#!/usr/bin/env python
# getbib by kkumer@phy.hr 
# v. 2.0 2014-02-19 

# Gets BibTeX info for an article from INSPIRE HEP database

##########################################
# INSPIRE HEP server:
server="http://inspirehep.net/"
# GGI commands (%s stands for ID that will be substituted)
cgisummary="search?p=fin+eprint+%s" 
cgibibtex="record/%s/export/hx"
cgikeywords="record/%s/export/xm"
cgitexkey="search?p=fin+texkey+%s"

##########################################


import requests,BeautifulSoup,sys,re,string,commands,time

DEBUG = 0

if DEBUG:
    config = {'verbose': sys.stderr}
else:
    config = {}

headers = {
    'User-Agent': 'getbib/2.0beta by kkumer[at]phy.hr'
}

# Checking input
if len(sys.argv)!=2:
    print ''
    print 'Usage: getbib <string containing arXiv number or texkey>'
    print '   Examples: '
    print '     getbib 0902.0002 '
    print '     getbib Arsiwalla:2009iq '
    print ''
    sys.exit(1)
else:
    # regex is built like  (texkey|hep-ph/dddddd|dddd.dddd)
    id = re.search(r'(([A-Za-z]+:[12][0-9][0-9][0-9][a-z]+)|((hep|gr|quant)-(ph|th|ex|lat|qc)/[0-9\.]{7})|([0-9]{4}\.[0-9]{4}))', sys.argv[1]).groups()[0]
    if DEBUG: print "ID = %s" % id


class hep:
    def __init__(self,command,paper_id):
        # sending request the file
        cmd = command % paper_id
        url = "%s%s" % (server, cmd)
        r = requests.get(url, headers=headers, config=config)
        # servers insist on the following header
        if r.status_code!=200:
            print "Server error!"
            print "Errcode: %i " % r.status_code
            if DEBUG: print "Headers: %s" % str(r.headers)
            sys.exit(1)
        # Getting data (gzipped paper or "invalid id" or "wait for PS")
        self.data=r.text

# if hep number given on command line:
#     get summary page and extract paper "key" ID from there
if re.search('[/.]',id):
    id=string.replace(id,'/','%2F')  
    page=hep(cgisummary,id)
    try:
        #keyid=re.search(r'www\?key=(.*?)&',page.data).group(1)
        recid=re.search(r'recid=(.*?)&',page.data).group(1)
    except:
        sys.stderr.write('ERROR: No page. Are you sure paper %s exists?\n' % string.replace(id,'%2F','/'))
        sys.exit(1)
# same for texkey
elif re.search(':',id):
    id=string.replace(id,':','%3A')  
    page=hep(cgitexkey,id)
    try:
        recid=re.search(r'recid=(.*?)&',page.data).group(1)
    except:
        sys.stderr.write('ERROR: No page. Are you sure paper %s exists?\n' % string.replace(id,'%2F','/'))
        sys.exit(1)
# Otherwise expect that argument given is recid itself
else:
    recid=id

# Getting  BiBTeX entry
page=hep(cgibibtex,recid)
try:
    bibtex=re.search('@article.*\n}\n',page.data,re.DOTALL).group()
except:
    sys.stderr.write('ERROR: No BibTeX. Are you sure paper %s exists?\n' % recid)
    sys.exit(1)

# Getting  keywords
page=hep(cgikeywords,recid)
soup = BeautifulSoup.BeautifulSoup(page.data)
dfs= [df for df in soup.record]
keywords=[]
for df in dfs:
    try:
        if dict(df.attrs)['tag'] == '695':
            keywords.append(df.find(code='a').text)
    except:
        pass
# cleaning up
#keywords=[re.sub('\n',' ',str).strip() for str in keywords_raw]
# putting in BibTeX format
kwlist=['     keywords  = "%s",\n' % str for str in keywords]
# appending it
bibtex=re.sub('",\n}\n','",\n '+string.join(kwlist),bibtex)

# appending INSPIRE HEP record id
bibtex=bibtex+'      recid  = "%s",\n' % recid

# appending pdf, group and comments field
bibtex=bibtex+'      pdf  = "%s",\n      group  = "",\n      comments  = ""\n}\n' % string.replace(id+'.pdf','%2F','-')
print bibtex
