#!/usr/bin/env python
# gethep by kkumer@phy.hr 
# $Id: gethep.py,v 2.1 2012-03-05 12:09:26+01 kkumer Exp kkumer $
# Version 2.1

# Usage:
# gethep -s quant-ph/0012149  -> saves quant-ph-0012149.ps.gz [default]
# gethep -d quant-ph/0012149  -> displays it on screen


##########################################

# User modifiable options:

# preprint server:
server="arxiv.org" 
# preferred file format ("pdf" or "ps"):
format="pdf"
# PS resolution (don't comment this out, it is irrelevant for pdf)
res='?dpi=600&font=bitmapped'
# program for displaying ("acroread", "gv", whatever you use)
disprog="acroread"
# directory for temporary files
tmpdir='/tmp'
# how long to wait for file to get created (in seconds)
# this will increase by 10 secs for each new try
waittime = 5
#
debug=1

##########################################


import httplib,sys,re,string,commands,time,mimetools

oldflag=0  # flag for old-style papers

# preparing filename extensions for saving
if format=="ps":
    extension=".ps.gz"
elif format=="pdf":
    extension=".pdf"
else:
    print "Unknown format: "+format
    sys.exit(1)


# Checking input
if len(sys.argv)!=3:
    if len(sys.argv)==2:
        id=sys.argv[1]
    else:
        print ''
        print '==== gethep 2.1 by kkumer@phy.hr 2012-03-05  ===='
        print ''
        print 'Usage: gethep [-s|-d] archive/number (e.g. 0911.1374)'
        print '-s  save to file [default]'
        print '-d  display in viewer'
        print ''
        sys.exit(1)
else:
    id=sys.argv[2]


class hep:
    def __init__(self,paper_id):
        self.filename=string.replace(paper_id,'/','-')+extension
        h=httplib.HTTP(server) #connecting
        # sending request the file
        if debug:
            sys.stderr.write('\nRetreiving http://%s/%s/%s.%s%s\n' % (
                server,format,paper_id,format,res))
        h.putrequest('GET', '/%s/%s.%s%s' % (format,paper_id,format,res))
        # servers insist on the following header
        h.putheader('User-Agent', 'gethep/1.0 by kkumer[at]phy.hr')
        h.endheaders()
        self.errcode,self.errmsg,self.headers=h.getreply()
        if self.errcode!=200:
            print "Errcode: "+str(self.errcode)
            if debug:
                sys.stderr.write(str(self.headers))
            print "Server error!"
            sys.exit(1)
        # Getting data (gzipped paper or "invalid id" or "wait for PS")
        f=h.getfile()
        self.data=f.read()
        f.close()

    def save(self,filename):
        fo=open(filename, "w")
        fo.write(self.data)
        fo.close()

    def show(self,filename):
        w=commands.getoutput(disprog+" "+filename)
        print w

class oldstyle(hep):
    def __init__(self,url):
        # taking the last part of url as the filename
        self.filename=re.split(r'/', url)[-1]
        h=httplib.HTTP(server) #connecting
        # sending request for file
        h.putrequest('GET', url)
        # servers insist on the following header
        h.putheader('User-Agent', 'gethep/1.0beta by kkumer[at]phy.hr')
        h.endheaders()
        self.errcode,self.errmsg,self.headers=h.getreply()
        if self.errcode!=200:
            print "Errcode: "+str(self.errcode)
            print "Server error!"
            sys.exit(1)
        # Getting data (gzipped paper or "invalid id" or "wait for PS")
        f=h.getfile()
        self.data=f.read()
        f.close()

# We are repeatedly trying to download until we get the real paper or
# encounter some insurmountable problem

unfinished = 1
while unfinished:
    paper = hep(id)    # Downloading
    unfinished = 0     # If problem occurs, return this to 1 later

    if re.search("Invalid paper", paper.data):
        print "Paper "+id+" is invalid!"
        sys.exit(1)
    if re.search("ostscript unavailable", paper.data):
        print "Postscript for paper "+id+" is unavailable!"
        print "This is most probably due to some mistake by authors."
        print "Maybe this is PDF-only paper."
        sys.exit(1)
    if re.search("PDF unavailable", paper.data):
        print "PDF for paper "+id+" is unavailable!"
        print "This is most probably due to some mistake by authors."
        sys.exit(1)
    if re.search("automatically create", paper.data):
        print "waiting "+str(waittime)+" seconds for file to get created"
        time.sleep(waittime)
        unfinished = 1
        waittime = waittime + 10
    # old-style papers with detached figures  (PS only)
    if re.search("following files", paper.data):
        print "This is the old-style paper with detached figures."
        print """I'll try to download everything (PS), and I'll save it
regardless of the -d/-s flags."""
        oldflag=1
        # searching the page for URLs of papers 
        allurls=re.findall(r'"/PS_cache/.*ps.gz"', paper.data)
        nourls=len(allurls) # number of files
        # Taking the quotation marks off
        for n in range(nourls):
            allurls[n]=allurls[n][1:][:-1]
        paper_old=[] # list of files will be here
        # getting files and saving them
        for n in range(nourls):
            paper_old.append(oldstyle(allurls[n]))  
            paper_old[n].save(paper_old[n].filename)
        print str(nourls)+" files downloaded."

# File(s) successfully downloaded. Process them as requested:

if oldflag:
    # displaying if requested, main file only
    if sys.argv[1]=="-d":
        for n in range(nourls):
            # recognizing the main file, then displaying it
            if re.match('\d*.ps.gz', paper_old[n].filename):
                paper_old[n].show(paper_old[n].filename)
else:
    # displaying
    if sys.argv[1]=="-d":
        print "Displaying "+paper.filename
        paper.save(tmpdir+"/"+paper.filename)
        paper.show(tmpdir+"/"+paper.filename)
        w=commands.getoutput("rm "+tmpdir+"/"+paper.filename)
    # saving 
    else:
        print "Saving "+paper.filename
        paper.save(paper.filename)