#!/usr/bin/env python # parse EGU journal html and return bibliographic information # (c) Jan Cermak, 2009, jan.cermak@env.ethz.ch # in part following agu.py and http://www.boddie.org.uk/ import re,sys from urllib2 import urlopen from urlparse import urlparse import sgmllib class ParseException(Exception): pass class MyParser(sgmllib.SGMLParser): "A simple parser class as given on http://www.boddie.org.uk/python/HTML.html" def parse(self, s): "Parse the given string 's'." self.feed(s) self.close() def __init__(self, verbose=0): "Initialise an object, passing 'verbose' to the superclass." sgmllib.SGMLParser.__init__(self, verbose) self.hyperlinks = [] def start_a(self, attributes): "Process a hyperlink and its 'attributes'." for name, value in attributes: if name == "href": self.hyperlinks.append(value) def get_hyperlinks(self): "Return the list of hyperlinks." return self.hyperlinks def bibTexString(string): """Put {} around the first letter of words starting with a capital letter.""" newstring = '' for word in string.split(): if word.istitle(): newword = '{' + word[0] + '}' + word[1:] else: newword = word newstring = newstring + newword + ' ' newstring.strip() # not working! return newstring def handle(url): m = re.match(r'(http://www\.(atmos-chem-phys|clim-past)\.net/[0-9]+/[0-9]+/[0-9]{4}/[[:alpha:]]+-[0-9]+-[0-9]{4}\w*)', url) # print m # if not m: # raise ParseException, "URL not supported %s" % url # read page page = urlopen(url).read() # get base url o = urlparse(url) if o[0] == 'http': urlBase = o[0] + '://' + o[1] journalNameUrlComponent = o[1].rstrip('net').rstrip('.').lstrip('www.') #journalNameUrlComponent = o[1].rstrip('.net').lstrip('www.') # produces rubbish else: urlBase = o[0] journalNameUrlComponent = o[0].rstrip('.net').lstrip('http://www.') # extract abstract from page abstractStart = page.rfind('Abstract.') + 17 # add length of title and tag abstractLen = page[abstractStart:].find('') abstractEnd = abstractStart + abstractLen abstractRaw = page[abstractStart:abstractEnd] abstract = abstractRaw.replace('\r\n',' ') p = re.compile( r'\s+') abstract = p.sub(" ",abstract) # find hyperlinks in page myparser = MyParser() myparser.parse(page) links = myparser.get_hyperlinks() # find ris and bibtex urls risLink = 'None' btLink = 'None' for link in links: if link.rfind('.ris') != -1: risLink = link elif link.rfind('.bib') != -1: btLink = link risUrl = urlBase + risLink btUrl = urlBase + btLink RIS = urlopen(risUrl).read() bibTex = urlopen(btUrl).read() # extract information from RIS data print 'begin_ris' authors = [] for line in RIS.splitlines(): if line[:2] == 'T1': # bibtexify title before printing newTitle = (bibTexString(line.split('-',1)[1])).strip() elif line[:2] == 'JO': # don't print journal title, take uabbreviated bibtex version pass elif line[:2] == 'A1': # RIS does a bad job on authors, fix authorRaw = line.split('-',1)[1] firstNames = authorRaw.split(',',1)[1].strip() lastNames = authorRaw.split(',',1)[0].strip() authorClean = firstNames + ' ' + lastNames authors.append(authorClean) else: print line # get some other information if line[:2] == 'SP': # start page, needed for linkout startPage = line.split()[2] if line[:2] == 'EP': # end page, needed for linkout endPage = line.split()[2] if line[:2] == 'VL': # volume, needed for linkout volume = line.split()[2] if line[:2] == 'Y1': # date information, needed in various contexts date = line.split()[2] year = date.split('/')[0] month = int(date.split('/')[1]) day = int(date.split('/')[2]) print 'end_ris' # now write the rest in tsv code print "begin_tsv" # print authors for author in authors: print "author\t%s" % author # extract information from bibTex data for line in bibTex.splitlines(): if line.rfind('JOURNAL') != -1: journalNameRaw = line.split('=')[1] journalName = journalNameRaw.lstrip(' {').rstrip('},') print "journal\t%s" % journalName # get keys for linkout and spit it back to standard out ckey_1 = journalNameUrlComponent ikey_1 = volume ikey_2 = startPage ckey_2 = year # actually this would be integer, but only 2 ikeys allowed print "linkout\tEGU\t%s\t%s\t%s\t%s" % (ikey_1, ckey_1, ikey_2, ckey_2) print "title\t%s" % newTitle print "abstract\t%s" % abstract print "day\t%s" % day print "month\t%s" % month print "year\t%s" % year print "type\tJOUR" print "end_tsv" print "status\tok" # get abstract # output abstract url = sys.stdin.readline() url = url.strip() #url = 'http://www.atmos-chem-phys.net/9/1847/2009/acp-9-1847-2009.html' handle(url) #print 'begin_bibtex' #print 'end_bibtex' # ris contains more information than BibTex: pdf url is given in addition #print 'begin_ris' #print 'end_ris' #$ echo 'http://www.jstor.org/view/00376752/ap010113/01a00130/0' | ./jstor.tcl #begin_tsv #linkout JSTOR 0037-6752%28198424%291%3A28%3A4%3C533%3AANOMEG%3E2.0.CO%3B2-3 #title A Note on May Eve, Good Friday, and the Full Moon in Bulgakov's The Master and Margarita #author Donald M. Fiene #journal The Slavic and East European Journal #volume 28 #issue 4 #year 1984 #start_page 533 #end_page 537 #type JOUR #end_tsv #status ok