Abstract
#An instrumented indentation technique... # def scrape_abstract(page): abs = [] parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(page) # soup = BeautifulSoup.BeautifulSoup(page) for div in soup.findAll('div',attrs={'class':'articleText'}): h3 = div.find('h3',{'class':'h3'}) if h3: val = h3.contents[0] if string.lower(val) in ('abstract'): for p in h3.findNextSiblings('p'): for t in p.findAll(text=True): abs.append(t) break abstract = ' '.join(abs); abstract = re.sub('\n+',' ',abstract) return unescape(abstract) # # Just try to fetch the metadata from crossref # def handle(url): page = urlopen(canon_url(url)).read() m = re.search(r'.*)", xml_page, re.S) #if not m: # raise ParseException, "Unable to extract metadata - malformed XML" # xml_page = m.group(1) yield "begin_crossref" yield xml_page yield "end_crossref" yield "begin_tsv" try: abstract = scrape_abstract(page) except: abstract = '' if abstract: print "abstract\t%s" % (abstract) yield "end_tsv" yield "status\tok" if __name__ == "__main__": cookie_jar = cookielib.CookieJar() handlers = [] if "--debug" in sys.argv: handlers.append( urllib2.HTTPHandler(debuglevel=True) ) handlers.append( urllib2.HTTPCookieProcessor(cookie_jar) ) opener=urllib2.build_opener(*handlers) opener.addheaders = [ ("User-Agent", "CiteULike/1.0 +http://www.citeulike.org/"), ] urllib2.install_opener(opener) # urllib2.ProxyHandler({"http":"http://quimby.smithersbet.com:3128"}) url = sys.stdin.readline().strip() try: for line in handle(url): print line.encode("utf-8") except Exception, e: import traceback line = traceback.tb_lineno(sys.exc_info()[2]) print "\t".join(["status", "error", "There was an internal error processing this request. Please report this to bugs@citeulike.org quoting error code %d." % line]) raise