#!/usr/bin/env python # coding: utf-8 # # Scraper for JSTAGE (Japanese Science and Technogy Information Aggregator, Electronic # http://www.jstage.jst.go.jp/browse/-char/ja # by Osamu Masutani # import re, sys, urllib, urllib2,codecs # URL parts JSTAGE_URL = 'http://www.jstage.jst.go.jp/' DOWNLOAD_URL = JSTAGE_URL + 'download/' ARTICLE_URL = JSTAGE_URL + 'article/' #ARTICLE_SUFFIX = '/_article/-char/ja/' #RIS_SUFFIX = '/_ris/-char/ja/' #BIBTEX_SUFFIX = '/_bib/-char/ja/' ARTICLE_SUFFIX = '/_article' RIS_SUFFIX = '/_ris' BIBTEX_SUFFIX = '/_bib' # read url from std input url = sys.stdin.readline().strip() # get article ID jstage_id_regexp = ARTICLE_URL + '(.+)' + ARTICLE_SUFFIX jstage_id_match = re.search(jstage_id_regexp, url, re.IGNORECASE) if not jstage_id_match: print "Could not find id in URL (" + url + ")" sys.exit(1) jstage_id = jstage_id_match.group(1) url = ARTICLE_URL + jstage_id + ARTICLE_SUFFIX # read article abstract page article_data = urllib2.urlopen(url).read().strip() # article_data = unicode(article_data,'cp932') article_data.encode('utf-8') # get abstract abst = None abst_regexp = '(?: *?)
(?:.*?)(?: )*(?:
)*' + '(.*?)' + '
'; abst_match = re.search(abst_regexp, article_data , re.IGNORECASE) #if not abst_match: # print "Could not find abst in article page (" + url + ")" # sys.exit(1) if abst_match: abst = abst_match.group(1) doi = None m = re.search(r'doi:(10.(\d{4})/[^<]+)', article_data) if m: doi = m.group(1) # get keywords #keywords_regexp = 'Keywords:(?:(.+?), )+?' + '(.+?)' #keywords_match = re.search(keywords_regexp, article_data , re.IGNORECASE) #if not keywords_match: # print "Could not find keywords in article page (" + url + ")" # sys.exit(1) #keywords = keywords_match.groups # change article url to RIS download url # url2 = re.sub(ARTICLE_URL , DOWNLOAD_URL , url); # url3 = re.sub(ARTICLE_SUFFIX , RIS_SUFFIX , url2); url3 = DOWNLOAD_URL + jstage_id + RIS_SUFFIX # fetch the citation export page ris_data = urllib2.urlopen(url3).read().strip() m = re.search("_in_Japanese", ris_data, re.IGNORECASE) if m: ris_data = urllib2.urlopen(url3+"/-char/ja/").read().strip() ris_data = re.sub("\nN1 - doi: .*\n","\n",ris_data) # print the results print "begin_ris" print ris_data print "end_ris" print "begin_tsv" if abst: print "abstract\t%s" % (abst.encode('utf-8')) print "linkout\tJSTAGE\t\t%s\t\t" % jstage_id if doi: print "linkout\tDOI\t\t%s\t\t" % doi print "end_tsv" print "status\tok"