#!/usr/bin/env perl
use LWP::Simple;
use HTML::TreeBuilder;
use Encode;
#
# Copyright (c) 2005 Richard Cameron, CiteULike.org
# All rights reserved.
#
# This code is derived from software contributed to CiteULike.org
# by
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. All advertising materials mentioning features or use of this software
# must display the following acknowledgement:
# This product includes software developed by
# CiteULike and its
# contributors.
# 4. Neither the name of CiteULike nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY CITEULIKE.ORG AND CONTRIBUTORS
# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
binmode STDOUT, ":utf8";
# Scrape the RIS file from the Nature.com site
$url = <>;
$page = get $url;
my $tree = HTML::TreeBuilder->new();
$tree->parse($page);
my $head = ($tree->look_down('_tag','head'))[0];
my @meta = $head->look_down('_tag','meta');
print "begin_tsv\n";
my $doi = 0;
foreach $m (@meta) {
my $name = $m->attr("name");
my $content = $m->attr("content");
#print "$name = $content\n";
$name =~ /dc.identifier/ and do {
$content =~ s/doi://;
# sometimes see the DOI prefix twice!
$content =~ s/(10\.\d\d\d\d)\/(10\.\d\d\d\d)/$1/;
$doi = $content;
};
$name =~ /dc.date/ and do {
$content =~ /(\d\d\d\d)(?:(?:-)(\d\d)(?:(?:-)(\d\d))?)?/;
print "year\t$1\n" if $1;
print "month\t$2\n" if $2;
print "day\t$3\n" if $3;
};
# prism.issn = ERROR! NO ISSN
$name =~ /prism.issn/ and do {
print "issn\t$content\n" if $content =~ /\d+/;
};
$name =~ /dc.title/ and do {
print "title\t$content\n" if $content;
};
$name =~ /prism.startingPage/ and do {
print "start_page\t$content\n" if $content;
};
$name =~ /prism.endingPage/ and do {
print "end_page\t$content\n" if $content;
};
$name =~ /prism.volume / and do {
print "volume\t$content\n" if $content;
};
$name =~ /prism.number / and do {
print "issue\t$content\n" if $content;
};
$name =~ /dc.publisher/ and do {
print "publisher\t$content\n" if $content;
};
$name =~ /prism.publicationName/ and do {
print "journal\t$content\n" if $content;
};
# abstract | abstract | AB
# address | address | AD
# chapter | chapter |
# date_other | |
# edition | edition |
# how_published | howpublished |
# institution | institution |
# isbn | isbn | SN
# issn | issn | SN
# issue | number | IS
# journal | journal | JO
# month | |
# organization | organization |
# school | school |
# title_secondary | booktitle | BT
# title_series | series | T3
# type | |
}
my $abstract = $tree->look_down( '_tag', 'span',
sub { $_[0]->attr("class") eq 'articletext' }
);
if ($abstract) {
$abstract = $abstract->as_text;
} else {
@abstract = $tree->look_down( '_tag', 'p',
sub { $_[0]->attr("class") eq 'lead' }
);
if (@abstract) {
$abstract = join (" ", map {$_->as_text} @abstract );
}
}
if ($abstract) {
print "abstract\t$abstract\n";
} else {
print "abstract\t\n";
}
# We can get the bog-standard Nature linkout from just looking at the URL
if ($url =~ m{www.nature.com/cgi.*file=/([^/]+)/journal/v([^/]+)/n([^/]+)/([^/]+)/([^/]+)(_[^._]+)?.(html|pdf|ris)}) {
# Old style
($journal,$vol,$num,$view_type,$article)=($1,$2,$3,$4,$5);
} elsif ($url =~ m{www.nature.com/(n\w+)/journal/v([^/]+)/n([^/]+)/[^/]+/([^/_]+)\.(html|pdf|ris)}) {
# Fix to get Nature photonics/genetics to parse
($journal,$vol,$num,$article)=($1,$2,$3,$4);
} elsif ($url =~ m{www.nature.com/([^/]+)/journal/v([^/]+)/n([^/]+s?)/[^/]+/([^/]+)(_[^._]+)?\.(html|pdf|ris)}) {
#http://www.nature.com/ni/journal/vaop/ncurrent/abs/ni.1771.html
($journal,$vol,$num,$article)=($1,$2,$3,$4);
} else {
print "status\terr\tThis page does not appear to be a Nature article\n";
exit;
}
#if ($num !~ m{s} && $num =~ /^\d+$/ && $vol =~ /^\d+$/) {
print "linkout\tNATUR\t$vol\t$article\t$num\t$journal\n";
#}
# http://www.nature.com/ng/journal/v38/n6s/abs/ng1798.html
#print "http://www.nature.com/${journal}/journal/v${vol}/n${num}/ris/${article}.ris\n" ;
# Grab the RIS file
$ris = get "http://www.nature.com/${journal}/journal/v${vol}/n${num}/ris/${article}.ris" || (print "status\terr\tCouldn't fetch the citation details from the Nature web site.\n" and exit);
# Not sure why they give us this crap in the "date_other" field. Kill it.
print "date_other\t\n";
if ($doi) {
print "linkout\tDOI\t\t$doi\t\t\n";
} elsif ($ris =~ m{UR - http://dx.doi.org/(.*)}) {
# We can extract a DOI if we're cunning, so that'll give us another linkout
print "linkout\tDOI\t\t$1\t\t\n";
}
print "end_tsv\n";
print "begin_ris\n";
# Nature serve up a few lines of extra crap at the start of their record
# Strip all that off.
if ($ris =~ m{(TY -.*ER -)}s) {
print $1."\n";
}
print "end_ris\n";
if ($ris =~ m{ER - }) {
print "status\tok\n";
} else {
print "status\terr\tCouldn't extract the details from Nature's 'export citation' link.\n";
}