User:Polbot/source/Reffix.pl

  1. Use like:
  2. perl reffix.pl Catname

use strict;

use Perlwikipedia;

use URI::Escape;

use LWP::UserAgent;

use Encode;

use XML::Simple;

my $Polbot_password = '(bot password)';

my $az_AccessKey = '(Amazon.com access code)';

my $crossref_creds = '(username:password)';

my $soonest_next_op = time;

my $wait_time = 10;

my $ignorenamespaces = 'User|User talk|Talk|Template|Template talk|Portal|Portal talk|Category|Category talk|Portal talk|Wikipedia talk|Image|Image talk|MediaWiki|MediaWiki talk|Template talk|Help|Help talk';

my $editsummary = 'Automated fixes to external links and references. (See the FAQ for details.)';

my $blacklist = '(^Cannot find server|(File|Resource|Article|Page) (was )?not found|(^|\s)Log ?In($|\s)|(^|\s)Sign ?in($|\s))';

  1. date

my ($Second, $Minute, $Hour, $Day, $Month, $Year, $WeekDay, $DayOfYear, $IsDST) = localtime(time);

$Year += 1900;

$Month++;

$Month =~ s/^(\d)$/0$1/;

$Day =~ s/^(\d)$/0$1/;

my $Todays_date = "$Year-$Month-$Day";

my $category = shift;

print "Running Polbot's reffix function, category = $category\n";

print "\nLogging in to Wikipedia.\n" ;

my $pw=Perlwikipedia->new();

$pw->{mech}->agent('Bot/polbot');

my $login_status=$pw->login('Polbot', $Polbot_password);

die "I can't log in." unless ($login_status eq 0);

my $ua = LWP::UserAgent->new;

$ua->agent("Firefox/3.0.1");

$ua->cookie_jar({});

print "Opening category '$category'\n";

my @allpages = $pw->get_pages_in_category("Category:$category");

print "There are " . scalar(@allpages) . " total pages to go through.\n";

foreach my $articlename (@allpages) {

print "Examining $articlename\n";

if ($articlename =~ /^$ignorenamespaces:/i) {

print " Not an article. Skipping.\n";

next;

}

# -----------------------------------------------------------------

# ---------------- First, look at the article and set variables.

my $bNeedsChanging = 0;

my $newart = '';

my $bHasReferencesTag = 0;

my $bHasReflist = 0;

my $bHasRefTag = 0;

my $art = $pw->get_text($articlename);

# Exclusion compliance

if ($art =~ m/\{\{\s*(nobots\s*\}\}|bots\s*\|\s*allow\s*=|bots\s*|\s*deny\s*=\s*all)/si) {

print " {{nobots}}, skipping.\n";

next;

}

# variables

if ($art =~ /<\s*references\s*\/\s*>/is) {

$bHasReferencesTag = 1;

}

if ($art =~ /\{\{\s*(template\s*:\s*)?reflist\s*[\|\}]/is) {

$bHasReflist = 1;

}

if ($art =~ /<\s*ref(\s+name\s*=\s*(?:"[^"]*"|\w+)|)\s*>/si) {

$bHasRefTag = 1;

}

# -----------------------------------------------------------------

# ---------------- Change to {{reflist}}

if ($bHasReferencesTag == 1) {

#$bNeedsChanging = 1;

$art =~ m/(<\s*references\s*\/>)/si;

my $refsect = $1;

$art =~ m/(<(span|div)( class=\"(references-small|small|references-2column))?\">\s*<\s*references\s*\/>\s*<\/\s*(span|div)>)/si;

my $temp2 = $1;

$art =~ m/(<(span|div)( class=\"(references-small|small|references-2column))?\">\s*<(span|div)( class=\"(references-small|small|references-2column))?\">\s*<\s*references\s*\/>\s*<\/\s*(span|div)>\s*<\/\s*(span|div)>)/si;

my $temp3 = $1;

if ($temp3) {

$refsect = $temp3;

} elsif ($temp2) {

$refsect = $temp2;

}

if ($refsect) {

my $newrefsect = $refsect;

if ($refsect =~ m/references-2column/) {

$newrefsect = "{{reflist|2}}";

} elsif ($refsect =~ m/[^-]column-count:[\s]*?(\d*)/) {

$newrefsect = "{{reflist|$1}}";

} elsif ($refsect =~ m/-moz-column-count:[\s]*?(\d*)/) {

$newrefsect = "{{reflist|$1}}";

} else {

$newrefsect = "{{reflist}}";

}

$art =~ s/$refsect/$newrefsect/si;

$bHasReflist = 1;

}

}

# -----------------------------------------------------------------

# ---------------- Fix http://...

while ($art =~ m/\[\[(https?:\/\/[^\]]*)\]\]/si) {

my $badlink = $1;

$bNeedsChanging = 1;

print " Fixing $badlink\n";

$art =~ s/\[\[\Q$badlink\E\]\]/[$badlink]/si;

}

# -----------------------------------------------------------------

# ---------------- Fix ext links to Wikimedia

# en.wikipedia

while ($art =~ m/http:\/\/(?:en\.)?wikipedia\.org\/wiki\/([^\] ]*)/g) {

my $extwikilink = $1;

$bNeedsChanging = 1;

my $intwikilink = $extwikilink;

$intwikilink =~ s/_/ /g;

$intwikilink =~ s/%([0-9A-Fa-f]{2})%([0-9A-Fa-f]{2})/decode("utf8", chr(hex($1)) . chr(hex($2)))/eg;

$intwikilink =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;

$intwikilink =~ s/^(Image|Category):/:$1:/;

print " Fixing ext wikilink $extwikilink to $intwikilink\n";

# non-renamed

$art =~ s/\[http:\/\/(en\.)?wikipedia\.org\/wiki\/\Q$extwikilink\E\]/$intwikilink/g;

# renamed

$art =~ s/\[http:\/\/(?:en\.)?wikipedia\.org\/wiki\/\Q$extwikilink\E ([^\]]*)\]/$1/g;

}

# other.wikipedia

while ($art =~ m/\[http:\/\/([^\.]*).wikipedia.org\/wiki\/([^\] ]*)/s) {

my $extwikilang = $1;

my $extwikilink = $2;

$bNeedsChanging = 1;

my $intwikilink = $extwikilink;

$intwikilink =~ s/_/ /g;

$intwikilink =~ s/%([0-9A-Fa-f]{2})%([0-9A-Fa-f]{2})/decode("utf8", chr(hex($1)) . chr(hex($2)))/eg;

$intwikilink =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;

$intwikilink =~ s/^(Image|Category):/:$1:/;

print " Fixing ext link $extwikilang.$extwikilink to $extwikilang:$intwikilink\n";

# non-renamed

$art =~ s/\[http:\/\/$extwikilang\.wikipedia\.org\/wiki\/\Q$extwikilink\E\]/:$extwikilang:$intwikilink/g;

# renamed

$art =~ s/\[http:\/\/$extwikilang\.wikipedia\.org\/wiki\/\Q$extwikilink\E ([^\]]*)\]/$1/g;

}

# -----------------------------------------------------------------

# ---------------- ref the BELs

# First, QQQ the PDFlink BELs

$art =~ s/(\{\{\s*PDF(?:link)?\s*\|\s*\[)(https?:\/\/[^\]]*\])/$1QQQ$2/gi;

# Next, QQQ the html comments

$newart = $art;

while ($art =~ m/<\!--(.*?)-->/gs) {

my $comment = $1;

my $newcomment = $comment;

if ($newcomment =~ s/\[(https?:\/\/)/\[QQQ$1/g) {

$newart =~ s/\Q$comment\E/$newcomment/;

}

}

$art = $newart;

# Next, QQQ the links already in refs

$newart = $art;

while ($art =~ m/(<\s*ref.*?<\s*\/\s*ref\s*>)/gs) {

my $ref = $1;

my $newref = $ref;

if ($newref =~ s/\[(https?:\/\/)/\[QQQ$1/g) {

$newart =~ s/\Q$ref\E/$newref/;

}

}

$art = $newart;

# Now QQQ the " at [http]" or " in [http]" or " from [http]"

$art =~ s/( at| in| from|At|In|From) \[(https?:\/\/[^\]]*\])/ $1 [QQQ$2/g;

# And lastly, QQQ links that begin a line

$art =~ s/^([\s\*\#\:]*\[)(https?\:\/\/[^\]]*\])/$1QQQ$2/gm;

# Okay! Now ref all non-QQQed BELs above the {{reflist}} template (or whatever)

my $artbefore = $art;

my $artafter = '';

if ($art =~ m/(.*?)(\{\{reflist\}\}|=+\s*Notes?\s*=+|=+\s*References?\s*=+|=+\s*External links?\s*=+|=+\s*Sources?\s*=+|=+\s*Further reading\s*=+|=+\s*See also\s*=+)(.*)/is) {

$artbefore = $1;

$artafter = "$2$3";

}

$newart = $artbefore;

while ($artbefore =~ m/\[(https?:\/\/[^ \]]*)\]/g) {

my $BEL = $1;

$bNeedsChanging = 1;

$bHasRefTag = 1;

$newart =~ s/ *\[\Q$BEL\E\]/QQQ$BEL<\/ref>/g;

}

$art = "$newart$artafter";

# UnQQQ it all

$art =~ s/QQQhttp/http/g;

$art =~ s/ *\(([^<]*<\/ref>)\)/$1/gs;

# -----------------------------------------------------------------

# ---------------- Add {{reflist}} if missing

if ($bHasRefTag - $bHasReflist == 1) {

$bNeedsChanging = 1;

print " but no {{reflist}}\n";

if ($art =~ m/\n=+\s*(references?|notes)\s*=+\s*\n/mi) {

my $putrefin = $1;

$art =~ s/(\n=+\s*($putrefin)\s*=+\n)/$1\{\{reflist\}\}\n/si;

print " Putting reflist after $putrefin section\n";

} else {

$art =~ m/(=+\s*see also\s*=+|=+\s*external links?\s*=+|=+\s*sources?\s*=+|=+\s*further reading\s*=+|\[\[\s*category\s*\:)/si;

my $putrefsbefore = $1;

if ($putrefsbefore) {

$art =~ s/\Q$putrefsbefore\E/==Notes==\n{{reflist}}\n\n$putrefsbefore/si;

print " Putting reflist before $putrefsbefore section\n";

} else {

$art .= "\n{{reflist}}";

print " Putting reflist at end\n";

}

}

}

# ------------------------------------------------------------------------

# ---------------- Known links -> cites or templates

# ---------------- Unkown links -> titles or {{dead link}}

# First, QQQ the PDFlink BELs

$art =~ s/(\{\{\s*PDF(?:link)?\s*\|\s*\[)(https?:\/\/[^\]]*\])/$1QQQ$2/gi;

# Next, QQQ the html comments

$newart = $art;

while ($art =~ m/<\!--(.*?)-->/gs) {

my $comment = $1;

my $newcomment = $comment;

if ($newcomment =~ s/\[(https?:\/\/)/\[QQQ$1/g) {

$newart =~ s/\Q$comment\E/$newcomment/;

}

}

$art = $newart;

# And QQQ the already-dead links

$art =~ s/\b(https?\:\/\/[^\s\]\<\{]*\]? ?\{\{dead link\}\})/QQQ$1/g;

my @BURLs = (); # bare URLs, e.g. http://www.example.com/subdir/example.html

my @BELs = (); # bare external links, e.g. [http://www.example.com/subdir/example.html]

my @NELs = (); # named external links, e.g. [http://www.example.com/subdir/example.html name]

# Those starting a line

push @BURLs, ($art =~ m/^[ \*\#\:]*https?\:\/\/[^\s\]\<]*/mg);

push @BELs, ($art =~ m/^[ \*\#\:]*\[https?\:\/\/[^ \]\<]*\]/mg);

push @NELs, ($art =~ m/^[ \*\#\:]*\[https?\:\/\/[^ \]\<]*(?: [^\]]+)\]/mg);

# Those in tags

push @BURLs, ($art =~ m/]*)?>https?\:\/\/[^\s\]\<]*\s*<\/ref>/sg);

push @BELs, ($art =~ m/]*)?>\[https?\:\/\/[^ \]\<]*\]\s*<\/ref>/sg);

push @NELs, ($art =~ m/]*)?>\[https?\:\/\/[^ \]]*(?: [^\]]+)\]\s*<\/ref>/sg);

# Process these links.

$newart = $art;

print "Processing BURLs and BELs\n";

foreach my $full_link (@BURLs, @BELs) {

my $transformedlink = process_link($full_link, 'bare');

if ($full_link ne $transformedlink) {

$newart =~ s/\Q$full_link\E/$transformedlink/s;

$bNeedsChanging = 1;

}

}

print "Processing " . scalar(@NELs) . " NELs\n";

foreach my $full_link (@NELs) {

my $transformedlink = process_link($full_link, 'named');

if ($full_link ne $transformedlink) {

$newart =~ s/\Q$full_link\E/$transformedlink/s;

$bNeedsChanging = 1;

}

}

$art = $newart;

$art =~ s/QQQhttp/http/g;

# -----------------------------------------------------------------

# ---------- Merging refs: very hard. Skipping for now.

# -----------------------------------------------------------------

# ---------- Minor fixes

if ($bNeedsChanging) {

# Fix punctuation touching ref tags

  1. while ($art =~ s/(.*)()([\.\,\?\!\;\:])/$1$3$2/gs) {};

$newart = $art;

while ($art =~ m/(]*>.*?<\/ref>)(.)/gs) {

my $thisref = $1;

my $thischar = $2;

if ($thischar =~ m/[\.\,\?\!\;\:]/) {

print "Found $thischar after \n";

$newart =~ s/\Q$thisref$thischar\E/$thischar$thisref/gs;

}

}

while ($art =~ m/(]*\/>)(.)/gs) {

my $thisref = $1;

my $thischar = $2;

if ($thischar =~ m/[\.\,\?\!\;\:]/) {

print "Found $thischar after \n";

$newart =~ s/\Q$thisref$thischar\E/$thischar$thisref/gs;

}

}

$art = $newart;

# Miscaptalizations

$art =~ s/==(\s*)See also(\s*)==/==$1See also$2==/i;

$art =~ s/==(\s*)External links?(\s*)==/==$1External links$2==/i;

# units

$art =~ s/(\d) (mph|km|mile|mi|kilometer|mbar|knot|feet|ft|meter|m|metre|kilometre|inch|million|billion|foot|days|kt|millibar|mm|cm|dollar|USD|inHg|hPa|people|hour|liter|degree|°|year|month|square|sq)\b/$1 $2/g;

# HTML

$art =~ s/\<\/?i\>/\'\'/gi;

$art =~ s/\<\/?b\>/\'\'\'/gi;

# Date stuff

# Century

$art =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi;

$art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi;

$art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi;

$art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\]\]/$1 centuries/gi;

$art =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi;

$art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi;

$art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\s(AD|BC|CE|BCE)\]\]/$1 centuries $2/gi;

# piped decades and years

$art =~ s/\[\[(\d{1,4}\'?s)\]\]/$1/gi;

$art =~ s/\[\[(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi;

$art =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4})\]\]/$1/gi;

$art =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi;

$art =~ s/\[\[\d{1,4}s?\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi;

$art =~ s/\[\[\d{1,4}s?\|(\d{1,2}s?)\]\]/$1/gi;

# months

$art =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December)\]\]/$1/gi;

$art =~ s/\[\[January\|(Jan)\]\]/$1/gi;

$art =~ s/\[\[February\|(Feb)\]\]/$1/gi;

$art =~ s/\[\[March\|(Mar)\]\]/$1/gi;

$art =~ s/\[\[April\|(Apr)\]\]/$1/gi;

$art =~ s/\[\[May\|(May)\]\]/$1/gi;

$art =~ s/\[\[June\|(Jun)\]\]/$1/gi;

$art =~ s/\[\[July\|(Jul)\]\]/$1/gi;

$art =~ s/\[\[August\|(Aug)\]\]/$1/gi;

$art =~ s/\[\[September\|(Sep)\]\]/$1/gi;

$art =~ s/\[\[October\|(Oct)\]\]/$1/gi;

$art =~ s/\[\[November\|(Nov)\]\]/$1/gi;

$art =~ s/\[\[December\|(Dec)\]\]/$1/gi;

# month+year

$art =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d{3,4})\]\]/$1/gi;

# Month+day_number "March 7th" -> "March 7"

$art =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December) (\d?\d)(?:th|st|nd|rd)\]\]/\[\[$1 $2\]\]/gi;

$art =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](?:th|st|nd|rd)/\[\[$1\]\]/gi;

$art =~ s/\[\[(\d?\d)(?:th|st|nd|rd) (January|February|March|April|May|June|July|August|September|October|November|December)\]\]/\[\[$1 $2\]\]/gi;

# Month+day_number piped into number. Preferences do not work. They don't work in sequence because digits in the two dates must be adjacent

$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;

$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;

$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;

$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;

$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;

$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;

$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;

$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;

$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;

$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;

$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;

$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;

$art =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1/gi;

$art =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2})\]\]/$1/gi;

$art =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2})\]\]/$1/gi;

# solitary day_numbers

$art =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi;

$art =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi;

$art =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi;

# days of the week in full. Optional plurals

$art =~ s/\[\[(Mondays?|Tuesdays?|Wednesdays?|Thursdays?|Fridays?|Saturdays?|Sundays?)\]\]/$1/gi;

# days of the week abbreviated. Leave out 'Sun' as potentially valid link to the Sun. Leave out 'SAT' in upper case as potential link to 'Scholastic achievement/aptitude test'.

$art =~ s/\[\[(Mon|Tue|Tues|Wed|Thu|Thur|Thurs|Fri)\]\]/$1/gi;

$art =~ s/\[\[(Sat)\]\]/$1/g;

$art =~ s/\[\[Mondays?\|(Mondays?)\]\]/$1/gi;

$art =~ s/\[\[Tuesdays?\|(Tuesdays?)\]\]/$1/gi;

$art =~ s/\[\[Wednesdays?\|(Wednesdays?)\]\]/$1/gi;

$art =~ s/\[\[Thursdays?\|(Thursdays?)\]\]/$1/gi;

$art =~ s/\[\[Fridays?\|(Fridays?)\]\]/$1/gi;

$art =~ s/\[\[Saturdays?\|(Saturdays?)\]\]/$1/gi;

$art =~ s/\[\[Sundays?\|(Sundays?)\]\]/$1/gi;

# 4 digit years piped into 2

$art =~ s/\[\[\d{1,4}\|(\d{1,2})\]\]/$1/gi;

# year: examine characters in link on left for date, examine characters in link on right for date

$art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi;

# year pair: examine characters in link on left for date, examine characters in link on right for date

$art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc\s\-]))/$1$2$3$4$5/gi;

# year: examine characters in link on left for date, avoid links on right

$art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi;

# year pair: examine characters in link on left for date, avoid links on right

$art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi;

# year: check for line-ends, text on left, avoid links on right. Run twice to deal better with lists.

$art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi;

$art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi;

# year pair: check for line-ends, text on left, avoid links on right

$art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3$4$5/gi;

# year: avoid links on left, examine characters in link on right for date

$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi;

# year pair: avoid links on left, examine characters in link on right for date

$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3$4$5/gi;

# year:avoid links on left, text on right

$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.\'\*\|\&])/$1$2$3/gi;

# year pair: avoid links on left, text on right

$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.\'\*\|\&])/$1$2$3$4$5/gi;

# year:text on left, text on right

$art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.\'\*\|\&])/$1$2$3/gi;

# year pair: avoid links on left, text on right

$art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:\.\'\*\|\&])/$1$2$3$4$5/gi;

# year:avoid links on both sides

$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi;

# year pair: avoid links on both sides

$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi;

# 'present'

$art =~ s/\[\[Present \(time\)\|(Present)\]\]/$1/gi;

# Eliminate 'surprise links' also known as 'easter egg links'

$art =~ s/\[\[\d{1,4}s?\sin\s[^\|]{1,30}\|(\d{1,4}s?)\]\]/$1/gi;

}

# -----------------------------------------------------------------

# ---------- DONE! ---------------------------

if ($bNeedsChanging) {

wiki_write($articlename, $art, $editsummary);

}

}

sub wiki_write {

my $article_name = shift;

my $wiki_out = shift;

my $edit_summary = shift;

$|=1;

print " Waiting " . ($soonest_next_op - time) . " secs... ";

$|=1;

while (time < $soonest_next_op) {};

$soonest_next_op = time + $wait_time;

print "Writing $article_name\n";

$pw->edit($article_name, $wiki_out, $edit_summary);

}

sub citefromlink_GoogleBooks {

my $gb_url = shift;

my $citetemplate = '';

my $gb_lookup_url = $gb_url;

$gb_lookup_url =~ s/(http\:\/\/books\.google\.com\/books\?).*(id\=[^\&]*).*$/$1$2/i;

#http://books.google.com/books?hl=en&lr=&id=thmPzIltAV8C&oi=fnd&pg=PP11&sig=81UGsCDc1DxLV3JAWviltyHD_bY&dq=%22Mordecai+Cooke%22#PPA166,M1

#http://books.google.com/books?id=thmPzIltAV8C

print " Google book link: $gb_lookup_url\n";

my $res = $ua->get($gb_lookup_url);

if ($res->is_success) {

print " success.\n";

my $html = $res->content;

my $bibdiv = '';

my $gb_title = '';

my $gb_author = '';

my $gb_year = '';

my $gb_pub = '';

my $gb_isbn = '';

my $gb_pages = '';

if ($html =~ m/

([^<]+)<\/h2>([^\n]*)\n/s) {

$gb_title = $1;

$bibdiv = $2;

}

if ($html =~ m/

(.*?)\n/s) {

$bibdiv = $1;

}

$bibdiv =~ s/
/ /g;

if ($bibdiv =~ m/By ([^<]*)/) {

$gb_author = $1;

} elsif ($bibdiv =~ m/

By ([^<]*)/) {

$gb_author = $1;

}

if ($bibdiv =~ m/\

Published by ([^\n\<]*?)\, (\d\d\d\d)\<\/div>/) {

$gb_pub = $1;

$gb_year = $2;

} else {

if ($bibdiv =~ m/

Published ([^<]*)/) {

$gb_year = $1;

}

if ($bibdiv =~ m/q=inpublisher[^>]+>([^<]*)/) {

$gb_pub = $1;

}

}

if ($bibdiv =~ m/\>ISBN\s*(?:\:\s*)?(\w+)/) {

$gb_isbn = $1;

}

$citetemplate = "{{cite book\n|title=$gb_title\n|author=$gb_author\n|year=$gb_year\n|publisher=$gb_pub\n|isbn=$gb_isbn\n|url=$gb_url\n}}";

} else { print " failed.\n"; }

return $citetemplate;

}

sub citefromlink_Amazon {

my $az_url = shift;

my $citetemplate = '';

print " Amazon.com link: ";

# First, get the ASIN

my $az_ASIN = '';

if ($az_url =~ m/\/(?:dp|product)\/([^\/]*)/) {

$az_ASIN = $1;

print "$az_ASIN\n";

# Next, plug it into the Amazon API.

my $az_api_url = "http://webservices.amazon.com/onca/xml" .

"?Service=AWSECommerceService" .

"&AWSAccessKeyId=$az_AccessKey" .

"&Operation=ItemLookup" .

"&IdType=ASIN" .

"&ItemId=$az_ASIN" .

"&ResponseGroup=Medium";

my $res = $ua->get($az_api_url);

my $xml = XMLin( $res->decoded_content );

my $az_binding = $xml->{Items}->{Item}->{ItemAttributes}->{Binding};

if ($az_binding =~ m/^(Hardcover|Paperback|Ring-bound|Kindle Edition|School & Library Binding|Unknown Binding)$/) {

# Book

my $az_title = $xml->{Items}->{Item}->{ItemAttributes}->{Title};

my $az_date = $xml->{Items}->{Item}->{ItemAttributes}->{PublicationDate};

my $az_pub = $xml->{Items}->{Item}->{ItemAttributes}->{Publisher};

$az_pub = join(", ", @{ $az_pub }) if (ref( $az_pub ) eq "ARRAY" );

my $az_isbn = $xml->{Items}->{Item}->{ItemAttributes}->{ISBN};

#my $az_pages = $xml->{Items}->{Item}->{ItemAttributes}->{NumberOfPages} . " pages";

my $az_author = $xml->{Items}->{Item}->{ItemAttributes}->{Author};

$az_author = join(", ", @{ $az_author }) if (ref( $az_author ) eq "ARRAY" );

$citetemplate = "{{cite book\n|title=$az_title\n|author=$az_author\n|date=$az_date\n|publisher=$az_pub\n|isbn=$az_isbn\n|url=$az_url\n}}";

} elsif ($az_binding =~ m/^(Audio CD|Audio Cassette|Music Download|Video Game|DVD|Blu-ray|HD DVD|VHS Tape|UMD for PSP)$/) {

# Media

my $az_title = $xml->{Items}->{Item}->{ItemAttributes}->{Title};

my $az_date = $xml->{Items}->{Item}->{ItemAttributes}->{ReleaseDate};

$az_date = $xml->{Items}->{Item}->{ItemAttributes}->{PublicationDate} unless ($az_date);

my $az_pub = $xml->{Items}->{Item}->{ItemAttributes}->{Publisher};

$az_pub = join(", ", @{ $az_pub }) if (ref( $az_pub ) eq "ARRAY" );

my $az_artist = $xml->{Items}->{Item}->{ItemAttributes}->{Artist};

$az_artist = $xml->{Items}->{Item}->{ItemAttributes}->{Author} unless ($az_artist);

$az_artist = join(", ", @{ $az_artist }) if (ref( $az_artist ) eq "ARRAY" );

my $az_isbn = $xml->{Items}->{Item}->{ItemAttributes}->{ISBN};

$citetemplate = "{{cite video\n|title=$az_title\n|people=$az_artist\n|date=$az_date\n|format=$az_binding|publisher=$az_pub\n|isbn=$az_isbn\n|url=$az_url\n|accessdate=$Todays_date\n}}";

}

} else { print " couldn't find ASIN.\n"; }

return $citetemplate;

}

sub citefromlink_TimeMagazine {

my $tm_url = shift;

my $citetemplate = '';

print " Time Magazine link.\n";

my $res = $ua->get($tm_url);

if ($res->is_success) {

my $html = $res->content;

my $tm_title = '';

my $tm_date = '';

my $tm_author = '';

if ($html =~ m/RightslinkPopUp\(\'(.*?)\', \'(.*?)\', \'(.*?)\', \'.*?\'\)\;/) {

$tm_title = $1;

$tm_date = $2;

$tm_author = $3;

$tm_title =~ s/\\\'/'/g;

$tm_author =~ s/\\\'/'/g;

if ($tm_title) {

$citetemplate = "{{cite news\n|author=$tm_author\n|title=$tm_title\n|date=$tm_date\n|work=Time Magazine\n|url=$tm_url\n|accessdate=$Todays_date\n}}";

}

}

}

return $citetemplate;

}

sub citefromlink_NewYorkTimes {

my $nyt_url = shift;

my $citetemplate = '';

print " New York Times link: '$nyt_url'\n";

my $res = $ua->get($nyt_url);

if ($res->is_success) {

my $html = $res->content;

my $nyt_title = '';

my $nyt_date = '';

my $nyt_author = '';

# Title

if ($html =~ m/

$nyt_title = $1;

$nyt_title =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;

$nyt_title =~ s/\n/ /gs;

$nyt_title =~ s/^ +//;

$nyt_title =~ s/ +$//;

} elsif ($html =~ m/function getShareHeadline\(\) \{\s*return encodeURIComponent\(\'(.*?)\'\)\;/s) {

$nyt_title = $1;

$nyt_title =~ s/\n/ /gs;

$nyt_title =~ s/^\s+//;

$nyt_title =~ s/\s+$//;

$nyt_title =~ s/\\\'/'/g;

} elsif ($html =~ m/(.*?)<\/NYT_HEADLINE>/s) {

$nyt_title = $1;

$nyt_title =~ s/\n/ /gs;

$nyt_title =~ s/^\s+//;

$nyt_title =~ s/\s+$//;

} elsif ($html =~ m//s) {

$nyt_title = $1;

$nyt_title =~ s/\n/ /gs;

$nyt_title =~ s/^\s+//;

$nyt_title =~ s/\s+$//;

} elsif ($html =~ m/

(.*?)<\/h3>/s) {

$nyt_title = $1;

$nyt_title =~ s/\n/ /gs;

$nyt_title =~ s/^\s+//;

$nyt_title =~ s/\s+$//;

}

$nyt_title =~ s/\<\/?..?\>//g;

# Author

if ($html =~ m/

$nyt_author = $1;

$nyt_author =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;

$nyt_author =~ s/^ *(BY )?//i;

$nyt_author =~ s/ +$//;

} elsif ($html =~ m/function getShareByline\(\) \{\s*return encodeURIComponent\(\'By (.*?)\'\)\;/s) {

$nyt_author = $1;

$nyt_author =~ s/^\s+//;

$nyt_author =~ s/\s+$//;

$nyt_author =~ s/\\\'/'/g;

} elsif ($html =~ m//s) {

$nyt_author = $1;

$nyt_author =~ s/^\s+//;

$nyt_author =~ s/\s+$//;

}

# Date

if ($html =~ m/

$nyt_date = $1;

$nyt_date =~ s/^ *(\d\d\d\d)(\d\d)(\d\d) *$/$1-$2-$3/;

} elsif ($html =~ m//s) {

$nyt_date = $1;

} elsif ($html =~ m/

Published\: (.*?)<\/div>/s) {

$nyt_date = $1;

} elsif ($html =~ m/function getSharePubdate\(\) \{\s*return encodeURIComponent\(\'(.*?)\'\)\;/s) {

$nyt_date = $1;

$nyt_date =~ s/^\s+//;

$nyt_date =~ s/\s+$//;

}

if ($nyt_title) {

$citetemplate = "{{cite news\n|author=$nyt_author\n|title=$nyt_title\n|date=$nyt_date\n|work=New York Times\n|url=$nyt_url\n|accessdate=$Todays_date\n}}";

} else { print " not readable\n"; }

} else { print " not is success\n"; }

return $citetemplate;

}

sub templatefrom_IMDB {

my $imdb_url = shift;

my $citetemplate = '';

if ($imdb_url =~ m/imdb\.com\/(title|name|company|character)\/(tt|nm|co|ch)(\d+)/) {

my $imdbtype = $1;

my $imdbtypeabbr = $2;

my $imdbnum = $3;

print " IMDB link to $imdbtype $imdbnum\n";

my $res = $ua->get("http://www.imdb.com/$imdbtype/$imdbtypeabbr$imdbnum/");

if ($res->is_success) {

my $html = $res->content;

if ($html =~ m/<\s*title\s*>\s*([^\n<]*)<\s*\/\s*title\s*>/si) {

my $title = $1;

if ($title =~ m/Page\)? not found/i) {

print " not found on imdb:" . $res->status_line . ".\n";

} else {

$title =~ tr/\[\]/()/;

print " changing to {{imdb $imdbtype|$imdbnum|$title}}\n";

$citetemplate = "{{imdb $imdbtype|$imdbnum|$title}}";

}

} else {

print " no title.\n";

}

} else {

print " not found on imdb. " . $res->status_line . "\n";

}

}

return $citetemplate;

}

sub templatefrom_Myspace {

my $ms_username = shift;

my $citetemplate = '';

print " MySpace link: $ms_username\n";

my $res = $ua->get("http://www.myspace.com/$ms_username");

if ($res->is_success) {

print " success.\n";

my $html = $res->content;

if ($html =~ m/Invalid Friend ID/) {

$citetemplate = "{{MySpace|$ms_username|$ms_username (dead link)}}";

} elsif ($html =~ m/(.*?)<\/span>/) {

my $ms_showname = $1;

$citetemplate = "{{MySpace|$ms_username|$ms_showname}}";

}

} else { print " fail: " . $res->status_line . "\n"; }

return $citetemplate;

}

sub templatefrom_PG {

my $pg_id = shift;

my $citetemplate = '';

print " Gutenberg: $pg_id\n";

my $res = $ua->get("http://www.gutenberg.org/etext/$pg_id");

if ($res->is_success) {

print " success.\n";

my $html = $res->content;

if ($html =~ m/

Error<\/h2>/) {

$citetemplate = "http://www.gutenberg.org/etext/$pg_id {{dead link}}";

} elsif ($html =~ m/

.*?

([^<]*)/s) {

my $pg_title = $1;

$citetemplate = "{{gutenberg|no=$pg_id|name=$pg_title}}";

} else {print " no title\n";}

} else { print " fail: " . $res->status_line . "\n"; }

return $citetemplate;

}

sub templatefrom_YouTube {

my $yt_id = shift;

my $citetemplate = '';

print " Youtube link: $yt_id\n";

my $res = $ua->get("http://www.youtube.com/watch?v=$yt_id");

if ($res->is_success) {

print " success.\n";

my $html = $res->content;

if ($html =~ m/The URL contained a malformed video ID/) {

$citetemplate = "http://www.youtube.com/watch?v=$yt_id {{dead link}}";

} elsif ($html =~ m//) {

my $yt_title = $1;

$citetemplate = "{{YouTube|$yt_id|$yt_title}}";

}

} else { print " fail: " . $res->status_line . "\n"; }

return $citetemplate;

}

sub templatefrom_CongBio {

my $cb_id = shift;

my $citetemplate = '';

print " CongBio link: $cb_id\n";

my $res = $ua->get("http://bioguide.congress.gov/scripts/biodisplay.pl?index=$cb_id");

if ($res->is_success) {

my $html = $res->content;

if ($html =~ m/File\: $cb_id does not exist\./) {

$citetemplate = "http://bioguide.congress.gov/scripts/biodisplay.pl?index=$cb_id {{dead link}}";

} elsif ($html =~ m/([^<]*)

my $cb_name = $1;

$cb_name =~ s/(.*?)[\,\s\.\;\:]+$/$1/;

$citetemplate = "{{CongBio|$cb_id|name=$cb_name|inline=1}}";

} else {

$citetemplate = "{{CongBio|$cb_id|inline=1}}";

}

}

return $citetemplate;

}

sub citefromlink_USNews {

my $usn_url = shift;

my $citetemplate = '';

print " US News and World Report\n";

my $res = $ua->get($usn_url);

if ($res->is_success) {

my $html = $res->content;

my $usn_title = '';

my $usn_date = '';

my $usn_author = '';

if ($html =~ m/

\s*(.*?)\s*<\/h1>\s*

\s*(.*?).*?<\/h2>/s) {

$usn_title = "$1: $2";

} elsif ($html =~ m/

\s*(.*?)\s*<\/h1>/s) {

$usn_title = $1;

}

if ($html =~ m/

By\s*(?:)?\s*(.*?)<\//s) {

$usn_author = $1;

}

if ($html =~ m/

Posted (.*?)<\/div>/s) {

$usn_date = $1;

}

if ($usn_title) {

$citetemplate = "{{cite news\n|author=$usn_author\n|title=$usn_title\n|date=$usn_date\n|work=US News and World Report\n|url=$usn_url\n|accessdate=$Todays_date\n}}";

}

}

return $citetemplate;

}

sub citefromlink_Forbes {

my $fo_url = shift;

my $citetemplate = '';

print " Forbes link\n";

my $res = $ua->get($fo_url);

if ($res->is_success) {

my $html = $res->content;

my $fo_title = '';

my $fo_date = '';

my $fo_author = '';

if ($html =~ m/\s*(.*?)\s*<\/span>/s) {

$fo_title = $1;

$fo_title =~ s/<\/?b>//gi;

}

if ($html =~ m/\s*(.*?)\s*<\/?span>/s) {

$fo_author = $1;

} elsif ($html =~ m/.*?<\/span>
(.*?)\s*

$fo_author = $1;

}

if ($html =~ m/\s*(\d\d)\.(\d\d)\.(\d\d)/) {

my $temp_month = $1;

my $temp_day = $2;

my $temp_year = $3;

$fo_date = "20$temp_year-$temp_month-$temp_day";

}

if ($fo_title) {

$citetemplate = "{{cite news\n|author=$fo_author\n|title=$fo_title\n|date=$fo_date\n|work=Forbes Magazine\n|url=$fo_url\n|accessdate=$Todays_date\n}}";

}

}

return $citetemplate;

}

sub citefromlink_BBC {

my $bbc_url = shift;

my $citetemplate = '';

print " BBC news link: $bbc_url\n";

my $res = $ua->get($bbc_url);

if ($res->is_success) {

print " success.\n";

my $html = $res->content;

my $bbc_title = '';

my $bbc_date = '';

if ($html =~ m//si) {

$bbc_title = $1;

$bbc_title =~ s/^\s+//;

$bbc_title =~ s/\s+$//;

if ($html =~ m/

my $temp_year = $1;

my $temp_month = $2;

my $temp_day = $3;

$bbc_date = "$temp_year-$temp_month-$temp_day";

}

$citetemplate = "{{cite news\n|author=\n|title=$bbc_title\n|date=$bbc_date\n|work=BBC News\n|url=$bbc_url\n|accessdate=$Todays_date\n}}";

}

}

print " done.\n";

return $citetemplate;

}

sub process_link {

my $full_link = shift;

my $link_type = shift;

$full_link =~ m/(https?\:\/\/[^\s\]\<]*)/s;

my $urlonly = $1;

my $citetemplate = '';

if ($urlonly =~ m/http\:\/\/books\.google\.com\/books/) { # Google Books

$citetemplate = citefromlink_GoogleBooks($urlonly);

} elsif ($urlonly =~ m/http\:\/\/.*amazon\.com\//) { # Amazon.com

$citetemplate = citefromlink_Amazon($urlonly);

} elsif ($urlonly =~ m/http\:\/\/www\.time\.com\//) { # Time Magazine

$citetemplate = citefromlink_TimeMagazine($urlonly);

} elsif ($urlonly =~ m/https?\:\/\/.*?nytimes\.com\//) { # New York Times

$citetemplate = citefromlink_NewYorkTimes($urlonly);

} elsif ($urlonly =~ m/http:\/\/.*\.usnews\.com\//) { # US News and World Report

$citetemplate = citefromlink_USNews($urlonly);

} elsif ($urlonly =~ m/http:\/\/.*\.forbes\.com\//) { # Forbes

$citetemplate = citefromlink_Forbes($urlonly);

} elsif ($urlonly =~ m/http:\/\/news\.bbc\.co\.uk\//) { # BBC News

$citetemplate = citefromlink_BBC($urlonly);

} elsif ($urlonly =~ m/http:\/\/www\.imdb\.com\//) { # IMDB

$citetemplate = templatefrom_IMDB($urlonly);

} elsif ($urlonly =~ m/http:\/\/www\.myspace\.com\/([^\s\< \]]*)/) { # MySpace

my $ms_title = $1;

$citetemplate = templatefrom_Myspace($ms_title);

} elsif ($urlonly =~ m/http:\/\/www\.youtube\.com\/watch\?v\=([^\s\< \]]*)/) { # YouTube

my $yt_id = $1;

$citetemplate = templatefrom_YouTube($yt_id);

} elsif ($urlonly =~ m/http:\/\/bioguide\.congress\.gov\/scripts\/biodisplay.pl\?index\=([^\s\< \]]*)/) { # Congbio

my $cb_id = $1;

$citetemplate = templatefrom_CongBio($cb_id);

} elsif ($urlonly =~ m/http:\/\/www\.gutenberg\.org\/(?:etext|ebooks|files)\/(\d+)/) { # Project Gutenberg

my $pg_id = $1;

$citetemplate = templatefrom_PG($pg_id);

} else { # check for DOI, and add title if none already

$citetemplate = check_DOI($urlonly, $link_type);

}

if ($citetemplate) {

if ($full_link =~ s/\[\Q$urlonly\E[^\]]*\]/$citetemplate/s) {

# do nothing

} else {

$full_link =~ s/\Q$urlonly\E/$citetemplate/s;

}

}

return $full_link;

}

sub check_DOI {

my $url = shift;

my $linktype = shift;

my $citetemplate = '';

return $citetemplate unless $linktype eq 'bare';

print " Looking up $url\n";

my $res = $ua->get("$url");

unless ($res->content_type eq 'text/html') {

print " not html. Skipping.\n";

} else {

# It's html.

unless ($res->is_success) {

print " no connection (probably 404). Skipping.\n";

} else {

# It's connected.

my $html = $res->content;

# Here's where I should check for a DOI, and only check for a title if $linktype eq 'bare'

if ($html =~ m/(10\.\d{4}(\/|\%2F)([^\s\"\?\&\>]|\&l?g?t\;|\<[^\s\"\?\&]*\>)*)(?=[\s\"\?]|\<\/)/) {

# It's got a DOI! Eureka.

my $DOI = $1;

# strip trailing flotsam

$DOI =~ s/(\<\/?\w+\/?\>|[\:\;\)\.\'\,\-\#])+$//;

$DOI =~ s/\<.*//;

# Now run the DOI through crossref.org:

my $crossref_url = "http://www.crossref.org/openurl/?pid=$crossref_creds&id=doi:$DOI&noredirect=true";

my $res2 = $ua->get($crossref_url);

my $xml = XMLin( $res2->decoded_content );

my $j_article_title = $xml->{query_result}->{body}->{query}->{article_title};

if ($j_article_title) {

my $j_journal_title = $xml->{query_result}->{body}->{query}->{journal_title};

my $j_volume = $xml->{query_result}->{body}->{query}->{volume};

my $j_issue = $xml->{query_result}->{body}->{query}->{issue};

my $j_pages = $xml->{query_result}->{body}->{query}->{first_page};

my $j_year = $xml->{query_result}->{body}->{query}->{year};

my $j_format = $xml->{query_result}->{body}->{query}->{publication_type};

$j_format =~ tr/_/ /;

my $j_last_name = $xml->{query_result}->{body}->{query}->{contributors}->{contributor}->{given_name};

my $j_first_name = $xml->{query_result}->{body}->{query}->{contributors}->{contributor}->{surname};

$citetemplate = "{{cite journal\n"

. "| last = $j_last_name\n"

. "| first = $j_first_name\n"

. "| year = $j_year\n"

. "| title = $j_article_title\n"

. "| journal = $j_journal_title\n"

. "| volume = $j_volume\n"

. "| issue = $j_issue\n"

. "| pages = $j_pages\n"

. "| doi = $DOI\n"

. "| format = $j_format\n"

. "}}";

}

}

unless ($citetemplate) {

# DOI checking.

if ($linktype eq 'bare') {

# Look for a title

print " Looking for a title.\n";

if ($html =~ m/<\s*title\s*>\s*([^\n<]*)\s*<\s*\/\s*title\s*>/si) {

my $title = $1;

$title =~ tr/[]{}/()()/;

$title =~ s/\s/ /g;

while ($title =~ s/ / /g) {};

$title =~ s/ $//;

$title =~ s/^ //;

$title =~ s/]*>.*?<\/script>|]*>.*?<\/style>||//gi;

if (length($title) > 175) {

$title =~ s/(.{175}).*/$1.../;

}

$title =~ s/(.*)/\u$1/;

if ($title !~ m/$blacklist/i) {

# Title not blacklisted

print " Title: $title\n";

my $baseurl = $url;

$baseurl =~ s/.*https?:\/\/([^\/\s\<]*).*/$1/;

$baseurl =~ s/.*\.(blogspot\.com|livejournal\.com|blogger\.com)/$1/;

$citetemplate = "[QQQ$url $title] at $baseurl";

} else { print " black-listed title. Skipping.\n"; }

} else { print " no title. Skipping.\n"; }

}

}

}

}

return $citetemplate;

}