User:Polbot/source/Reffix.pl



Use like:
perl reffix.pl Catname

use strict;
use Perlwikipedia;
use URI::Escape;
use LWP::UserAgent;
use Encode;
use XML::Simple;
my $Polbot_password = '(bot password)';
my $az_AccessKey = '(Amazon.com access code)';
my $crossref_creds = '(username:password)';
my $soonest_next_op = time;
my $wait_time = 10;
my $ignorenamespaces = 'User|User talk|Talk|Template|Template talk|Portal|Portal talk|Category|Category talk|Portal talk|Wikipedia talk|Image|Image talk|MediaWiki|MediaWiki talk|Template talk|Help|Help talk';
my $editsummary = 'Automated fixes to external links and references. (See the FAQ for details.)';
my $blacklist = '(^Cannot find server|(File|Resource|Article|Page) (was )?not found|(^|\s)Log ?In($|\s)|(^|\s)Sign ?in($|\s))';

date

my ($Second, $Minute, $Hour, $Day, $Month, $Year, $WeekDay, $DayOfYear, $IsDST) = localtime(time);
$Year += 1900;
$Month++;
$Month =~ s/^(\d)$/0$1/;
$Day =~ s/^(\d)$/0$1/;
my $Todays_date = "$Year-$Month-$Day";
my $category = shift;
print "Running Polbot's reffix function, category = $category\n";
print "\nLogging in to Wikipedia.\n" ;
my $pw=Perlwikipedia->new();
$pw->{mech}->agent('Bot/polbot');
my $login_status=$pw->login('Polbot', $Polbot_password);
die "I can't log in." unless ($login_status eq 0);
my $ua = LWP::UserAgent->new;
$ua->agent("Firefox/3.0.1");
$ua->cookie_jar({});
print "Opening category '$category'\n";
my @allpages = $pw->get_pages_in_category("Category:$category");
print "There are " . scalar(@allpages) . " total pages to go through.\n";
foreach my $articlename (@allpages) {
print "Examining $articlename\n";
if ($articlename =~ /^$ignorenamespaces:/i) {
print " Not an article. Skipping.\n";
next;
}
# -----------------------------------------------------------------
# ---------------- First, look at the article and set variables.
my $bNeedsChanging = 0;
my $newart = '';
my $bHasReferencesTag = 0;
my $bHasReflist = 0;
my $bHasRefTag = 0;
my $art = $pw->get_text($articlename);
# Exclusion compliance
if ($art =~ m/\{\{\s*(nobots\s*\}\}|bots\s*\|\s*allow\s*=|bots\s*|\s*deny\s*=\s*all)/si) {
print "  {{nobots}}, skipping.\n";
next;
}
# variables
if ($art =~ /<\s*references\s*\/\s*>/is) {
$bHasReferencesTag = 1;
}
if ($art =~ /\{\{\s*(template\s*:\s*)?reflist\s*[\|\}]/is) {
$bHasReflist = 1;
}
if ($art =~ /<\s*ref(\s+name\s*=\s*(?:"[^"]*"|\w+)|)\s*>/si) {
$bHasRefTag = 1;
}
# -----------------------------------------------------------------
# ---------------- Change  to {{reflist}}
if ($bHasReferencesTag == 1) {
#$bNeedsChanging = 1;
$art =~ m/(<\s*references\s*\/>)/si;
my $refsect = $1;
$art =~ m/(<(span|div)( class=\"(references-small|small|references-2column))?\">\s*<\s*references\s*\/>\s*<\/\s*(span|div)>)/si;
my $temp2 = $1;
$art =~ m/(<(span|div)( class=\"(references-small|small|references-2column))?\">\s*<(span|div)( class=\"(references-small|small|references-2column))?\">\s*<\s*references\s*\/>\s*<\/\s*(span|div)>\s*<\/\s*(span|div)>)/si;
my $temp3 = $1;
if ($temp3) {
$refsect = $temp3;
} elsif ($temp2) {
$refsect = $temp2;
}
if ($refsect) {
my $newrefsect = $refsect;
if ($refsect =~ m/references-2column/) {
$newrefsect = "{{reflist|2}}";
} elsif ($refsect =~ m/[^-]column-count:[\s]*?(\d*)/) {
$newrefsect = "{{reflist|$1}}";
} elsif ($refsect =~ m/-moz-column-count:[\s]*?(\d*)/) {
$newrefsect = "{{reflist|$1}}";
} else {
$newrefsect = "{{reflist}}";
}
$art =~ s/$refsect/$newrefsect/si;
$bHasReflist = 1;
}
}
# -----------------------------------------------------------------
# ---------------- Fix http://...
while ($art =~ m/\[\[(https?:\/\/[^\]]*)\]\]/si) {
my $badlink = $1;
$bNeedsChanging = 1;
print " Fixing $badlink\n";
$art =~ s/\[\[\Q$badlink\E\]\]/[$badlink]/si;
}
# -----------------------------------------------------------------
# ---------------- Fix ext links to Wikimedia
# en.wikipedia
while ($art =~ m/http:\/\/(?:en\.)?wikipedia\.org\/wiki\/([^\] ]*)/g) {
my $extwikilink = $1;
$bNeedsChanging = 1;
my $intwikilink = $extwikilink;
$intwikilink =~ s/_/ /g;
$intwikilink =~ s/%([0-9A-Fa-f]{2})%([0-9A-Fa-f]{2})/decode("utf8", chr(hex($1)) . chr(hex($2)))/eg;
$intwikilink =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
$intwikilink =~ s/^(Image|Category):/:$1:/;
print " Fixing ext wikilink $extwikilink to $intwikilink\n";
# non-renamed
$art =~ s/\[http:\/\/(en\.)?wikipedia\.org\/wiki\/\Q$extwikilink\E\]/$intwikilink/g;
# renamed
$art =~ s/\[http:\/\/(?:en\.)?wikipedia\.org\/wiki\/\Q$extwikilink\E ([^\]]*)\]/$1/g;
}
# other.wikipedia
while ($art =~ m/\[http:\/\/([^\.]*).wikipedia.org\/wiki\/([^\] ]*)/s) {
my $extwikilang = $1;
my $extwikilink = $2;
$bNeedsChanging = 1;
my $intwikilink = $extwikilink;
$intwikilink =~ s/_/ /g;
$intwikilink =~ s/%([0-9A-Fa-f]{2})%([0-9A-Fa-f]{2})/decode("utf8", chr(hex($1)) . chr(hex($2)))/eg;
$intwikilink =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
$intwikilink =~ s/^(Image|Category):/:$1:/;
print " Fixing ext link $extwikilang.$extwikilink to $extwikilang:$intwikilink\n";
# non-renamed
$art =~ s/\[http:\/\/$extwikilang\.wikipedia\.org\/wiki\/\Q$extwikilink\E\]/:$extwikilang:$intwikilink/g;
# renamed
$art =~ s/\[http:\/\/$extwikilang\.wikipedia\.org\/wiki\/\Q$extwikilink\E ([^\]]*)\]/$1/g;
}
# -----------------------------------------------------------------
# ---------------- ref the BELs
# First, QQQ the PDFlink BELs
$art =~ s/(\{\{\s*PDF(?:link)?\s*\|\s*\[)(https?:\/\/[^\]]*\])/$1QQQ$2/gi;
# Next, QQQ the html comments
$newart = $art;
while ($art =~ m/<\!--(.*?)-->/gs) {
my $comment = $1;
my $newcomment = $comment;
if ($newcomment =~ s/\[(https?:\/\/)/\[QQQ$1/g) {
$newart =~ s/\Q$comment\E/$newcomment/;
}
}
$art = $newart;
# Next, QQQ the links already in refs
$newart = $art;
while ($art =~ m/(<\s*ref.*?<\s*\/\s*ref\s*>)/gs) {
my $ref = $1;
my $newref = $ref;
if ($newref =~ s/\[(https?:\/\/)/\[QQQ$1/g) {
$newart =~ s/\Q$ref\E/$newref/;
}
}
$art = $newart;
# Now QQQ the " at [http]" or " in [http]" or " from [http]"
$art =~ s/( at| in| from|At|In|From) \[(https?:\/\/[^\]]*\])/ $1 [QQQ$2/g;
# And lastly, QQQ links that begin a line
$art =~ s/^([\s\*\#\:]*\[)(https?\:\/\/[^\]]*\])/$1QQQ$2/gm;
# Okay! Now ref all non-QQQed BELs above the {{reflist}} template (or whatever)
my $artbefore = $art;
my $artafter = '';
if ($art =~ m/(.*?)(\{\{reflist\}\}|=+\s*Notes?\s*=+|=+\s*References?\s*=+|=+\s*External links?\s*=+|=+\s*Sources?\s*=+|=+\s*Further reading\s*=+|=+\s*See also\s*=+)(.*)/is) {
$artbefore = $1;
$artafter = "$2$3";
}
$newart = $artbefore;
while ($artbefore =~ m/\[(https?:\/\/[^ \]]*)\]/g) {
my $BEL = $1;
$bNeedsChanging = 1;
$bHasRefTag = 1;
$newart =~ s/ *\[\Q$BEL\E\]/QQQ$BEL<\/ref>/g;
}
$art = "$newart$artafter";
# UnQQQ it all
$art =~ s/QQQhttp/http/g;
$art =~ s/ *\(([^<]*<\/ref>)\)/$1/gs;
# -----------------------------------------------------------------
# ---------------- Add {{reflist}} if missing
if ($bHasRefTag - $bHasReflist == 1) {
$bNeedsChanging = 1;
print "  but no {{reflist}}\n";
if ($art =~ m/\n=+\s*(references?|notes)\s*=+\s*\n/mi) {
my $putrefin = $1;
$art =~ s/(\n=+\s*($putrefin)\s*=+\n)/$1\{\{reflist\}\}\n/si;
print "  Putting reflist after $putrefin section\n";
} else {
$art =~ m/(=+\s*see also\s*=+|=+\s*external links?\s*=+|=+\s*sources?\s*=+|=+\s*further reading\s*=+|\[\[\s*category\s*\:)/si;
my $putrefsbefore = $1;
if ($putrefsbefore) {
$art =~ s/\Q$putrefsbefore\E/==Notes==\n{{reflist}}\n\n$putrefsbefore/si;
print "  Putting reflist before $putrefsbefore section\n";
} else {
$art .= "\n{{reflist}}";
print "  Putting reflist at end\n";
}
}
}
# ------------------------------------------------------------------------
# ---------------- Known links -> cites or templates
# ---------------- Unkown links -> titles or {{dead link}}
# First, QQQ the PDFlink BELs
$art =~ s/(\{\{\s*PDF(?:link)?\s*\|\s*\[)(https?:\/\/[^\]]*\])/$1QQQ$2/gi;
# Next, QQQ the html comments
$newart = $art;
while ($art =~ m/<\!--(.*?)-->/gs) {
my $comment = $1;
my $newcomment = $comment;
if ($newcomment =~ s/\[(https?:\/\/)/\[QQQ$1/g) {
$newart =~ s/\Q$comment\E/$newcomment/;
}
}
$art = $newart;
# And QQQ the already-dead links
$art =~ s/\b(https?\:\/\/[^\s\]\<\{]*\]? ?\{\{dead link\}\})/QQQ$1/g;
my @BURLs = ();  # bare URLs, e.g. http://www.example.com/subdir/example.html
my @BELs = ();   # bare external links, e.g. [http://www.example.com/subdir/example.html]
my @NELs = ();   # named external links, e.g. [http://www.example.com/subdir/example.html name]
# Those starting a line
push @BURLs, ($art =~ m/^[ \*\#\:]*https?\:\/\/[^\s\]\<]*/mg);
push @BELs, ($art =~ m/^[ \*\#\:]*\[https?\:\/\/[^ \]\<]*\]/mg);
push @NELs, ($art =~ m/^[ \*\#\:]*\[https?\:\/\/[^ \]\<]*(?: [^\]]+)\]/mg);
# Those in  tags
push @BURLs, ($art =~ m/]*)?>https?\:\/\/[^\s\]\<]*\s*<\/ref>/sg);
push @BELs, ($art =~ m/]*)?>\[https?\:\/\/[^ \]\<]*\]\s*<\/ref>/sg);
push @NELs, ($art =~ m/]*)?>\[https?\:\/\/[^ \]]*(?: [^\]]+)\]\s*<\/ref>/sg);
# Process these links.
$newart = $art;
print "Processing BURLs and BELs\n";
foreach my $full_link (@BURLs, @BELs) {
my $transformedlink = process_link($full_link, 'bare');
if ($full_link ne $transformedlink) {
$newart =~ s/\Q$full_link\E/$transformedlink/s;
$bNeedsChanging = 1;
}
}
print "Processing " . scalar(@NELs) . " NELs\n";
foreach my $full_link (@NELs) {
my $transformedlink = process_link($full_link, 'named');
if ($full_link ne $transformedlink) {
$newart =~ s/\Q$full_link\E/$transformedlink/s;
$bNeedsChanging = 1;
}
}
$art = $newart;
$art =~ s/QQQhttp/http/g;
# -----------------------------------------------------------------
# ---------- Merging refs: very hard. Skipping for now.
# -----------------------------------------------------------------
# ---------- Minor fixes
if ($bNeedsChanging) {
# Fix punctuation touching ref tags

while ($art =~ s/(.*)()([\.\,\?\!\;\:])/$1$3$2/gs) {};

$newart = $art;
while ($art =~ m/(]*>.*?<\/ref>)(.)/gs) {
my $thisref = $1;
my $thischar = $2;
if ($thischar =~ m/[\.\,\?\!\;\:]/) {
print "Found $thischar after \n";
$newart =~ s/\Q$thisref$thischar\E/$thischar$thisref/gs;
}
}
while ($art =~ m/(]*\/>)(.)/gs) {
my $thisref = $1;
my $thischar = $2;
if ($thischar =~ m/[\.\,\?\!\;\:]/) {
print "Found $thischar after \n";
$newart =~ s/\Q$thisref$thischar\E/$thischar$thisref/gs;
}
}
$art = $newart;
# Miscaptalizations
$art =~ s/==(\s*)See also(\s*)==/==$1See also$2==/i;
$art =~ s/==(\s*)External links?(\s*)==/==$1External links$2==/i;
# units
$art =~ s/(\d) (mph|km|mile|mi|kilometer|mbar|knot|feet|ft|meter|m|metre|kilometre|inch|million|billion|foot|days|kt|millibar|mm|cm|dollar|USD|inHg|hPa|people|hour|liter|degree|°|year|month|square|sq)\b/$1 $2/g;
# HTML
$art =~ s/\<\/?i\>/\'\'/gi;
$art =~ s/\<\/?b\>/\'\'\'/gi;
# Date stuff
#  Century
$art =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi;
$art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi;
$art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi;
$art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\]\]/$1 centuries/gi;
$art =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi;
$art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi;
$art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\s(AD|BC|CE|BCE)\]\]/$1 centuries $2/gi;
#  piped decades and years
$art =~ s/\[\[(\d{1,4}\'?s)\]\]/$1/gi;
$art =~ s/\[\[(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi;
$art =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4})\]\]/$1/gi;
$art =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi;
$art =~ s/\[\[\d{1,4}s?\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi;
$art =~ s/\[\[\d{1,4}s?\|(\d{1,2}s?)\]\]/$1/gi;
#  months
$art =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December)\]\]/$1/gi;
$art =~ s/\[\[January\|(Jan)\]\]/$1/gi;
$art =~ s/\[\[February\|(Feb)\]\]/$1/gi;
$art =~ s/\[\[March\|(Mar)\]\]/$1/gi;
$art =~ s/\[\[April\|(Apr)\]\]/$1/gi;
$art =~ s/\[\[May\|(May)\]\]/$1/gi;
$art =~ s/\[\[June\|(Jun)\]\]/$1/gi;
$art =~ s/\[\[July\|(Jul)\]\]/$1/gi;
$art =~ s/\[\[August\|(Aug)\]\]/$1/gi;
$art =~ s/\[\[September\|(Sep)\]\]/$1/gi;
$art =~ s/\[\[October\|(Oct)\]\]/$1/gi;
$art =~ s/\[\[November\|(Nov)\]\]/$1/gi;
$art =~ s/\[\[December\|(Dec)\]\]/$1/gi;
#  month+year
$art =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d{3,4})\]\]/$1/gi;
#  Month+day_number "March 7th" -> "March 7"
$art =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December) (\d?\d)(?:th|st|nd|rd)\]\]/\[\[$1 $2\]\]/gi;
$art =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](?:th|st|nd|rd)/\[\[$1\]\]/gi;
$art =~ s/\[\[(\d?\d)(?:th|st|nd|rd) (January|February|March|April|May|June|July|August|September|October|November|December)\]\]/\[\[$1 $2\]\]/gi;
#  Month+day_number piped into number. Preferences do not work. They don't work in sequence because digits in the two dates must be adjacent
$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1/gi;
$art =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2})\]\]/$1/gi;
$art =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2})\]\]/$1/gi;
#  solitary day_numbers
$art =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi;
$art =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi;
$art =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi;
#  days of the week in full. Optional plurals
$art =~ s/\[\[(Mondays?|Tuesdays?|Wednesdays?|Thursdays?|Fridays?|Saturdays?|Sundays?)\]\]/$1/gi;
#  days of the week abbreviated. Leave out 'Sun' as potentially valid link to the Sun. Leave out 'SAT' in upper case as potential link to 'Scholastic achievement/aptitude test'.
$art =~ s/\[\[(Mon|Tue|Tues|Wed|Thu|Thur|Thurs|Fri)\]\]/$1/gi;
$art =~ s/\[\[(Sat)\]\]/$1/g;
$art =~ s/\[\[Mondays?\|(Mondays?)\]\]/$1/gi;
$art =~ s/\[\[Tuesdays?\|(Tuesdays?)\]\]/$1/gi;
$art =~ s/\[\[Wednesdays?\|(Wednesdays?)\]\]/$1/gi;
$art =~ s/\[\[Thursdays?\|(Thursdays?)\]\]/$1/gi;
$art =~ s/\[\[Fridays?\|(Fridays?)\]\]/$1/gi;
$art =~ s/\[\[Saturdays?\|(Saturdays?)\]\]/$1/gi;
$art =~ s/\[\[Sundays?\|(Sundays?)\]\]/$1/gi;
#  4 digit years piped into 2
$art =~ s/\[\[\d{1,4}\|(\d{1,2})\]\]/$1/gi;
#  year: examine characters in link on left for date, examine characters in link on right for date
$art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi;
#  year pair: examine characters in link on left for date, examine characters in link on right for date
$art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc\s\-]))/$1$2$3$4$5/gi;
#  year: examine characters in link on left for date, avoid links on right
$art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi;
#  year pair: examine characters in link on left for date, avoid links on right
$art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi;
#  year: check for line-ends, text on left, avoid links on right. Run twice to deal better with lists.
$art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi;
$art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi;
#  year pair: check for line-ends, text on left, avoid links on right
$art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3$4$5/gi;
#  year: avoid links on left, examine characters in link on right for date
$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi;
#  year pair: avoid links on left, examine characters in link on right for date
$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3$4$5/gi;
#  year:avoid links on left, text on right
$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.\'\*\|\&])/$1$2$3/gi;
#  year pair: avoid links on left, text on right
$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.\'\*\|\&])/$1$2$3$4$5/gi;
#  year:text on left, text on right
$art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.\'\*\|\&])/$1$2$3/gi;
#  year pair: avoid links on left, text on right
$art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:\.\'\*\|\&])/$1$2$3$4$5/gi;
#  year:avoid links on both sides
$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi;
#  year pair: avoid links on both sides
$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi;
#  'present'
$art =~ s/\[\[Present \(time\)\|(Present)\]\]/$1/gi;
#  Eliminate 'surprise links' also known as 'easter egg links'
$art =~ s/\[\[\d{1,4}s?\sin\s[^\|]{1,30}\|(\d{1,4}s?)\]\]/$1/gi;
}
# -----------------------------------------------------------------
# ---------- DONE! ---------------------------
if ($bNeedsChanging) {
wiki_write($articlename, $art, $editsummary);
}
}
sub wiki_write {
my $article_name = shift;
my $wiki_out = shift;
my $edit_summary = shift;
$|=1;
print " Waiting " . ($soonest_next_op - time) . " secs... ";
$|=1;
while (time < $soonest_next_op) {};
$soonest_next_op = time + $wait_time;
print "Writing $article_name\n";
$pw->edit($article_name, $wiki_out, $edit_summary);
}
sub citefromlink_GoogleBooks {
my $gb_url = shift;
my $citetemplate = '';
my $gb_lookup_url = $gb_url;
$gb_lookup_url =~ s/(http\:\/\/books\.google\.com\/books\?).*(id\=[^\&]*).*$/$1$2/i;
#http://books.google.com/books?hl=en&lr=&id=thmPzIltAV8C&oi=fnd&pg=PP11&sig=81UGsCDc1DxLV3JAWviltyHD_bY&dq=%22Mordecai+Cooke%22#PPA166,M1
#http://books.google.com/books?id=thmPzIltAV8C
print " Google book link: $gb_lookup_url\n";
my $res = $ua->get($gb_lookup_url);
if ($res->is_success) {
print "  success.\n";
my $html = $res->content;
my $bibdiv = '';
my $gb_title = '';
my $gb_author = '';
my $gb_year = '';
my $gb_pub = '';
my $gb_isbn = '';
my $gb_pages = '';
if ($html =~ m/
([^<]+)<\/h2>([^\n]*)\n/s) {
$gb_title = $1;
$bibdiv = $2;
}
if ($html =~ m/
(.*?)\n/s) {
$bibdiv = $1;
}
$bibdiv =~ s/
/ /g;
if ($bibdiv =~ m/By ([^<]*)/) {
$gb_author = $1;
} elsif ($bibdiv =~ m/






By ([^<]*)/) {
$gb_author = $1;
}
if ($bibdiv =~ m/\
Published by ([^\n\<]*?)\, (\d\d\d\d)\<\/div>/) {
$gb_pub = $1;
$gb_year = $2;
} else {
if ($bibdiv =~ m/
Published ([^<]*)/) {
$gb_year = $1;
}
if ($bibdiv =~ m/q=inpublisher[^>]+>([^<]*)/) {
$gb_pub = $1;
}
}
if ($bibdiv =~ m/\>ISBN\s*(?:\:\s*)?(\w+)/) {
$gb_isbn = $1;
}
$citetemplate = "{{cite book\n|title=$gb_title\n|author=$gb_author\n|year=$gb_year\n|publisher=$gb_pub\n|isbn=$gb_isbn\n|url=$gb_url\n}}";
} else { print "  failed.\n"; }
return $citetemplate;
}
sub citefromlink_Amazon {
my $az_url = shift;
my $citetemplate = '';
print " Amazon.com link: ";
# First, get the ASIN
my $az_ASIN = '';
if ($az_url =~ m/\/(?:dp|product)\/([^\/]*)/) {
$az_ASIN = $1;
print "$az_ASIN\n";
# Next, plug it into the Amazon API.
my $az_api_url = "http://webservices.amazon.com/onca/xml" .
"?Service=AWSECommerceService" .
"&AWSAccessKeyId=$az_AccessKey" .
"&Operation=ItemLookup" .
"&IdType=ASIN" .
"&ItemId=$az_ASIN" .
"&ResponseGroup=Medium";
my $res = $ua->get($az_api_url);
my $xml = XMLin( $res->decoded_content );
my $az_binding = $xml->{Items}->{Item}->{ItemAttributes}->{Binding};
if ($az_binding =~ m/^(Hardcover|Paperback|Ring-bound|Kindle Edition|School & Library Binding|Unknown Binding)$/) {
# Book
my $az_title = $xml->{Items}->{Item}->{ItemAttributes}->{Title};
my $az_date = $xml->{Items}->{Item}->{ItemAttributes}->{PublicationDate};
my $az_pub = $xml->{Items}->{Item}->{ItemAttributes}->{Publisher};
$az_pub = join(", ", @{ $az_pub }) if (ref( $az_pub ) eq "ARRAY" );
my $az_isbn = $xml->{Items}->{Item}->{ItemAttributes}->{ISBN};
#my $az_pages = $xml->{Items}->{Item}->{ItemAttributes}->{NumberOfPages} . " pages";
my $az_author = $xml->{Items}->{Item}->{ItemAttributes}->{Author};
$az_author = join(", ", @{ $az_author }) if (ref( $az_author ) eq "ARRAY" );
$citetemplate = "{{cite book\n|title=$az_title\n|author=$az_author\n|date=$az_date\n|publisher=$az_pub\n|isbn=$az_isbn\n|url=$az_url\n}}";
} elsif ($az_binding =~ m/^(Audio CD|Audio Cassette|Music Download|Video Game|DVD|Blu-ray|HD DVD|VHS Tape|UMD for PSP)$/) {
# Media
my $az_title = $xml->{Items}->{Item}->{ItemAttributes}->{Title};
my $az_date = $xml->{Items}->{Item}->{ItemAttributes}->{ReleaseDate};
$az_date = $xml->{Items}->{Item}->{ItemAttributes}->{PublicationDate} unless ($az_date);
my $az_pub = $xml->{Items}->{Item}->{ItemAttributes}->{Publisher};
$az_pub = join(", ", @{ $az_pub }) if (ref( $az_pub ) eq "ARRAY" );
my $az_artist = $xml->{Items}->{Item}->{ItemAttributes}->{Artist};
$az_artist = $xml->{Items}->{Item}->{ItemAttributes}->{Author} unless ($az_artist);
$az_artist = join(", ", @{ $az_artist }) if (ref( $az_artist ) eq "ARRAY" );
my $az_isbn = $xml->{Items}->{Item}->{ItemAttributes}->{ISBN};
$citetemplate = "{{cite video\n|title=$az_title\n|people=$az_artist\n|date=$az_date\n|format=$az_binding|publisher=$az_pub\n|isbn=$az_isbn\n|url=$az_url\n|accessdate=$Todays_date\n}}";
}
} else { print "  couldn't find ASIN.\n"; }
return $citetemplate;
}
sub citefromlink_TimeMagazine {
my $tm_url = shift;
my $citetemplate = '';
print " Time Magazine link.\n";
my $res = $ua->get($tm_url);
if ($res->is_success) {
my $html = $res->content;
my $tm_title = '';
my $tm_date = '';
my $tm_author = '';
if ($html =~ m/RightslinkPopUp\(\'(.*?)\', \'(.*?)\', \'(.*?)\', \'.*?\'\)\;/) {
$tm_title = $1;
$tm_date = $2;
$tm_author = $3;
$tm_title =~ s/\\\'/'/g;
$tm_author =~ s/\\\'/'/g;
if ($tm_title) {
$citetemplate = "{{cite news\n|author=$tm_author\n|title=$tm_title\n|date=$tm_date\n|work=Time Magazine\n|url=$tm_url\n|accessdate=$Todays_date\n}}";
}
}
}
return $citetemplate;
}
sub citefromlink_NewYorkTimes {
my $nyt_url = shift;
my $citetemplate = '';
print " New York Times link: '$nyt_url'\n";
my $res = $ua->get($nyt_url);
if ($res->is_success) {
my $html = $res->content;
my $nyt_title = '';
my $nyt_date = '';
my $nyt_author = '';
# Title
if ($html =~ m/
$nyt_title = $1;
$nyt_title =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
$nyt_title =~ s/\n/ /gs;
$nyt_title =~ s/^ +//;
$nyt_title =~ s/ +$//;
} elsif ($html =~ m/function getShareHeadline\(\) \{\s*return encodeURIComponent\(\'(.*?)\'\)\;/s) {
$nyt_title = $1;
$nyt_title =~ s/\n/ /gs;
$nyt_title =~ s/^\s+//;
$nyt_title =~ s/\s+$//;
$nyt_title =~ s/\\\'/'/g;
} elsif ($html =~ m/(.*?)<\/NYT_HEADLINE>/s) {
$nyt_title = $1;
$nyt_title =~ s/\n/ /gs;
$nyt_title =~ s/^\s+//;
$nyt_title =~ s/\s+$//;
} elsif ($html =~ m//s) {
$nyt_title = $1;
$nyt_title =~ s/\n/ /gs;
$nyt_title =~ s/^\s+//;
$nyt_title =~ s/\s+$//;
} elsif ($html =~ m/
(.*?)<\/h3>/s) {
$nyt_title = $1;
$nyt_title =~ s/\n/ /gs;
$nyt_title =~ s/^\s+//;
$nyt_title =~ s/\s+$//;
}
$nyt_title =~ s/\<\/?..?\>//g;
# Author
if ($html =~ m/
$nyt_author = $1;
$nyt_author =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
$nyt_author =~ s/^ *(BY )?//i;
$nyt_author =~ s/ +$//;
} elsif ($html =~ m/function getShareByline\(\) \{\s*return encodeURIComponent\(\'By (.*?)\'\)\;/s) {
$nyt_author = $1;
$nyt_author =~ s/^\s+//;
$nyt_author =~ s/\s+$//;
$nyt_author =~ s/\\\'/'/g;
} elsif ($html =~ m//s) {
$nyt_author = $1;
$nyt_author =~ s/^\s+//;
$nyt_author =~ s/\s+$//;
}
# Date
if ($html =~ m/
$nyt_date = $1;
$nyt_date =~ s/^ *(\d\d\d\d)(\d\d)(\d\d) *$/$1-$2-$3/;
} elsif ($html =~ m//s) {
$nyt_date = $1;
} elsif ($html =~ m/
Published\: (.*?)<\/div>/s) {
$nyt_date = $1;
} elsif ($html =~ m/function getSharePubdate\(\) \{\s*return encodeURIComponent\(\'(.*?)\'\)\;/s) {
$nyt_date = $1;
$nyt_date =~ s/^\s+//;
$nyt_date =~ s/\s+$//;
}
if ($nyt_title) {
$citetemplate = "{{cite news\n|author=$nyt_author\n|title=$nyt_title\n|date=$nyt_date\n|work=New York Times\n|url=$nyt_url\n|accessdate=$Todays_date\n}}";
} else { print "  not readable\n"; }
}	else { print "  not is success\n"; }
return $citetemplate;
}
sub templatefrom_IMDB {
my $imdb_url = shift;
my $citetemplate = '';
if ($imdb_url =~ m/imdb\.com\/(title|name|company|character)\/(tt|nm|co|ch)(\d+)/) {
my $imdbtype = $1;
my $imdbtypeabbr = $2;
my $imdbnum = $3;
print " IMDB link to $imdbtype $imdbnum\n";
my $res = $ua->get("http://www.imdb.com/$imdbtype/$imdbtypeabbr$imdbnum/");
if ($res->is_success) {
my $html = $res->content;
if ($html =~ m/<\s*title\s*>\s*([^\n<]*)<\s*\/\s*title\s*>/si) {
my $title = $1;
if ($title =~ m/Page\)? not found/i) {
print "  not found on imdb:" . $res->status_line . ".\n";
} else {
$title =~ tr/\[\]/()/;
print "  changing to {{imdb $imdbtype|$imdbnum|$title}}\n";
$citetemplate = "{{imdb $imdbtype|$imdbnum|$title}}";
}
} else {
print "  no title.\n";
}
} else {
print "  not found on imdb. " . $res->status_line . "\n";
}
}
return $citetemplate;
}
sub templatefrom_Myspace {
my $ms_username = shift;
my $citetemplate = '';
print " MySpace link: $ms_username\n";
my $res = $ua->get("http://www.myspace.com/$ms_username");
if ($res->is_success) {
print "  success.\n";
my $html = $res->content;
if ($html =~ m/Invalid Friend ID/) {
$citetemplate = "{{MySpace|$ms_username|$ms_username (dead link)}}";
} elsif ($html =~ m/(.*?)<\/span>/) {
my $ms_showname = $1;
$citetemplate = "{{MySpace|$ms_username|$ms_showname}}";
}
} else { print "  fail: " . $res->status_line . "\n"; }
return $citetemplate;
}
sub templatefrom_PG {
my $pg_id = shift;
my $citetemplate = '';
print " Gutenberg: $pg_id\n";
my $res = $ua->get("http://www.gutenberg.org/etext/$pg_id");
if ($res->is_success) {
print "  success.\n";
my $html = $res->content;
if ($html =~ m/
Error<\/h2>/) {
$citetemplate = "http://www.gutenberg.org/etext/$pg_id {{dead link}}";
} elsif ($html =~ m/
.*?([^<]*)/s) {
my $pg_title = $1;
$citetemplate = "{{gutenberg|no=$pg_id|name=$pg_title}}";
} else {print "  no title\n";}
} else { print "  fail: " . $res->status_line . "\n"; }
return $citetemplate;
}
sub templatefrom_YouTube {
my $yt_id = shift;
my $citetemplate = '';
print " Youtube link: $yt_id\n";
my $res = $ua->get("http://www.youtube.com/watch?v=$yt_id");
if ($res->is_success) {
print "  success.\n";
my $html = $res->content;
if ($html =~ m/The URL contained a malformed video ID/) {
$citetemplate = "http://www.youtube.com/watch?v=$yt_id {{dead link}}";
} elsif ($html =~ m//) {
my $yt_title = $1;
$citetemplate = "{{YouTube|$yt_id|$yt_title}}";
}
} else { print "  fail: " . $res->status_line . "\n"; }
return $citetemplate;
}
sub templatefrom_CongBio {
my $cb_id = shift;
my $citetemplate = '';
print " CongBio link: $cb_id\n";
my $res = $ua->get("http://bioguide.congress.gov/scripts/biodisplay.pl?index=$cb_id");
if ($res->is_success) {
my $html = $res->content;
if ($html =~ m/File\: $cb_id does not exist\./) {
$citetemplate = "http://bioguide.congress.gov/scripts/biodisplay.pl?index=$cb_id {{dead link}}";
} elsif ($html =~ m/([^<]*)
my $cb_name = $1;

$cb_name =~ s/(.*?)[\,\s\.\;\:]+$/$1/;
$citetemplate = "{{CongBio|$cb_id|name=$cb_name|inline=1}}";
} else {
$citetemplate = "{{CongBio|$cb_id|inline=1}}";
}
}
return $citetemplate;
}
sub citefromlink_USNews {
my $usn_url = shift;
my $citetemplate = '';
print " US News and World Report\n";
my $res = $ua->get($usn_url);
if ($res->is_success) {
my $html = $res->content;
my $usn_title = '';
my $usn_date = '';
my $usn_author = '';
if ($html =~ m/
\s*(.*?)\s*<\/h1>\s*
\s*(.*?).*?<\/h2>/s) {
$usn_title = "$1: $2";
} elsif ($html =~ m/\s*(.*?)\s*<\/h1>/s) {
$usn_title = $1;
}
if ($html =~ m/By\s*(?:)?\s*(.*?)<\//s) {
$usn_author = $1;
}
if ($html =~ m/Posted (.*?)<\/div>/s) {
$usn_date = $1;
}
if ($usn_title) {
$citetemplate = "{{cite news\n|author=$usn_author\n|title=$usn_title\n|date=$usn_date\n|work=US News and World Report\n|url=$usn_url\n|accessdate=$Todays_date\n}}";
}
}
return $citetemplate;
}
sub citefromlink_Forbes {
my $fo_url = shift;
my $citetemplate = '';
print " Forbes link\n";
my $res = $ua->get($fo_url);
if ($res->is_success) {
my $html = $res->content;
my $fo_title = '';
my $fo_date = '';
my $fo_author = '';
if ($html =~ m/\s*(.*?)\s*<\/span>/s) {
$fo_title = $1;
$fo_title =~ s/<\/?b>//gi;
}
if ($html =~ m/\s*(.*?)\s*<\/?span>/s) {
$fo_author = $1;
} elsif ($html =~ m/.*?<\/span>
(.*?)\s*
$fo_author = $1;
}
if ($html =~ m/\s*(\d\d)\.(\d\d)\.(\d\d)/) {
my $temp_month = $1;
my $temp_day = $2;
my $temp_year = $3;
$fo_date = "20$temp_year-$temp_month-$temp_day";
}
if ($fo_title) {
$citetemplate = "{{cite news\n|author=$fo_author\n|title=$fo_title\n|date=$fo_date\n|work=Forbes Magazine\n|url=$fo_url\n|accessdate=$Todays_date\n}}";
}
}
return $citetemplate;
}
sub citefromlink_BBC {
my $bbc_url = shift;
my $citetemplate = '';
print " BBC news link: $bbc_url\n";
my $res = $ua->get($bbc_url);
if ($res->is_success) {
print "  success.\n";
my $html = $res->content;
my $bbc_title = '';
my $bbc_date = '';
if ($html =~ m//si) {
$bbc_title = $1;
$bbc_title =~ s/^\s+//;
$bbc_title =~ s/\s+$//;
if ($html =~ m/
my $temp_year = $1;
my $temp_month = $2;
my $temp_day = $3;
$bbc_date = "$temp_year-$temp_month-$temp_day";
}
$citetemplate = "{{cite news\n|author=\n|title=$bbc_title\n|date=$bbc_date\n|work=BBC News\n|url=$bbc_url\n|accessdate=$Todays_date\n}}";
}
}
print "  done.\n";
return $citetemplate;
}
sub process_link {
my $full_link = shift;
my $link_type = shift;
$full_link =~ m/(https?\:\/\/[^\s\]\<]*)/s;
my $urlonly = $1;
my $citetemplate = '';
if ($urlonly =~ m/http\:\/\/books\.google\.com\/books/) { # Google Books
$citetemplate = citefromlink_GoogleBooks($urlonly);
}	elsif ($urlonly =~ m/http\:\/\/.*amazon\.com\//) { # Amazon.com
$citetemplate = citefromlink_Amazon($urlonly);
}	elsif ($urlonly =~ m/http\:\/\/www\.time\.com\//) { # Time Magazine
$citetemplate = citefromlink_TimeMagazine($urlonly);
}	elsif ($urlonly =~ m/https?\:\/\/.*?nytimes\.com\//) { # New York Times
$citetemplate = citefromlink_NewYorkTimes($urlonly);
} elsif ($urlonly =~ m/http:\/\/.*\.usnews\.com\//) { # US News and World Report
$citetemplate = citefromlink_USNews($urlonly);
} elsif ($urlonly =~ m/http:\/\/.*\.forbes\.com\//) { # Forbes
$citetemplate = citefromlink_Forbes($urlonly);
} elsif ($urlonly =~ m/http:\/\/news\.bbc\.co\.uk\//) { # BBC News
$citetemplate = citefromlink_BBC($urlonly);
} elsif ($urlonly =~ m/http:\/\/www\.imdb\.com\//) { # IMDB
$citetemplate = templatefrom_IMDB($urlonly);
} elsif ($urlonly =~ m/http:\/\/www\.myspace\.com\/([^\s\< \]]*)/) { # MySpace
my $ms_title = $1;
$citetemplate = templatefrom_Myspace($ms_title);
} elsif ($urlonly =~ m/http:\/\/www\.youtube\.com\/watch\?v\=([^\s\< \]]*)/) { # YouTube
my $yt_id = $1;
$citetemplate = templatefrom_YouTube($yt_id);
} elsif ($urlonly =~ m/http:\/\/bioguide\.congress\.gov\/scripts\/biodisplay.pl\?index\=([^\s\< \]]*)/) { # Congbio
my $cb_id = $1;
$citetemplate = templatefrom_CongBio($cb_id);
} elsif ($urlonly =~ m/http:\/\/www\.gutenberg\.org\/(?:etext|ebooks|files)\/(\d+)/) { # Project Gutenberg
my $pg_id = $1;
$citetemplate = templatefrom_PG($pg_id);
} else {  # check for DOI, and add title if none already
$citetemplate = check_DOI($urlonly, $link_type);
}
if ($citetemplate) {
if ($full_link =~ s/\[\Q$urlonly\E[^\]]*\]/$citetemplate/s) {
# do nothing
} else {
$full_link =~ s/\Q$urlonly\E/$citetemplate/s;
}
}
return $full_link;
}
sub check_DOI {
my $url = shift;
my $linktype = shift;
my $citetemplate = '';
return $citetemplate unless $linktype eq 'bare';
print " Looking up $url\n";
my $res = $ua->get("$url");
unless ($res->content_type eq 'text/html') {
print "  not html. Skipping.\n";
} else {
# It's html.
unless ($res->is_success) {
print "  no connection (probably 404). Skipping.\n";
} else {
# It's connected.
my $html = $res->content;
# Here's where I should check for a DOI, and only check for a title if $linktype eq 'bare'
if ($html =~ m/(10\.\d{4}(\/|\%2F)([^\s\"\?\&\>]|\&l?g?t\;|\<[^\s\"\?\&]*\>)*)(?=[\s\"\?]|\<\/)/) {
# It's got a DOI! Eureka.
my $DOI = $1;
# strip trailing flotsam
$DOI =~ s/(\<\/?\w+\/?\>|[\:\;\)\.\'\,\-\#])+$//;
$DOI =~ s/\<.*//;
# Now run the DOI through crossref.org:
my $crossref_url = "http://www.crossref.org/openurl/?pid=$crossref_creds&id=doi:$DOI&noredirect=true";
my $res2 = $ua->get($crossref_url);
my $xml = XMLin( $res2->decoded_content );
my $j_article_title = $xml->{query_result}->{body}->{query}->{article_title};
if ($j_article_title) {
my $j_journal_title = $xml->{query_result}->{body}->{query}->{journal_title};
my $j_volume = $xml->{query_result}->{body}->{query}->{volume};
my $j_issue = $xml->{query_result}->{body}->{query}->{issue};
my $j_pages = $xml->{query_result}->{body}->{query}->{first_page};
my $j_year = $xml->{query_result}->{body}->{query}->{year};
my $j_format = $xml->{query_result}->{body}->{query}->{publication_type};
$j_format =~ tr/_/ /;
my $j_last_name = $xml->{query_result}->{body}->{query}->{contributors}->{contributor}->{given_name};
my $j_first_name = $xml->{query_result}->{body}->{query}->{contributors}->{contributor}->{surname};
$citetemplate = "{{cite journal\n"
. "| last = $j_last_name\n"
. "| first = $j_first_name\n"
. "| year = $j_year\n"
. "| title = $j_article_title\n"
. "| journal = $j_journal_title\n"
. "| volume = $j_volume\n"
. "| issue = $j_issue\n"
. "| pages = $j_pages\n"
. "| doi = $DOI\n"
. "| format = $j_format\n"
. "}}";
}
}
unless ($citetemplate) {
# DOI checking.
if ($linktype eq 'bare') {
# Look for a title
print "  Looking for a title.\n";
if ($html =~ m/<\s*title\s*>\s*([^\n<]*)\s*<\s*\/\s*title\s*>/si) {
my $title = $1;
$title =~ tr/[]{}/()()/;
$title =~ s/\s/ /g;
while ($title =~ s/  / /g) {};
$title =~ s/ $//;
$title =~ s/^ //;
$title =~ s/]*>.*?<\/script>|]*>.*?<\/style>||//gi;
if (length($title) > 175) {
$title =~ s/(.{175}).*/$1.../;
}
$title =~ s/(.*)/\u$1/;
if ($title !~ m/$blacklist/i) {
# Title not blacklisted
print "  Title: $title\n";
my $baseurl = $url;
$baseurl =~ s/.*https?:\/\/([^\/\s\<]*).*/$1/;
$baseurl =~ s/.*\.(blogspot\.com|livejournal\.com|blogger\.com)/$1/;
$citetemplate = "[QQQ$url $title] at $baseurl";
} else { print "  black-listed title. Skipping.\n"; }
} else { print "  no title. Skipping.\n"; }
}
}
}
}
return $citetemplate;
}



  
    
      About
      Help
      Updates
      Contact
      Privacy
      Terms
      GitHub
    
    
      © 2025 Friendly Wiki. All rights reserved.
User:Polbot/source/Reffix.pl

\s(.?)\s<\/h1>\s

\s(.?).*?<\/h2>/s) {

$usn_title = "$1: $2";

} elsif ($html =~ m/

User:Polbot/source/Reffix.pl

\s*(.*?)\s*<\/h1>\s*

\s*(.*?).*?<\/h2>/s) { $usn_title = "$1: $2"; } elsif ($html =~ m/

\s*(.*?)\s*<\/h1>/s) { $usn_title = $1; } if ($html =~ m/

\s(.?)\s<\/h1>\s

\s(.?).*?<\/h2>/s) {

$usn_title = "$1: $2";

} elsif ($html =~ m/