User:Polbot/source/Reffix.pl
- Use like:
- perl reffix.pl Catname
use strict;
use Perlwikipedia;
use URI::Escape;
use LWP::UserAgent;
use Encode;
use XML::Simple;
my $Polbot_password = '(bot password)';
my $az_AccessKey = '(Amazon.com access code)';
my $crossref_creds = '(username:password)';
my $soonest_next_op = time;
my $wait_time = 10;
my $ignorenamespaces = 'User|User talk|Talk|Template|Template talk|Portal|Portal talk|Category|Category talk|Portal talk|Wikipedia talk|Image|Image talk|MediaWiki|MediaWiki talk|Template talk|Help|Help talk';
my $editsummary = 'Automated fixes to external links and references. (See the FAQ for details.)';
my $blacklist = '(^Cannot find server|(File|Resource|Article|Page) (was )?not found|(^|\s)Log ?In($|\s)|(^|\s)Sign ?in($|\s))';
- date
my ($Second, $Minute, $Hour, $Day, $Month, $Year, $WeekDay, $DayOfYear, $IsDST) = localtime(time);
$Year += 1900;
$Month++;
$Month =~ s/^(\d)$/0$1/;
$Day =~ s/^(\d)$/0$1/;
my $Todays_date = "$Year-$Month-$Day";
my $category = shift;
print "Running Polbot's reffix function, category = $category\n";
print "\nLogging in to Wikipedia.\n" ;
my $pw=Perlwikipedia->new();
$pw->{mech}->agent('Bot/polbot');
my $login_status=$pw->login('Polbot', $Polbot_password);
die "I can't log in." unless ($login_status eq 0);
my $ua = LWP::UserAgent->new;
$ua->agent("Firefox/3.0.1");
$ua->cookie_jar({});
print "Opening category '$category'\n";
my @allpages = $pw->get_pages_in_category("Category:$category");
print "There are " . scalar(@allpages) . " total pages to go through.\n";
foreach my $articlename (@allpages) {
print "Examining $articlename\n";
if ($articlename =~ /^$ignorenamespaces:/i) {
print " Not an article. Skipping.\n";
next;
}
# -----------------------------------------------------------------
# ---------------- First, look at the article and set variables.
my $bNeedsChanging = 0;
my $newart = '';
my $bHasReferencesTag = 0;
my $bHasReflist = 0;
my $bHasRefTag = 0;
my $art = $pw->get_text($articlename);
# Exclusion compliance
if ($art =~ m/\{\{\s*(nobots\s*\}\}|bots\s*\|\s*allow\s*=|bots\s*|\s*deny\s*=\s*all)/si) {
print " {{nobots}}, skipping.\n";
next;
}
# variables
if ($art =~ /<\s*references\s*\/\s*>/is) {
$bHasReferencesTag = 1;
}
if ($art =~ /\{\{\s*(template\s*:\s*)?reflist\s*[\|\}]/is) {
$bHasReflist = 1;
}
if ($art =~ /<\s*ref(\s+name\s*=\s*(?:"[^"]*"|\w+)|)\s*>/si) {
$bHasRefTag = 1;
}
# -----------------------------------------------------------------
# ---------------- Change
to {{reflist}} if ($bHasReferencesTag == 1) {
#$bNeedsChanging = 1;
$art =~ m/(<\s*references\s*\/>)/si;
my $refsect = $1;
$art =~ m/(<(span|div)( class=\"(references-small|small|references-2column))?\">\s*<\s*references\s*\/>\s*<\/\s*(span|div)>)/si;
my $temp2 = $1;
$art =~ m/(<(span|div)( class=\"(references-small|small|references-2column))?\">\s*<(span|div)( class=\"(references-small|small|references-2column))?\">\s*<\s*references\s*\/>\s*<\/\s*(span|div)>\s*<\/\s*(span|div)>)/si;
my $temp3 = $1;
if ($temp3) {
$refsect = $temp3;
} elsif ($temp2) {
$refsect = $temp2;
}
if ($refsect) {
my $newrefsect = $refsect;
if ($refsect =~ m/references-2column/) {
$newrefsect = "{{reflist|2}}";
} elsif ($refsect =~ m/[^-]column-count:[\s]*?(\d*)/) {
$newrefsect = "{{reflist|$1}}";
} elsif ($refsect =~ m/-moz-column-count:[\s]*?(\d*)/) {
$newrefsect = "{{reflist|$1}}";
} else {
$newrefsect = "{{reflist}}";
}
$art =~ s/$refsect/$newrefsect/si;
$bHasReflist = 1;
}
}
# -----------------------------------------------------------------
# ---------------- Fix http://...
while ($art =~ m/\[\[(https?:\/\/[^\]]*)\]\]/si) {
my $badlink = $1;
$bNeedsChanging = 1;
print " Fixing $badlink\n";
$art =~ s/\[\[\Q$badlink\E\]\]/[$badlink]/si;
}
# -----------------------------------------------------------------
# ---------------- Fix ext links to Wikimedia
# en.wikipedia
while ($art =~ m/http:\/\/(?:en\.)?wikipedia\.org\/wiki\/([^\] ]*)/g) {
my $extwikilink = $1;
$bNeedsChanging = 1;
my $intwikilink = $extwikilink;
$intwikilink =~ s/_/ /g;
$intwikilink =~ s/%([0-9A-Fa-f]{2})%([0-9A-Fa-f]{2})/decode("utf8", chr(hex($1)) . chr(hex($2)))/eg;
$intwikilink =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
$intwikilink =~ s/^(Image|Category):/:$1:/;
print " Fixing ext wikilink $extwikilink to $intwikilink\n";
# non-renamed
$art =~ s/\[http:\/\/(en\.)?wikipedia\.org\/wiki\/\Q$extwikilink\E\]/$intwikilink/g;
# renamed
$art =~ s/\[http:\/\/(?:en\.)?wikipedia\.org\/wiki\/\Q$extwikilink\E ([^\]]*)\]/$1/g;
}
# other.wikipedia
while ($art =~ m/\[http:\/\/([^\.]*).wikipedia.org\/wiki\/([^\] ]*)/s) {
my $extwikilang = $1;
my $extwikilink = $2;
$bNeedsChanging = 1;
my $intwikilink = $extwikilink;
$intwikilink =~ s/_/ /g;
$intwikilink =~ s/%([0-9A-Fa-f]{2})%([0-9A-Fa-f]{2})/decode("utf8", chr(hex($1)) . chr(hex($2)))/eg;
$intwikilink =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
$intwikilink =~ s/^(Image|Category):/:$1:/;
print " Fixing ext link $extwikilang.$extwikilink to $extwikilang:$intwikilink\n";
# non-renamed
$art =~ s/\[http:\/\/$extwikilang\.wikipedia\.org\/wiki\/\Q$extwikilink\E\]/:$extwikilang:$intwikilink/g;
# renamed
$art =~ s/\[http:\/\/$extwikilang\.wikipedia\.org\/wiki\/\Q$extwikilink\E ([^\]]*)\]/$1/g;
}
# -----------------------------------------------------------------
# ---------------- ref the BELs
# First, QQQ the PDFlink BELs
$art =~ s/(\{\{\s*PDF(?:link)?\s*\|\s*\[)(https?:\/\/[^\]]*\])/$1QQQ$2/gi;
# Next, QQQ the html comments
$newart = $art;
while ($art =~ m/<\!--(.*?)-->/gs) {
my $comment = $1;
my $newcomment = $comment;
if ($newcomment =~ s/\[(https?:\/\/)/\[QQQ$1/g) {
$newart =~ s/\Q$comment\E/$newcomment/;
}
}
$art = $newart;
# Next, QQQ the links already in refs
$newart = $art;
while ($art =~ m/(<\s*ref.*?<\s*\/\s*ref\s*>)/gs) {
my $ref = $1;
my $newref = $ref;
if ($newref =~ s/\[(https?:\/\/)/\[QQQ$1/g) {
$newart =~ s/\Q$ref\E/$newref/;
}
}
$art = $newart;
# Now QQQ the " at [http]" or " in [http]" or " from [http]"
$art =~ s/( at| in| from|At|In|From) \[(https?:\/\/[^\]]*\])/ $1 [QQQ$2/g;
# And lastly, QQQ links that begin a line
$art =~ s/^([\s\*\#\:]*\[)(https?\:\/\/[^\]]*\])/$1QQQ$2/gm;
# Okay! Now ref all non-QQQed BELs above the {{reflist}} template (or whatever)
my $artbefore = $art;
my $artafter = '';
if ($art =~ m/(.*?)(\{\{reflist\}\}|=+\s*Notes?\s*=+|=+\s*References?\s*=+|=+\s*External links?\s*=+|=+\s*Sources?\s*=+|=+\s*Further reading\s*=+|=+\s*See also\s*=+)(.*)/is) {
$artbefore = $1;
$artafter = "$2$3";
}
$newart = $artbefore;
while ($artbefore =~ m/\[(https?:\/\/[^ \]]*)\]/g) {
my $BEL = $1;
$bNeedsChanging = 1;
$bHasRefTag = 1;
$newart =~ s/ *\[\Q$BEL\E\]/QQQ$BEL<\/ref>/g;
}
$art = "$newart$artafter";
# UnQQQ it all
$art =~ s/QQQhttp/http/g;
$art =~ s/ *\(([^<]*<\/ref>)\)/$1/gs;
# -----------------------------------------------------------------
# ---------------- Add {{reflist}} if missing
if ($bHasRefTag - $bHasReflist == 1) {
$bNeedsChanging = 1;
print " but no {{reflist}}\n";
if ($art =~ m/\n=+\s*(references?|notes)\s*=+\s*\n/mi) {
my $putrefin = $1;
$art =~ s/(\n=+\s*($putrefin)\s*=+\n)/$1\{\{reflist\}\}\n/si;
print " Putting reflist after $putrefin section\n";
} else {
$art =~ m/(=+\s*see also\s*=+|=+\s*external links?\s*=+|=+\s*sources?\s*=+|=+\s*further reading\s*=+|\[\[\s*category\s*\:)/si;
my $putrefsbefore = $1;
if ($putrefsbefore) {
$art =~ s/\Q$putrefsbefore\E/==Notes==\n{{reflist}}\n\n$putrefsbefore/si;
print " Putting reflist before $putrefsbefore section\n";
} else {
$art .= "\n{{reflist}}";
print " Putting reflist at end\n";
}
}
}
# ------------------------------------------------------------------------
# ---------------- Known links -> cites or templates
# ---------------- Unkown links -> titles or {{dead link}}
# First, QQQ the PDFlink BELs
$art =~ s/(\{\{\s*PDF(?:link)?\s*\|\s*\[)(https?:\/\/[^\]]*\])/$1QQQ$2/gi;
# Next, QQQ the html comments
$newart = $art;
while ($art =~ m/<\!--(.*?)-->/gs) {
my $comment = $1;
my $newcomment = $comment;
if ($newcomment =~ s/\[(https?:\/\/)/\[QQQ$1/g) {
$newart =~ s/\Q$comment\E/$newcomment/;
}
}
$art = $newart;
# And QQQ the already-dead links
$art =~ s/\b(https?\:\/\/[^\s\]\<\{]*\]? ?\{\{dead link\}\})/QQQ$1/g;
my @BURLs = (); # bare URLs, e.g. http://www.example.com/subdir/example.html
my @BELs = (); # bare external links, e.g. [http://www.example.com/subdir/example.html]
my @NELs = (); # named external links, e.g. [http://www.example.com/subdir/example.html name]
# Those starting a line
push @BURLs, ($art =~ m/^[ \*\#\:]*https?\:\/\/[^\s\]\<]*/mg);
push @BELs, ($art =~ m/^[ \*\#\:]*\[https?\:\/\/[^ \]\<]*\]/mg);
push @NELs, ($art =~ m/^[ \*\#\:]*\[https?\:\/\/[^ \]\<]*(?: [^\]]+)\]/mg);
# Those in tags
push @BURLs, ($art =~ m/]*)?>https?\:\/\/[^\s\]\<]*\s*<\/ref>/sg);
push @BELs, ($art =~ m/]*)?>\[https?\:\/\/[^ \]\<]*\]\s*<\/ref>/sg);
push @NELs, ($art =~ m/]*)?>\[https?\:\/\/[^ \]]*(?: [^\]]+)\]\s*<\/ref>/sg);
# Process these links.
$newart = $art;
print "Processing BURLs and BELs\n";
foreach my $full_link (@BURLs, @BELs) {
my $transformedlink = process_link($full_link, 'bare');
if ($full_link ne $transformedlink) {
$newart =~ s/\Q$full_link\E/$transformedlink/s;
$bNeedsChanging = 1;
}
}
print "Processing " . scalar(@NELs) . " NELs\n";
foreach my $full_link (@NELs) {
my $transformedlink = process_link($full_link, 'named');
if ($full_link ne $transformedlink) {
$newart =~ s/\Q$full_link\E/$transformedlink/s;
$bNeedsChanging = 1;
}
}
$art = $newart;
$art =~ s/QQQhttp/http/g;
# -----------------------------------------------------------------
# ---------- Merging refs: very hard. Skipping for now.
# -----------------------------------------------------------------
# ---------- Minor fixes
if ($bNeedsChanging) {
# Fix punctuation touching ref tags
- while ($art =~ s/(.*)(
)([\.\,\?\!\;\:])/$1$3$2/gs) {}; $newart = $art;
while ($art =~ m/(]*>.*?<\/ref>)(.)/gs) {
my $thisref = $1;
my $thischar = $2;
if ($thischar =~ m/[\.\,\?\!\;\:]/) {
print "Found $thischar after \n";
$newart =~ s/\Q$thisref$thischar\E/$thischar$thisref/gs;
}
}
while ($art =~ m/(]*\/>)(.)/gs) {
my $thisref = $1;
my $thischar = $2;
if ($thischar =~ m/[\.\,\?\!\;\:]/) {
print "Found $thischar after \n";
$newart =~ s/\Q$thisref$thischar\E/$thischar$thisref/gs;
}
}
$art = $newart;
# Miscaptalizations
$art =~ s/==(\s*)See also(\s*)==/==$1See also$2==/i;
$art =~ s/==(\s*)External links?(\s*)==/==$1External links$2==/i;
# units
$art =~ s/(\d) (mph|km|mile|mi|kilometer|mbar|knot|feet|ft|meter|m|metre|kilometre|inch|million|billion|foot|days|kt|millibar|mm|cm|dollar|USD|inHg|hPa|people|hour|liter|degree|°|year|month|square|sq)\b/$1 $2/g;
# HTML
$art =~ s/\<\/?i\>/\'\'/gi;
$art =~ s/\<\/?b\>/\'\'\'/gi;
# Date stuff
# Century
$art =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi;
$art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi;
$art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi;
$art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\]\]/$1 centuries/gi;
$art =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi;
$art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi;
$art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\s(AD|BC|CE|BCE)\]\]/$1 centuries $2/gi;
# piped decades and years
$art =~ s/\[\[(\d{1,4}\'?s)\]\]/$1/gi;
$art =~ s/\[\[(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi;
$art =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4})\]\]/$1/gi;
$art =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi;
$art =~ s/\[\[\d{1,4}s?\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi;
$art =~ s/\[\[\d{1,4}s?\|(\d{1,2}s?)\]\]/$1/gi;
# months
$art =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December)\]\]/$1/gi;
$art =~ s/\[\[January\|(Jan)\]\]/$1/gi;
$art =~ s/\[\[February\|(Feb)\]\]/$1/gi;
$art =~ s/\[\[March\|(Mar)\]\]/$1/gi;
$art =~ s/\[\[April\|(Apr)\]\]/$1/gi;
$art =~ s/\[\[May\|(May)\]\]/$1/gi;
$art =~ s/\[\[June\|(Jun)\]\]/$1/gi;
$art =~ s/\[\[July\|(Jul)\]\]/$1/gi;
$art =~ s/\[\[August\|(Aug)\]\]/$1/gi;
$art =~ s/\[\[September\|(Sep)\]\]/$1/gi;
$art =~ s/\[\[October\|(Oct)\]\]/$1/gi;
$art =~ s/\[\[November\|(Nov)\]\]/$1/gi;
$art =~ s/\[\[December\|(Dec)\]\]/$1/gi;
# month+year
$art =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d{3,4})\]\]/$1/gi;
# Month+day_number "March 7th" -> "March 7"
$art =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December) (\d?\d)(?:th|st|nd|rd)\]\]/\[\[$1 $2\]\]/gi;
$art =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](?:th|st|nd|rd)/\[\[$1\]\]/gi;
$art =~ s/\[\[(\d?\d)(?:th|st|nd|rd) (January|February|March|April|May|June|July|August|September|October|November|December)\]\]/\[\[$1 $2\]\]/gi;
# Month+day_number piped into number. Preferences do not work. They don't work in sequence because digits in the two dates must be adjacent
$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
$art =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1/gi;
$art =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2})\]\]/$1/gi;
$art =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2})\]\]/$1/gi;
# solitary day_numbers
$art =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi;
$art =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi;
$art =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi;
# days of the week in full. Optional plurals
$art =~ s/\[\[(Mondays?|Tuesdays?|Wednesdays?|Thursdays?|Fridays?|Saturdays?|Sundays?)\]\]/$1/gi;
# days of the week abbreviated. Leave out 'Sun' as potentially valid link to the Sun. Leave out 'SAT' in upper case as potential link to 'Scholastic achievement/aptitude test'.
$art =~ s/\[\[(Mon|Tue|Tues|Wed|Thu|Thur|Thurs|Fri)\]\]/$1/gi;
$art =~ s/\[\[(Sat)\]\]/$1/g;
$art =~ s/\[\[Mondays?\|(Mondays?)\]\]/$1/gi;
$art =~ s/\[\[Tuesdays?\|(Tuesdays?)\]\]/$1/gi;
$art =~ s/\[\[Wednesdays?\|(Wednesdays?)\]\]/$1/gi;
$art =~ s/\[\[Thursdays?\|(Thursdays?)\]\]/$1/gi;
$art =~ s/\[\[Fridays?\|(Fridays?)\]\]/$1/gi;
$art =~ s/\[\[Saturdays?\|(Saturdays?)\]\]/$1/gi;
$art =~ s/\[\[Sundays?\|(Sundays?)\]\]/$1/gi;
# 4 digit years piped into 2
$art =~ s/\[\[\d{1,4}\|(\d{1,2})\]\]/$1/gi;
# year: examine characters in link on left for date, examine characters in link on right for date
$art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi;
# year pair: examine characters in link on left for date, examine characters in link on right for date
$art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc\s\-]))/$1$2$3$4$5/gi;
# year: examine characters in link on left for date, avoid links on right
$art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi;
# year pair: examine characters in link on left for date, avoid links on right
$art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi;
# year: check for line-ends, text on left, avoid links on right. Run twice to deal better with lists.
$art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi;
$art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi;
# year pair: check for line-ends, text on left, avoid links on right
$art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3$4$5/gi;
# year: avoid links on left, examine characters in link on right for date
$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi;
# year pair: avoid links on left, examine characters in link on right for date
$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3$4$5/gi;
# year:avoid links on left, text on right
$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.\'\*\|\&])/$1$2$3/gi;
# year pair: avoid links on left, text on right
$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.\'\*\|\&])/$1$2$3$4$5/gi;
# year:text on left, text on right
$art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.\'\*\|\&])/$1$2$3/gi;
# year pair: avoid links on left, text on right
$art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:\.\'\*\|\&])/$1$2$3$4$5/gi;
# year:avoid links on both sides
$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi;
# year pair: avoid links on both sides
$art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi;
# 'present'
$art =~ s/\[\[Present \(time\)\|(Present)\]\]/$1/gi;
# Eliminate 'surprise links' also known as 'easter egg links'
$art =~ s/\[\[\d{1,4}s?\sin\s[^\|]{1,30}\|(\d{1,4}s?)\]\]/$1/gi;
}
# -----------------------------------------------------------------
# ---------- DONE! ---------------------------
if ($bNeedsChanging) {
wiki_write($articlename, $art, $editsummary);
}
}
sub wiki_write {
my $article_name = shift;
my $wiki_out = shift;
my $edit_summary = shift;
$|=1;
print " Waiting " . ($soonest_next_op - time) . " secs... ";
$|=1;
while (time < $soonest_next_op) {};
$soonest_next_op = time + $wait_time;
print "Writing $article_name\n";
$pw->edit($article_name, $wiki_out, $edit_summary);
}
sub citefromlink_GoogleBooks {
my $gb_url = shift;
my $citetemplate = '';
my $gb_lookup_url = $gb_url;
$gb_lookup_url =~ s/(http\:\/\/books\.google\.com\/books\?).*(id\=[^\&]*).*$/$1$2/i;
#http://books.google.com/books?hl=en&lr=&id=thmPzIltAV8C&oi=fnd&pg=PP11&sig=81UGsCDc1DxLV3JAWviltyHD_bY&dq=%22Mordecai+Cooke%22#PPA166,M1
#http://books.google.com/books?id=thmPzIltAV8C
print " Google book link: $gb_lookup_url\n";
my $res = $ua->get($gb_lookup_url);
if ($res->is_success) {
print " success.\n";
my $html = $res->content;
my $bibdiv = '';
my $gb_title = '';
my $gb_author = '';
my $gb_year = '';
my $gb_pub = '';
my $gb_isbn = '';
my $gb_pages = '';
if ($html =~ m/
([^<]+)<\/h2>([^\n]*)\n/s) {
$gb_title = $1;
$bibdiv = $2;
}
if ($html =~ m/
(.*?)\n/s) {
$bibdiv = $1;
}
$bibdiv =~ s/
/ /g;if ($bibdiv =~ m/By ([^<]*)/) {
$gb_author = $1;
} elsif ($bibdiv =~ m/
By ([^<]*)/) { $gb_author = $1;
}
if ($bibdiv =~ m/\
Published by ([^\n\<]*?)\, (\d\d\d\d)\<\/div>/) {$gb_pub = $1;
$gb_year = $2;
} else {
if ($bibdiv =~ m/
Published ([^<]*)/) { $gb_year = $1;
}
if ($bibdiv =~ m/q=inpublisher[^>]+>([^<]*)/) {
$gb_pub = $1;
}
}
if ($bibdiv =~ m/\>ISBN\s*(?:\:\s*)?(\w+)/) {
$gb_isbn = $1;
}
$citetemplate = "{{cite book\n|title=$gb_title\n|author=$gb_author\n|year=$gb_year\n|publisher=$gb_pub\n|isbn=$gb_isbn\n|url=$gb_url\n}}";
} else { print " failed.\n"; }
return $citetemplate;
}
sub citefromlink_Amazon {
my $az_url = shift;
my $citetemplate = '';
print " Amazon.com link: ";
# First, get the ASIN
my $az_ASIN = '';
if ($az_url =~ m/\/(?:dp|product)\/([^\/]*)/) {
$az_ASIN = $1;
print "$az_ASIN\n";
# Next, plug it into the Amazon API.
my $az_api_url = "http://webservices.amazon.com/onca/xml" .
"?Service=AWSECommerceService" .
"&AWSAccessKeyId=$az_AccessKey" .
"&Operation=ItemLookup" .
"&IdType=ASIN" .
"&ItemId=$az_ASIN" .
"&ResponseGroup=Medium";
my $res = $ua->get($az_api_url);
my $xml = XMLin( $res->decoded_content );
my $az_binding = $xml->{Items}->{Item}->{ItemAttributes}->{Binding};
if ($az_binding =~ m/^(Hardcover|Paperback|Ring-bound|Kindle Edition|School & Library Binding|Unknown Binding)$/) {
# Book
my $az_title = $xml->{Items}->{Item}->{ItemAttributes}->{Title};
my $az_date = $xml->{Items}->{Item}->{ItemAttributes}->{PublicationDate};
my $az_pub = $xml->{Items}->{Item}->{ItemAttributes}->{Publisher};
$az_pub = join(", ", @{ $az_pub }) if (ref( $az_pub ) eq "ARRAY" );
my $az_isbn = $xml->{Items}->{Item}->{ItemAttributes}->{ISBN};
#my $az_pages = $xml->{Items}->{Item}->{ItemAttributes}->{NumberOfPages} . " pages";
my $az_author = $xml->{Items}->{Item}->{ItemAttributes}->{Author};
$az_author = join(", ", @{ $az_author }) if (ref( $az_author ) eq "ARRAY" );
$citetemplate = "{{cite book\n|title=$az_title\n|author=$az_author\n|date=$az_date\n|publisher=$az_pub\n|isbn=$az_isbn\n|url=$az_url\n}}";
} elsif ($az_binding =~ m/^(Audio CD|Audio Cassette|Music Download|Video Game|DVD|Blu-ray|HD DVD|VHS Tape|UMD for PSP)$/) {
# Media
my $az_title = $xml->{Items}->{Item}->{ItemAttributes}->{Title};
my $az_date = $xml->{Items}->{Item}->{ItemAttributes}->{ReleaseDate};
$az_date = $xml->{Items}->{Item}->{ItemAttributes}->{PublicationDate} unless ($az_date);
my $az_pub = $xml->{Items}->{Item}->{ItemAttributes}->{Publisher};
$az_pub = join(", ", @{ $az_pub }) if (ref( $az_pub ) eq "ARRAY" );
my $az_artist = $xml->{Items}->{Item}->{ItemAttributes}->{Artist};
$az_artist = $xml->{Items}->{Item}->{ItemAttributes}->{Author} unless ($az_artist);
$az_artist = join(", ", @{ $az_artist }) if (ref( $az_artist ) eq "ARRAY" );
my $az_isbn = $xml->{Items}->{Item}->{ItemAttributes}->{ISBN};
$citetemplate = "{{cite video\n|title=$az_title\n|people=$az_artist\n|date=$az_date\n|format=$az_binding|publisher=$az_pub\n|isbn=$az_isbn\n|url=$az_url\n|accessdate=$Todays_date\n}}";
}
} else { print " couldn't find ASIN.\n"; }
return $citetemplate;
}
sub citefromlink_TimeMagazine {
my $tm_url = shift;
my $citetemplate = '';
print " Time Magazine link.\n";
my $res = $ua->get($tm_url);
if ($res->is_success) {
my $html = $res->content;
my $tm_title = '';
my $tm_date = '';
my $tm_author = '';
if ($html =~ m/RightslinkPopUp\(\'(.*?)\', \'(.*?)\', \'(.*?)\', \'.*?\'\)\;/) {
$tm_title = $1;
$tm_date = $2;
$tm_author = $3;
$tm_title =~ s/\\\'/'/g;
$tm_author =~ s/\\\'/'/g;
if ($tm_title) {
$citetemplate = "{{cite news\n|author=$tm_author\n|title=$tm_title\n|date=$tm_date\n|work=Time Magazine\n|url=$tm_url\n|accessdate=$Todays_date\n}}";
}
}
}
return $citetemplate;
}
sub citefromlink_NewYorkTimes {
my $nyt_url = shift;
my $citetemplate = '';
print " New York Times link: '$nyt_url'\n";
my $res = $ua->get($nyt_url);
if ($res->is_success) {
my $html = $res->content;
my $nyt_title = '';
my $nyt_date = '';
my $nyt_author = '';
# Title
if ($html =~ m/
$nyt_title = $1;
$nyt_title =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
$nyt_title =~ s/\n/ /gs;
$nyt_title =~ s/^ +//;
$nyt_title =~ s/ +$//;
} elsif ($html =~ m/function getShareHeadline\(\) \{\s*return encodeURIComponent\(\'(.*?)\'\)\;/s) {
$nyt_title = $1;
$nyt_title =~ s/\n/ /gs;
$nyt_title =~ s/^\s+//;
$nyt_title =~ s/\s+$//;
$nyt_title =~ s/\\\'/'/g;
} elsif ($html =~ m/
(.*?)<\/NYT_HEADLINE>/s) { $nyt_title = $1;
$nyt_title =~ s/\n/ /gs;
$nyt_title =~ s/^\s+//;
$nyt_title =~ s/\s+$//;
} elsif ($html =~ m//s) {
$nyt_title = $1;
$nyt_title =~ s/\n/ /gs;
$nyt_title =~ s/^\s+//;
$nyt_title =~ s/\s+$//;
} elsif ($html =~ m/
(.*?)<\/h3>/s) {
$nyt_title = $1;
$nyt_title =~ s/\n/ /gs;
$nyt_title =~ s/^\s+//;
$nyt_title =~ s/\s+$//;
}
$nyt_title =~ s/\<\/?..?\>//g;
# Author
if ($html =~ m/
$nyt_author = $1;
$nyt_author =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
$nyt_author =~ s/^ *(BY )?//i;
$nyt_author =~ s/ +$//;
} elsif ($html =~ m/function getShareByline\(\) \{\s*return encodeURIComponent\(\'By (.*?)\'\)\;/s) {
$nyt_author = $1;
$nyt_author =~ s/^\s+//;
$nyt_author =~ s/\s+$//;
$nyt_author =~ s/\\\'/'/g;
} elsif ($html =~ m//s) {
$nyt_author = $1;
$nyt_author =~ s/^\s+//;
$nyt_author =~ s/\s+$//;
}
# Date
if ($html =~ m/
$nyt_date = $1;
$nyt_date =~ s/^ *(\d\d\d\d)(\d\d)(\d\d) *$/$1-$2-$3/;
} elsif ($html =~ m//s) {
$nyt_date = $1;
} elsif ($html =~ m/