User:AnomieBOT/source/tasks/ReplaceExternalLinks5.pm
{{ombox|type=notice|text= Approved 2011-12-04.
Wikipedia:Bots/Requests for approval/AnomieBOT 60}}
package tasks::ReplaceExternalLinks5;
=pod
=begin metadata
Bot: AnomieBOT
Task: ReplaceExternalLinks5
BRFA: Wikipedia:Bots/Requests for approval/AnomieBOT 60
Status: Approved 2011-12-04
Created: 2011-11-30
OnDemand: true
Add archiveurl for dead or dying links, when an archive can be found at
archive.org or webcitation.org, and optionally tag unarchived links with
{{tl|dead link}} or a similar template.
=end metadata
=cut
use utf8;
use strict;
use Data::Dumper;
use POSIX;
use Date::Parse;
use LWP::UserAgent;
use XML::LibXML;
use HTML::Entities ();
use URI;
use AnomieBOT::Task qw/:time/;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;
my $req='request';
- Useful character sets
my $chars='[^][<>"\x00-\x20\x7F\p{Zs}]';
my $dchars='[^][<>"\x00-\x20\x7F\p{Zs}/.]';
my $portre=qr!(?::\d+)?!;
- Template for marking dead links. Set undef for no tagging.
my $deadlink=undef; #'dead link';
- euquery values to search for
my @euquery=('*.gamepro.com');
- Regular expression matching links to replace. No protocol.
my $linkre=qr!(?:$dchars+\.)*(?i:gamepro\.com)$portre\/!;
- Description of links
my $desc='Gamepro';
- Marker to indicate where {{dead links}} should be removed
my $rmdl="\x02*\x03";
- Placeholder for when
my $nodl="\x02x\x03";
- The text part of a bracketed link
my $btext=qr/ *[^\]\x00-\x08\x0a-\x1F]*?/;
- Protocol re
my $proto1=qr!(?:https?:)?//!;
my $proto2=qr!https?://!;
sub new {
my $class=shift;
my $self=$class->SUPER::new();
$self->{'iter'}=undef;
$self->{'ua'}=LWP::UserAgent->new(
agent=>"AnomieBOT link checker for en.wikipedia.org (https://en.wikipedia.org/wiki/Wikipedia:Bots/Requests_for_approval/AnomieBOT_60)",
keep_alive=>300,
);
# Unfortunately, webcite seems to like quoting back the url without
# encoding ampersands in certain error messages.
$self->{'xml'}=XML::LibXML->new(recover=>1);
$self->{'protocols'}=[];
bless $self, $class;
return $self;
}
=pod
=for info
Approved 2011-12-04.
Wikipedia:Bots/Requests for approval/AnomieBOT 60
=cut
sub approved {
return -1;
}
sub run {
my ($self, $api)=@_;
my $res;
$api->task('ReplaceExternalLinks5', 0, 10, qw/d::Redirects d::Templates d::Nowiki/);
my $screwup='Errors? User:'.$api->user.'/shutoff/ReplaceExternalLinks5';
# Spend a max of 5 minutes on this task before restarting
my $endtime=time()+300;
# Get list of citation templates
my %templates=$api->redirects_to_resolved(
'Template:Citation',
'Template:Citation metadata',
'Template:Cite api',
'Template:Cite book',
'Template:Cite conference',
'Template:Cite IETF',
'Template:Cite interview',
'Template:Cite journal',
'Template:Cite mailing list',
'Template:Cite news',
'Template:Cite press release',
'Template:Cite video',
'Template:Cite web',
'Template:Unicite',
'Template:Vancite conference',
'Template:Vancite journal',
'Template:Vancite news',
'Template:Vancite web',
'Template:Vcite conference',
'Template:Vcite journal',
'Template:Vcite news',
'Template:Vcite web',
);
if(exists($templates{''})){
$api->warn("Failed to get citation template redirects: ".$templates{''}{'error'}."\n");
return 60;
}
# Get regex for finding {{dead link}}
my (%dl,$dlre);
if(defined($deadlink)){
%dl=$api->redirects_to_resolved($deadlink);
if(exists($dl{''})){
$api->warn("Failed to get dead link template redirects: ".$dl{''}{'error'}."\n");
return 60;
}
$dlre='{{(?i:\s*Template\s*:)?\s*(?:'.join('|',map { $_="\Q$_\E"; s/^Template\\:(.)/(?i:$1)/; s/\\ /[ _]/g; $_; } keys %dl).')(?>\s*(?s:\|.*?)?}})';
$dlre=qr/$dlre/;
} else {
%dl=();
$dlre=qr/(*F)x/;
}
$self->{'protocols'}=[qw/http https/] unless @{$self->{'protocols'}};
while(@{$self->{'protocols'}}){
if(!defined($self->{'iter'})){
$self->{'iter'}=$api->iterator(
list => 'exturlusage',
euprop => 'title',
euquery => [@euquery],
euprotocol => shift @{$self->{'protocols'}},
eunamespace => '0',
eulimit => '1000', # exturlusage has issues with big lists
);
}
while(my $pg=$self->{'iter'}->next){
if(!$pg->{'_ok_'}){
$api->warn("Failed to retrieve page list for ".$self->{'iter'}->iterval.": ".$pg->{'error'}."\n");
return 60;
}
return 0 if $api->halting;
my $page=$pg->{'title'};
my $tok=$api->edittoken($page, EditRedir => 1);
if($tok->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$tok->{'content'}."\n");
return 300;
}
if($tok->{'code'} ne 'success'){
$api->warn("Failed to get edit token for $page: ".$tok->{'error'}."\n");
next;
}
if(exists($tok->{'missing'})){
$api->warn("WTF? $page does not exist?\n");
next;
}
# Setup flags
$self->{'flags'}={cite=>0,link=>0,404=>0,fail=>0};
my $intxt=$tok->{'revisions'}[0]{'slots'}{'main'}{'*'};
my $outtxt=$intxt;
# Replace the links. First, do citation templates.
my $nowiki;
$outtxt=$api->process_templates($outtxt, sub {
return undef if $self->{'flags'}{'fail'};
my $name=shift;
my $params=shift;
my $wikitext=shift;
my $data=shift;
my $oname=shift;
return undef unless exists($templates{"Template:$name"});
my $ret="{{$oname";
my $archived=0;
my $url='';
my ($accessdate,$date,$year,$month);
$year=$month='XXX';
foreach ($api->process_paramlist(@$params)){
$_->{'name'}=~s/^\s+|\s+$//g;
$_->{'value'}=~s/^\s+|\s+$//g;
if($_->{'name'} eq 'url'){
$url=$_->{'value'};
} elsif($_->{'name'} eq 'accessdate'){
$accessdate=str2time($_->{'value'});
} elsif($_->{'name'} eq 'date'){
$date=str2time($_->{'value'});
} elsif($_->{'name'} eq 'year' && $_->{'value'}=~/^\d+$/){
$year=$_->{'value'};
} elsif($_->{'name'} eq 'month'){
$month=$_->{'value'};
} elsif($_->{'name'} eq 'archiveurl'){
$archived=1;
}
$ret.='|'.$_->{'text'};
}
my $r404='';
if(!$archived && $url=~m!^$proto1$linkre!){
my ($u,$dt);
$dt=$accessdate // $date // str2time("1 $month $year") // str2time("1 June $year") // time();
($u,$dt,$r404)=chkExtLink($self,$api,0,$url, $dt);
return undef if($self->{'flags'}{'fail'});
$ret.="|archiveurl=$u|archivedate=$dt" unless $r404;
if(!$r404){
$ret=~s/$rmdl//g;
$r404=$rmdl;
}
}
$ret.="}}".$r404;
return $ret;
});
return 60 if($self->{'flags'}{'fail'});
# Next, strip for raw link processing
# Regular expressions are adapted from those MediaWiki uses to
# recognize external links.
($outtxt,$nowiki)=$api->strip_nowiki($outtxt);
($outtxt,$nowiki)=$api->strip_templates($outtxt, sub {
my $name=shift;
return exists($templates{"Template:$name"});
}, {}, $nowiki);
# Strip out ref tags, then replace any links with a guess at access
# date.
($outtxt,$nowiki)=$api->strip_regex(qr!].*?!, $outtxt, $nowiki);
my @arc=qw/[aA]rchive webcitation\.org [wW]ayback/;
my $arc='(?:'.join('|',@arc).')';
while(my ($k,$v)=each %$nowiki){