User:AnomieBOT/source/tasks/ReplaceExternalLinks5.pm

{{ombox|type=notice|text= Approved 2011-12-04.
Wikipedia:Bots/Requests for approval/AnomieBOT 60}}

package tasks::ReplaceExternalLinks5;

=pod

=begin metadata

Bot: AnomieBOT

Task: ReplaceExternalLinks5

BRFA: Wikipedia:Bots/Requests for approval/AnomieBOT 60

Status: Approved 2011-12-04

Created: 2011-11-30

OnDemand: true

Add archiveurl for dead or dying links, when an archive can be found at

archive.org or webcitation.org, and optionally tag unarchived links with

{{tl|dead link}} or a similar template.

=end metadata

=cut

use utf8;

use strict;

use Data::Dumper;

use POSIX;

use Date::Parse;

use LWP::UserAgent;

use XML::LibXML;

use HTML::Entities ();

use URI;

use AnomieBOT::Task qw/:time/;

use vars qw/@ISA/;

@ISA=qw/AnomieBOT::Task/;

my $req='request';

  1. Useful character sets

my $chars='[^][<>"\x00-\x20\x7F\p{Zs}]';

my $dchars='[^][<>"\x00-\x20\x7F\p{Zs}/.]';

my $portre=qr!(?::\d+)?!;

  1. Template for marking dead links. Set undef for no tagging.

my $deadlink=undef; #'dead link';

  1. euquery values to search for

my @euquery=('*.gamepro.com');

  1. Regular expression matching links to replace. No protocol.

my $linkre=qr!(?:$dchars+\.)*(?i:gamepro\.com)$portre\/!;

  1. Description of links

my $desc='Gamepro';

  1. Marker to indicate where {{dead links}} should be removed

my $rmdl="\x02*\x03";

  1. Placeholder for when

my $nodl="\x02x\x03";

  1. The text part of a bracketed link

my $btext=qr/ *[^\]\x00-\x08\x0a-\x1F]*?/;

  1. Protocol re

my $proto1=qr!(?:https?:)?//!;

my $proto2=qr!https?://!;

sub new {

my $class=shift;

my $self=$class->SUPER::new();

$self->{'iter'}=undef;

$self->{'ua'}=LWP::UserAgent->new(

agent=>"AnomieBOT link checker for en.wikipedia.org (https://en.wikipedia.org/wiki/Wikipedia:Bots/Requests_for_approval/AnomieBOT_60)",

keep_alive=>300,

);

# Unfortunately, webcite seems to like quoting back the url without

# encoding ampersands in certain error messages.

$self->{'xml'}=XML::LibXML->new(recover=>1);

$self->{'protocols'}=[];

bless $self, $class;

return $self;

}

=pod

=for info

Approved 2011-12-04.
Wikipedia:Bots/Requests for approval/AnomieBOT 60

=cut

sub approved {

return -1;

}

sub run {

my ($self, $api)=@_;

my $res;

$api->task('ReplaceExternalLinks5', 0, 10, qw/d::Redirects d::Templates d::Nowiki/);

my $screwup='Errors? User:'.$api->user.'/shutoff/ReplaceExternalLinks5';

# Spend a max of 5 minutes on this task before restarting

my $endtime=time()+300;

# Get list of citation templates

my %templates=$api->redirects_to_resolved(

'Template:Citation',

'Template:Citation metadata',

'Template:Cite api',

'Template:Cite book',

'Template:Cite conference',

'Template:Cite IETF',

'Template:Cite interview',

'Template:Cite journal',

'Template:Cite mailing list',

'Template:Cite news',

'Template:Cite press release',

'Template:Cite video',

'Template:Cite web',

'Template:Unicite',

'Template:Vancite conference',

'Template:Vancite journal',

'Template:Vancite news',

'Template:Vancite web',

'Template:Vcite conference',

'Template:Vcite journal',

'Template:Vcite news',

'Template:Vcite web',

);

if(exists($templates{''})){

$api->warn("Failed to get citation template redirects: ".$templates{''}{'error'}."\n");

return 60;

}

# Get regex for finding {{dead link}}

my (%dl,$dlre);

if(defined($deadlink)){

%dl=$api->redirects_to_resolved($deadlink);

if(exists($dl{''})){

$api->warn("Failed to get dead link template redirects: ".$dl{''}{'error'}."\n");

return 60;

}

$dlre='{{(?i:\s*Template\s*:)?\s*(?:'.join('|',map { $_="\Q$_\E"; s/^Template\\:(.)/(?i:$1)/; s/\\ /[ _]/g; $_; } keys %dl).')(?>\s*(?s:\|.*?)?}})';

$dlre=qr/$dlre/;

} else {

%dl=();

$dlre=qr/(*F)x/;

}

$self->{'protocols'}=[qw/http https/] unless @{$self->{'protocols'}};

while(@{$self->{'protocols'}}){

if(!defined($self->{'iter'})){

$self->{'iter'}=$api->iterator(

list => 'exturlusage',

euprop => 'title',

euquery => [@euquery],

euprotocol => shift @{$self->{'protocols'}},

eunamespace => '0',

eulimit => '1000', # exturlusage has issues with big lists

);

}

while(my $pg=$self->{'iter'}->next){

if(!$pg->{'_ok_'}){

$api->warn("Failed to retrieve page list for ".$self->{'iter'}->iterval.": ".$pg->{'error'}."\n");

return 60;

}

return 0 if $api->halting;

my $page=$pg->{'title'};

my $tok=$api->edittoken($page, EditRedir => 1);

if($tok->{'code'} eq 'shutoff'){

$api->warn("Task disabled: ".$tok->{'content'}."\n");

return 300;

}

if($tok->{'code'} ne 'success'){

$api->warn("Failed to get edit token for $page: ".$tok->{'error'}."\n");

next;

}

if(exists($tok->{'missing'})){

$api->warn("WTF? $page does not exist?\n");

next;

}

# Setup flags

$self->{'flags'}={cite=>0,link=>0,404=>0,fail=>0};

my $intxt=$tok->{'revisions'}[0]{'slots'}{'main'}{'*'};

my $outtxt=$intxt;

# Replace the links. First, do citation templates.

my $nowiki;

$outtxt=$api->process_templates($outtxt, sub {

return undef if $self->{'flags'}{'fail'};

my $name=shift;

my $params=shift;

my $wikitext=shift;

my $data=shift;

my $oname=shift;

return undef unless exists($templates{"Template:$name"});

my $ret="{{$oname";

my $archived=0;

my $url='';

my ($accessdate,$date,$year,$month);

$year=$month='XXX';

foreach ($api->process_paramlist(@$params)){

$_->{'name'}=~s/^\s+|\s+$//g;

$_->{'value'}=~s/^\s+|\s+$//g;

if($_->{'name'} eq 'url'){

$url=$_->{'value'};

} elsif($_->{'name'} eq 'accessdate'){

$accessdate=str2time($_->{'value'});

} elsif($_->{'name'} eq 'date'){

$date=str2time($_->{'value'});

} elsif($_->{'name'} eq 'year' && $_->{'value'}=~/^\d+$/){

$year=$_->{'value'};

} elsif($_->{'name'} eq 'month'){

$month=$_->{'value'};

} elsif($_->{'name'} eq 'archiveurl'){

$archived=1;

}

$ret.='|'.$_->{'text'};

}

my $r404='';

if(!$archived && $url=~m!^$proto1$linkre!){

my ($u,$dt);

$dt=$accessdate // $date // str2time("1 $month $year") // str2time("1 June $year") // time();

($u,$dt,$r404)=chkExtLink($self,$api,0,$url, $dt);

return undef if($self->{'flags'}{'fail'});

$ret.="|archiveurl=$u|archivedate=$dt" unless $r404;

if(!$r404){

$ret=~s/$rmdl//g;

$r404=$rmdl;

}

}

$ret.="}}".$r404;

return $ret;

});

return 60 if($self->{'flags'}{'fail'});

# Next, strip for raw link processing

# Regular expressions are adapted from those MediaWiki uses to

# recognize external links.

($outtxt,$nowiki)=$api->strip_nowiki($outtxt);

($outtxt,$nowiki)=$api->strip_templates($outtxt, sub {

my $name=shift;

return exists($templates{"Template:$name"});

}, {}, $nowiki);

# Strip out ref tags, then replace any links with a guess at access

# date.

($outtxt,$nowiki)=$api->strip_regex(qr!].*?!, $outtxt, $nowiki);

my @arc=qw/[aA]rchive webcitation\.org [wW]ayback/;

my $arc='(?:'.join('|',@arc).')';

while(my ($k,$v)=each %$nowiki){

next unless $v=~/^

next if $v=~/$arc/;

my ($dt,$nw);

# We have to re-strip here, because the saved values here are

# automatically unstripped.

($v,$nw)=$api->strip_nowiki($v);

($v,$nw)=$api->strip_templates($v, sub {

my $name=shift;

return exists($templates{"Template:$name"});

}, {}, $nw);

$dt=str2time($1) if $v=~/(?:accessed|retrieved)(?: +on)? +(\d{4}-\d{2}-\d{2}|\d+ \w+,? \d{4}|\w+ \d+,? \d{4})/i;

$v=~s{\[($proto1$linkre$chars*)($btext)\]}{ chkExtLink($self,$api,1,$1,$dt // time(),$2) }ge;

return 60 if($self->{'flags'}{'fail'});

($v,$nw)=$api->strip_regex(qr{\[$proto1$chars+$btext\]}, $v, $nw);

$v=~s{\b($proto2$linkre$chars*)}{ chkExtLink($self,$api,2,$1,$dt // time()) }ge;

return 60 if($self->{'flags'}{'fail'});

$v=$api->replace_stripped($v,$nw);

$nowiki->{$k}=$v;

}

# Fix any bracketed external link that doesn't have "Archive" or the

# like in the line after it.

$outtxt=~s{\[($proto1$linkre$chars*)($btext)\](?!.*$arc)}{ chkExtLink($self,$api,1,$1,time(),$2) }ge;

return 60 if($self->{'flags'}{'fail'});

# Hide all bracketed external links. We have to keep track of the

# replacement token for the ones that have "Archive" etc in their

# display text.

($outtxt,$nowiki)=$api->strip_regex(qr{\[$proto1$chars+$btext\]}, $outtxt, $nowiki);

while(my ($k,$v)=each %$nowiki){

push @arc, $k if $v=~m!^\[$proto1$chars+ *.*$arc!;

}

$arc='(?:'.join('|',@arc).')';

# Fix any bare external link that doesn't have "Archive" or the like in

# the line after it.

$outtxt=~s{\b($proto2$linkre$chars+)(?!.*$arc)}{ chkExtLink($self,$api,2,$1,time()) }ge;

return 60 if($self->{'flags'}{'fail'});

# Unstrip

$outtxt=$api->replace_stripped($outtxt,$nowiki);

# Remove "no-dead-link" markers

$outtxt=~s/\Q$nodl\E//g;

# rm marked {{dead link}} templates (and $rmdl markers)

$outtxt=~s/\Q$rmdl\E(?:\s*$dlre)*//g;

# rm duplicate {{dead link}} templates too

$outtxt=~s/$dlre+($dlre)/$1/g;

if($outtxt ne $intxt){

my @summary=();

push @summary, "adding archiveurl for archived $desc cites" if $self->{'flags'}{'cite'};

push @summary, "changing archived $desc links" if $self->{'flags'}{'link'};

push @summary, "tagging dead $desc links" if($self->{'flags'}{'404'} && defined($deadlink));

unless(@summary){

$api->warn("Changes made with no summary for $page, not editing");

next;

}

$summary[$#summary]='and '.$summary[$#summary] if @summary>1;

my $summary=ucfirst(join((@summary>2)?', ':' ', @summary));

$summary.=" per $req";

$api->log("$summary in $page");

my $r=$api->edit($tok, $outtxt, "$summary. $screwup", 1, 1);

if($r->{'code'} ne 'success'){

$api->warn("Write failed on $page: ".$r->{'error'}."\n");

next;

}

}

# If we've been at it long enough, let another task have a go.

return 0 if time()>=$endtime;

}

$self->{'iter'}=undef;

}

$api->log("May be DONE!");

return 600;

}

sub chkExtLink {

my $self=shift;

if($self->{'flags'}{'fail'}){

return wantarray?('fail','fail','fail'):'fail';

}

my $api=shift;

my $fmt=shift;

my $url=shift;

my $date=shift;

my $txt='';

if($fmt==2){

# Duplicate Mediawiki post-processing of bare external links

$txt=$1.$txt if $url=~s/((?:[<>]|&[lg]t;).*$)//;

my $sep=',;\.:!?';

$sep.=')' unless $url=~/\(/;

$txt=$1.$txt if $url=~s/([$sep]+$)//;

# There shouldn't be a template inside the url

$txt=$1.$txt if $url=~s/(\{\{.*$)//;

return $url.$txt unless $url=~m!^$proto2$linkre!;

}

# Get archive link and date

my @archives;

my ($u, $dt);

if(exists($api->store->{$url})){

@archives=@{$api->store->{$url}};

} else {

($u="http://web.archive.org/web/*/$url")=~s!/$proto1!/!;

$api->log("... Checking $u");

# Screen-scrape archive.org

my $r=$self->{'ua'}->get($u);

if($r->is_success){

foreach $_ ($r->decoded_content=~m!href="(http://web.archive.org/web/\d+/[^\x22]*)"!g) {

$_ = HTML::Entities::decode($_);

$api->log("... Got $_");

if(m!^http://web.archive.org/web/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})!){

$dt=timegm($6,$5,$4,$3,$2-1,$1-1900);

} else {

$dt=time();

}

push @archives, [$dt, $_];

}

} elsif($r->code eq '404'){

$api->log("... Failed with ".$r->code);

} elsif($r->code eq '403' && $r->decoded_content=~m!

Blocked Site Error.

\s*

\s*

\Q$url\E is not available in the Wayback Machine!){

$api->log("... Failed with 403 'not available in the Wayback Machine'");

} else {

$api->log("... Failed with ".$r->code.", will retry later");

$self->{'flags'}{'fail'}=1;

return chkExtLink($self);

}

# check webcite too

$u=URI->new('http://www.webcitation.org/query');

$u->query_form(url=>$url,returnxml=>1);

$u=$u->as_string;

$api->log("... Checking $u");

$r=$self->{'ua'}->get($u);

if($r->is_success){

my $xml=$self->{'xml'}->load_xml(string=>$r->decoded_content);

if($xml){

foreach $_ (@{$xml->find('//result[@status=\'success\']')}){

$dt=$_->find('./timestamp');

my $uu=URI->new('http://www.webcitation.org/query');

$uu->query_form(url=>$url,date=>$dt);

$uu=$uu->as_string;

# Not exactly RFC-compliant, but it works fine

$uu=~s/%3A/:/g; $uu=~s/%2F/\//g;

$api->log("... Got $uu");

push @archives, [str2time($dt) // time(), $uu];

}

} else {

$api->log("... Invalid XML data");

$self->{'flags'}{'fail'}=1;

return chkExtLink($self);

}

} elsif($r->code eq '404'){

$api->log("... Failed with ".$r->code);

} else {

$api->log("... Failed with ".$r->code.", will retry later");

$self->{'flags'}{'fail'}=1;

return chkExtLink($self);

}

$api->store->{$url}=\@archives;

}

# Then pull the closest archive to the accessdate or whatever.

my ($diff,$r404)=(1e100,defined($deadlink)?"{{$deadlink|date=".strftime('%B %Y', gmtime)."|bot=".$api->user."}}":$nodl);

$u=undef;

foreach $_ (@archives){

if(abs($_->[0] - $date) < $diff){

$diff=abs($_->[0] - $date);

($dt,$u)=@$_;

$r404='';

}

}

if($r404){

$self->{'flags'}{'404'}=1;

} elsif($fmt==0){

$self->{'flags'}{'cite'}=1;

} else {

$self->{'flags'}{'link'}=1;

}

if($fmt==0){ # cite template

return ($u,strftime('%Y-%m-%d',gmtime($dt // 0)),$r404);

} elsif($fmt==1){ # Bracketed external link

my $txt=shift;

return $r404?"[$url$txt]$r404":"[$u$txt]$rmdl";

} elsif($fmt==2){ # Bare external link

return ($r404?"[$url $url]$r404":"$u$rmdl").$txt.$rmdl;

} else {

return undef;

}

}

1;