User:FairuseBot/libBot.pm
- !/usr/bin/perl
- libBot: A Perl module of useful routines for running a bot
package libBot;
use strict;
use warnings;
use Pearle;
use Data::Dumper;
use Array::Utils;
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(config usernotify wikilog botwarnlog notelog LoadInfoboxPatterns FixupLinks MakeWikiRegex DoIHaveMessages GetPageCategories GetPageTemplates GetLinksOnPage GetPageText GetPageList GetFullPageList GetImageNames SaveImage UpdateLink RemoveImageFromPage ReplaceImage IsNotified IsPageNotified isDated getDate getUploadDates getLastEditDate GetImageUploader loadNotificationList saveNotificationList usesTemplate DoesPageExist);
our $VERSION = 1.00;
my $test_only = 0;
my $username = "";
my @infobox_patterns = ();
sub config
{
my %params = @_;
$test_only = $params{test_only} if(defined($params{test_only}));
$username = $params{username} if(defined($params{username}));
}
- Log a warning on a user's talkpage, using an existing edit session
sub usernotify
{
my ($wikipage, $text, $user, $summary);
$wikipage = $_[1];
$summary = $_[2];
$summary = "Logging warning message" if(!defined($summary));
if(!$wikipage->isa("Pearle::WikiPage"))
{
Pearle::myLog(0, "usernotify(): Second parameter is not a WikiPage object\n");
die "usernotify(): Second parameter is not a WikiPage object\n";
}
# We've been handed an editing session
Pearle::myLog(4, "Warning with existing edit session\n");
if($test_only)
{
print STDERR $_[0];
return;
}
if($wikipage->getWikiText() =~ /^#redirect/i)
{
botwarnlog("*User talk page User talk:$user is a redirect\n");
return;
}
$text = $wikipage->getEditableText();
$text .= $_[0];
$wikipage->setEditableText($text);
Pearle::postPage($wikipage, $summary, 0);
print STDERR $_[0];
}
- General-purpose on-Wiki logging routine
sub wikilog
{
my($target, $text, $token, $summary);
$target = $_[0];
$text = $_[1];
$summary = $_[2] || "Logging note";
chomp($text);
$text = "\n$text" if($text !~ /^\n/); # The edit API eats trailing newlines, so prepend a newline if the message doesn't already have one.
eval
{
$token = Pearle::getToken($target);
};
if($@)
{
if($@ =~ /^925/)
{
Pearle::myLog(1, "Failed to notify: Protected page $target\n");
return;
}
else
{
die;
}
}
if($test_only)
{
print STDERR $_[1];
return;
}
Pearle::appendToPage($target, $token, $text, $summary, 0);
}
- Log a warning on the talk page of the bot
sub botwarnlog
{
my ($page, $text, $summary);
$text = $_[0];
$summary = $_[1];
$summary = "Logging warning message" if(!defined($summary));
$page = "User talk:${username}/log";
wikilog($page, $text, $summary);
}
- Log a notification message to the console
sub notelog
{
print STDERR @_;
}
- Fix all wikilinks in a string so that they shows as a link, not inline, if it's for a category or image
sub FixupLinks
{
my $link = shift;
$link =~ s/\[\[(Category|Image|File)/[[:$1/g;
return $link;
}
- Make a string into a Wikipedia-compatible regex
sub MakeWikiRegex
{
my $string = shift;
my @chars = split //, $string;
my $result = '';
return undef if(!defined($string));
foreach my $char (@chars)
{
# Escape metacharacters, and add percent-encoding for certain characters
if($char eq '\\') {$result .= '\\\\';}
elsif($char eq '/') {$result .= '(?i:\/|%2F)';}
elsif($char eq '.') {$result .= '\.';}
elsif($char eq '(') {$result .= '(?i:\(|%28)';}
elsif($char eq ')') {$result .= '(?i:\)|%29)';}
elsif($char eq '[') {$result .= '\[';}
elsif($char eq ']') {$result .= '\]';}
elsif($char eq '+') {$result .= '\+';}
elsif($char eq '*') {$result .= '\*';}
elsif($char eq '?') {$result .= '(?i:\?|%3F)';}
elsif($char eq '^') {$result .= '\^';}
elsif($char eq '$') {$result .= '\$';}
elsif($char eq '&') {$result .= '(?i:&|%26)';}
elsif($char eq '!') {$result .= '(?i:!|%21)';}
elsif($char eq '~') {$result .= '(?i:~|%7E)';}
elsif($char eq "'") {$result .= "(?i:'|%27)";}
elsif($char eq '"') {$result .= '(?i:"|%22)';}
elsif($char eq ',') {$result .= '(?i:,|%2C)';}
else {$result .= $char;}
}
# Process the string to match both with spaces and with underscores
$result =~ s/[ _]/[ _]+/g;
# Process the string to match both upcase and lowercase first characters
if($result =~ /^[A-Za-z]/)
{
$result =~ s/^(.)/"[$1".lc($1)."]"/e;
}
return $result;
}
sub HTMLEncode
{
my $char = shift;
return sprintf("&X%X;", ord($char));
}
- Make a string into something that can match most image name formats
sub MakeFancyRegex
{
my $string = shift;
my @chars = split //, $string;
my $result;
foreach my $char (@chars)
{
if($char eq '\\')
{
$result .= "(\\\\|%5C|%5c|&x5C;)";
}
elsif($char eq '.')
{
}
elsif($char eq '(')
{
}
elsif($char eq ')')
{
}
else
{
$result .= "($char|" . uri_escape_utf8($char) . "|" . lc(uri_escape_utf8($char)) . "|" . HTMLEncode($char) . "|" . lc(HTMLEncode($char)) . ")";
}
}
return $result;
}
- Check for new talk page messages
sub DoIHaveMessages
{
my $xml = shift;
my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml);
if(exists($parsed_xml->{query}->{userinfo}->{messages}) and defined($parsed_xml->{query}->{userinfo}->{messages}))
{
return 1;
}
else
{
return 0;
}
}
sub GetPageCategories
{
my $xml = shift;
my @pages = ();
if(defined($xml))
{
my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml);
Pearle::myLog(4, Dumper($parsed_xml));
if(exists($parsed_xml->{query}->{pages}->{page}->{categories}->{cl}) and defined($parsed_xml->{query}->{pages}->{page}->{categories}->{cl}))
{
if(ref($parsed_xml->{query}->{pages}->{page}->{categories}->{cl}) eq 'ARRAY')
{
my @all_pages = @{$parsed_xml->{query}->{pages}->{page}->{categories}->{cl}};
@pages = map {$_->{title}} @all_pages;
}
else
{
@pages = ($parsed_xml->{query}->{pages}->{page}->{categories}->{cl}->{title});
}
}
}
return @pages;
}
sub GetLinksOnPage
{
my $xml = shift;
my @pages = ();
if(defined($xml))
{
my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml);
Pearle::myLog(4, Dumper($parsed_xml));
if(exists($parsed_xml->{query}->{pages}->{page}->{links}->{pl}) and defined($parsed_xml->{query}->{pages}->{page}->{links}->{pl}))
{
if(ref($parsed_xml->{query}->{pages}->{page}->{links}->{pl}) eq 'ARRAY')
{
my @all_pages = @{$parsed_xml->{query}->{pages}->{page}->{links}->{pl}};
@pages = map {$_->{title}} @all_pages;
}
else
{
@pages = ($parsed_xml->{query}->{pages}->{page}->{links}->{pl}->{title});
}
}
}
return @pages;
}
sub GetPageText
{
my $xml = shift;
my $text = undef;
if(defined($xml))
{
my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml);
Pearle::myLog(4, Dumper($parsed_xml));
if(exists($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{content}) and defined($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{content}))
{
if(ref($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{content}))
{
# The API/XML parser interact to produce a HASH ref if the revision is empty
$text = "";
}
else
{
$text = $parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{content};
}
}
}
return $text;
}
- Input: XML from the API, generated with prop => 'templates' and with only a single title
- Either as text or as a parsed tree
- Returns: A list of templates used by the page
- Side effects: None
sub GetPageTemplates
{
my $xml = shift;
my @templates;
if(defined($xml))
{
my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml, ForceArray => ['tl']);
if(exists($parsed_xml->{query}->{pages}->{page}->{templates}->{tl}) and defined($parsed_xml->{query}->{pages}->{page}->{templates}->{tl}))
{
@templates = map {$_->{title}} @{$parsed_xml->{query}->{pages}->{page}->{templates}->{tl}};
}
}
return @templates;
}
- Input: XML, either a tree produced by parsing, or XML text
- Returns: A list of pages that this image is used on
- Side effects: For pages in certain namespaces, posts on the bot's log page
sub GetPageList
{
my $xml = shift;
my $image;
my @pages = ();
if(defined($xml))
{
my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml, ForceArray => ['iu']);
my $image = $parsed_xml->{query}->{pages}->{page}->{title};
Pearle::myLog(4, Dumper($parsed_xml));
if(exists($parsed_xml->{query}->{imageusage}->{iu}) and defined($parsed_xml->{query}->{imageusage}->{iu}))
{
my @bad_pages = grep {$_->{ns} == 10 or $_->{ns} == 12} @{$parsed_xml->{query}->{imageusage}->{iu}};
my @good_pages = grep {$_->{ns} != 10 and $_->{ns} != 12} @{$parsed_xml->{query}->{imageusage}->{iu}};
@pages = map {$_->{title}} @good_pages;
if(scalar(@bad_pages) > 0 and defined($image)) # If "image" is undefined, we're probably doing a pure usage check, rather than one in preparation for removal
{
my $notice;
foreach my $page (@bad_pages)
{
$notice .= "*Found image :$image in $page->{title}\n";
}
botwarnlog($notice);
}
}
}
return @pages;
}
- Get all pages. Don't filter for bad namespaces.
sub GetFullPageList
{
my $xml = shift;
my $image;
my @pages = ();
if(defined($xml))
{
my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml, ForceArray => ['iu']);
my $image = $parsed_xml->{query}->{pages}->{page}->{title};
Pearle::myLog(4, Dumper($parsed_xml));
if(exists($parsed_xml->{query}->{imageusage}->{iu}) and defined($parsed_xml->{query}->{imageusage}->{iu}))
{
@pages = map {$_->{title}} @{$parsed_xml->{query}->{imageusage}->{iu}};
}
}
return @pages;
}
- Input: XML from the API, generated with list => 'backlinks'. blfilterredir => 'redirects' is recommended but not mandatory.
- Either as text or as a parsed tree
- Returns: A list of redirects to the image
- Side effects: None
sub GetImageNames
{
my $xml = shift;
my @names;
if(defined($xml))
{
my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml, ForceArray => ['bl']);
if(exists($parsed_xml->{query}->{backlinks}->{bl}) and defined($parsed_xml->{query}->{backlinks}->{bl}))
{
@names = map {$_->{title}} grep( {defined($_->{redirect})} @{$parsed_xml->{query}->{backlinks}->{bl}});
}
}
return @names;
}
sub UpdateLink
{
my $page = shift;
my $from = shift;
my $to = shift;
my $summary = shift || "Updating link to bypass a redirect or disambiguation page";
die "No page to edit" if(!defined($page));
die "No link to change" if(!defined($from));
die "No new link" if(!defined($to));
Pearle::myLog(3, "Updating link from $from to $to for page $page\n");
my $wikipage = Pearle::getPage($page);
$wikipage->canonicalizeLinks();
my $text = $wikipage->getEditableText();
my $link_regex = MakeWikiRegex($from);
my $matches = $text =~ s/\x01($link_regex)\x02/\x01${to}|${1}\x02/gi;
$matches += $text =~ s/\x01$link_regex([#|])/\x01${to}${1}/gi;
$matches += $text =~ s/([^=]\s*=\s*)$link_regex(\s*[|\n])/${1}$to${2}/gi;
$wikipage->setEditableText($text);
print $text;
if($matches > 0)
{
Pearle::postPage( $wikipage, $summary, 0);
}
else
{
Pearle::myLog(3, "No update\n");
}
return $matches;
}
sub RemoveImageFromPage
{
my $image = shift;
my $page = shift;
my $image_regex = shift;
my $removal_prefix = shift;
my $removal_comment = shift;
my $wikipage;
my $text;
my ($match1, $match2);
my $old_length;
my $new_length;
my $change_len;
my $match_len;
tryagain:
# Fetch an article page
$wikipage = Pearle::getPage($page);
$wikipage->canonicalizeLinks();
$text = $wikipage->getEditableText();
if(!defined($text))
{
Pearle::myLog(1, "Error: Bad edit page $page\n");
botwarnlog(FixupLinks("*Error: Bad edit page $page\n"));
sleep(300);
return 0;
}
if($text =~ /^\s*$/)
{
# Might be protected instead of empty
Pearle::myLog(1, "Error: Empty or protected page $page\n");
botwarnlog(FixupLinks("*Error: Empty or protected page $page\n"));
sleep(300);
return 0;
}
if($text =~ /^#redirect/i)
{
Pearle::myLog(1, "Redirect found for page $page (image :$image)\n");
botwarnlog(FixupLinks("*Redirect found for page $page (image :$image)\n"));
print $text;
return 0;
}
# Remove the image
my $regex3 = "([ \\t]*\x01${image_regex}[^\x01]*?(\x01[^\x02]*?\x02[^\x01]*?|)+\x02[ \\t]*)"; # Regex to match images
#my $regex3 = "(
# [ \\t]* # Any leading whitespace
# \x01 # Open double-bracket for the image
# ${image_regex} # The image itself
# [^\x01]*? # Anything up to the first link in the caption, or a closing double bracket (minimal match)
# (\x01 # Open double-bracket for a link in the caption
# [^\x02]*? # Anything but a closing double-bracket
# \x02 # The closing double-bracket for the link
# [^\x01]*?|) # Any non-link text, or nothing
# + # Matches one or more times
# \x02 # The closing double-bracket for the image
# [ \\t]*) # Any trailing whitespace
# ";
my $regex3ex = "\\w[ \\t]*${regex3}[ \\t]*\\w"; # Regex to try to spot inline images
my $regex3g = "(${image_regex}.*)"; # Regex to match gallery images
my ($raw_image) = $image =~ /(?:Image|File):(.*)/;
Pearle::myLog(3, "Regex 3: $regex3\n");
notelog("Regex 3 extended: $regex3ex\n");
notelog("Regex 3 gallery: $regex3g\n");
Pearle::myLog(3, "Raw regex: $raw_image\n");
if($text =~ /$regex3ex/)
{
Pearle::myLog(1, "Possible inline image in $page\n");
botwarnlog(FixupLinks("*Possible inline image :$image in $page\n"));
return 0; # Can't do gallery matching because that also matches regular images, and odds are, we don't have an infobox
}
$text =~ /$regex3/;
$match_len = length($1);
my @matches = $text =~ /$regex3/g;
if(grep {$_ =~ /[\x{F0000}-\x{FFFFF}]/} @matches) # If any images have comments in their captions, we can't remove them
{
botwarnlog(FixupLinks("*Comment in image in $page\n"));
goto skipregular;
}
if(defined($removal_prefix))
{
$match2 = $text =~ s/$regex3//g;
}
else
{
$match2 = $text =~ s/$regex3//g;
}
$new_length = length($text);
print "Num: $match2 Len: $match_len\n";
if($match2)
{
if($match_len < (2 + length($image)))
{
Pearle::myLog(0, "Short replacement of $match_len bytes (min " . (length($image) + 2) . ") in $page ($match2 matches). Exiting.\n");
Pearle::myLog(0, "Text:\n$text\n");
print Dumper($1);
print Dumper($image);
exit;
}
# If many matches, log a warning
if($match2 > 2)
{
Pearle::myLog(3, "More than one match ($match2) in page $page\n");
}
if($match2 > 100)
{
Pearle::myLog(1, "Too many matches ($match2) in page $page. Skipping.\n");
botwarnlog("*Too many matches ($match2) in page $page. Skipping.\n");
return 0;
}
}
skipregular:
# Put the text back and get it again in order to fold any comments resulting from removing non-gallery images.
# This is because gallery image matching will also match commented images.
$wikipage->setEditableText($text);
$text = $wikipage->getEditableText();
if($text =~ /
{
Pearle::myLog(3, "*Possible image gallery in page $page\n");
if(defined($removal_prefix))
{
if($text =~ s/$regex3g//g)
{
$match2 += 1;
}
}
else
{
if($text =~ s/$regex3g//g)
{
$match2 += 1;
}
}
}
if($match2 > 0)
{
if($text =~ /\[\[(?: |)