User:FairuseBot/Pearle.pm
- IMPORTANT ###
- This code is released into the public domain.
- RECENT CHANGES ###
- 30 Nov 2005: Created, based off of the 12 Nov 2005 version of Pearle Wisebot
- 15 Feb 2006: Modifed "retry" to work with any function that signals failure by dying, modified to use a simple exponential backoff formula
- Simplified "limit" code, modified to take an optional parameter
- Added "config" function as a clean interface to change internal parameters
- Modified Wiki-access functions for use with the new "retry" function
- Cleanup of boolean config vars to use standard Perl boolean conventions
- 28 Feb 2006: Added checkLogin bottleneck, option to allow editing while logged out
- Added support for proxy servers
- 8 Mar 2006: Added support for getting a user's contributions
- Added support for retrieving logs
- Separated out some common regex parts into variables
- 29 Mar 2006: Added protection against Unicode in URLs
- Made thrown exceptions consistent
- Sanity-checking on postPage: talkpage without article, userpage or user talkpage without user
- 17 May 2005: Improved log retrieval
- 12 Jul 2007: Started support for api.php
- Log retrieval converted to api.php, added timestamps to retrieval
- Modified to work with any wiki
- Modified to use index.php rather than wiki.phtml
- Converted GetLogArticles to use named parameters
- 14 Jul 2007: Modified logging to use message levels. Removed "print" and "myPrint" functions
- 6 Aug 2007: Added the "WikiPage" class
- Modified getPage and putPage to only work with WikiPage objects
- Renamed to "Pearle.pm"
- Made a proper module
- 10 Aug 2007: Changed the default XML parser, reduced memory usage when parsing
- 17 Oct 2007: Removed nullEdit() -- MediaWiki hasn't required them in some time.
- Modified getCategoryArticles, getCategoryImages, and getSubcategories to use api.php
- 21 Oct 2007: Proper Unicode support
- 29 Oct 2007: Made edit summaries mandatory
- 23 Mar 2008: Changed "minor" flag from text to boolean
- 29 Mar 2008: Improved UTF-8 support
- 17 Oct 2008: Added a second login check, to handle non-Monobook skins
- 1 Dec 2008: Added a screen-scraping function to get Special:UncategorizedFiles
- Removed some hard-coded references to enwiki
- 27 Apr 2010: Fixed the login function to work with the new enwiki login page
- The HTML version of wikitext can validly contain ">"; fixed to handle this
- 12 Aug 2010: Added read-only and test modes
- 13 Dec 2010: Updated to use the API for editing. This removes the last coherent piece of Pearle Wisebot code.
- 31 Aug 2011: Added option for bot-flagging an edit to postPage()
- 24 Sep 2011: Added namespace-handling routines
- Added getPageList()
- Re-wrote parseHistory as getArticleHistory. Breaking change.
- 17 Jan 2012: Updated login and logout to use the API.
- Added APIPost()
- Refactored APIQuery(), APIEdit(), and APIPost() to move common code into a helper function.
- 20 Feb 2012: Added getToken() and appendToPage() to permit efficient logging.
- Removed the protection check from getPage(): it's simpler to just try the edit than to figure out how the wiki's protection rules apply.
- Refactored postPage and appendToPage to move common code into a helper function.
- 9 Nov 2012: Fixed a bug where getCategoryContents would only return the last set of items if the category had more than 500 items of that type.
- Added getPageImages()
- 11 Jun 2015: Modified all API access functions to use 'rawcontinue'
- Errors thrown by this package always begin with a three-digit number
- 4xx: HTTP client errors
- 505: Server error: HTTP version not supported
- 509: Server error: Bandwidth exceeded
- 900: Unspecified internal error.
- 901: Library not initialized. You didn't call Pearle::init() before calling this function.
- 902: Parameter error. You made a function call, but forgot a mandatory parameter, or provided an invalid one.
- 903: Attempted write in read-only mode.
- 920: Unexpected response. The MediaWiki site returned something unexpected.
- 921: Unexpected logout. The MediaWiki site logged us out unexpectedly.
- 922: Edit conflict. Someone edited the article while we were.
- 923: Deleted article conflict. Someone deleted the article while we were editing.
- 924: Spam filter. A link in the page tripped the spam filter.
- 925: Protected page. The page is protected, and the bot doesn't have the rights to edit it.
package Pearle;
use strict;
use warnings;
use Time::HiRes;
use Encode;
use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Request::Common qw(POST);
use HTML::Entities;
use XML::Simple;
use Data::Dumper; # For debugging
use URI::Escape;
use Digest::MD5 qw(md5_hex);
use Pearle::WikiPage;
- Standard regex parts
$Pearle::regex_timestamp = '(\d\d):(\d\d), (\d\d?) (\w+) (\d\d\d\d)'; # Match and capture a Wikipedia timestamp
$Pearle::regex_timestamp_nc = '\d\d:\d\d, \d\d? \w+ \d\d\d\d'; # Match a Wikipedia timestamp
$Pearle::regex_timestamp_ISO = '(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)'; # Match and capture a timestamp of the form 2007-07-13T04:21:39Z
$Pearle::regex_timestamp_ISO_nc = '\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d'; # Match a timestamp of the form 2007-07-13T04:21:39Z
$Pearle::regex_pagelink = ''; # Match and capture any page
$Pearle::regex_redpagelink = ''; # Match and capture nonexistant pages only
$Pearle::regex_bluepagelink = ''; # Match and capture existing pages only
$Pearle::regex_pagelink_nc = ''; # Match any page
$Pearle::regex_redpagelink_nc = ''; # Match nonexistant pages only
$Pearle::regex_bluepagelink_nc = ''; # Match existing pages only
- Namespaces available on this wiki
@Pearle::namespaces = ();
$Pearle::logfile = "";
$Pearle::_inited = 0;
$Pearle::username = "";
$Pearle::password = "";
$Pearle::testmode = 0; # Read-only mode: do not actually make wiki-modifying calls, but return as if they succeeded
$Pearle::readonly = 0; # Read-only mode: wiki-modifying calls are errors.
$Pearle::speedLimit = 10; # Seconds to wait by default when limit() is called
$Pearle::_speedMult = 1; # Multiplier for default wait time if the wiki is being slow
$Pearle::roughMode = 0; # Ignore most errors
$Pearle::nullOK = 0; # Permit editing non-existent pages
$Pearle::sanityCheck = 0; # Sanity checking on edits
$Pearle::loglevel = 2; # Level of message to write to file
$Pearle::printlevel = 3; # Level of message to print to stdout
$Pearle::logoutOK = 0; # Permit editing while logged out
$Pearle::proxy = undef; # Proxy to use
$Pearle::wiki = 'https://en.wikipedia.org/w/'; # URL of the directory containing index.php and api.php
$XML::Simple::PREFERRED_PARSER = "XML::Parser"; # Much faster than the default XML::SAX parser
$Pearle::xml_parser = XML::Simple->new();
- Accessors #########################################################
sub getXMLParser
{
return $Pearle::xml_parser;
}
- Other functions ###################################################
- This must be the first function from the library called
sub init
{
$Pearle::username = $_[0] or die("902 No username provided!\n");
$Pearle::password = $_[1] or die("902 No password provided!\n");
$Pearle::logfile = $_[2] or die("902 No logfile name provided!\n");
$Pearle::cookies = $_[3] or die("902 No cookie file provided!\n");
$Pearle::useragent = $_[4] or $Pearle::useragent = "PearleLib/0.2 (User:${Pearle::username})";
$Pearle::ua = LWP::UserAgent->new(timeout => 300);
$Pearle::ua->agent($Pearle::useragent);
$Pearle::ua->cookie_jar(HTTP::Cookies->new(file => $Pearle::cookies, autosave => 1));
$Pearle::ua->cookie_jar->load();
$Pearle::roughMode = "no";
$Pearle::_inited = 1;
}
sub config
{
my %params = @_;
$Pearle::readonly = $params{readonly} if(defined($params{readonly}));
$Pearle::testmode = $params{testmode} if(defined($params{testmode}));
$Pearle::speedLimit = $params{speedLimit} if(defined($params{speedLimit}));
$Pearle::roughMode = $params{roughMode} if(defined($params{roughMode}));
$Pearle::nullOK = $params{nullOK} if(defined($params{nullOK}));
$Pearle::loglevel = $params{loglevel} if(defined($params{loglevel}));
$Pearle::printlevel = $params{printlevel} if(defined($params{printlevel}));
$Pearle::logfile = $params{logfile} if(defined($params{logfile}));
$Pearle::logoutOK = $params{logoutOK} if(defined($params{logoutOK}));
$Pearle::sanityCheck = $params{sanityCheck} if(defined($params{sanityCheck}));
if(defined($params{wiki}) and $params{wiki} ne $Pearle::wiki)
{
$params{wiki} .= '/' if($params{wiki} !~ /\/$/); # Add a trailing slash if needed
$Pearle::wiki = $params{wiki};
}
if(exists($params{proxy}))
{
if(defined($params{proxy}))
{
myLog(3, "Proxying: $params{proxy}\n");
$Pearle::ua->proxy('http', $params{proxy});
$Pearle::proxy = $params{proxy};
}
else
{
myLog(3, "Not proxying\n");
$Pearle::ua->no_proxy();
$Pearle::proxy = undef;
}
}
}
- Logging levels:
- 0: Immediately fatal errors. Call to myLog will be followed by a call to die()
- 1: Non-fatal errors. The library can recover, turn the function call into a no-op, and return an error indicator
- 2: Serious warning. The library can still complete the action
- 3: Status messages. Messages useful for tracing library execution.
- 4: Debugging messages.
sub myLog
{
my $level = shift;
my @message = @_;
my $message = join "", @message;
if($level <= $Pearle::loglevel)
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
open (LOG, ">>:utf8", $Pearle::logfile) || die "900 Could not append to log!";
print LOG $message;
close (LOG);
}
if($level <= $Pearle::printlevel)
{
print $message;
}
}
- Rate-limiting. Can be sensibly run even if libPearle isn't initialized
sub limit
{
my ($i);
$i = ($_[0] or ($Pearle::speedLimit * $Pearle::_speedMult));
$i = 10 if($i < 10);
# Rate-limiting to avoid hosing the wiki server
# Min 30 sec unmarked
# Min 10 sec marked
# May be raised by retry() if load is heavy
### ATTENTION ###
# Increasing the speed of the bot to faster than 1 edit every 10
# seconds violates English Wikipedia rules as of April, 2005, and
# will cause your bot to be banned. So don't change $normalDelay
# unless you know what you are doing. Other sites may have
# similar policies, and you are advised to check before using your
# bot at the default speed.
#################
while ($i >= 0)
{
sleep (1);
print "Sleeping $i seconds...\r";
$i--;
}
print " \r";
}
sub login
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
my $xml = APIPost(action => 'login', lgname => $Pearle::username, lgpassword => $Pearle::password);
my $parsed_xml = $Pearle::xml_parser->XMLin($xml);
print Dumper($parsed_xml);
if($parsed_xml->{login}->{result} eq 'NeedToken')
{
$xml = APIPost(action => 'login', lgname => $Pearle::username, lgpassword => $Pearle::password, lgtoken => $parsed_xml->{login}->{token});
$parsed_xml = $Pearle::xml_parser->XMLin($xml);
print Dumper($parsed_xml);
}
if($parsed_xml->{login}->{result} eq 'Success')
{
return 1;
}
else
{
return 0;
}
}
sub logout {
APIQuery(action => 'logout');
return 1;
}
sub checkLogin
{
my ($reply_text);
$reply_text = $_[0];
if ($reply_text !~ m/>My talk<\/a>/ and !($Pearle::logoutOK))
{
if($reply_text !~ /var wgUserName = "$Pearle::username"/)
{
# We've lost our identity.
myLog(0, "Wiki server is not recognizing me (1).\n---\n${reply_text}\n---\n");
die ("921 Wiki server is not recognizing me (1).\n");
}
}
}
- Make an HTTP request, performing basic error checking and handling. Suitable for use with the "retry" function
sub httpRequest
{
my ($request, $response, $attemptStartTime, $attemptEndTime);
$request = $_[0];
$response = $Pearle::ua->request($request);
# Monitor wiki server responsiveness
$attemptStartTime = Time::HiRes::time();
if ($response->is_success or $response->is_redirect)
{
return $response
}
else
{
# 50X HTTP errors mean there is a problem connecting to the wiki server. Can be remedied by waiting and trying again
if (500 <= $response->code and 504 >= $response->code)
{
myLog(2, "HTTP ERR (".$response->status_line.")\n");
die("retry:".$response->status_line);
}
else
{
# Unhandled HTTP response. Waiting probably won't fix it
myLog(0, "HTTP ERR (".$response->status_line.")\n".$response->decoded_content."\n");
die($response->status_line."\n");
}
}
# Monitor wiki server responsiveness
$attemptEndTime = Time::HiRes::time();
if($request->method() eq "POST")
{
if (($attemptEndTime - $attemptStartTime) > 20)
{
$Pearle::_speedMult = 60;
myLog(3, "Wiki is very slow. Increasing minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");
}
# If the response time is between 10 and 20 seconds...
elsif (($attemptEndTime - $attemptStartTime) > 10)
{
$Pearle::_speedMult = 6;
myLog(3, "Wiki is somewhat slow. Setting minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");
}
# If the response time is less than 10 seconds...
else
{
if ($Pearle::_speedMult != 1)
{
$Pearle::_speedMult = 1;
myLog(3, "Returning to normal minimum wait time.\n");
}
}
}
}
- Check out a page for editing.
sub getPage
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
my ($target, $xml, $parsed_xml, %query_params, $text, $editTime, $startTime, $token);
$target = $_[0];
if ($target =~ m/^\s*$/)
{
myLog(0, "getPage: Null target.");
die("902 getPage: Null target.");
}
$query_params{prop} = ['revisions', 'info'];
$query_params{rvprop} = ['timestamp', 'content'];
$query_params{inprop} = 'protection';
$query_params{intoken} = 'edit';
$query_params{titles} = $target;
$xml = APIQuery(%query_params);
if(!defined($xml))
{
myLog(0, "Unknown error requesting page contents\n");
die "900 Unknown error requesting page contents";
}
$parsed_xml = $Pearle::xml_parser->XMLin($xml);
myLog(4, Dumper($parsed_xml));
$xml = undef;
# Check for errors
if(exists($parsed_xml->{query}->{pages}->{page}->{invalid}))
{
myLog(0, "Invalid page title: $target\n");
die("902 Invalid page title: $target");
}
# See if the page is blank
if(!$Pearle::nullOK && exists($parsed_xml->{query}->{pages}->{page}->{missing}))
{
myLog (1, "Empty page: $target\n");
if (!$Pearle::roughMode)
{
die ("920 Empty page: $target\n");
}
}
$text = $parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{content};
$startTime = $parsed_xml->{query}->{pages}->{page}->{starttimestamp};
$editTime = $parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{timestamp};
$token = $parsed_xml->{query}->{pages}->{page}->{edittoken};
return Pearle::WikiPage->new(text => $text, editTime => $editTime, startTime => $startTime, editToken => $token, title => $target);
}
sub getToken
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
my ($target, $xml, $parsed_xml, %query_params);
$target = $_[0];
if ($target =~ m/^\s*$/)
{
myLog(0, "getPage: Null target.");
die("902 getPage: Null target.");
}
$query_params{prop} = ['info'];
$query_params{intoken} = 'edit';
$query_params{titles} = $target;
$query_params{inprop} = '';
$xml = APIQuery(%query_params);
$parsed_xml = $Pearle::xml_parser->XMLin($xml);
myLog(4, Dumper($parsed_xml));
$xml = undef;
# Check for errors
if(exists($parsed_xml->{query}->{pages}->{page}->{invalid}))
{
myLog(0, "Invalid page title: $target\n");
die("902 Invalid page title: $target");
}
# See if the page has been protected
if(exists($parsed_xml->{query}->{pages}->{page}->{protection}->{pr}))
{
myLog(0, "Page $target is protected\n");
die("925 Protected");
}
return $parsed_xml->{query}->{pages}->{page}->{edittoken};
}
- The common elements of postPage and appendToPage. Not for external consumption.
sub _editPage
{
die "903 Library in read-only mode!\n" if($Pearle::readonly);
my ($xml, $parsed_xml, $pageName, %params);
%params = @_;
$pageName = $params{title};
$params{assert} = 'user' if(!$Pearle::logoutOK);
EDITRETRY:
$xml = APIEdit(%params);
if(!defined($xml))
{
myLog(0, "Unknown error posting edit\n");
die "900 Unknown error posting edit";
}
$parsed_xml = $Pearle::xml_parser->XMLin($xml);
myLog(4, Dumper($parsed_xml));
$xml = undef;
# Check for errors
if(exists($parsed_xml->{edit}) && $parsed_xml->{edit}->{result} eq 'Failure' && $parsed_xml->{edit}->{assert} eq 'user')
{
myLog(0, "Wiki server is not recognizing me\n");
die ("921 Not logged in");
}
if(exists($parsed_xml->{error}))
{
if($parsed_xml->{error}->{code} eq 'blocked')
{
myLog(0, "Blocked\n");
die ("900 Blocked");
}
elsif($parsed_xml->{error}->{code} eq 'protectedpage' || $parsed_xml->{error}->{code} eq 'cascadeprotected')
{
myLog(0, "Page $pageName is protected\n");
die ("925 Protected");
}
elsif($parsed_xml->{error}->{code} eq 'pagedeleted')
{
myLog(0, "Deleted article conflict on $pageName\n");
die ("923 Deleted article conflict");
}
elsif($parsed_xml->{error}->{code} eq 'editconflict')
{
myLog(0, "Edit conflict on $pageName\n");
die ("922 Edit conflict on $pageName");
}
elsif($parsed_xml->{error}->{code} eq 'spamdetected')
{
myLog(0, "Spam link on $pageName: $parsed_xml->{error}->{info}\n");
die ("924 Spam filter");
}
elsif($parsed_xml->{error}->{code} eq 'readonly')
{
myLog(1, "Wiki is in readonly mode. Waiting before retry\n");
sleep 60;
goto EDITRETRY;
}
else
{
myLog(0, "Unexpected error. Code: $parsed_xml->{error}->{code} Info: $parsed_xml->{error}->{info}\n");
die ("920 Server error");
}
}
return $parsed_xml->{edit}->{result};
}
sub postPage
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
my ($page, $pageName, $summaryEntry, $minor, $bot, $xml, %params, $parsed_xml);
$page = $_[0];
$summaryEntry = $_[1];
$minor = $_[2];
$bot = $_[3];
$bot = 1 if(!defined($bot));
if(!defined($minor))
{
myLog(0, "postPage(): Not enough parameters.\n");
die "902 postPage(): Not enough parameters!\n";
}
if(!$page->isa("Pearle::WikiPage"))
{
myLog(0, "postPage(): First parameter is not a WikiPage object\n");
die "902 postPage(): First parameter is not a WikiPage object\n";
}
if ($summaryEntry eq "")
{
myLog(0, "postPage(): No edit summary provided\n");
die "902 postPage(): No edit summary provided\n";
}
return "Success" if($Pearle::testmode);
$pageName = $page->getTitle();
$params{title} = $pageName;
$params{text} = $page->getWikiText();
$params{token} = $page->getEditToken();
$params{summary} = $summaryEntry;
$params{minor} = 1 if($minor);
$params{starttimestamp} = $page->getStartTime();
$params{basetimestamp} = $page->getEditTime();
$params{bot} = 1 if($bot);
return _editPage(%params);
}
sub appendToPage
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
my ($page, $token, $text, $summary, $minor, $bot, %params, $xml, $parsed_xml);
$page = $_[0];
$token = $_[1];
$text = $_[2];
$summary = $_[3];
$minor = $_[4];
$bot = $_[5];
$bot = 1 if(!defined($bot));
if(!defined($minor))
{
myLog(0, "appendToPage(): Not enough parameters.\n");
die "902 appendToPage(): Not enough parameters!\n";
}
return "Success" if($Pearle::testmode);
$params{title} = $page;
$params{appendtext} = $text;
$params{token} = $token;
$params{summary} = $summary;
$params{minor} = 1 if($minor);
$params{bot} = 1 if($bot);
return _editPage(%params);;
}
- Get a list of the contents in a given category, filtered by namespace
sub getCategoryContents
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
my ($target, $category_offset, @articles, $xml, $parsed_xml, %query_params,
$numberOfArticles, @namespaces);
$target = shift;
@namespaces = @_;
# Category: prefix is mandatory
if($target !~ /^[Cc]ategory:/)
{
$target = "Category:" . $target;
}
$query_params{list} = 'categorymembers';
$query_params{cmprop} = 'title';
$query_params{cmtitle} = $target;
$query_params{cmlimit} = 500; # If you're a flagged bot, this could be 5000, but we default to 500 for compatibility and to keep memory usage down
$query_params{rawcontinue} = "";
foreach my $namespace (@namespaces)
{
$query_params{cmnamespace} .= "${namespace}|";
}
if(exists($query_params{cmnamespace}) and defined($query_params{cmnamespace}))
{
chop $query_params{cmnamespace};
}
do
{
$xml = APIQuery(%query_params);
if(!defined($xml))
{
myLog(0, "Unknown error accessing wiki\n");
die "900 Unknown error accessing wiki";
}
$parsed_xml = $Pearle::xml_parser->XMLin($xml, ForceArray => ['cm']);
myLog(4, Dumper($parsed_xml));
$xml = undef;
if(exists($parsed_xml->{query}->{categorymembers}->{cm}) and defined($parsed_xml->{query}->{categorymembers}->{cm}))
{
my @set_articles = map {$_->{title}} @{$parsed_xml->{query}->{categorymembers}->{cm}};
push @articles, @set_articles;
}
if(exists($parsed_xml->{'query-continue'}->{categorymembers}->{cmcontinue}))
{
$category_offset = $parsed_xml->{'query-continue'}->{categorymembers}->{cmcontinue};
$category_offset =~ s/&/%26/;
$query_params{cmcontinue} = $category_offset;
}
else
{
$category_offset = undef;
}
sleep (1); # Throttle GETs
}
while(defined($category_offset));
$numberOfArticles = scalar(@articles);
myLog(4, "Got $numberOfArticles articles.\n");
return @articles;
}
sub getCategoryArticles
{
return getCategoryContents($_[0], 0); # Namespace 0: Articles
}
sub getCategoryImages
{
return getCategoryContents($_[0], 6); #Namespace 6: Images
}
sub getSubcategories
{
return getCategoryContents($_[0], 14); # Namespace 14: Categories
}
sub getPageImages
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
my ($target, $list_offset, @images, $xml, $parsed_xml, %query_params);
$target = shift;
die "902 No article provided for getPageImages\n" if(!defined($target));
$query_params{prop} = "images";
$query_params{titles} = $target;
$query_params{imlimit} = 500;
$query_params{rawcontinue} = "";
do
{
$xml = APIQuery(%query_params);
if(!defined($xml))
{
myLog(0, "Unknown error accessing wiki\n");
die "900 Unknown error accessing wiki";
}
$parsed_xml = $Pearle::xml_parser->XMLin($xml, ForceArray => ['im']);
myLog(4, Dumper($parsed_xml));
$xml = undef;
if(exists($parsed_xml->{query}->{pages}->{page}->{images}->{im}) and defined($parsed_xml->{query}->{pages}->{page}->{images}->{im}))
{
my @set_images = map {$_->{title}} @{$parsed_xml->{query}->{pages}->{page}->{images}->{im}};
push @images, @set_images;
}
if(exists($parsed_xml->{'query-continue'}->{images}->{imcontinue}))
{
$list_offset = $parsed_xml->{'query-continue'}->{images}->{imcontinue};
$list_offset =~ s/&/%26/;
$query_params{imcontinue} = $list_offset;
}
else
{
$list_offset = undef;
}
sleep (1); # Throttle GETs
}
while(defined($list_offset));
return @images;
}
- Get up to $max most recent articles edited by a user
sub getUserArticles
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
my ($url, $request, $response, $reply, @contribs,
$target, $namespace, $max, $offset);
$target = $_[0];
$max = $_[1];
$offset = $_[2];
$namespace = namespaceToNumber($_[3]);
# Create a request-object
if(defined($namespace))
{
$url = "${Pearle::wiki}index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}&namespace=$namespace";
}
else
{
$url = "${Pearle::wiki}index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}";
}
myLog(3, "GET $url\n");
$request = HTTP::Request->new(GET => "$url");
$response = startRetry(\&httpRequest, $request);
$reply = $response->decoded_content;
# This detects whether or not we're logged in.
checkLogin($reply);
# Extract the contributions
#
while($reply =~ /
{
push @contribs, $1;
}
# Remove duplicates
- @contribs = uniquify(@contribs);
return @contribs;
}
- Gets a list of (page, id, namespace) tuples
- Takes the following named parameters:
- prefix: Filter to only include pages that start with this string
- namespace: a reference to a list of namespaces to get pages from. If not provided, gets only pages from namespace 0.
- redirects: one of "yes", "no", "both"
sub getPageList
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
my %params = @_;
my %base_query_params = (list => 'allpages', aplimit => 500);
my %full_query_params;
my @namespaces;
my @articles = ();
foreach my $key (keys(%params))
{
if($key eq 'prefix')
{
$base_query_params{apprefix} = $params{prefix};
}
elsif($key eq 'redirects')
{
if($params{redirects} eq 'both')
{
$base_query_params{apfilterredir} = "all";
}
elsif($params{redirects} eq 'yes')
{
$base_query_params{apfilterredir} = "redirects";
}
elsif($params{redirects} eq 'no')
{
$base_query_params{apfilterredir} = "nonredirects";
}
else
{
myLog(2, "Unrecognized redirect option in getPageList: $params{redirects}. Ignoring.\n");
}
}
elsif($key eq 'namespace')
{
if(ref($params{namespace}) eq 'ARRAY')
{
@namespaces = @{$params{namespace}};
}
elsif(!ref($params{namespace}))
{
push @namespaces, $params{namespace};
}
else
{
myLog(0, "Namespace list in getPageList must be a scalar or an array reference\n");
die("902 Namespace list in getPageList must be a scalar or an array reference");
}
}
}
push @namespaces, 0 if(!scalar(@namespaces));
foreach my $namespace (@namespaces)
{
%full_query_params = %base_query_params;
$full_query_params{apnamespace} = $namespace;
my $offset;
do
{
my $xml = APIQuery(%full_query_params);
if(!defined($xml))
{
myLog(0, "Unknown error accessing wiki\n");
die "900 Unknown error accessing wiki";
}
my $parsed_xml = $Pearle::xml_parser->XMLin($xml, ForceArray => ['p']);
myLog(4, Dumper($parsed_xml));
$xml = undef;
if(exists($parsed_xml->{query}->{allpages}->{p}) and defined($parsed_xml->{query}->{allpages}->{p}))
{
push(@articles, map({[$_->{title}, $_->{ns}]} @{$parsed_xml->{query}->{allpages}->{p}}));
}
if(exists($parsed_xml->{'query-continue'}->{allpages}->{apfrom}))
{
$offset = $parsed_xml->{'query-continue'}->{allpages}->{apfrom};
$offset =~ s/&/%26/;
$full_query_params{apfrom} = $offset;
}
else
{
$offset = undef;
}
sleep (1); # Throttle GETs
}
while(defined($offset));
}
return @articles;
}
- Gets a list of (articles, actor, summary, timestamp) tuples from the specified log (upload, delete, move, protect). The list is sorted by timestamp
- with the newest entry first
- Takes the following named parameters:
- user: Filter "actor" to include only actions by this user
- log: Filter to include only actions in this log (upload, delete, move, protect).
- limit: Include this many items. Defaults to 50 items.
- time: Start checking the log at this time. Timestamp in ISO 8601 format.
- dir: Check the log in this direction (newer or older) from the timestamp
sub getLogArticles
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
my %params = @_;
my %query_params = (list => 'logevents', lelimit => 50);
my @articles = ();
foreach my $key (keys(%params))
{
if($key eq 'user')
{
$query_params{leuser} = $params{user};
$query_params{leuser} =~ s/^User://i; # Strip namespace prefix, if it's there.
}
elsif($key eq 'log')
{
$query_params{letype} = $params{log};
}
elsif($key eq 'limit')
{
$query_params{lelimit} = $params{limit};
}
elsif($key eq 'time')
{
$query_params{lestart} = $params{time};
}
elsif($key eq 'dir')
{
$query_params{ledir} = $params{dir};
}
else
{
myLog(2, "Error: Unknown parameter $key in getLogArticles\n");
}
}
my $xml = APIQuery(%query_params);
if(!defined($xml))
{
myLog(0, "Unknown error accessing wiki\n");
die "920 Unknown error accessing wiki";
}
my $parsed_xml = $Pearle::xml_parser->XMLin($xml, ForceArray => ['item']);
myLog(4, Dumper($parsed_xml));
$xml = undef;
if(exists($parsed_xml->{query}->{logevents}->{item}) and defined($parsed_xml->{query}->{logevents}->{item}))
{
- foreach my $item (@{$parsed_xml->{query}->{logevents}->{item}})
- {
- push @articles, [$item->{title}, $item->{user}, $item->{comment}, $item->{timestamp}];
- }
@articles = map {[$_->{title}, $_->{user}, $_->{comment}, $_->{timestamp}]} @{$parsed_xml->{query}->{logevents}->{item}};
$parsed_xml = undef;
@articles = uniquify_ref(0, @articles);
@articles = sort {$b->[3] cmp $a->[3]} @articles;
}
return @articles;
}
- Gets a list of all files at Special:UncategorizedFiles
- Uses screen-scraping because there isn't an API call for this. May not work for non-Wikipedia wikis.
sub getUncatFiles
{
my $offset = shift || 0;
my @files = ();
my $content = getURL("${Pearle::wiki}index.php?title=Special:UncategorizedFiles&limit=500&offset=$offset");
if($content !~ /There are no results for this report./)
{
while($content =~ /
{
my $file = decode('utf8', urlDecode($1));
myLog(4, "Found file $file\n");
push @files, $file;
}
if($content =~ /class="mw-nextlink"/)
{
myLog(4, "More files\n");
push @files, getUncatFiles($offset + 500);
}
}
return @files;
}
sub fixupURLFragment
{
my $url_fragment = shift;
$url_fragment =~ s/%/%25/g;
$url_fragment =~ s/&/%26/g;
$url_fragment =~ s/\+/%2B/g;
$url_fragment =~ s/#/%23/g;
$url_fragment =~ s/\?/%3F/g;
$url_fragment =~ s/\\/%5C/g;
return $url_fragment;
}
- Use the api.php interface to query the wiki using a GET request
- Takes a hash of parameter,value pairs
- Returns raw the XML blob from the wiki, or undef on error
sub APIQuery
{
my %params = @_;
my $url = "${Pearle::wiki}api.php?action=query&format=xml";
my $reply = undef;
foreach my $key (keys(%params))
{
my $val;
if(ref($params{$key}) eq 'ARRAY') # We've got a list of values
{
$val = join '|', @{$params{$key}};
}
else
{
$val = $params{$key};
}
$val =~ s/ /_/g if($key eq 'titles');
$val = fixupURLFragment($val);
$key = fixupURLFragment($key);
$url .= "&${key}=$val";
}
myLog(3, "API query: $url\n");
$url = encode("utf8", $url);
my $request = HTTP::Request->new(GET => "$url");
$reply = _APIguts($request);
return $reply;
}
- Use the api.php interface to query the wiki using a POST request
- Takes a hash of parameter,value pairs
- Returns raw the XML blob from the wiki (possibly including an API error message), or undef on server error
sub APIPost
{
my %params = @_;
my $url = "${Pearle::wiki}api.php?format=xml";
my $data = "";
my $reply = undef;
$data = join('&', map {fixupURLFragment($_) . "=" . fixupURLFragment((ref($params{$_}) eq 'ARRAY')?(join '|', @{$params{$_}}):($params{$_}))} keys(%params));
myLog(3, "API URL: $url\n");
myLog(3, "API query: $data\n");
my $request = HTTP::Request->new('POST', $url,
HTTP::Headers->new(Content_Type => "application/x-www-form-urlencoded"), encode("utf8", $data));
$reply = _APIguts($request);
return $reply;
}
- Use the api.php interface to query the wiki
- Takes a hash of parameter,value pairs
- Returns raw the XML blob from the wiki (possibly including an API error message), or undef on server error
sub APIEdit
{
my %params = @_;
my $url = "${Pearle::wiki}api.php?action=edit&format=xml";
my $data = "";
my $reply = undef;
$data = join('&', map {fixupURLFragment($_) . "=" . fixupURLFragment((ref($params{$_}) eq 'ARRAY')?(join '|', @{$params{$_}}):($params{$_}))} keys(%params));
myLog(3, "API query: $data\n");
my $request = HTTP::Request->new('POST', $url,
HTTP::Headers->new(Content_Type => "application/x-www-form-urlencoded"), encode("utf8", $data));
$reply = _APIguts($request);
return $reply;
}
- The common elements of APIQuery and APIPost. Not for external consumption.
sub _APIguts
{
my $request = shift;
die "902 request is not an HTTP::Request object" if(!$request->isa('HTTP::Request'));
my $reply;
APIretry:
my $response = startRetry(\&httpRequest, $request);
if($response->is_success)
{
$Pearle::ua->cookie_jar->save();
$reply = $response->decoded_content;
if(!defined($reply))
{
myLog(1, "Failed to decode response\n");
#return undef;
$reply = decode("utf8", $response->content);
}
if($reply =~ / { # Errors that can be fixed by trying again later if($1 eq 'internal_api_error_DBConnectionError') { myLog(1, "Error $1 querying server. Retrying after 60 seconds.\n"); sleep(60); goto APIretry; } else { # Format error myLog(1, "Error $1 querying server\n"); } } } else { myLog(1, "HTTP error accessing server\n"); $reply = undef; } return $reply; } sub Export { my ($request, $response, $reply, $articles); $articles = join "\n", @_; $request = POST "${Pearle::wiki}index.php?title=Special:Export&action=submit", [action => 'submit', pages => $articles, curonly => 1]; $response = startRetry(\&httpRequest, $request); $reply = $response->decoded_content; return $reply; } sub getArticleHistory { die "901 Pearle library not initialized!\n" if(!$Pearle::_inited); my %params = @_; my @history; my $offset; my $title = $params{title}; my $content = $params{content}; $content = 0 if(!defined($content)); my $limit = $params{limit}; die "902 Must specify title when calling getArticleHistory()" if(!defined($title)); my %query_params = (prop => 'revisions', rvlimit => 500, rvprop => ['ids', 'timestamp', 'user', 'comment', 'flags']); $query_params{rvprop} = ['ids', 'timestamp', 'user', 'comment', 'flags', 'content' ] if($content); $query_params{titles} = $title; $query_params{rvlimit} = $limit if(defined($limit)); do { my $xml = APIQuery(%query_params); if(!defined($xml)) { myLog(0, "Unknown error accessing wiki\n"); die "920 Unknown error accessing wiki"; } my $parsed_xml = $Pearle::xml_parser->XMLin($xml, ForceArray => ['rev']); myLog(4, Dumper($parsed_xml)); $xml = undef; if(exists($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}) and defined($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev})) { foreach my $item (@{$parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}}) { my $comment = $item->{comment}; $comment = "" if(!defined($comment)); my $minor = 0; $minor = 1 if(exists($item->{minor}) and defined($item->{minor})); if($content) { push @history, [$item->{revid}, $item->{timestamp}, $item->{user}, $comment, $minor, $item->{content}]; } else { push @history, [$item->{revid}, $item->{timestamp}, $item->{user}, $comment, $minor]; } } } if(!defined($limit) and exists($parsed_xml->{'query-continue'}->{revisions}->{rvstartid})) { $offset = $parsed_xml->{'query-continue'}->{revisions}->{rvstartid}; $offset =~ s/&/%26/; $query_params{rvstartid} = $offset; sleep(10); } else { $offset = undef; } } while(defined($offset)); return @history; } sub getURL #($target) { die "901 Pearle library not initialized!\n" if(!$Pearle::_inited); # Read throttle! sleep (1); my ($request, $response, $reply, $url); $url = $_[0]; # Create a request-object myLog(3, "GET ${url}\n"); $request = HTTP::Request->new(GET => "${url}"); $response = startRetry(\&httpRequest, $request); $reply = $response->decoded_content; # This may or may not actually work $Pearle::ua->cookie_jar->save(); return ($reply); } sub startRetry { my ($call_fn, @args) = @_; return retry($Pearle::speedLimit, $call_fn, @args); } sub retry { my ($call_fn, @args, $delay, @result, $result); ($delay, $call_fn, @args) = @_; if(wantarray()) { @result = eval{ $call_fn->(@args) }; if($@ =~ /^retry:/i) { limit($delay); @result = retry($delay * 2, $call_fn, @args); } elsif($@) { die; } return @result; } else { $result = eval{ &{$call_fn}(@args) }; if($@ =~ /^retry:/i) { limit($delay); $result = retry($delay * 2, $call_fn, @args); } elsif($@) { die; } return $result; } } sub initNamespaceList { die "901 Pearle library not initialized!\n" if(!$Pearle::_inited); my $xml = APIQuery(meta => 'siteinfo', siprop => 'namespaces'); if(!defined($xml)) { myLog(0, "Unknown error accessing wiki\n"); die "920 Unknown error accessing wiki"; } my $parsed_xml = $Pearle::xml_parser->XMLin($xml, KeyAttr => []); @Pearle::namespaces = map({[$_->{id}, $_->{content}]} grep({$_->{id} >= 0} @{$parsed_xml->{query}->{namespaces}->{ns}})); } sub getNamespaceNames { return map {$_->[1]} @Pearle::namespaces; } sub getNamespaceNumbers { return map {$_->[0]} @Pearle::namespaces; } sub namespaceToNumber { my $namespace = $_[0]; my $i = 0; my $name; if(scalar(@Pearle::namespaces) > 0) { if(defined($namespace)) { my @val = grep( {lc($_->[1]) eq lc($namespace)} @Pearle::namespaces); return undef if(!scalar(@val)); return $val[0]->[0]; } else { return undef; } } else { myLog(1, "Namespace array not initialized\n"); return undef; } } sub numberToNamespace { my $i = shift; if(scalar(@Pearle::namespaces) > 0) { if(defined($i)) { my @val = grep( {$_->[0] == $i} @Pearle::namespaces); return undef if(!scalar(@val)); return $val[0]->[1]; } else { return undef; } } else { myLog(1, "Namespace array not initialized\n"); return undef; } } sub urlDecode { my ($input); $input = $_[0]; $input =~ s/\%([a-f|A-F|0-9][a-f|A-F|0-9])/chr(hex($1))/eg; return ($input); } sub decodeArray { return map {urlDecode($_)} @_; } sub uniquify { my @list = @_; @list = sort @list; my $last = undef; my @new_list; my $item; foreach $item (@list) { push @new_list, $item if(!defined($last) or ($item ne $last)); $last = $item; } return @new_list; } sub uniquify_ref { my $element = shift; my @list = @_; @list = sort {$a->[$element] cmp $b->[$element]} @list; my $last = undef; my @new_list; my $item; foreach $item (@list) { push @new_list, $item if(!defined($last) or ($item->[$element] ne $last)); $last = $item->[$element]; } return @new_list; } 1;