User:FairuseBot/Pearle.pm

  1. IMPORTANT ###
  1. This code is released into the public domain.
  1. RECENT CHANGES ###
  1. 30 Nov 2005: Created, based off of the 12 Nov 2005 version of Pearle Wisebot
  2. 15 Feb 2006: Modifed "retry" to work with any function that signals failure by dying, modified to use a simple exponential backoff formula
  3. Simplified "limit" code, modified to take an optional parameter
  4. Added "config" function as a clean interface to change internal parameters
  5. Modified Wiki-access functions for use with the new "retry" function
  6. Cleanup of boolean config vars to use standard Perl boolean conventions
  7. 28 Feb 2006: Added checkLogin bottleneck, option to allow editing while logged out
  8. Added support for proxy servers
  9. 8 Mar 2006: Added support for getting a user's contributions
  10. Added support for retrieving logs
  11. Separated out some common regex parts into variables
  12. 29 Mar 2006: Added protection against Unicode in URLs
  13. Made thrown exceptions consistent
  14. Sanity-checking on postPage: talkpage without article, userpage or user talkpage without user
  15. 17 May 2005: Improved log retrieval
  16. 12 Jul 2007: Started support for api.php
  17. Log retrieval converted to api.php, added timestamps to retrieval
  18. Modified to work with any wiki
  19. Modified to use index.php rather than wiki.phtml
  20. Converted GetLogArticles to use named parameters
  21. 14 Jul 2007: Modified logging to use message levels. Removed "print" and "myPrint" functions
  22. 6 Aug 2007: Added the "WikiPage" class
  23. Modified getPage and putPage to only work with WikiPage objects
  24. Renamed to "Pearle.pm"
  25. Made a proper module
  26. 10 Aug 2007: Changed the default XML parser, reduced memory usage when parsing
  27. 17 Oct 2007: Removed nullEdit() -- MediaWiki hasn't required them in some time.
  28. Modified getCategoryArticles, getCategoryImages, and getSubcategories to use api.php
  29. 21 Oct 2007: Proper Unicode support
  30. 29 Oct 2007: Made edit summaries mandatory
  31. 23 Mar 2008: Changed "minor" flag from text to boolean
  32. 29 Mar 2008: Improved UTF-8 support
  33. 17 Oct 2008: Added a second login check, to handle non-Monobook skins
  34. 1 Dec 2008: Added a screen-scraping function to get Special:UncategorizedFiles
  35. Removed some hard-coded references to enwiki
  36. 27 Apr 2010: Fixed the login function to work with the new enwiki login page
  37. The HTML version of wikitext can validly contain ">"; fixed to handle this
  38. 12 Aug 2010: Added read-only and test modes
  39. 13 Dec 2010: Updated to use the API for editing. This removes the last coherent piece of Pearle Wisebot code.
  40. 31 Aug 2011: Added option for bot-flagging an edit to postPage()
  41. 24 Sep 2011: Added namespace-handling routines
  42. Added getPageList()
  43. Re-wrote parseHistory as getArticleHistory. Breaking change.
  44. 17 Jan 2012: Updated login and logout to use the API.
  45. Added APIPost()
  46. Refactored APIQuery(), APIEdit(), and APIPost() to move common code into a helper function.
  47. 20 Feb 2012: Added getToken() and appendToPage() to permit efficient logging.
  48. Removed the protection check from getPage(): it's simpler to just try the edit than to figure out how the wiki's protection rules apply.
  49. Refactored postPage and appendToPage to move common code into a helper function.
  50. 9 Nov 2012: Fixed a bug where getCategoryContents would only return the last set of items if the category had more than 500 items of that type.
  51. Added getPageImages()
  52. 11 Jun 2015: Modified all API access functions to use 'rawcontinue'
  1. Errors thrown by this package always begin with a three-digit number
  2. 4xx: HTTP client errors
  3. 505: Server error: HTTP version not supported
  4. 509: Server error: Bandwidth exceeded
  5. 900: Unspecified internal error.
  6. 901: Library not initialized. You didn't call Pearle::init() before calling this function.
  7. 902: Parameter error. You made a function call, but forgot a mandatory parameter, or provided an invalid one.
  8. 903: Attempted write in read-only mode.
  9. 920: Unexpected response. The MediaWiki site returned something unexpected.
  10. 921: Unexpected logout. The MediaWiki site logged us out unexpectedly.
  11. 922: Edit conflict. Someone edited the article while we were.
  12. 923: Deleted article conflict. Someone deleted the article while we were editing.
  13. 924: Spam filter. A link in the page tripped the spam filter.
  14. 925: Protected page. The page is protected, and the bot doesn't have the rights to edit it.

package Pearle;

use strict;

use warnings;

use Time::HiRes;

use Encode;

use LWP::UserAgent;

use HTTP::Cookies;

use HTTP::Request::Common qw(POST);

use HTML::Entities;

use XML::Simple;

use Data::Dumper; # For debugging

use URI::Escape;

use Digest::MD5 qw(md5_hex);

use Pearle::WikiPage;

  1. Standard regex parts

$Pearle::regex_timestamp = '(\d\d):(\d\d), (\d\d?) (\w+) (\d\d\d\d)'; # Match and capture a Wikipedia timestamp

$Pearle::regex_timestamp_nc = '\d\d:\d\d, \d\d? \w+ \d\d\d\d'; # Match a Wikipedia timestamp

$Pearle::regex_timestamp_ISO = '(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)'; # Match and capture a timestamp of the form 2007-07-13T04:21:39Z

$Pearle::regex_timestamp_ISO_nc = '\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d'; # Match a timestamp of the form 2007-07-13T04:21:39Z

$Pearle::regex_pagelink = ''; # Match and capture any page

$Pearle::regex_redpagelink = ''; # Match and capture nonexistant pages only

$Pearle::regex_bluepagelink = ''; # Match and capture existing pages only

$Pearle::regex_pagelink_nc = ''; # Match any page

$Pearle::regex_redpagelink_nc = ''; # Match nonexistant pages only

$Pearle::regex_bluepagelink_nc = ''; # Match existing pages only

  1. Namespaces available on this wiki

@Pearle::namespaces = ();

$Pearle::logfile = "";

$Pearle::_inited = 0;

$Pearle::username = "";

$Pearle::password = "";

$Pearle::testmode = 0; # Read-only mode: do not actually make wiki-modifying calls, but return as if they succeeded

$Pearle::readonly = 0; # Read-only mode: wiki-modifying calls are errors.

$Pearle::speedLimit = 10; # Seconds to wait by default when limit() is called

$Pearle::_speedMult = 1; # Multiplier for default wait time if the wiki is being slow

$Pearle::roughMode = 0; # Ignore most errors

$Pearle::nullOK = 0; # Permit editing non-existent pages

$Pearle::sanityCheck = 0; # Sanity checking on edits

$Pearle::loglevel = 2; # Level of message to write to file

$Pearle::printlevel = 3; # Level of message to print to stdout

$Pearle::logoutOK = 0; # Permit editing while logged out

$Pearle::proxy = undef; # Proxy to use

$Pearle::wiki = 'https://en.wikipedia.org/w/'; # URL of the directory containing index.php and api.php

$XML::Simple::PREFERRED_PARSER = "XML::Parser"; # Much faster than the default XML::SAX parser

$Pearle::xml_parser = XML::Simple->new();

  1. Accessors #########################################################

sub getXMLParser

{

return $Pearle::xml_parser;

}

  1. Other functions ###################################################
  1. This must be the first function from the library called

sub init

{

$Pearle::username = $_[0] or die("902 No username provided!\n");

$Pearle::password = $_[1] or die("902 No password provided!\n");

$Pearle::logfile = $_[2] or die("902 No logfile name provided!\n");

$Pearle::cookies = $_[3] or die("902 No cookie file provided!\n");

$Pearle::useragent = $_[4] or $Pearle::useragent = "PearleLib/0.2 (User:${Pearle::username})";

$Pearle::ua = LWP::UserAgent->new(timeout => 300);

$Pearle::ua->agent($Pearle::useragent);

$Pearle::ua->cookie_jar(HTTP::Cookies->new(file => $Pearle::cookies, autosave => 1));

$Pearle::ua->cookie_jar->load();

$Pearle::roughMode = "no";

$Pearle::_inited = 1;

}

sub config

{

my %params = @_;

$Pearle::readonly = $params{readonly} if(defined($params{readonly}));

$Pearle::testmode = $params{testmode} if(defined($params{testmode}));

$Pearle::speedLimit = $params{speedLimit} if(defined($params{speedLimit}));

$Pearle::roughMode = $params{roughMode} if(defined($params{roughMode}));

$Pearle::nullOK = $params{nullOK} if(defined($params{nullOK}));

$Pearle::loglevel = $params{loglevel} if(defined($params{loglevel}));

$Pearle::printlevel = $params{printlevel} if(defined($params{printlevel}));

$Pearle::logfile = $params{logfile} if(defined($params{logfile}));

$Pearle::logoutOK = $params{logoutOK} if(defined($params{logoutOK}));

$Pearle::sanityCheck = $params{sanityCheck} if(defined($params{sanityCheck}));

if(defined($params{wiki}) and $params{wiki} ne $Pearle::wiki)

{

$params{wiki} .= '/' if($params{wiki} !~ /\/$/); # Add a trailing slash if needed

$Pearle::wiki = $params{wiki};

}

if(exists($params{proxy}))

{

if(defined($params{proxy}))

{

myLog(3, "Proxying: $params{proxy}\n");

$Pearle::ua->proxy('http', $params{proxy});

$Pearle::proxy = $params{proxy};

}

else

{

myLog(3, "Not proxying\n");

$Pearle::ua->no_proxy();

$Pearle::proxy = undef;

}

}

}

  1. Logging levels:
  2. 0: Immediately fatal errors. Call to myLog will be followed by a call to die()
  3. 1: Non-fatal errors. The library can recover, turn the function call into a no-op, and return an error indicator
  4. 2: Serious warning. The library can still complete the action
  5. 3: Status messages. Messages useful for tracing library execution.
  6. 4: Debugging messages.

sub myLog

{

my $level = shift;

my @message = @_;

my $message = join "", @message;

if($level <= $Pearle::loglevel)

{

die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

open (LOG, ">>:utf8", $Pearle::logfile) || die "900 Could not append to log!";

print LOG $message;

close (LOG);

}

if($level <= $Pearle::printlevel)

{

print $message;

}

}

  1. Rate-limiting. Can be sensibly run even if libPearle isn't initialized

sub limit

{

my ($i);

$i = ($_[0] or ($Pearle::speedLimit * $Pearle::_speedMult));

$i = 10 if($i < 10);

# Rate-limiting to avoid hosing the wiki server

# Min 30 sec unmarked

# Min 10 sec marked

# May be raised by retry() if load is heavy

### ATTENTION ###

# Increasing the speed of the bot to faster than 1 edit every 10

# seconds violates English Wikipedia rules as of April, 2005, and

# will cause your bot to be banned. So don't change $normalDelay

# unless you know what you are doing. Other sites may have

# similar policies, and you are advised to check before using your

# bot at the default speed.

#################

while ($i >= 0)

{

sleep (1);

print "Sleeping $i seconds...\r";

$i--;

}

print " \r";

}

sub login

{

die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

my $xml = APIPost(action => 'login', lgname => $Pearle::username, lgpassword => $Pearle::password);

my $parsed_xml = $Pearle::xml_parser->XMLin($xml);

print Dumper($parsed_xml);

if($parsed_xml->{login}->{result} eq 'NeedToken')

{

$xml = APIPost(action => 'login', lgname => $Pearle::username, lgpassword => $Pearle::password, lgtoken => $parsed_xml->{login}->{token});

$parsed_xml = $Pearle::xml_parser->XMLin($xml);

print Dumper($parsed_xml);

}

if($parsed_xml->{login}->{result} eq 'Success')

{

return 1;

}

else

{

return 0;

}

}

sub logout {

APIQuery(action => 'logout');

return 1;

}

sub checkLogin

{

my ($reply_text);

$reply_text = $_[0];

if ($reply_text !~ m/>My talk<\/a>/ and !($Pearle::logoutOK))

{

if($reply_text !~ /var wgUserName = "$Pearle::username"/)

{

# We've lost our identity.

myLog(0, "Wiki server is not recognizing me (1).\n---\n${reply_text}\n---\n");

die ("921 Wiki server is not recognizing me (1).\n");

}

}

}

  1. Make an HTTP request, performing basic error checking and handling. Suitable for use with the "retry" function

sub httpRequest

{

my ($request, $response, $attemptStartTime, $attemptEndTime);

$request = $_[0];

$response = $Pearle::ua->request($request);

# Monitor wiki server responsiveness

$attemptStartTime = Time::HiRes::time();

if ($response->is_success or $response->is_redirect)

{

return $response

}

else

{

# 50X HTTP errors mean there is a problem connecting to the wiki server. Can be remedied by waiting and trying again

if (500 <= $response->code and 504 >= $response->code)

{

myLog(2, "HTTP ERR (".$response->status_line.")\n");

die("retry:".$response->status_line);

}

else

{

# Unhandled HTTP response. Waiting probably won't fix it

myLog(0, "HTTP ERR (".$response->status_line.")\n".$response->decoded_content."\n");

die($response->status_line."\n");

}

}

# Monitor wiki server responsiveness

$attemptEndTime = Time::HiRes::time();

if($request->method() eq "POST")

{

if (($attemptEndTime - $attemptStartTime) > 20)

{

$Pearle::_speedMult = 60;

myLog(3, "Wiki is very slow. Increasing minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");

}

# If the response time is between 10 and 20 seconds...

elsif (($attemptEndTime - $attemptStartTime) > 10)

{

$Pearle::_speedMult = 6;

myLog(3, "Wiki is somewhat slow. Setting minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");

}

# If the response time is less than 10 seconds...

else

{

if ($Pearle::_speedMult != 1)

{

$Pearle::_speedMult = 1;

myLog(3, "Returning to normal minimum wait time.\n");

}

}

}

}

  1. Check out a page for editing.

sub getPage

{

die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

my ($target, $xml, $parsed_xml, %query_params, $text, $editTime, $startTime, $token);

$target = $_[0];

if ($target =~ m/^\s*$/)

{

myLog(0, "getPage: Null target.");

die("902 getPage: Null target.");

}

$query_params{prop} = ['revisions', 'info'];

$query_params{rvprop} = ['timestamp', 'content'];

$query_params{inprop} = 'protection';

$query_params{intoken} = 'edit';

$query_params{titles} = $target;

$xml = APIQuery(%query_params);

if(!defined($xml))

{

myLog(0, "Unknown error requesting page contents\n");

die "900 Unknown error requesting page contents";

}

$parsed_xml = $Pearle::xml_parser->XMLin($xml);

myLog(4, Dumper($parsed_xml));

$xml = undef;

# Check for errors

if(exists($parsed_xml->{query}->{pages}->{page}->{invalid}))

{

myLog(0, "Invalid page title: $target\n");

die("902 Invalid page title: $target");

}

# See if the page is blank

if(!$Pearle::nullOK && exists($parsed_xml->{query}->{pages}->{page}->{missing}))

{

myLog (1, "Empty page: $target\n");

if (!$Pearle::roughMode)

{

die ("920 Empty page: $target\n");

}

}

$text = $parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{content};

$startTime = $parsed_xml->{query}->{pages}->{page}->{starttimestamp};

$editTime = $parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{timestamp};

$token = $parsed_xml->{query}->{pages}->{page}->{edittoken};

return Pearle::WikiPage->new(text => $text, editTime => $editTime, startTime => $startTime, editToken => $token, title => $target);

}

sub getToken

{

die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

my ($target, $xml, $parsed_xml, %query_params);

$target = $_[0];

if ($target =~ m/^\s*$/)

{

myLog(0, "getPage: Null target.");

die("902 getPage: Null target.");

}

$query_params{prop} = ['info'];

$query_params{intoken} = 'edit';

$query_params{titles} = $target;

$query_params{inprop} = '';

$xml = APIQuery(%query_params);

$parsed_xml = $Pearle::xml_parser->XMLin($xml);

myLog(4, Dumper($parsed_xml));

$xml = undef;

# Check for errors

if(exists($parsed_xml->{query}->{pages}->{page}->{invalid}))

{

myLog(0, "Invalid page title: $target\n");

die("902 Invalid page title: $target");

}

# See if the page has been protected

if(exists($parsed_xml->{query}->{pages}->{page}->{protection}->{pr}))

{

myLog(0, "Page $target is protected\n");

die("925 Protected");

}

return $parsed_xml->{query}->{pages}->{page}->{edittoken};

}

  1. The common elements of postPage and appendToPage. Not for external consumption.

sub _editPage

{

die "903 Library in read-only mode!\n" if($Pearle::readonly);

my ($xml, $parsed_xml, $pageName, %params);

%params = @_;

$pageName = $params{title};

$params{assert} = 'user' if(!$Pearle::logoutOK);

EDITRETRY:

$xml = APIEdit(%params);

if(!defined($xml))

{

myLog(0, "Unknown error posting edit\n");

die "900 Unknown error posting edit";

}

$parsed_xml = $Pearle::xml_parser->XMLin($xml);

myLog(4, Dumper($parsed_xml));

$xml = undef;

# Check for errors

if(exists($parsed_xml->{edit}) && $parsed_xml->{edit}->{result} eq 'Failure' && $parsed_xml->{edit}->{assert} eq 'user')

{

myLog(0, "Wiki server is not recognizing me\n");

die ("921 Not logged in");

}

if(exists($parsed_xml->{error}))

{

if($parsed_xml->{error}->{code} eq 'blocked')

{

myLog(0, "Blocked\n");

die ("900 Blocked");

}

elsif($parsed_xml->{error}->{code} eq 'protectedpage' || $parsed_xml->{error}->{code} eq 'cascadeprotected')

{

myLog(0, "Page $pageName is protected\n");

die ("925 Protected");

}

elsif($parsed_xml->{error}->{code} eq 'pagedeleted')

{

myLog(0, "Deleted article conflict on $pageName\n");

die ("923 Deleted article conflict");

}

elsif($parsed_xml->{error}->{code} eq 'editconflict')

{

myLog(0, "Edit conflict on $pageName\n");

die ("922 Edit conflict on $pageName");

}

elsif($parsed_xml->{error}->{code} eq 'spamdetected')

{

myLog(0, "Spam link on $pageName: $parsed_xml->{error}->{info}\n");

die ("924 Spam filter");

}

elsif($parsed_xml->{error}->{code} eq 'readonly')

{

myLog(1, "Wiki is in readonly mode. Waiting before retry\n");

sleep 60;

goto EDITRETRY;

}

else

{

myLog(0, "Unexpected error. Code: $parsed_xml->{error}->{code} Info: $parsed_xml->{error}->{info}\n");

die ("920 Server error");

}

}

return $parsed_xml->{edit}->{result};

}

sub postPage

{

die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

my ($page, $pageName, $summaryEntry, $minor, $bot, $xml, %params, $parsed_xml);

$page = $_[0];

$summaryEntry = $_[1];

$minor = $_[2];

$bot = $_[3];

$bot = 1 if(!defined($bot));

if(!defined($minor))

{

myLog(0, "postPage(): Not enough parameters.\n");

die "902 postPage(): Not enough parameters!\n";

}

if(!$page->isa("Pearle::WikiPage"))

{

myLog(0, "postPage(): First parameter is not a WikiPage object\n");

die "902 postPage(): First parameter is not a WikiPage object\n";

}

if ($summaryEntry eq "")

{

myLog(0, "postPage(): No edit summary provided\n");

die "902 postPage(): No edit summary provided\n";

}

return "Success" if($Pearle::testmode);

$pageName = $page->getTitle();

$params{title} = $pageName;

$params{text} = $page->getWikiText();

$params{token} = $page->getEditToken();

$params{summary} = $summaryEntry;

$params{minor} = 1 if($minor);

$params{starttimestamp} = $page->getStartTime();

$params{basetimestamp} = $page->getEditTime();

$params{bot} = 1 if($bot);

return _editPage(%params);

}

sub appendToPage

{

die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

my ($page, $token, $text, $summary, $minor, $bot, %params, $xml, $parsed_xml);

$page = $_[0];

$token = $_[1];

$text = $_[2];

$summary = $_[3];

$minor = $_[4];

$bot = $_[5];

$bot = 1 if(!defined($bot));

if(!defined($minor))

{

myLog(0, "appendToPage(): Not enough parameters.\n");

die "902 appendToPage(): Not enough parameters!\n";

}

return "Success" if($Pearle::testmode);

$params{title} = $page;

$params{appendtext} = $text;

$params{token} = $token;

$params{summary} = $summary;

$params{minor} = 1 if($minor);

$params{bot} = 1 if($bot);

return _editPage(%params);;

}

  1. Get a list of the contents in a given category, filtered by namespace

sub getCategoryContents

{

die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

my ($target, $category_offset, @articles, $xml, $parsed_xml, %query_params,

$numberOfArticles, @namespaces);

$target = shift;

@namespaces = @_;

# Category: prefix is mandatory

if($target !~ /^[Cc]ategory:/)

{

$target = "Category:" . $target;

}

$query_params{list} = 'categorymembers';

$query_params{cmprop} = 'title';

$query_params{cmtitle} = $target;

$query_params{cmlimit} = 500; # If you're a flagged bot, this could be 5000, but we default to 500 for compatibility and to keep memory usage down

$query_params{rawcontinue} = "";

foreach my $namespace (@namespaces)

{

$query_params{cmnamespace} .= "${namespace}|";

}

if(exists($query_params{cmnamespace}) and defined($query_params{cmnamespace}))

{

chop $query_params{cmnamespace};

}

do

{

$xml = APIQuery(%query_params);

if(!defined($xml))

{

myLog(0, "Unknown error accessing wiki\n");

die "900 Unknown error accessing wiki";

}

$parsed_xml = $Pearle::xml_parser->XMLin($xml, ForceArray => ['cm']);

myLog(4, Dumper($parsed_xml));

$xml = undef;

if(exists($parsed_xml->{query}->{categorymembers}->{cm}) and defined($parsed_xml->{query}->{categorymembers}->{cm}))

{

my @set_articles = map {$_->{title}} @{$parsed_xml->{query}->{categorymembers}->{cm}};

push @articles, @set_articles;

}

if(exists($parsed_xml->{'query-continue'}->{categorymembers}->{cmcontinue}))

{

$category_offset = $parsed_xml->{'query-continue'}->{categorymembers}->{cmcontinue};

$category_offset =~ s/&/%26/;

$query_params{cmcontinue} = $category_offset;

}

else

{

$category_offset = undef;

}

sleep (1); # Throttle GETs

}

while(defined($category_offset));

$numberOfArticles = scalar(@articles);

myLog(4, "Got $numberOfArticles articles.\n");

return @articles;

}

sub getCategoryArticles

{

return getCategoryContents($_[0], 0); # Namespace 0: Articles

}

sub getCategoryImages

{

return getCategoryContents($_[0], 6); #Namespace 6: Images

}

sub getSubcategories

{

return getCategoryContents($_[0], 14); # Namespace 14: Categories

}

sub getPageImages

{

die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

my ($target, $list_offset, @images, $xml, $parsed_xml, %query_params);

$target = shift;

die "902 No article provided for getPageImages\n" if(!defined($target));

$query_params{prop} = "images";

$query_params{titles} = $target;

$query_params{imlimit} = 500;

$query_params{rawcontinue} = "";

do

{

$xml = APIQuery(%query_params);

if(!defined($xml))

{

myLog(0, "Unknown error accessing wiki\n");

die "900 Unknown error accessing wiki";

}

$parsed_xml = $Pearle::xml_parser->XMLin($xml, ForceArray => ['im']);

myLog(4, Dumper($parsed_xml));

$xml = undef;

if(exists($parsed_xml->{query}->{pages}->{page}->{images}->{im}) and defined($parsed_xml->{query}->{pages}->{page}->{images}->{im}))

{

my @set_images = map {$_->{title}} @{$parsed_xml->{query}->{pages}->{page}->{images}->{im}};

push @images, @set_images;

}

if(exists($parsed_xml->{'query-continue'}->{images}->{imcontinue}))

{

$list_offset = $parsed_xml->{'query-continue'}->{images}->{imcontinue};

$list_offset =~ s/&/%26/;

$query_params{imcontinue} = $list_offset;

}

else

{

$list_offset = undef;

}

sleep (1); # Throttle GETs

}

while(defined($list_offset));

return @images;

}

  1. Get up to $max most recent articles edited by a user

sub getUserArticles

{

die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

my ($url, $request, $response, $reply, @contribs,

$target, $namespace, $max, $offset);

$target = $_[0];

$max = $_[1];

$offset = $_[2];

$namespace = namespaceToNumber($_[3]);

# Create a request-object

if(defined($namespace))

{

$url = "${Pearle::wiki}index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}&namespace=$namespace";

}

else

{

$url = "${Pearle::wiki}index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}";

}

myLog(3, "GET $url\n");

$request = HTTP::Request->new(GET => "$url");

$response = startRetry(\&httpRequest, $request);

$reply = $response->decoded_content;

# This detects whether or not we're logged in.

checkLogin($reply);

# Extract the contributions

#

  • 23:18, 6 March 2006 (

    while($reply =~ /

  • $Pearle::regex_timestamp_nc \($Pearle::regex_bluepagelink/g)

    {

    push @contribs, $1;

    }

    # Remove duplicates

    1. @contribs = uniquify(@contribs);

    return @contribs;

    }

    1. Gets a list of (page, id, namespace) tuples
    2. Takes the following named parameters:
    3. prefix: Filter to only include pages that start with this string
    4. namespace: a reference to a list of namespaces to get pages from. If not provided, gets only pages from namespace 0.
    5. redirects: one of "yes", "no", "both"

    sub getPageList

    {

    die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

    my %params = @_;

    my %base_query_params = (list => 'allpages', aplimit => 500);

    my %full_query_params;

    my @namespaces;

    my @articles = ();

    foreach my $key (keys(%params))

    {

    if($key eq 'prefix')

    {

    $base_query_params{apprefix} = $params{prefix};

    }

    elsif($key eq 'redirects')

    {

    if($params{redirects} eq 'both')

    {

    $base_query_params{apfilterredir} = "all";

    }

    elsif($params{redirects} eq 'yes')

    {

    $base_query_params{apfilterredir} = "redirects";

    }

    elsif($params{redirects} eq 'no')

    {

    $base_query_params{apfilterredir} = "nonredirects";

    }

    else

    {

    myLog(2, "Unrecognized redirect option in getPageList: $params{redirects}. Ignoring.\n");

    }

    }

    elsif($key eq 'namespace')

    {

    if(ref($params{namespace}) eq 'ARRAY')

    {

    @namespaces = @{$params{namespace}};

    }

    elsif(!ref($params{namespace}))

    {

    push @namespaces, $params{namespace};

    }

    else

    {

    myLog(0, "Namespace list in getPageList must be a scalar or an array reference\n");

    die("902 Namespace list in getPageList must be a scalar or an array reference");

    }

    }

    }

    push @namespaces, 0 if(!scalar(@namespaces));

    foreach my $namespace (@namespaces)

    {

    %full_query_params = %base_query_params;

    $full_query_params{apnamespace} = $namespace;

    my $offset;

    do

    {

    my $xml = APIQuery(%full_query_params);

    if(!defined($xml))

    {

    myLog(0, "Unknown error accessing wiki\n");

    die "900 Unknown error accessing wiki";

    }

    my $parsed_xml = $Pearle::xml_parser->XMLin($xml, ForceArray => ['p']);

    myLog(4, Dumper($parsed_xml));

    $xml = undef;

    if(exists($parsed_xml->{query}->{allpages}->{p}) and defined($parsed_xml->{query}->{allpages}->{p}))

    {

    push(@articles, map({[$_->{title}, $_->{ns}]} @{$parsed_xml->{query}->{allpages}->{p}}));

    }

    if(exists($parsed_xml->{'query-continue'}->{allpages}->{apfrom}))

    {

    $offset = $parsed_xml->{'query-continue'}->{allpages}->{apfrom};

    $offset =~ s/&/%26/;

    $full_query_params{apfrom} = $offset;

    }

    else

    {

    $offset = undef;

    }

    sleep (1); # Throttle GETs

    }

    while(defined($offset));

    }

    return @articles;

    }

    1. Gets a list of (articles, actor, summary, timestamp) tuples from the specified log (upload, delete, move, protect). The list is sorted by timestamp
    2. with the newest entry first
    3. Takes the following named parameters:
    4. user: Filter "actor" to include only actions by this user
    5. log: Filter to include only actions in this log (upload, delete, move, protect).
    6. limit: Include this many items. Defaults to 50 items.
    7. time: Start checking the log at this time. Timestamp in ISO 8601 format.
    8. dir: Check the log in this direction (newer or older) from the timestamp

    sub getLogArticles

    {

    die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

    my %params = @_;

    my %query_params = (list => 'logevents', lelimit => 50);

    my @articles = ();

    foreach my $key (keys(%params))

    {

    if($key eq 'user')

    {

    $query_params{leuser} = $params{user};

    $query_params{leuser} =~ s/^User://i; # Strip namespace prefix, if it's there.

    }

    elsif($key eq 'log')

    {

    $query_params{letype} = $params{log};

    }

    elsif($key eq 'limit')

    {

    $query_params{lelimit} = $params{limit};

    }

    elsif($key eq 'time')

    {

    $query_params{lestart} = $params{time};

    }

    elsif($key eq 'dir')

    {

    $query_params{ledir} = $params{dir};

    }

    else

    {

    myLog(2, "Error: Unknown parameter $key in getLogArticles\n");

    }

    }

    my $xml = APIQuery(%query_params);

    if(!defined($xml))

    {

    myLog(0, "Unknown error accessing wiki\n");

    die "920 Unknown error accessing wiki";

    }

    my $parsed_xml = $Pearle::xml_parser->XMLin($xml, ForceArray => ['item']);

    myLog(4, Dumper($parsed_xml));

    $xml = undef;

    if(exists($parsed_xml->{query}->{logevents}->{item}) and defined($parsed_xml->{query}->{logevents}->{item}))

    {

    1. foreach my $item (@{$parsed_xml->{query}->{logevents}->{item}})
    2. {
    3. push @articles, [$item->{title}, $item->{user}, $item->{comment}, $item->{timestamp}];
    4. }

    @articles = map {[$_->{title}, $_->{user}, $_->{comment}, $_->{timestamp}]} @{$parsed_xml->{query}->{logevents}->{item}};

    $parsed_xml = undef;

    @articles = uniquify_ref(0, @articles);

    @articles = sort {$b->[3] cmp $a->[3]} @articles;

    }

    return @articles;

    }

    1. Gets a list of all files at Special:UncategorizedFiles
    2. Uses screen-scraping because there isn't an API call for this. May not work for non-Wikipedia wikis.

    sub getUncatFiles

    {

    my $offset = shift || 0;

    my @files = ();

    my $content = getURL("${Pearle::wiki}index.php?title=Special:UncategorizedFiles&limit=500&offset=$offset");

    if($content !~ /There are no results for this report./)

    {

    while($content =~ /

    ]*href="\/wiki\/([^"]*)"/g)

    {

    my $file = decode('utf8', urlDecode($1));

    myLog(4, "Found file $file\n");

    push @files, $file;

    }

    if($content =~ /class="mw-nextlink"/)

    {

    myLog(4, "More files\n");

    push @files, getUncatFiles($offset + 500);

    }

    }

    return @files;

    }

    sub fixupURLFragment

    {

    my $url_fragment = shift;

    $url_fragment =~ s/%/%25/g;

    $url_fragment =~ s/&/%26/g;

    $url_fragment =~ s/\+/%2B/g;

    $url_fragment =~ s/#/%23/g;

    $url_fragment =~ s/\?/%3F/g;

    $url_fragment =~ s/\\/%5C/g;

    return $url_fragment;

    }

    1. Use the api.php interface to query the wiki using a GET request
    2. Takes a hash of parameter,value pairs
    3. Returns raw the XML blob from the wiki, or undef on error

    sub APIQuery

    {

    my %params = @_;

    my $url = "${Pearle::wiki}api.php?action=query&format=xml";

    my $reply = undef;

    foreach my $key (keys(%params))

    {

    my $val;

    if(ref($params{$key}) eq 'ARRAY') # We've got a list of values

    {

    $val = join '|', @{$params{$key}};

    }

    else

    {

    $val = $params{$key};

    }

    $val =~ s/ /_/g if($key eq 'titles');

    $val = fixupURLFragment($val);

    $key = fixupURLFragment($key);

    $url .= "&${key}=$val";

    }

    myLog(3, "API query: $url\n");

    $url = encode("utf8", $url);

    my $request = HTTP::Request->new(GET => "$url");

    $reply = _APIguts($request);

    return $reply;

    }

    1. Use the api.php interface to query the wiki using a POST request
    2. Takes a hash of parameter,value pairs
    3. Returns raw the XML blob from the wiki (possibly including an API error message), or undef on server error

    sub APIPost

    {

    my %params = @_;

    my $url = "${Pearle::wiki}api.php?format=xml";

    my $data = "";

    my $reply = undef;

    $data = join('&', map {fixupURLFragment($_) . "=" . fixupURLFragment((ref($params{$_}) eq 'ARRAY')?(join '|', @{$params{$_}}):($params{$_}))} keys(%params));

    myLog(3, "API URL: $url\n");

    myLog(3, "API query: $data\n");

    my $request = HTTP::Request->new('POST', $url,

    HTTP::Headers->new(Content_Type => "application/x-www-form-urlencoded"), encode("utf8", $data));

    $reply = _APIguts($request);

    return $reply;

    }

    1. Use the api.php interface to query the wiki
    2. Takes a hash of parameter,value pairs
    3. Returns raw the XML blob from the wiki (possibly including an API error message), or undef on server error

    sub APIEdit

    {

    my %params = @_;

    my $url = "${Pearle::wiki}api.php?action=edit&format=xml";

    my $data = "";

    my $reply = undef;

    $data = join('&', map {fixupURLFragment($_) . "=" . fixupURLFragment((ref($params{$_}) eq 'ARRAY')?(join '|', @{$params{$_}}):($params{$_}))} keys(%params));

    myLog(3, "API query: $data\n");

    my $request = HTTP::Request->new('POST', $url,

    HTTP::Headers->new(Content_Type => "application/x-www-form-urlencoded"), encode("utf8", $data));

    $reply = _APIguts($request);

    return $reply;

    }

    1. The common elements of APIQuery and APIPost. Not for external consumption.

    sub _APIguts

    {

    my $request = shift;

    die "902 request is not an HTTP::Request object" if(!$request->isa('HTTP::Request'));

    my $reply;

    APIretry:

    my $response = startRetry(\&httpRequest, $request);

    if($response->is_success)

    {

    $Pearle::ua->cookie_jar->save();

    $reply = $response->decoded_content;

    if(!defined($reply))

    {

    myLog(1, "Failed to decode response\n");

    #return undef;

    $reply = decode("utf8", $response->content);

    }

    if($reply =~ /

    {

    # Errors that can be fixed by trying again later

    if($1 eq 'internal_api_error_DBConnectionError')

    {

    myLog(1, "Error $1 querying server. Retrying after 60 seconds.\n");

    sleep(60);

    goto APIretry;

    }

    else

    {

    # Format error

    myLog(1, "Error $1 querying server\n");

    }

    }

    }

    else

    {

    myLog(1, "HTTP error accessing server\n");

    $reply = undef;

    }

    return $reply;

    }

    1. Use the Special:Export interface to get the wikitext of one or more articles

    sub Export

    {

    my ($request, $response, $reply, $articles);

    $articles = join "\n", @_;

    $request = POST "${Pearle::wiki}index.php?title=Special:Export&action=submit", [action => 'submit', pages => $articles, curonly => 1];

    $response = startRetry(\&httpRequest, $request);

    $reply = $response->decoded_content;

    return $reply;

    }

    1. Get the history of an article as a set of (id, timestamp, user, comment, minor) or (id, timestamp, user, comment, minor, content) tuples.
    2. Takes the following named parameters:
    3. title: the title of the article to retrieve. Mandatory.
    4. content: a boolean indicating if article content should be retrieved.
    5. limit: the maximum number of revisions to fetch. Must be smaller than the wiki's limit on how many revisions can be fetched at one time.

    sub getArticleHistory

    {

    die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

    my %params = @_;

    my @history;

    my $offset;

    my $title = $params{title};

    my $content = $params{content};

    $content = 0 if(!defined($content));

    my $limit = $params{limit};

    die "902 Must specify title when calling getArticleHistory()" if(!defined($title));

    my %query_params = (prop => 'revisions', rvlimit => 500, rvprop => ['ids', 'timestamp', 'user', 'comment', 'flags']);

    $query_params{rvprop} = ['ids', 'timestamp', 'user', 'comment', 'flags', 'content' ] if($content);

    $query_params{titles} = $title;

    $query_params{rvlimit} = $limit if(defined($limit));

    do

    {

    my $xml = APIQuery(%query_params);

    if(!defined($xml))

    {

    myLog(0, "Unknown error accessing wiki\n");

    die "920 Unknown error accessing wiki";

    }

    my $parsed_xml = $Pearle::xml_parser->XMLin($xml, ForceArray => ['rev']);

    myLog(4, Dumper($parsed_xml));

    $xml = undef;

    if(exists($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}) and defined($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}))

    {

    foreach my $item (@{$parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}})

    {

    my $comment = $item->{comment};

    $comment = "" if(!defined($comment));

    my $minor = 0;

    $minor = 1 if(exists($item->{minor}) and defined($item->{minor}));

    if($content)

    {

    push @history, [$item->{revid}, $item->{timestamp}, $item->{user}, $comment, $minor, $item->{content}];

    }

    else

    {

    push @history, [$item->{revid}, $item->{timestamp}, $item->{user}, $comment, $minor];

    }

    }

    }

    if(!defined($limit) and exists($parsed_xml->{'query-continue'}->{revisions}->{rvstartid}))

    {

    $offset = $parsed_xml->{'query-continue'}->{revisions}->{rvstartid};

    $offset =~ s/&/%26/;

    $query_params{rvstartid} = $offset;

    sleep(10);

    }

    else

    {

    $offset = undef;

    }

    }

    while(defined($offset));

    return @history;

    }

    sub getURL #($target)

    {

    die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

    # Read throttle!

    sleep (1);

    my ($request, $response, $reply, $url);

    $url = $_[0];

    # Create a request-object

    myLog(3, "GET ${url}\n");

    $request = HTTP::Request->new(GET => "${url}");

    $response = startRetry(\&httpRequest, $request);

    $reply = $response->decoded_content;

    # This may or may not actually work

    $Pearle::ua->cookie_jar->save();

    return ($reply);

    }

    1. Retries a given function repeatedly, with an exponential backoff rate
    2. The function should throw an exception beginning with "retry:" (case insensitive) if the call should be retried

    sub startRetry

    {

    my ($call_fn, @args) = @_;

    return retry($Pearle::speedLimit, $call_fn, @args);

    }

    sub retry

    {

    my ($call_fn, @args, $delay, @result, $result);

    ($delay, $call_fn, @args) = @_;

    if(wantarray())

    {

    @result = eval{ $call_fn->(@args) };

    if($@ =~ /^retry:/i)

    {

    limit($delay);

    @result = retry($delay * 2, $call_fn, @args);

    }

    elsif($@)

    {

    die;

    }

    return @result;

    }

    else

    {

    $result = eval{ &{$call_fn}(@args) };

    if($@ =~ /^retry:/i)

    {

    limit($delay);

    $result = retry($delay * 2, $call_fn, @args);

    }

    elsif($@)

    {

    die;

    }

    return $result;

    }

    }

    sub initNamespaceList

    {

    die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

    my $xml = APIQuery(meta => 'siteinfo', siprop => 'namespaces');

    if(!defined($xml))

    {

    myLog(0, "Unknown error accessing wiki\n");

    die "920 Unknown error accessing wiki";

    }

    my $parsed_xml = $Pearle::xml_parser->XMLin($xml, KeyAttr => []);

    @Pearle::namespaces = map({[$_->{id}, $_->{content}]} grep({$_->{id} >= 0} @{$parsed_xml->{query}->{namespaces}->{ns}}));

    }

    sub getNamespaceNames

    {

    return map {$_->[1]} @Pearle::namespaces;

    }

    sub getNamespaceNumbers

    {

    return map {$_->[0]} @Pearle::namespaces;

    }

    sub namespaceToNumber

    {

    my $namespace = $_[0];

    my $i = 0;

    my $name;

    if(scalar(@Pearle::namespaces) > 0)

    {

    if(defined($namespace))

    {

    my @val = grep( {lc($_->[1]) eq lc($namespace)} @Pearle::namespaces);

    return undef if(!scalar(@val));

    return $val[0]->[0];

    }

    else

    {

    return undef;

    }

    }

    else

    {

    myLog(1, "Namespace array not initialized\n");

    return undef;

    }

    }

    sub numberToNamespace

    {

    my $i = shift;

    if(scalar(@Pearle::namespaces) > 0)

    {

    if(defined($i))

    {

    my @val = grep( {$_->[0] == $i} @Pearle::namespaces);

    return undef if(!scalar(@val));

    return $val[0]->[1];

    }

    else

    {

    return undef;

    }

    }

    else

    {

    myLog(1, "Namespace array not initialized\n");

    return undef;

    }

    }

    1. Translate from HTTP URL encoding to the native character set.

    sub urlDecode

    {

    my ($input);

    $input = $_[0];

    $input =~ s/\%([a-f|A-F|0-9][a-f|A-F|0-9])/chr(hex($1))/eg;

    return ($input);

    }

    sub decodeArray

    {

    return map {urlDecode($_)} @_;

    }

    1. Remove duplicates from a list

    sub uniquify

    {

    my @list = @_;

    @list = sort @list;

    my $last = undef;

    my @new_list;

    my $item;

    foreach $item (@list)

    {

    push @new_list, $item if(!defined($last) or ($item ne $last));

    $last = $item;

    }

    return @new_list;

    }

    1. Remove duplicates from a list of array references, grouping on the specified subelement

    sub uniquify_ref

    {

    my $element = shift;

    my @list = @_;

    @list = sort {$a->[$element] cmp $b->[$element]} @list;

    my $last = undef;

    my @new_list;

    my $item;

    foreach $item (@list)

    {

    push @new_list, $item if(!defined($last) or ($item->[$element] ne $last));

    $last = $item->[$element];

    }

    return @new_list;

    }

    1;