User:OrphanBot/libPearle2.pl

  1. IMPORTANT ###
  1. This code is released into the public domain.
  1. RECENT CHANGES ###
  1. 30 Nov 2005: Created, based off of the 12 Nov 2005 version of Pearle Wisebot
  2. 15 Feb 2006: Modifed "retry" to work with any function that signals failure by dying, modified to use a simple exponential backoff formula
  3. Simplified "limit" code, modified to take an optional parameter
  4. Added "config" function as a clean interface to change internal parameters
  5. Modified Wiki-access functions for use with the new "retry" function
  6. Cleanup of boolean config vars to use standard Perl boolean conventions
  7. 28 Feb 2006: Added checkLogin bottleneck, option to allow editing while logged out
  8. Added support for proxy servers
  9. 8 Mar 2006: Added support for getting a user's contributions
  10. Added support for retrieving logs
  11. Separated out some common regex parts into variables
  12. 29 Mar 2006: Added protection against Unicode in URLs
  13. Made thrown exceptions consistent
  14. Sanity-checking on postPage: talkpage without article, userpage or user talkpage without user
  15. 17 May 2005: Improved log retrieval
  16. 12 Jul 2007: Added timestamp to information retrieved from logs
  1. Errors thrown by this package always begin with a three-digit number
  2. 4xx: HTTP client errors
  3. 505: Server error: HTTP version not supported
  4. 509: Server error: Bandwidth exceeded
  5. 900: Unspecified internal error.
  6. 901: Library not initialized. You didn't call Pearle::init() before calling this function.
  7. 902: Parameter error. You made a function call, but forgot a mandatory parameter, or provided an invalid one.
  8. 920: Unexpected response. The MediaWiki site returned something unexpected.
  9. 921: Unexpected logout. The MediaWiki site logged us out unexpectedly.
  10. 922: Edit conflict. Someone edited the article while we were.
  11. 923: Deleted article conflict. Someone deleted the article while we were editing.

package Pearle;

use strict;

use warnings;

use Time::HiRes;

use utf8;

use LWP::UserAgent;

use HTTP::Cookies;

use HTTP::Request::Common qw(POST);

use HTML::Entities;

  1. Standard regex parts

$Pearle::regex_timestamp = '(\d\d):(\d\d), (\d\d?) (\w+) (\d\d\d\d)'; # Match and capture a Wikipedia timestamp

$Pearle::regex_timestamp_nc = '\d\d:\d\d, \d\d? \w+ \d\d\d\d'; # Match a Wikipedia timestamp

$Pearle::regex_pagelink = ''; # Match and capture any page

$Pearle::regex_redpagelink = ''; # Match and capture nonexistant pages only

$Pearle::regex_bluepagelink = ''; # Match and capture existing pages only

$Pearle::regex_pagelink_nc = ''; # Match any page

$Pearle::regex_redpagelink_nc = ''; # Match nonexistant pages only

$Pearle::regex_bluepagelink_nc = ''; # Match existing pages only

  1. Standard MediaWiki namespaces

@Pearle::namespaces = ("", "Talk", "User", "User talk", "Wikipedia", "Wikipedia talk", "Image", "Image talk", "MediaWiki", "MediaWiki talk", "Template", "Template talk", "Help", "Help talk", "Category", "Category talk");

$Pearle::logfile = "";

$Pearle::_inited = 0;

$Pearle::username = "";

$Pearle::password = "";

$Pearle::speedLimit = 10; # Seconds to wait by default when limit() is called

$Pearle::_speedMult = 1; # Multiplier for default wait time if the wiki is being slow

$Pearle::roughMode = 0; # Ignore most errors

$Pearle::nullOK = 0; # Permit editing non-existent pages

$Pearle::sanityCheck = 0; # Sanity checking on edits

$Pearle::silent = 0; # Silent mode

$Pearle::quiet = 0; # Quiet mode

$Pearle::logoutOK = 0; # Permit editing while logged out

$Pearle::proxy = undef; # Proxy to use

  1. This must be the first function from the library called

sub init

{

$Pearle::username = $_[0] or die("902 No username provided!\n");

$Pearle::password = $_[1] or die("902 No password provided!\n");

$Pearle::logfile = $_[2] or die("902 No logfile name provided!\n");

$Pearle::cookies = $_[3] or die("902 No cookie file provided!\n");

$Pearle::useragent = $_[4] or $Pearle::useragent = "PearleLib/0.2";

$Pearle::ua = LWP::UserAgent->new(timeout => 300);

$Pearle::ua->agent($Pearle::useragent);

$Pearle::ua->cookie_jar(HTTP::Cookies->new(file => $Pearle::cookies, autosave => 1));

$Pearle::ua->cookie_jar->load();

$Pearle::roughMode = "no";

# Hot pipes

$| = 1;

$Pearle::_inited = 1;

}

sub config

{

my %params = @_;

$Pearle::speedLimit = $params{speedLimit} if(defined($params{speedLimit}));

$Pearle::roughMode = $params{roughMode} if(defined($params{roughMode}));

$Pearle::nullOK = $params{nullOK} if(defined($params{nullOK}));

$Pearle::silent = $params{silent} if(defined($params{silent}));

$Pearle::quiet = $params{quiet} if(defined($params{quiet}));

$Pearle::logfile = $params{logfile} if(defined($params{logfile}));

$Pearle::logoutOK = $params{logoutOK} if(defined($params{logoutOK}));

$Pearle::sanityCheck = $params{sanityCheck} if(defined($params{sanityCheck}));

if(exists($params{proxy}))

{

if(defined($params{proxy}))

{

myPrint("Proxying: $params{proxy}\n");

myLog("Proxying: $params{proxy}\n");

$Pearle::ua->proxy('http', $params{proxy});

$Pearle::proxy = $params{proxy};

}

else

{

myPrint("Not proxying\n");

myLog("Not proxying\n");

$Pearle::ua->no_proxy();

$Pearle::proxy = undef;

}

}

}

sub myLog

{

die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

open (LOG, ">>", $Pearle::logfile)

|| die "900 Could not append to log!";

print LOG $_[0];

close (LOG);

}

sub myPrint

{

return if($Pearle::silent);

return if($Pearle::quiet);

print @_;

}

sub myErrPrint

{

return if($Pearle::silent);

return if($Pearle::quiet);

print STDERR @_;

}

  1. Rate-limiting. Can be sensibly run even if libPearle isn't initialized

sub limit

{

my ($i);

$i = ($_[0] or ($Pearle::speedLimit * $Pearle::_speedMult));

$i = 10 if($i < 10);

# Rate-limiting to avoid hosing the wiki server

# Min 30 sec unmarked

# Min 10 sec marked

# May be raised by retry() if load is heavy

### ATTENTION ###

# Increasing the speed of the bot to faster than 1 edit every 10

# seconds violates English Wikipedia rules as of April, 2005, and

# will cause your bot to be banned. So don't change $normalDelay

# unless you know what you are doing. Other sites may have

# similar policies, and you are advised to check before using your

# bot at the default speed.

#################

while ($i >= 0)

{

sleep (1);

myErrPrint("Sleeping $i seconds...\r");

$i--;

}

myErrPrint(" \r");

}

sub login

{

die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

my $res = $Pearle::ua->post(

"http://en.wikipedia.org/w/wiki.phtml?title=Special:Userlogin&action=submitlogin",

Content => [

wpName => $Pearle::username,

wpPassword => $Pearle::password,

wpRemember => 1,

wpLoginAttempt => 1

]

);

if( 302 == $res->code )

{

myPrint("Logged in as $Pearle::username\n");

myLog("Logged in as $Pearle::username\n");

# This may or may not actually work

$Pearle::ua->cookie_jar->save();

return 1;

}

else

{

myPrint("Login failed\n");

myPrint("Code: ".$res->code."\n");

myLog("Login failed\n");

return 0;

}

}

sub logout {

my $res = $Pearle::ua->post(

"http://en.wikipedia.org/w/wiki.phtml?title=Special:Userlogout",

);

return 1;

}

sub checkLogin

{

my ($reply_text);

$reply_text = $_[0];

if ($reply_text !~ m/>My talk<\/a>/ and !($Pearle::logoutOK))

{

# We've lost our identity.

myLog ("Wiki server is not recognizing me.\n");

die ("921 Wiki server is not recognizing me.\n");

}

}

  1. Make an HTTP request, performing basic error checking and handling. Suitable for use with the "retry" function

sub httpRequest

{

my ($request, $response, $attemptStartTime, $attemptEndTime);

$request = $_[0];

# Since not every server handles UTF-8 in URLs, and LWP doesn't escape them properly, escape every character > 255

$request->uri(unicodeToUrl($request->uri()));

$response = $Pearle::ua->request($request);

# Monitor wiki server responsiveness

$attemptStartTime = Time::HiRes::time();

if ($response->is_success or $response->is_redirect)

{

return $response

}

else

{

myLog ("HTTP ERR (".$response->status_line.")\n".$response->content."\n");

myPrint("HTTP ERR (".$response->status_line.")\n".$response->content."\n");

# 50X HTTP errors mean there is a problem connecting to the wiki server. Can be remedied by waiting and trying again

if (500 <= $response->code and 504 >= $response->code)

{

die("retry:".$response->status_line);

}

else

{

# Unhandled HTTP response. Waiting probably won't fix it

die ($response->status_line."\n");

}

}

# Monitor wiki server responsiveness

$attemptEndTime = Time::HiRes::time();

if($request->method() eq "POST")

{

if (($attemptEndTime - $attemptStartTime) > 20)

{

$Pearle::_speedMult = 60;

myPrint("Wikipedia is very slow. Increasing minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");

myLog("Wikipedia is very slow. Increasing minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");

}

# If the response time is between 10 and 20 seconds...

elsif (($attemptEndTime - $attemptStartTime) > 10)

{

$Pearle::_speedMult = 6;

myPrint("Wikipedia is somewhat slow. Setting minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");

myLog("Wikipedia is somewhat slow. Setting minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");

}

# If the response time is less than 10 seconds...

else

{

if ($Pearle::_speedMult != 1)

{

$Pearle::_speedMult = 1;

myPrint( "Returning to normal minimum wait time.\n");

myLog("Returning to normal minimum wait time.\n");

}

}

}

}

  1. Check out a page for editing.

sub getPage

{

die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

my ($target, $request, $response, $reply, $text, $text2,

$editTime, $startTime, $attemptStartTime, $attemptFinishTime,

$token, $targetSafe);

$target = $_[0];

if ($target =~ m/^\s*$/)

{

myLog("getPage: Null target.");

die("902 getPage: Null target.");

}

$targetSafe = $target;

$targetSafe =~ s/\&/%26/g;

$targetSafe =~ s/\+/%2B/g;

# Create a request-object

myPrint("GET http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit\n");

myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit\n");

$request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit");

$response = startRetry(\&httpRequest, $request);

$reply = $response->content;

# This detects whether or not we're logged in.

checkLogin($reply);

# Check for blocking

if($reply =~ /

User is blocked<\/h1>/)

{

myLog("Blocked\n");

die("900 Blocked");

}

$reply =~ m%(.*?)%s;

$text = $1;

$reply =~ m/value="(\d+)" name="wpEdittime"/;

$editTime = $1;

# Added 22 Aug 2005 to correctly handle articles that have

# been undeleted

$reply =~ m/value="(\d+)" name="wpStarttime"/;

$startTime = $1;

# Added 9 Mar 2005 after recent software change.

$reply =~ m/value="([^"]+)" name="wpEditToken"/;

$token = $1;

###

if (($text =~ m/^\s*$/) and !$Pearle::nullOK)

{

myLog ("getPage($target): Null text!\n");

myLog ("\n---\n$reply\n---\n");

if ($Pearle::roughMode)

{

return;

}

else

{

die ("920 getPage($target): Null text!\n");

}

}

if (($editTime =~ m/^\s*$/) and !$Pearle::nullOK)

{

myLog ("getPage($target): Null time!\n");

myLog("\n---\n$reply\n---\n");

die ("920 getPage($target): Null time!\n");

}

if (($text =~ m/>/) or ($text =~ m/

{

myPrint($text);

myLog("\n---\n$text\n---\n");

myLog ("getPage($target): Bad text suck!\n");

die ("920 getPage($target): Bad text suck!\n");

}

# Change ( " -> " ) etc

# This function is from HTML::Entities.

decode_entities($text);

# This may or may not actually work

$Pearle::ua->cookie_jar->save();

return ($text, $editTime, $startTime, $token);

}

sub postPage

{

die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

my ($request, $response, $pageName, $textToPost, $summaryEntry,

$editTime, $startTime, $actual, $expected, $date, $editToken,

$minor, $pageNameSafe);

$pageName = $_[0];

$editTime = $_[1];

$startTime = $_[2];

$editToken = $_[3];

$textToPost = $_[4];

$summaryEntry = $_[5]; # Max 200 chars!

$minor = $_[6];

$summaryEntry = substr($summaryEntry, 0, 200);

if ($pageName eq "")

{

myLog ("postPage(): Empty pageName.\n");

die ("902 postPage(): Empty pageName.\n");

}

if(!defined($minor))

{

die "902 postPage(): Not enough parameters!\n";

}

if ($summaryEntry eq "")

{

$summaryEntry = "Automated editing.";

}

$pageNameSafe = $pageName;

$pageNameSafe =~ s/\&/%26/g;

$pageNameSafe =~ s/\+/%2B/g;

if ($minor eq "yes")

{

$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageNameSafe}&action=submit",

[wpTextbox1 => $textToPost,

wpSummary => $summaryEntry,

wpSave => "Save page",

wpMinoredit => "on",

wpEditToken => $editToken,

wpStarttime => $startTime,

wpEdittime => $editTime];

# Optional: wpWatchthis

}

else

{

$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageNameSafe}&action=submit",

[wpTextbox1 => $textToPost,

wpSummary => $summaryEntry,

wpSave => "Save page",

wpEditToken => $editToken,

wpStarttime => $startTime,

wpEdittime => $editTime];

# Optional: wpWatchthis, wpMinoredit

}

# ---

## If posts are failing, you can uncomment the below to see what

## HTTP request is being made.

# myLog($request->as_string());

# print $request->as_string(); $::speedLimit = 60 * 10;

# print $::ua->request($request)->as_string;

# ---

myLog("POSTing...");

myPrint("POSTing...");

# Pass request to the user agent and get a response back

$response = startRetry(\&httpRequest, $request);

myLog("POSTed.\n");

myPrint("POSTed.\n");

if ($response->content =~ m/Please confirm that really want to recreate this article./)

{

myLog ($response->content."\n");

die ("923 Deleted article conflict! See log!");

}

# Check the outcome of the response

$response->code;

if ($response->code != 302 and $response->code != 200)

{

myLog ("postPage(${pageName}, $editTime)#1 - expected =! actual\n");

myLog ($request->as_string());

myLog ("EXPECTED: 302'\n");

myLog (" ACTUAL: '" . $response->status_line . "'\n");

if ($Pearle::roughMode eq "yes")

{

return();

}

else

{

die ("920 postPage(${pageName}, $editTime)#1 - expected =! actual - see log\n");

}

}

$expected = "http://en.wikipedia.org/wiki/${pageName}";

$expected = Pearle::urlEncode($expected);

$actual = $response->headers->header("Location");

if (($expected ne $actual) and ($Pearle::roughMode ne "yes")

and !(($actual eq "") and ($response->code == 200)))

{

myLog ("postPage(${pageName}, $editTime)#2 - expected =! actual\n");

myLog ("EXPECTED: '${expected}'\n");

myLog (" ACTUAL: '${actual}'\n");

die ("920 postPage(${pageName}, $editTime)#2 - expected =! actual - see log\n");

}

if ($response->content =~ m/

Edit conflict/)

{

myLog ("Edit conflict on '$pageName' at '$editTime'!\n");

die ("922 Edit conflict on '$pageName' at '$editTime'!\n");

}

if($Pearle::sanityCheck and $pageName =~ /talk[ _]*:/i) # Check for accidental creation of a talkpage without a mainpage. Only works with bots using the "monobook" skin.

{

# Monobook:

  • Article
  • # Classic:
    View article

    if($response->content =~ /

  • /)

    {

    myLog ("postPage(${pageName}) - Talkpage without article!\n");

    die ("920 postPage(${pageName}) - Talkpage without article!\n");

    }

    }

    if($Pearle::sanityCheck and $pageName =~ /^user[ _]*talk[ _]*:/) # Check for user talkpage for non-existant user

    {

    if($response->content !~ /User contributions/)

    {

    myLog ("postPge(${pageName}) - User talkpage for non-existant user!\n");

    die ("920 postPge(${pageName}) - User talkpage for non-existant user!\n");

    }

    }

    $Pearle::ua->cookie_jar->save();

    return ($response->content);

    }

    1. Get a list of the names of articles in a given category.

    sub getCategoryArticles

    {

    die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

    my ($target, $request, $response, $reply, $articles, $article,

    @articles, $targetSpace, $offset, $numberOfArticles, $url,

    @moreArticles);

    $target = $_[0];

    $offset = $_[1];

    # Need both _ and spaces for precise matching later

    $target =~ s/ /_/g;

    $targetSpace = $target;

    $targetSpace =~ s/_/ /g;

    unless ($target =~ m/^Category:/)

    {

    myLog ("getCategoryArticles(): Are you sure '$target' is a category?\n");

    die ("902 getCategoryArticles(): Are you sure '$target' is a category?\n");

    }

    if ($offset eq "")

    {

    $url = "http://en.wikipedia.org/wiki/${target}";

    }

    else

    {

    $url = "http://en.wikipedia.org/w/index.php?title=${target}&from=${offset}";

    }

    # Create a request-object

    if ($offset eq "")

    {

    myPrint("GET ${url}\n");

    }

    myLog("GET ${url}\n");

    $request = HTTP::Request->new(GET => "${url}");

    $response = startRetry(\&httpRequest, $request);

    $reply = $response->content;

    # This detects whether or not we're logged in.

    checkLogin($reply);

    $articles = $reply;

    $articles =~ s%^.*?

    Articles in category.*?

    %%s;

    $articles =~ s%

    .*?$%%s;

    @articles = $articles =~ m%

  • if ($reply =~ m/next 200<\/a>/s)

    {

    sleep (1); # Throttle GETs

    @moreArticles = getCategoryArticles($target, $1);

    @articles = (@articles, @moreArticles);

    }

    $Pearle::ua->cookie_jar->save();

    $numberOfArticles = @articles;

    if ($offset eq "")

    {

    myPrint("Got $numberOfArticles articles.\n");

    myLog ("Got $numberOfArticles articles.\n");

    }

    return decodeArray(@articles);

    }

    sub getCategoryImages

    {

    die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

    my ($target, $from, $request, $response, $reply, $images, @images,

    $image, %imagesHash);

    $target = $_[0];

    $from = $_[1];

    unless ($target =~ m/^Category:/)

    {

    myLog ("getCategoryImages(): Are you sure '$target' is a category?\n");

    die ("902 getCategoryImages(): Are you sure '$target' is a category?\n");

    }

    # Create a request-object

    if(!defined($from)) # Default: Start at the beginning of a category

    {

    myPrint("GET http://en.wikipedia.org/wiki/${target}\n");

    myLog("GET http://en.wikipedia.org/wiki/${target}\n");

    $request = HTTP::Request->new(GET => "http://en.wikipedia.org/wiki/${target}");

    }

    else # Start somewhere middle-ish

    {

    myPrint("GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&from=$from\n");

    myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&from=$from\n");

    $request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${target}\&from=$from");

    }

    $response = startRetry(\&httpRequest, $request);

    $reply = $response->content;

    # This detects whether or not we're logged in.

    checkLogin($reply);

    1. unless ($reply =~ m%My talk%)
    2. {
    3. # We've lost our identity.
    4. myLog ("Wiki server is not recognizing me (2).\n---\n${reply}\n---\n");
    5. die ("Wiki server is not recognizing me (2).\n");
    6. }

    $images = $reply;

    $images =~ s/^.*?

    $images =~ s/

    .*?$//s;

    @images = $images =~ m/

    @images = grep {$_ =~ /^Image:/} @images;

    if($images =~ /&from=([^"]+)" title="Category:[^"]*">next 200/)

    {

    print "More: $1\n";

    @images = (@images, getCategoryImages($target, $1));

    }

    # Uniqify to prevent duplicates

    @images = uniquify(@images);

    $Pearle::ua->cookie_jar->save();

    return decodeArray(@images);

    }

    sub getSubcategories

    {

    die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

    my ($target, $request, $response, $reply, $subcats, $subcat,

    @subcats, $attemptStartTime, $attemptFinishTime);

    $target = $_[0];

    unless ($target =~ m/^Category:/)

    {

    myLog ("getSubcategories(): Are you sure '$target' is a category?\n");

    die ("902 getSubcategories(): Are you sure '$target' is a category?\n");

    }

    # Create a request-object

    myPrint("GET http://en.wikipedia.org/wiki/${target}\n");

    myLog("GET http://en.wikipedia.org/wiki/${target}\n");

    $request = HTTP::Request->new(GET => "http://en.wikipedia.org/wiki/${target}");

    $response = startRetry(\&httpRequest, $request);

    $reply = $response->content;

    # This detects whether or not we're logged in.

    checkLogin($reply);

    $subcats = $reply;

    if ($subcats =~ m%^.*?

    Subcategories

    (.*?)

    Pages in category.*?

    .*?$%s)

    {

    $subcats =~ s%^.*?

    Subcategories

    (.*?)

    Pages in category.*?

    .*?$%$1%s;

    }

    else

    {

    return ();

    }

    @subcats = $subcats =~ m%

  • $Pearle::ua->cookie_jar->save();

    return decodeArray(@subcats);

    }

    1. Get up to $max most recent articles edited by a user

    sub getUserArticles

    {

    die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

    my ($url, $request, $response, $reply, @contribs,

    $target, $namespace, $max, $offset);

    $target = $_[0];

    $max = $_[1];

    $offset = $_[2];

    $namespace = namespaceToNumber($_[3]);

    # Create a request-object

    if(defined($namespace))

    {

    $url = "http://en.wikipedia.org/w/index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}&namespace=$namespace";

    }

    else

    {

    $url = "http://en.wikipedia.org/w/index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}";

    }

    myPrint("GET $url\n");

    myLog("GET $url\n");

    $request = HTTP::Request->new(GET => "$url");

    $response = startRetry(\&httpRequest, $request);

    $reply = $response->content;

    # This detects whether or not we're logged in.

    checkLogin($reply);

    # Extract the contributions

    #

  • 23:18, 6 March 2006 (

    while($reply =~ /

  • $Pearle::regex_timestamp_nc \($Pearle::regex_bluepagelink/g)

    {

    push @contribs, $1;

    }

    # Remove duplicates

    1. @contribs = uniquify(@contribs);

    return @contribs;

    }

    1. Gets a list of (articles, actor, summary) tuples from the specified log (upload, delete, move, protect)

    sub getLogArticles

    {

    die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

    my ($url, $request, $response, $reply, @articles,

    $log, $max, $offset, $user);

    $log = $_[0];

    $max = $_[1] || 50;

    $offset = $_[2] || 0;

    $user = $_[3] || '';

    # Create a request-object

    # http://en.wikipedia.org/w/index.php?title=Special:Log&type=upload&user=&page=&limit=2000&offset=0

    $url = "http://en.wikipedia.org/w/index.php?title=Special%3ALog&limit=${max}&offset=${offset}&user=${user}&type=${log}";

    myPrint("GET $url\n");

    myLog("GET $url\n");

    $request = HTTP::Request->new(GET => "$url");

    $response = startRetry(\&httpRequest, $request);

    $reply = $response->content;

    # This detects whether or not we're logged in.

    checkLogin($reply);

    # Extract the articles

    #

  • 19:55, 7 March 2006 Jimbo Wales deleted "Image:Justinsfriends.jpg" (blatant copyvio)
  • #

  • 19:54, 7 March 2006 MrD9 moved Statsoft to StatSoft (revert)
  • #

  • 19:53, 7 March 2006 Biederman uploaded "Image:Rockingham Raymond NH.PNG" (Changed Image:Rockingham_Portsmouth_NH.PNG to highlight Raymond )
  • #

  • 19:31, 7 March 2006 Francs2000 protected Manoeuvre.org ({{deletedpage}} [edit=sysop:move=sysop])
  • #

  • 19:30, 7 March 2006 Tony Sidaway unprotected Will McWhinney (This looks like the protection that time forgot.)
    1. while($reply =~ /
    2. $Pearle::regex_timestamp_nc ${Pearle::regex_pagelink}.*?<\/a> (?:deleted|moved|uploaded|protected|unprotected) "?${Pearle::regex_pagelink}.*?<\/a>"(?:\s*(.*)<\/span>|)/g)

    while($reply =~ /

  • ($Pearle::regex_timestamp_nc) ${Pearle::regex_pagelink}.*?<\/a> \(${Pearle::regex_pagelink_nc}Talk<\/a> \| ${Pearle::regex_pagelink_nc}contribs<\/a>\) (?:deleted|moved|uploaded|protected|unprotected) "?${Pearle::regex_pagelink}.*?<\/a>"(?:\s*(.*)<\/span>|)/g)

    {

    my $summary = $3 || '';

    push @articles, [$3, $2, $summary, $1];

    }

    @articles = uniquify_ref1(@articles);

    return @articles;

    }

    1. Use the Special:Export interface to get the wikitext of one or more articles

    sub Export

    {

    my ($request, $response, $reply, $articles);

    $articles = join "\n", @_;

    $request = POST "http://en.wikipedia.org/w/index.php?title=Special:Export&action=submit", [action => 'submit', pages => $articles, curonly => 1];

    $response = startRetry(\&httpRequest, $request);

    $reply = $response->content;

    return $reply;

    }

    1. Do a null edit to an article

    sub nullEdit

    {

    die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

    my ($text, $articleName, $comment, $editTime, $startTime, $token);

    $articleName = $_[0];

    myPrint("nullEdit($articleName)\n");

    myLog ("nullEdit($articleName)\n");

    ($text, $editTime, $startTime, $token) = getPage($articleName);

    unless ($text eq "")

    {

    postPage ($articleName, $editTime, $startTime, $token, $text, "null edit");

    }

    }

    1. Get the history of an article and parse the first 500 entries into a list of [link day month year] lists

    sub parseHistory

    {

    my ($pageName, $html, @lines, $line, $date, $hour, $minute, $day, $month, $year,

    $htmlCopy, $link, $user, @result);

    $pageName = $_[0];

    $pageName = escapeUrl($pageName);

    $html = getURL("http://en.wikipedia.org/w/index.php?title=${pageName}&action=history&limit=500");

    $htmlCopy = $html;

    $html =~ s%^.*?

      %%s;

      $html =~ s%(.*?)

    .*$%$1%s;

    @lines = split ("

  • ", $html);

    foreach $line (@lines)

    {

    $line =~ s/\n/ /g;

    if ($line =~ m/^\s*$/)

    {

    next;

    }

    ($user) = $line =~ /]*>([^<]*)/;

    $line =~ s/.*?$//;

    $line =~ s/^.*?Select a newer version for comparison//;

    $line =~ s/^.*?Select a older version for comparison//;

    $line =~ s/^.*?name="diff" \/>//;

    $line =~ m%$Pearle::regex_timestamp%;

    $link = $1;

    $hour = $3;

    $minute = $4;

    $day = $5;

    $month = $6;

    $year = $7;

    push @result, [$link, $day, $month, $year, $user];

    }

    return (@result);

    }

    sub getURL #($target)

    {

    die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

    # Read throttle!

    sleep (1);

    my ($request, $response, $reply, $url);

    $url = $_[0];

    # Create a request-object

    myPrint("GET ${url}\n");

    myLog("GET ${url}\n");

    $request = HTTP::Request->new(GET => "${url}");

    $response = startRetry(\&httpRequest, $request);

    $reply = $response->content;

    # This may or may not actually work

    $Pearle::ua->cookie_jar->save();

    return ($reply);

    }

    1. Retries a given function repeatedly, with an exponential backoff rate
    2. The function should throw an exception beginning with "retry:" (case insensitive) if the call should be retried

    sub startRetry

    {

    my ($call_fn, @args) = @_;

    return retry($Pearle::speedLimit, $call_fn, @args);

    }

    sub retry

    {

    my ($call_fn, @args, $delay, @result, $result);

    ($delay, $call_fn, @args) = @_;

    if(wantarray())

    {

    @result = eval{ $call_fn->(@args) };

    if($@ =~ /^retry:/i)

    {

    limit($delay);

    @result = retry($delay * 2, $call_fn, @args);

    }

    elsif($@)

    {

    die;

    }

    return @result;

    }

    else

    {

    $result = eval{ &{$call_fn}(@args) };

    if($@ =~ /^retry:/i)

    {

    limit($delay);

    $result = retry($delay * 2, $call_fn, @args);

    }

    elsif($@)

    {

    die;

    }

    return $result;

    }

    }

    sub namespaceToNumber

    {

    my $namespace = $_[0];

    my $i = 0;

    my $name;

    if(defined($namespace))

    {

    foreach $name (@Pearle::namespaces)

    {

    return $i if(lc($name) eq lc($namespace));

    $i++;

    }

    }

    else

    {

    return undef;

    }

    }

    sub numberToNamespace

    {

    my $i = shift;

    if(defined($i))

    {

    return $Pearle::namespaces[$i];

    }

    else

    {

    return undef;

    }

    }

    1. Translate from HTTP URL encoding to the native character set.

    sub urlDecode

    {

    my ($input);

    $input = $_[0];

    $input =~ s/\%([a-f|A-F|0-9][a-f|A-F|0-9])/chr(hex($1))/eg;

    return ($input);

    }

    1. Basic escaping of special characters in a URL

    sub escapeUrl

    {

    my $input = shift;

    $input =~ s/%/%25/g;

    $input =~ s/&/%26/g;

    $input = unicodeToUrl($input);

    return $input;

    }

    1. URL-escape any high-unicode chars in a string

    sub unicodeToUrl

    {

    my ($char, $input, $output);;

    $input = $_[0];

    foreach $char (split("",$input))

    {

    if(ord($char) > 255)

    {

    $output .= uc(sprintf("%%%x%%%x", int(ord($char)/256), ord($char) & 0xFF));

    # %HH%LL where HHLL is the hex code of $char

    }

    else

    {

    $output .= $char;

    }

    }

    return $output;

    }

    1. Translate from the native character set to the Wikipedia HTTP URL encoding.

    sub urlEncode

    {

    my ($char, $input, $output);

    $input = $_[0];

    foreach $char (split("",$input))

    {

    # if ($char =~ m/[a-z|A-Z|0-9|\-_\.\!\~\*\'\(\)]/)

    # The below exclusions should conform to Wikipedia practice

    # (possibly non-standard)

    if ($char =~ m/[a-z|A-Z|0-9|\-_\.\/:]/)

    {

    $output .= $char;

    }

    elsif ($char eq " ")

    {

    $output .= "_";

    }

    else

    {

    if(ord($char) > 255)

    {

    $output .= uc(sprintf("%%%x%%%x", int(ord($char)/256), ord($char) & 0xFF));

    # %HH%LL where HHLL is the hex code of $char

    }

    else

    {

    $output .= uc(sprintf("%%%x", ord($char)));

    # %HH where HH is the hex code of $char

    }

    }

    }

    return ($output);

    }

    sub decodeArray

    {

    return map {urlDecode($_)} @_;

    }

    1. Remove duplicates from a list

    sub uniquify

    {

    my @list = @_;

    @list = sort @list;

    my $last = undef;

    my @new_list;

    my $item;

    foreach $item (@list)

    {

    push @new_list, $item if(!defined($last) or ($item ne $last));

    $last = $item;

    }

    return @new_list;

    }

    1. Remove duplicates from a list of array references, grouping on the first subelement

    sub uniquify_ref1

    {

    my @list = @_;

    @list = sort {$a->[0] cmp $b->[0]} @list;

    my $last = undef;

    my @new_list;

    my $item;

    foreach $item (@list)

    {

    push @new_list, $item if(!defined($last) or ($item->[0] ne $last));

    $last = $item->[0];

    }

    return @new_list;

    }

    1;