User:OrphanBot/libPearle2.pl
- IMPORTANT ###
- This code is released into the public domain.
- RECENT CHANGES ###
- 30 Nov 2005: Created, based off of the 12 Nov 2005 version of Pearle Wisebot
- 15 Feb 2006: Modifed "retry" to work with any function that signals failure by dying, modified to use a simple exponential backoff formula
- Simplified "limit" code, modified to take an optional parameter
- Added "config" function as a clean interface to change internal parameters
- Modified Wiki-access functions for use with the new "retry" function
- Cleanup of boolean config vars to use standard Perl boolean conventions
- 28 Feb 2006: Added checkLogin bottleneck, option to allow editing while logged out
- Added support for proxy servers
- 8 Mar 2006: Added support for getting a user's contributions
- Added support for retrieving logs
- Separated out some common regex parts into variables
- 29 Mar 2006: Added protection against Unicode in URLs
- Made thrown exceptions consistent
- Sanity-checking on postPage: talkpage without article, userpage or user talkpage without user
- 17 May 2005: Improved log retrieval
- 12 Jul 2007: Added timestamp to information retrieved from logs
- Errors thrown by this package always begin with a three-digit number
- 4xx: HTTP client errors
- 505: Server error: HTTP version not supported
- 509: Server error: Bandwidth exceeded
- 900: Unspecified internal error.
- 901: Library not initialized. You didn't call Pearle::init() before calling this function.
- 902: Parameter error. You made a function call, but forgot a mandatory parameter, or provided an invalid one.
- 920: Unexpected response. The MediaWiki site returned something unexpected.
- 921: Unexpected logout. The MediaWiki site logged us out unexpectedly.
- 922: Edit conflict. Someone edited the article while we were.
- 923: Deleted article conflict. Someone deleted the article while we were editing.
package Pearle;
use strict;
use warnings;
use Time::HiRes;
use utf8;
use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Request::Common qw(POST);
use HTML::Entities;
- Standard regex parts
$Pearle::regex_timestamp = '(\d\d):(\d\d), (\d\d?) (\w+) (\d\d\d\d)'; # Match and capture a Wikipedia timestamp
$Pearle::regex_timestamp_nc = '\d\d:\d\d, \d\d? \w+ \d\d\d\d'; # Match a Wikipedia timestamp
$Pearle::regex_pagelink = ''; # Match and capture any page
$Pearle::regex_redpagelink = ''; # Match and capture nonexistant pages only
$Pearle::regex_bluepagelink = ''; # Match and capture existing pages only
$Pearle::regex_pagelink_nc = ''; # Match any page
$Pearle::regex_redpagelink_nc = ''; # Match nonexistant pages only
$Pearle::regex_bluepagelink_nc = ''; # Match existing pages only
- Standard MediaWiki namespaces
@Pearle::namespaces = ("", "Talk", "User", "User talk", "Wikipedia", "Wikipedia talk", "Image", "Image talk", "MediaWiki", "MediaWiki talk", "Template", "Template talk", "Help", "Help talk", "Category", "Category talk");
$Pearle::logfile = "";
$Pearle::_inited = 0;
$Pearle::username = "";
$Pearle::password = "";
$Pearle::speedLimit = 10; # Seconds to wait by default when limit() is called
$Pearle::_speedMult = 1; # Multiplier for default wait time if the wiki is being slow
$Pearle::roughMode = 0; # Ignore most errors
$Pearle::nullOK = 0; # Permit editing non-existent pages
$Pearle::sanityCheck = 0; # Sanity checking on edits
$Pearle::silent = 0; # Silent mode
$Pearle::quiet = 0; # Quiet mode
$Pearle::logoutOK = 0; # Permit editing while logged out
$Pearle::proxy = undef; # Proxy to use
- This must be the first function from the library called
sub init
{
$Pearle::username = $_[0] or die("902 No username provided!\n");
$Pearle::password = $_[1] or die("902 No password provided!\n");
$Pearle::logfile = $_[2] or die("902 No logfile name provided!\n");
$Pearle::cookies = $_[3] or die("902 No cookie file provided!\n");
$Pearle::useragent = $_[4] or $Pearle::useragent = "PearleLib/0.2";
$Pearle::ua = LWP::UserAgent->new(timeout => 300);
$Pearle::ua->agent($Pearle::useragent);
$Pearle::ua->cookie_jar(HTTP::Cookies->new(file => $Pearle::cookies, autosave => 1));
$Pearle::ua->cookie_jar->load();
$Pearle::roughMode = "no";
# Hot pipes
$| = 1;
$Pearle::_inited = 1;
}
sub config
{
my %params = @_;
$Pearle::speedLimit = $params{speedLimit} if(defined($params{speedLimit}));
$Pearle::roughMode = $params{roughMode} if(defined($params{roughMode}));
$Pearle::nullOK = $params{nullOK} if(defined($params{nullOK}));
$Pearle::silent = $params{silent} if(defined($params{silent}));
$Pearle::quiet = $params{quiet} if(defined($params{quiet}));
$Pearle::logfile = $params{logfile} if(defined($params{logfile}));
$Pearle::logoutOK = $params{logoutOK} if(defined($params{logoutOK}));
$Pearle::sanityCheck = $params{sanityCheck} if(defined($params{sanityCheck}));
if(exists($params{proxy}))
{
if(defined($params{proxy}))
{
myPrint("Proxying: $params{proxy}\n");
myLog("Proxying: $params{proxy}\n");
$Pearle::ua->proxy('http', $params{proxy});
$Pearle::proxy = $params{proxy};
}
else
{
myPrint("Not proxying\n");
myLog("Not proxying\n");
$Pearle::ua->no_proxy();
$Pearle::proxy = undef;
}
}
}
sub myLog
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
open (LOG, ">>", $Pearle::logfile)
|| die "900 Could not append to log!";
print LOG $_[0];
close (LOG);
}
sub myPrint
{
return if($Pearle::silent);
return if($Pearle::quiet);
print @_;
}
sub myErrPrint
{
return if($Pearle::silent);
return if($Pearle::quiet);
print STDERR @_;
}
- Rate-limiting. Can be sensibly run even if libPearle isn't initialized
sub limit
{
my ($i);
$i = ($_[0] or ($Pearle::speedLimit * $Pearle::_speedMult));
$i = 10 if($i < 10);
# Rate-limiting to avoid hosing the wiki server
# Min 30 sec unmarked
# Min 10 sec marked
# May be raised by retry() if load is heavy
### ATTENTION ###
# Increasing the speed of the bot to faster than 1 edit every 10
# seconds violates English Wikipedia rules as of April, 2005, and
# will cause your bot to be banned. So don't change $normalDelay
# unless you know what you are doing. Other sites may have
# similar policies, and you are advised to check before using your
# bot at the default speed.
#################
while ($i >= 0)
{
sleep (1);
myErrPrint("Sleeping $i seconds...\r");
$i--;
}
myErrPrint(" \r");
}
sub login
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
my $res = $Pearle::ua->post(
"http://en.wikipedia.org/w/wiki.phtml?title=Special:Userlogin&action=submitlogin",
Content => [
wpName => $Pearle::username,
wpPassword => $Pearle::password,
wpRemember => 1,
wpLoginAttempt => 1
]
);
if( 302 == $res->code )
{
myPrint("Logged in as $Pearle::username\n");
myLog("Logged in as $Pearle::username\n");
# This may or may not actually work
$Pearle::ua->cookie_jar->save();
return 1;
}
else
{
myPrint("Login failed\n");
myPrint("Code: ".$res->code."\n");
myLog("Login failed\n");
return 0;
}
}
sub logout {
my $res = $Pearle::ua->post(
"http://en.wikipedia.org/w/wiki.phtml?title=Special:Userlogout",
);
return 1;
}
sub checkLogin
{
my ($reply_text);
$reply_text = $_[0];
if ($reply_text !~ m/>My talk<\/a>/ and !($Pearle::logoutOK))
{
# We've lost our identity.
myLog ("Wiki server is not recognizing me.\n");
die ("921 Wiki server is not recognizing me.\n");
}
}
- Make an HTTP request, performing basic error checking and handling. Suitable for use with the "retry" function
sub httpRequest
{
my ($request, $response, $attemptStartTime, $attemptEndTime);
$request = $_[0];
# Since not every server handles UTF-8 in URLs, and LWP doesn't escape them properly, escape every character > 255
$request->uri(unicodeToUrl($request->uri()));
$response = $Pearle::ua->request($request);
# Monitor wiki server responsiveness
$attemptStartTime = Time::HiRes::time();
if ($response->is_success or $response->is_redirect)
{
return $response
}
else
{
myLog ("HTTP ERR (".$response->status_line.")\n".$response->content."\n");
myPrint("HTTP ERR (".$response->status_line.")\n".$response->content."\n");
# 50X HTTP errors mean there is a problem connecting to the wiki server. Can be remedied by waiting and trying again
if (500 <= $response->code and 504 >= $response->code)
{
die("retry:".$response->status_line);
}
else
{
# Unhandled HTTP response. Waiting probably won't fix it
die ($response->status_line."\n");
}
}
# Monitor wiki server responsiveness
$attemptEndTime = Time::HiRes::time();
if($request->method() eq "POST")
{
if (($attemptEndTime - $attemptStartTime) > 20)
{
$Pearle::_speedMult = 60;
myPrint("Wikipedia is very slow. Increasing minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");
myLog("Wikipedia is very slow. Increasing minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");
}
# If the response time is between 10 and 20 seconds...
elsif (($attemptEndTime - $attemptStartTime) > 10)
{
$Pearle::_speedMult = 6;
myPrint("Wikipedia is somewhat slow. Setting minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");
myLog("Wikipedia is somewhat slow. Setting minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");
}
# If the response time is less than 10 seconds...
else
{
if ($Pearle::_speedMult != 1)
{
$Pearle::_speedMult = 1;
myPrint( "Returning to normal minimum wait time.\n");
myLog("Returning to normal minimum wait time.\n");
}
}
}
}
- Check out a page for editing.
sub getPage
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
my ($target, $request, $response, $reply, $text, $text2,
$editTime, $startTime, $attemptStartTime, $attemptFinishTime,
$token, $targetSafe);
$target = $_[0];
if ($target =~ m/^\s*$/)
{
myLog("getPage: Null target.");
die("902 getPage: Null target.");
}
$targetSafe = $target;
$targetSafe =~ s/\&/%26/g;
$targetSafe =~ s/\+/%2B/g;
# Create a request-object
myPrint("GET http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit\n");
myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit\n");
$request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit");
$response = startRetry(\&httpRequest, $request);
$reply = $response->content;
# This detects whether or not we're logged in.
checkLogin($reply);
# Check for blocking
if($reply =~ /
User is blocked<\/h1>/)
{
myLog("Blocked\n");
die("900 Blocked");
}
$reply =~ m%%s;
$text = $1;
$reply =~ m/value="(\d+)" name="wpEdittime"/;
$editTime = $1;
# Added 22 Aug 2005 to correctly handle articles that have
# been undeleted
$reply =~ m/value="(\d+)" name="wpStarttime"/;
$startTime = $1;
# Added 9 Mar 2005 after recent software change.
$reply =~ m/value="([^"]+)" name="wpEditToken"/;
$token = $1;
###
if (($text =~ m/^\s*$/) and !$Pearle::nullOK)
{
myLog ("getPage($target): Null text!\n");
myLog ("\n---\n$reply\n---\n");
if ($Pearle::roughMode)
{
return;
}
else
{
die ("920 getPage($target): Null text!\n");
}
}
if (($editTime =~ m/^\s*$/) and !$Pearle::nullOK)
{
myLog ("getPage($target): Null time!\n");
myLog("\n---\n$reply\n---\n");
die ("920 getPage($target): Null time!\n");
}
if (($text =~ m/>/) or ($text =~ m/))
{
myPrint($text);
myLog("\n---\n$text\n---\n");
myLog ("getPage($target): Bad text suck!\n");
die ("920 getPage($target): Bad text suck!\n");
}
# Change ( " -> " ) etc
# This function is from HTML::Entities.
decode_entities($text);
# This may or may not actually work
$Pearle::ua->cookie_jar->save();
return ($text, $editTime, $startTime, $token);
}
sub postPage
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
my ($request, $response, $pageName, $textToPost, $summaryEntry,
$editTime, $startTime, $actual, $expected, $date, $editToken,
$minor, $pageNameSafe);
$pageName = $_[0];
$editTime = $_[1];
$startTime = $_[2];
$editToken = $_[3];
$textToPost = $_[4];
$summaryEntry = $_[5]; # Max 200 chars!
$minor = $_[6];
$summaryEntry = substr($summaryEntry, 0, 200);
if ($pageName eq "")
{
myLog ("postPage(): Empty pageName.\n");
die ("902 postPage(): Empty pageName.\n");
}
if(!defined($minor))
{
die "902 postPage(): Not enough parameters!\n";
}
if ($summaryEntry eq "")
{
$summaryEntry = "Automated editing.";
}
$pageNameSafe = $pageName;
$pageNameSafe =~ s/\&/%26/g;
$pageNameSafe =~ s/\+/%2B/g;
if ($minor eq "yes")
{
$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageNameSafe}&action=submit",
[wpTextbox1 => $textToPost,
wpSummary => $summaryEntry,
wpSave => "Save page",
wpMinoredit => "on",
wpEditToken => $editToken,
wpStarttime => $startTime,
wpEdittime => $editTime];
# Optional: wpWatchthis
}
else
{
$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageNameSafe}&action=submit",
[wpTextbox1 => $textToPost,
wpSummary => $summaryEntry,
wpSave => "Save page",
wpEditToken => $editToken,
wpStarttime => $startTime,
wpEdittime => $editTime];
# Optional: wpWatchthis, wpMinoredit
}
# ---
## If posts are failing, you can uncomment the below to see what
## HTTP request is being made.
# myLog($request->as_string());
# print $request->as_string(); $::speedLimit = 60 * 10;
# print $::ua->request($request)->as_string;
# ---
myLog("POSTing...");
myPrint("POSTing...");
# Pass request to the user agent and get a response back
$response = startRetry(\&httpRequest, $request);
myLog("POSTed.\n");
myPrint("POSTed.\n");
if ($response->content =~ m/Please confirm that really want to recreate this article./)
{
myLog ($response->content."\n");
die ("923 Deleted article conflict! See log!");
}
# Check the outcome of the response
$response->code;
if ($response->code != 302 and $response->code != 200)
{
myLog ("postPage(${pageName}, $editTime)#1 - expected =! actual\n");
myLog ($request->as_string());
myLog ("EXPECTED: 302'\n");
myLog (" ACTUAL: '" . $response->status_line . "'\n");
if ($Pearle::roughMode eq "yes")
{
return();
}
else
{
die ("920 postPage(${pageName}, $editTime)#1 - expected =! actual - see log\n");
}
}
$expected = "http://en.wikipedia.org/wiki/${pageName}";
$expected = Pearle::urlEncode($expected);
$actual = $response->headers->header("Location");
if (($expected ne $actual) and ($Pearle::roughMode ne "yes")
and !(($actual eq "") and ($response->code == 200)))
{
myLog ("postPage(${pageName}, $editTime)#2 - expected =! actual\n");
myLog ("EXPECTED: '${expected}'\n");
myLog (" ACTUAL: '${actual}'\n");
die ("920 postPage(${pageName}, $editTime)#2 - expected =! actual - see log\n");
}
if ($response->content =~ m/
Edit conflict/)
{
myLog ("Edit conflict on '$pageName' at '$editTime'!\n");
die ("922 Edit conflict on '$pageName' at '$editTime'!\n");
}
if($Pearle::sanityCheck and $pageName =~ /talk[ _]*:/i) # Check for accidental creation of a talkpage without a mainpage. Only works with bots using the "monobook" skin.
{
# Monobook:
# Classic:
View article
if($response->content =~ /
{
myLog ("postPage(${pageName}) - Talkpage without article!\n");
die ("920 postPage(${pageName}) - Talkpage without article!\n");
}
}
if($Pearle::sanityCheck and $pageName =~ /^user[ _]*talk[ _]*:/) # Check for user talkpage for non-existant user
{
if($response->content !~ /User contributions/)
{
myLog ("postPge(${pageName}) - User talkpage for non-existant user!\n");
die ("920 postPge(${pageName}) - User talkpage for non-existant user!\n");
}
}
$Pearle::ua->cookie_jar->save();
return ($response->content);
}
- Get a list of the names of articles in a given category.
sub getCategoryArticles
{
die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
my ($target, $request, $response, $reply, $articles, $article,
@articles, $targetSpace, $offset, $numberOfArticles, $url,
@moreArticles);
$target = $_[0];
$offset = $_[1];
# Need both _ and spaces for precise matching later
$target =~ s/ /_/g;
$targetSpace = $target;
$targetSpace =~ s/_/ /g;
unless ($target =~ m/^Category:/)
{
myLog ("getCategoryArticles(): Are you sure '$target' is a category?\n");
die ("902 getCategoryArticles(): Are you sure '$target' is a category?\n");
}
if ($offset eq "")
{
$url = "http://en.wikipedia.org/wiki/${target}";
}
else
{
$url = "http://en.wikipedia.org/w/index.php?title=${target}&from=${offset}";
}
# Create a request-object
if ($offset eq "")
{
myPrint("GET ${url}\n");
}
myLog("GET ${url}\n");
$request = HTTP::Request->new(GET => "${url}");
$response = startRetry(\&httpRequest, $request);
$reply = $response->content;
# This detects whether or not we're logged in.
checkLogin($reply);
$articles = $reply;
$articles =~ s%^.*?
Articles in category.*?
%%s;$articles =~ s%