User:PockBot/SourceCode/001
#!/usr/bin/perl -w";use strict;
use warnings;
use CGI;
use CGI::Carp "fatalsToBrowser";
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Request::Common qw(GET);
use HTTP::Response;
- ______________________________________________________________________________#
- PockBot.pl Version 0.01 PRE_RELEASE #
- Author Dan Adams , (User:PocklingtonDan) #
- Created 29/11/06 Last Modified 04/12/06 #
- ______________________________________________________________________________#
- ______________________________________________________________________________#
- RIGHTS MANAGEMENT ETC #
- #
- The source code for PockBot is supplied solely for the purposes of allowing #
- other editors to comment on and improve the code, and/or to run the code as #
- a clone. It may be distributed and modified as required for these purposes. #
- ______________________________________________________________________________#
- ______________________________________________________________________________#
- CHANGES STILL TO MAKE #
- #
- - timesouts on larger categories #
- - make it write to wiki talk page for the category in scrollbox #
- - "PockBot ran successfully" presented even if category non-existent #
- #
- RECENT CHANGES #
- #
- 05.12.06 - Version 0.01 source code released #
- ______________________________________________________________________________#
- ______________________________________________________________________________#
- WHAT THE SCRIPT DOES #
- #
- This script is a wikipedia bot. It acts as a web spider. Given a wikipedia #
- category page to start from, it finds all articles listed in that category #
- as well as all subcategories of that category. For every subcategory it #
- pulls a list of articles. For all articles retrieved (a list of all articles #
- in that category and its subcategories) it then retrieves the CLASS flag for #
- each page from wikipedia. It then presents these resulsts in tabulated form. #
- #
- INTENDED USE #
- #
- It is intended that this script would be useful to those trying to monitor #
- all pages within a category for purposes of administration or for a project #
- in order to monitor which articles need bringing up from stub or start class #
- to full article status. #
- #
- CODE FORMATTING #
- #
- Code is formatted for ease of editing with Textad (www.textpad.com) or #
- similar editor with colour-coding meta-markup. It may be difficult to scan #
- using a no-frills text editor. #
- ______________________________________________________________________________#
- ______________________________________________________________________________#
- MAIN ROUTINE #
- ______________________________________________________________________________#
use CGI qw(:standard Vars);
my $action = param('action') || 'startBot';
if ($action eq 'intro') {&startBot;}
elsif ($action eq 'disableBot') {&disableBot;}
elsif ($action eq 'enableBot') {&enableBot;}
elsif ($action eq 'getMainCategory') {&getMainCategory;}
else {&error("Unrecognised action request");}
exit;
- ______________________________________________________________________________#
- SUBROUTINES #
- ______________________________________________________________________________#
sub startBot {
&checkIfBotOnline;
&logAction("Bot requested");
&printOnlineHeader;
print "
Please enter the wikipedia Category you wish to process below:
";print "
&printFooter;
}
- ______________________________________________________________________________#
sub getArticlesinCategory {
my $content_articles = $_[0];
&logAction("Searching for articles in this category ");
# if its not a wikipedia category page, return empty array
unless ($content_articles =~ m/
$content_articles = "";
my @found_articles = split(/\|/,$content_articles);
&logAction("Found 0 articles in this category ");
return (@found_articles);
}
# empty array if no articles, else populate with article names
if ($content_articles =~ m/There are 0 pages in this section of this category/){
$content_articles = "";
&logAction("Found 0 articles in this category");
}
else {
$content_articles =~ s/[\s\S]*
$content_articles =~ s/<\/div>[\s\S]*/<\/div>/;
$content_articles =~ s/[\s\S]*?
- /
- /|/g;
$content_articles =~ s/\n//g;
$content_articles =~ s/\|\|/\|/g;
$content_articles =~ s///g;
$content_articles =~ s/<\/a>//g;
$content_articles =~ s/\|$//;
$content_articles =~ s/^\|//;
$content_articles =~ s/_/ /g;
$content_articles =~ s/\s\|/\|/g;
&logAction("Found 1 or more articles in this category");
}
my @found_articles = split(/\|/,$content_articles);
return (@found_articles);
}
- ______________________________________________________________________________#
sub getSubCatsinCategory {
my $content_subcats = $_[0];
&logAction("Searching for subcats in this category");
# if its not a wikipedia category page, empty array
unless ($content_subcats =~ m/
/){$content_subcats = "";
my @found_subcats = split(/\|/,$content_subcats);
&logAction("Found 0 subcats in this category");
return (@found_subcats);
}
# empty array if no subcats, else populate with subcat names
if ($content_subcats =~ m/There are 0 subcategories to this category/){
$content_subcats = "";
&logAction("Found 0 subcats in this category");
}
else {
$content_subcats =~ s/[\s\S]*
//;$content_subcats =~ s/
[\s\S]*//;$content_subcats =~ s/
[\s\S]*?<\/h3>//g;
$content_subcats =~ s/
//g;$content_subcats =~ s/<\/div>//g;
$content_subcats =~ s///g;
$content_subcats =~ s/[\s\S]*?
- /
- /|/g;
$content_subcats =~ s///g;
$content_subcats =~ s/<\/a>//g;
$content_subcats =~ s/\n//g;
$content_subcats =~ s/\|\|/\|/g;
$content_subcats =~ s/
//g; $content_subcats =~ s/<\/td>//g;
$content_subcats =~ s/<\/tr>//g;
$content_subcats =~ s/<\/table>//g;
$content_subcats =~ s/[\s]*?\|/\|/g;
$content_subcats =~ s/\|$//;
$content_subcats =~ s/^\|//;
$content_subcats =~ s/\|\|/\|/g;
&logAction("Found 1 or more subcats in category $content_subcats");
}
my @found_subcats = split(/\|/,$content_subcats);
return (@found_subcats);
}
- ______________________________________________________________________________#
sub processContents {
my $category = $_[0];
my $contents = $_[1];
$category =~ s/_/ /g;
&logAction("Starting to process category $category");
#Seperate the page generation from spider work
use threads;
use Config;
if ($Config{useithreads}) {
# We have threads
# Let user know spider is on the job.
&logAction("Notifying user bot starting");
&printOnlineHeader;
print "
PockBot is now running - DO NOT CLOSE THIS WINDOW
";print "
Thank you for using PockBot. You wanted a list of article classes for ";
print " wikipedia category $category.
";print "
The content will take some time to generate. When complete, the results will be posted to wikipedia for you.
";print "In respect for wikipedia's servers, PockBot will only make one read request to wikipedia servers every second
";print "PockBot can read 3600 pages an hour under ideal network conditions. Large categories may therefore take up to an hour to run
";print "
Progress:
";
PockBot is running...&printFooter;
# Set spider to work on requested category, in separate thread
my $threadForSpidering = threads->new(\&workthread, $category, $contents);
#$threadForSpidering->detach;
my @listOfAllArticlesFound = $threadForSpidering->join;
}
else {
&error("PockBot requires threads. This perl installation is not built with threads activated. PockBot cannot run.");
}
}
- ______________________________________________________________________________#
sub removeDuplicates {
my @articles = @_;
my @articles_no_duplicates = ();
&logAction("Removing duplicates from found articles list.");
foreach my $suggested_article (@articles) {
my $already_exists = 0;
foreach my $existing_article (@articles_no_duplicates) {
if ($suggested_article eq $existing_article) {
$already_exists = 1;
}
}
if ($already_exists == 0) {
push(@articles_no_duplicates, $suggested_article);
}
}
return (@articles_no_duplicates);
}
- ______________________________________________________________________________#
sub getAllArticlesIn {
my @subcats = @_;
my @new_articles = ();
foreach my $individual_subcat (@subcats) {
&logAction("Searching for new articles in subcat $individual_subcat");
my ($subcategory, $subcategorycontents) = fetchContents($individual_subcat);
my @found_articles = getArticlesinCategory($subcategorycontents);
foreach my $found_article (@found_articles) {
push(@new_articles, $found_article);
}
}
return (@new_articles);
}
- ______________________________________________________________________________#
sub getArticleClasses {
my @articles_no_duplicates = @_;
my %classes = ();
foreach my $article_title (@articles_no_duplicates) {
my ($talkpage, $contents) = fetchTalkContents($article_title);
my $class = "?? (unclassified)";
&logAction("Getting article class for article $article_title");
if ($contents =~ m/[\s\S]*class=Start[\s\S]*/) {
$class = "Start";
}
if ($contents =~ m/[\s\S]*class= Start[\s\S]*/) {
$class = "Start";
}
if ($contents =~ m/[\s\S]*class=Stub[\s\S]*/) {
$class = "Stub";
}
if ($contents =~ m/[\s\S]*class= Stub[\s\S]*/) {
$class = "Stub";
}
if ($contents =~ m/[\s\S]*class=A[\s\S]*/) {
$class = "A";
}
if ($contents =~ m/[\s\S]*class= A[\s\S]*/) {
$class = "A";
}
if ($contents =~ m/[\s\S]*class=B[\s\S]*/) {
$class = "B";
}
if ($contents =~ m/[\s\S]*class= B[\s\S]*/) {
$class = "B";
}
if ($contents =~ m/[\s\S]*class=FA[\s\S]*/) {
$class = "FA";
}
if ($contents =~ m/[\s\S]*class= FA[\s\S]*/) {
$class = "FA";
}
# add details of article class to hash
$classes{'$article_title'} = $class;
}
return (%classes)
}
- ______________________________________________________________________________#
sub writeResultsToFile {
my $category = $_[0];
my $text_to_print = $_[1];
my $results_file = '/files/home2/thepaty/cgi-bin/results.htm';
&logAction("Writing bot results to file.");
open(RESULTSFILE,">$results_file") || &error("Cannot open bot results file.");
flock(RESULTSFILE, 2) || &error("Cannot lock bot results file.");
print RESULTSFILE "$text_to_print";
flock(RESULTSFILE, 8);
close (RESULTSFILE);
}
- ______________________________________________________________________________#
sub getTimeStamp {
my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
my @weekDays = qw(Sun Mon Tue Wed Thu Fri Sat Sun);
my ($second, $minute, $hour, $dayOfMonth, $month, $yearOffset, $dayOfWeek, $dayOfYear, $daylightSavings) = localtime();
my $year = 1900 + $yearOffset;
my $timeNow = "$hour:$minute:$second, $weekDays[$dayOfWeek] $months[$month] $dayOfMonth, $year";
return ($timeNow);
}
- ______________________________________________________________________________#
sub logAction {
my $actionToLog = $_[0];
my $log_file = '/files/home2/thepaty/cgi-bin/log.htm';
my $timeStamp = getTimeStamp();
open(LOGFILE,">>$log_file") || &error("Cannot open log file.");
flock(LOGFILE, 2) || &error("Cannot lock log file.");
print LOGFILE "$timeStamp: $actionToLog
";flock(LOGFILE, 8);
close (LOGFILE);
}
- ______________________________________________________________________________#
sub workthread {
my $category = $_[0];
my $contents = $_[1];
&logAction("Starting work thread for category $category");
my @subcats = getSubCatsinCategory($contents);
my @articles = getArticlesinCategory($contents);
my $new_subcats_found_this_round = 1;
my @subcats_searched_aleady = ();
# Keep searching until no new subcats are found.in any categories searched
while ($new_subcats_found_this_round > 0) {
$new_subcats_found_this_round = 0;
my @proposed_extra_subcats = ();
# Perform a search of every category we currently know of
foreach my $existing_subcat (@subcats) {
my $already_searched = 0;
# If already searched this category in an earlier pass, skip it.
foreach my $searched_subcat (@subcats_searched_aleady) {
if ($existing_subcat eq $searched_subcat) {
$already_searched = 1;
}
}
# If not already searched, get all subcats of that category
if ($already_searched == 0) {
&logAction("Have not searched subcat $existing_subcat already");
my ($subcategory, $subcategorycontents) = fetchContents($existing_subcat);
my @additional_subcats = getSubCatsinCategory($subcategorycontents);
foreach my $proposed_additional_subcat (@additional_subcats) {
push(@proposed_extra_subcats, $proposed_additional_subcat);
&logAction("Found possible new subcat $proposed_additional_subcat");
}
push(@subcats_searched_aleady, $existing_subcat);
}
else {
&logAction("Have searched subcat $existing_subcat already");
}
}
# If this new found subcat isn't a duplicate of one we already know about...
foreach my $proposed_new_subcat (@proposed_extra_subcats) {
my $already_exists = 0;
foreach my $existing_subcat (@subcats) {
if ($proposed_new_subcat eq $existing_subcat) {
$already_exists = 1;
}
}
# then add it to our master list
if ($already_exists == 0) {
&logAction("subcat $proposed_new_subcat is a genuinely new subcategory, adding to master list");
push(@subcats, $proposed_new_subcat);
$new_subcats_found_this_round++;
}
else {
&logAction("subcat $proposed_new_subcat already existed in master list, ignoring");
}
}
&logAction("$new_subcats_found_this_round new subcats found this round. If greater than zero, should run through again");
}
# And now get a list of every article in every subcat
my @new_articles = getAllArticlesIn(@subcats);
my @articles = (@articles, @new_articles);
# Remove duplicates from article list.
my @articles_no_duplicates = removeDuplicates(@articles);
# Search talk pages for each article to find "class=X" classification
my %classes = getArticleClasses(@articles_no_duplicates);
# Prepare text to print to results file
my $text_to_print = "
Pages in category $category";
$text_to_print .= " retrieved by PockBot.
";$text_to_print .= "PockBot is currently In Development and the below does not represent final output.
";$text_to_print .= "
";"; ";$text_to_print .= "
Article ";$text_to_print .= "
Class / Status foreach my $article_title (@articles_no_duplicates) {
$text_to_print .= "
"; ";$text_to_print .= "
"; $text_to_print .= "$article_title
$classes{'$article_title'} ";$text_to_print .= "
}
$text_to_print .= "
# write results to results.htm
&writeResultsToFile($text_to_print,$category);
return (@articles_no_duplicates);
}
- ______________________________________________________________________________#
sub fetchContents {
my $category = $_[0];
$category =~ s/\s/_/g;
my $category_url = "http://en.wikipedia.org/wiki/Category:" . $category;
&logAction("Fetching page contents for category $category");
my $browser = LWP::UserAgent->new();
$browser->timeout(60);
my $request = HTTP::Request->new(GET => $category_url);
my $response = $browser->request($request);
if ($response->is_error()) {printf "%s\n", $response->status_line;}
my $contents = $response->content();
sleep(1); # don't hammer the server! One read request every 1 second.
return($category,$contents);
}
- ______________________________________________________________________________#
sub fetchTalkContents {
my $category = $_[0];
$category =~ s/\s/_/g;
my $category_url = "http://en.wikipedia.org/wiki/Talk:" . $category;
&logAction("Fetching talk page contents for category $category");
my $browser = LWP::UserAgent->new();
$browser->timeout(60);
my $request = HTTP::Request->new(GET => $category_url);
my $response = $browser->request($request);
if ($response->is_error()) {printf "%s\n", $response->status_line;}
my $contents = $response->content();
sleep(1); # don't hammer the server! One read request every 1 second.
return($category,$contents);
}
- ______________________________________________________________________________#
sub finishedRunning {
my $category = $_[0];
my $category_url = "http://en.wikipedia.org/wiki/Talk:" . $category;
&logAction("Finished processing category $category");
print "
PockBot has finished running. The results should be visible on the talk page at Category_talk:$category
";&printFooter;
}
- ______________________________________________________________________________#
sub resetLogAndResultsFiles {
my $log_file = '/files/home2/thepaty/cgi-bin/log.htm';
my $results_file = '/files/home2/thepaty/cgi-bin/results.htm';
&logAction("Resetting log and results files to empty");
open(LOGFILE,">$log_file") || &error("Cannot open log file.");
flock(LOGFILE, 2) || &error("Cannot lock log file.");
print LOGFILE "";
flock(LOGFILE, 8);
close (LOGFILE);
open(RESULTSFILE,">$results_file") || &error("Cannot open log file.");
flock(RESULTSFILE, 2) || &error("Cannot lock log file.");
print RESULTSFILE "";
flock(RESULTSFILE, 8);
close (RESULTSFILE);
}
- ______________________________________________________________________________#
sub getMainCategory{
my $category = "BLANK";
$category = param('category_specified');
&resetLogAndResultsFiles();
&logAction("Bot started for category $category");
if ($category eq "BLANK") {
&error("Error receiving category name");
}
else {
my ($category, $contents) = fetchContents($category);
&processContents($category,$contents);
&finishedRunning($category);
}
}
- ______________________________________________________________________________#
sub enableBot {
my $status_file = "/files/home2/thepaty/cgi-bin/status.txt";
&logAction("Bot enable request made");
open(STATUSFILE,"$status_file") || &error("Cannot open bot status file.");
flock(STATUSFILE, 2) || &error("Cannot lock bot status file.");
my $current_status =
; flock(STATUSFILE, 8);
close (STATUSFILE);
chomp($current_status);
my $bot_enabled = $current_status;
if ($bot_enabled == 1) {
&logAction("Bot already enabled, no action necesary");
&printOnlineHeader;
print "
PockBot is already enabled. Disable PockBot
";&printFooter;
exit;
}
elsif ($bot_enabled == 0) {
&logAction("Bot currently disabled. Enabling bot.");
open(STATUSFILE,">$status_file") || &error("Cannot open bot status file.");
flock(STATUSFILE, 2) || &error("Cannot lock bot status file.");
print STATUSFILE "1";
flock(STATUSFILE, 8);
close (STATUSFILE);
&printOnlineHeader;
print "
PockBot is now enabled. Disable Pockbot
";&printFooter;
exit;
}
else {
&error("Unrecognised bot status. Something has gone wrong.");
}
}
- ______________________________________________________________________________#
sub disableBot {
my $status_file = "/files/home2/thepaty/cgi-bin/status.txt";
&logAction("Bot disable request made");
open(STATUSFILE,"$status_file") || &error("Cannot open bot status file.");
flock(STATUSFILE, 2) || &error("Cannot lock bot status file.");
my $current_status =
; flock(STATUSFILE, 8);
close (STATUSFILE);
chomp($current_status);
my $bot_enabled = $current_status;
if ($bot_enabled == 0) {
&logAction("Bot is already disabled. No action necessary");
&printOfflineHeader;
print "
PockBot is already disabled. Enable PockBot
";&printFooter;
exit;
}
elsif ($bot_enabled == 1) {
&logAction("Bot is currently enabled. Disabling bot.");
open(STATUSFILE,">$status_file") || &error("Cannot open bot status file.");
flock(STATUSFILE, 2) || &error("Cannot lock bot status file.");
print STATUSFILE "0";
flock(STATUSFILE, 8);
close (STATUSFILE);
&printOfflineHeader;
print "
PockBot is now disabled. Enable Pockbot
";&printFooter;
exit;
}
else {
&error("Unrecognised bot status. Something has gone wrong.");
}
}
- ______________________________________________________________________________#
sub checkIfBotOnline {
my $status_file = '/files/home2/thepaty/cgi-bin/status.txt';
&logAction("Checking if bot is online");
open(STATUSFILE,"$status_file") || &error("Cannot open bot status file.");
flock(STATUSFILE, 2) || &error("Cannot lock bot status file.");
my $current_status =
; flock(STATUSFILE, 8);
close (STATUSFILE);
chomp($current_status);
my $bot_enabled = $current_status;
if ($bot_enabled == 0) {
&logAction("Bot is disabled, cannot perform action");
&printOfflineHeader;
print "
PockBot is currently disabled. If you are certain it has nt been disabled for a reason, you can Enable PockBot
";&printFooter;
exit;
}
elsif ($bot_enabled == 1) {
&logAction("Bot is enabled, we are good to go.");
#no action necessary
}
else {
&error("Unrecognised bot status. Something has gone wrong.");
}
}
- ______________________________________________________________________________#
sub printOnlineHeader {
print "Content-type: text/html\n\n";
print "
PockBot ";print "
Wikipedia > Pockbot's User Page
";print "
Pockbot is currently ONLINE / ENABLED (Disable PockBot)
";
print "
";
}
sub printOfflineHeader {
print "Content-type: text/html\n\n";
print "
PockBot ";print "
Wikipedia > Pockbot's User Page
";print "
Pockbot is currently OFFLINE / DISABLED (Enable PockBot)
";
print "
";
}
- ______________________________________________________________________________#
sub printFooter {
print "";
}
- ______________________________________________________________________________#
sub error {
&checkIfBotOnline;
&logAction("ERROR: $_[0]");
&printOnlineHeader;
print "
ERROR: $_[0]
";&printFooter;
exit;
}
- /;
$content_subcats =~ s/
- //g;
$content_subcats =~ s/<\/ul>//g;
$content_subcats =~ s/<\/li>/|/g;
$content_subcats =~ s/
- /;
$content_articles =~ s/
[\s\S]*?<\/h3>//g;
$content_articles =~ s/
- //g;
$content_articles =~ s/<\/ul>//g;
$content_articles =~ s/
$content_articles =~ s/<\/td>//g;
$content_articles =~ s/<\/div>//g;
$content_articles =~ s/<\/tr>//g;
$content_articles =~ s/<\/table>//g;
$content_articles =~ s/<\/li>/|/g;
$content_articles =~ s/