User:PockBot/SourceCode/001

#!/usr/bin/perl -w

use strict;

use warnings;

use CGI;

use CGI::Carp "fatalsToBrowser";

use LWP::Simple;

use LWP::UserAgent;

use HTTP::Request;

use HTTP::Request::Common qw(GET);

use HTTP::Response;

  1. ______________________________________________________________________________#
  2. PockBot.pl Version 0.01 PRE_RELEASE #
  3. Author Dan Adams , (User:PocklingtonDan) #
  4. Created 29/11/06 Last Modified 04/12/06 #
  5. ______________________________________________________________________________#
  1. ______________________________________________________________________________#
  2. RIGHTS MANAGEMENT ETC #
  3. #
  4. The source code for PockBot is supplied solely for the purposes of allowing #
  5. other editors to comment on and improve the code, and/or to run the code as #
  6. a clone. It may be distributed and modified as required for these purposes. #
  7. ______________________________________________________________________________#
  1. ______________________________________________________________________________#
  2. CHANGES STILL TO MAKE #
  3. #
  4. - timesouts on larger categories #
  5. - make it write to wiki talk page for the category in scrollbox #
  6. - "PockBot ran successfully" presented even if category non-existent #
  7. #
  8. RECENT CHANGES #
  9. #
  10. 05.12.06 - Version 0.01 source code released #
  11. ______________________________________________________________________________#
  1. ______________________________________________________________________________#
  2. WHAT THE SCRIPT DOES #
  3. #
  4. This script is a wikipedia bot. It acts as a web spider. Given a wikipedia #
  5. category page to start from, it finds all articles listed in that category #
  6. as well as all subcategories of that category. For every subcategory it #
  7. pulls a list of articles. For all articles retrieved (a list of all articles #
  8. in that category and its subcategories) it then retrieves the CLASS flag for #
  9. each page from wikipedia. It then presents these resulsts in tabulated form. #
  10. #
  11. INTENDED USE #
  12. #
  13. It is intended that this script would be useful to those trying to monitor #
  14. all pages within a category for purposes of administration or for a project #
  15. in order to monitor which articles need bringing up from stub or start class #
  16. to full article status. #
  17. #
  18. CODE FORMATTING #
  19. #
  20. Code is formatted for ease of editing with Textad (www.textpad.com) or #
  21. similar editor with colour-coding meta-markup. It may be difficult to scan #
  22. using a no-frills text editor. #
  23. ______________________________________________________________________________#
  1. ______________________________________________________________________________#
  2. MAIN ROUTINE #
  3. ______________________________________________________________________________#

use CGI qw(:standard Vars);

my $action = param('action') || 'startBot';

if ($action eq 'intro') {&startBot;}

elsif ($action eq 'disableBot') {&disableBot;}

elsif ($action eq 'enableBot') {&enableBot;}

elsif ($action eq 'getMainCategory') {&getMainCategory;}

else {&error("Unrecognised action request");}

exit;

  1. ______________________________________________________________________________#
  2. SUBROUTINES #
  3. ______________________________________________________________________________#

sub startBot {

&checkIfBotOnline;

&logAction("Bot requested");

&printOnlineHeader;

print "

Please enter the wikipedia Category you wish to process below:

";

print "

";

print "

Category:";

print "

";

print "

Note: Bot may take up to an hour to run for large categories.

";

print "";

print "";

print "

";

&printFooter;

}

  1. ______________________________________________________________________________#

sub getArticlesinCategory {

my $content_articles = $_[0];

&logAction("Searching for articles in this category ");

# if its not a wikipedia category page, return empty array

unless ($content_articles =~ m/

/){

$content_articles = "";

my @found_articles = split(/\|/,$content_articles);

&logAction("Found 0 articles in this category ");

return (@found_articles);

}

# empty array if no articles, else populate with article names

if ($content_articles =~ m/There are 0 pages in this section of this category/){

$content_articles = "";

&logAction("Found 0 articles in this category");

}

else {

$content_articles =~ s/[\s\S]*

//;

$content_articles =~ s/<\/div>[\s\S]*/<\/div>/;

$content_articles =~ s/[\s\S]*?

    /
      /;

      $content_articles =~ s/

      [\s\S]*?<\/h3>//g;

      $content_articles =~ s/

        //g;

        $content_articles =~ s/<\/ul>//g;

        $content_articles =~ s///g;

        $content_articles =~ s/<\/td>//g;

        $content_articles =~ s/<\/div>//g;

        $content_articles =~ s/<\/tr>//g;

        $content_articles =~ s/<\/table>//g;

        $content_articles =~ s/<\/li>/|/g;

        $content_articles =~ s/

      • /|/g;

        $content_articles =~ s/\n//g;

        $content_articles =~ s/\|\|/\|/g;

        $content_articles =~ s///g;

        $content_articles =~ s/<\/a>//g;

        $content_articles =~ s/\|$//;

        $content_articles =~ s/^\|//;

        $content_articles =~ s/_/ /g;

        $content_articles =~ s/\s\|/\|/g;

        &logAction("Found 1 or more articles in this category");

        }

        my @found_articles = split(/\|/,$content_articles);

        return (@found_articles);

        }

        1. ______________________________________________________________________________#

        sub getSubCatsinCategory {

        my $content_subcats = $_[0];

        &logAction("Searching for subcats in this category");

        # if its not a wikipedia category page, empty array

        unless ($content_subcats =~ m/

        /){

        $content_subcats = "";

        my @found_subcats = split(/\|/,$content_subcats);

        &logAction("Found 0 subcats in this category");

        return (@found_subcats);

        }

        # empty array if no subcats, else populate with subcat names

        if ($content_subcats =~ m/There are 0 subcategories to this category/){

        $content_subcats = "";

        &logAction("Found 0 subcats in this category");

        }

        else {

        $content_subcats =~ s/[\s\S]*

        //;

        $content_subcats =~ s/

        [\s\S]*//;

        $content_subcats =~ s/

        [\s\S]*?<\/h3>//g;

        $content_subcats =~ s///g;

        $content_subcats =~ s/<\/div>//g;

        $content_subcats =~ s///g;

        $content_subcats =~ s/[\s\S]*?

          /
            /;

            $content_subcats =~ s/

              //g;

              $content_subcats =~ s/<\/ul>//g;

              $content_subcats =~ s/<\/li>/|/g;

              $content_subcats =~ s/

            • /|/g;

              $content_subcats =~ s///g;

              $content_subcats =~ s/<\/a>//g;

              $content_subcats =~ s/\n//g;

              $content_subcats =~ s/\|\|/\|/g;

              $content_subcats =~ s///g;

              $content_subcats =~ s/<\/td>//g;

              $content_subcats =~ s/<\/tr>//g;

              $content_subcats =~ s/<\/table>//g;

              $content_subcats =~ s/[\s]*?\|/\|/g;

              $content_subcats =~ s/\|$//;

              $content_subcats =~ s/^\|//;

              $content_subcats =~ s/\|\|/\|/g;

              &logAction("Found 1 or more subcats in category $content_subcats");

              }

              my @found_subcats = split(/\|/,$content_subcats);

              return (@found_subcats);

              }

              1. ______________________________________________________________________________#

              sub processContents {

              my $category = $_[0];

              my $contents = $_[1];

              $category =~ s/_/ /g;

              &logAction("Starting to process category $category");

              #Seperate the page generation from spider work

              use threads;

              use Config;

              if ($Config{useithreads}) {

              # We have threads

              # Let user know spider is on the job.

              &logAction("Notifying user bot starting");

              &printOnlineHeader;

              print "

              PockBot is now running - DO NOT CLOSE THIS WINDOW

              ";

              print "

              Thank you for using PockBot. You wanted a list of article classes for ";

              print " wikipedia category $category.

              ";

              print "

              The content will take some time to generate. When complete, the results will be posted to wikipedia for you.
              ";

              print "In respect for wikipedia's servers, PockBot will only make one read request to wikipedia servers every second
              ";

              print "PockBot can read 3600 pages an hour under ideal network conditions. Large categories may therefore take up to an hour to run

              ";

              print "

              Progress:
              PockBot is running...

              ";

              &printFooter;

              # Set spider to work on requested category, in separate thread

              my $threadForSpidering = threads->new(\&workthread, $category, $contents);

              #$threadForSpidering->detach;

              my @listOfAllArticlesFound = $threadForSpidering->join;

              }

              else {

              &error("PockBot requires threads. This perl installation is not built with threads activated. PockBot cannot run.");

              }

              }

              1. ______________________________________________________________________________#

              sub removeDuplicates {

              my @articles = @_;

              my @articles_no_duplicates = ();

              &logAction("Removing duplicates from found articles list.");

              foreach my $suggested_article (@articles) {

              my $already_exists = 0;

              foreach my $existing_article (@articles_no_duplicates) {

              if ($suggested_article eq $existing_article) {

              $already_exists = 1;

              }

              }

              if ($already_exists == 0) {

              push(@articles_no_duplicates, $suggested_article);

              }

              }

              return (@articles_no_duplicates);

              }

              1. ______________________________________________________________________________#

              sub getAllArticlesIn {

              my @subcats = @_;

              my @new_articles = ();

              foreach my $individual_subcat (@subcats) {

              &logAction("Searching for new articles in subcat $individual_subcat");

              my ($subcategory, $subcategorycontents) = fetchContents($individual_subcat);

              my @found_articles = getArticlesinCategory($subcategorycontents);

              foreach my $found_article (@found_articles) {

              push(@new_articles, $found_article);

              }

              }

              return (@new_articles);

              }

              1. ______________________________________________________________________________#

              sub getArticleClasses {

              my @articles_no_duplicates = @_;

              my %classes = ();

              foreach my $article_title (@articles_no_duplicates) {

              my ($talkpage, $contents) = fetchTalkContents($article_title);

              my $class = "?? (unclassified)";

              &logAction("Getting article class for article $article_title");

              if ($contents =~ m/[\s\S]*class=Start[\s\S]*/) {

              $class = "Start";

              }

              if ($contents =~ m/[\s\S]*class= Start[\s\S]*/) {

              $class = "Start";

              }

              if ($contents =~ m/[\s\S]*class=Stub[\s\S]*/) {

              $class = "Stub";

              }

              if ($contents =~ m/[\s\S]*class= Stub[\s\S]*/) {

              $class = "Stub";

              }

              if ($contents =~ m/[\s\S]*class=A[\s\S]*/) {

              $class = "A";

              }

              if ($contents =~ m/[\s\S]*class= A[\s\S]*/) {

              $class = "A";

              }

              if ($contents =~ m/[\s\S]*class=B[\s\S]*/) {

              $class = "B";

              }

              if ($contents =~ m/[\s\S]*class= B[\s\S]*/) {

              $class = "B";

              }

              if ($contents =~ m/[\s\S]*class=FA[\s\S]*/) {

              $class = "FA";

              }

              if ($contents =~ m/[\s\S]*class= FA[\s\S]*/) {

              $class = "FA";

              }

              # add details of article class to hash

              $classes{'$article_title'} = $class;

              }

              return (%classes)

              }

              1. ______________________________________________________________________________#

              sub writeResultsToFile {

              my $category = $_[0];

              my $text_to_print = $_[1];

              my $results_file = '/files/home2/thepaty/cgi-bin/results.htm';

              &logAction("Writing bot results to file.");

              open(RESULTSFILE,">$results_file") || &error("Cannot open bot results file.");

              flock(RESULTSFILE, 2) || &error("Cannot lock bot results file.");

              print RESULTSFILE "$text_to_print";

              flock(RESULTSFILE, 8);

              close (RESULTSFILE);

              }

              1. ______________________________________________________________________________#

              sub getTimeStamp {

              my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);

              my @weekDays = qw(Sun Mon Tue Wed Thu Fri Sat Sun);

              my ($second, $minute, $hour, $dayOfMonth, $month, $yearOffset, $dayOfWeek, $dayOfYear, $daylightSavings) = localtime();

              my $year = 1900 + $yearOffset;

              my $timeNow = "$hour:$minute:$second, $weekDays[$dayOfWeek] $months[$month] $dayOfMonth, $year";

              return ($timeNow);

              }

              1. ______________________________________________________________________________#

              sub logAction {

              my $actionToLog = $_[0];

              my $log_file = '/files/home2/thepaty/cgi-bin/log.htm';

              my $timeStamp = getTimeStamp();

              open(LOGFILE,">>$log_file") || &error("Cannot open log file.");

              flock(LOGFILE, 2) || &error("Cannot lock log file.");

              print LOGFILE "$timeStamp: $actionToLog
              ";

              flock(LOGFILE, 8);

              close (LOGFILE);

              }

              1. ______________________________________________________________________________#

              sub workthread {

              my $category = $_[0];

              my $contents = $_[1];

              &logAction("Starting work thread for category $category");

              my @subcats = getSubCatsinCategory($contents);

              my @articles = getArticlesinCategory($contents);

              my $new_subcats_found_this_round = 1;

              my @subcats_searched_aleady = ();

              # Keep searching until no new subcats are found.in any categories searched

              while ($new_subcats_found_this_round > 0) {

              $new_subcats_found_this_round = 0;

              my @proposed_extra_subcats = ();

              # Perform a search of every category we currently know of

              foreach my $existing_subcat (@subcats) {

              my $already_searched = 0;

              # If already searched this category in an earlier pass, skip it.

              foreach my $searched_subcat (@subcats_searched_aleady) {

              if ($existing_subcat eq $searched_subcat) {

              $already_searched = 1;

              }

              }

              # If not already searched, get all subcats of that category

              if ($already_searched == 0) {

              &logAction("Have not searched subcat $existing_subcat already");

              my ($subcategory, $subcategorycontents) = fetchContents($existing_subcat);

              my @additional_subcats = getSubCatsinCategory($subcategorycontents);

              foreach my $proposed_additional_subcat (@additional_subcats) {

              push(@proposed_extra_subcats, $proposed_additional_subcat);

              &logAction("Found possible new subcat $proposed_additional_subcat");

              }

              push(@subcats_searched_aleady, $existing_subcat);

              }

              else {

              &logAction("Have searched subcat $existing_subcat already");

              }

              }

              # If this new found subcat isn't a duplicate of one we already know about...

              foreach my $proposed_new_subcat (@proposed_extra_subcats) {

              my $already_exists = 0;

              foreach my $existing_subcat (@subcats) {

              if ($proposed_new_subcat eq $existing_subcat) {

              $already_exists = 1;

              }

              }

              # then add it to our master list

              if ($already_exists == 0) {

              &logAction("subcat $proposed_new_subcat is a genuinely new subcategory, adding to master list");

              push(@subcats, $proposed_new_subcat);

              $new_subcats_found_this_round++;

              }

              else {

              &logAction("subcat $proposed_new_subcat already existed in master list, ignoring");

              }

              }

              &logAction("$new_subcats_found_this_round new subcats found this round. If greater than zero, should run through again");

              }

              # And now get a list of every article in every subcat

              my @new_articles = getAllArticlesIn(@subcats);

              my @articles = (@articles, @new_articles);

              # Remove duplicates from article list.

              my @articles_no_duplicates = removeDuplicates(@articles);

              # Search talk pages for each article to find "class=X" classification

              my %classes = getArticleClasses(@articles_no_duplicates);

              # Prepare text to print to results file

              my $text_to_print = "

              Pages in category $category";

              $text_to_print .= " retrieved by PockBot.

              ";

              $text_to_print .= "PockBot is currently In Development and the below does not represent final output.

              ";

              $text_to_print .= "

              ";

              $text_to_print .= "

              ";

              $text_to_print .= "

              ";

              foreach my $article_title (@articles_no_duplicates) {

              $text_to_print .= "

              ";

              $text_to_print .= "

              ";

              $text_to_print .= "

              ";

              }

              $text_to_print .= "

              Article Class / Status
              ";

              $text_to_print .= "$article_title

              $classes{'$article_title'}
              ";

              # write results to results.htm

              &writeResultsToFile($text_to_print,$category);

              return (@articles_no_duplicates);

              }

              1. ______________________________________________________________________________#

              sub fetchContents {

              my $category = $_[0];

              $category =~ s/\s/_/g;

              my $category_url = "http://en.wikipedia.org/wiki/Category:" . $category;

              &logAction("Fetching page contents for category $category");

              my $browser = LWP::UserAgent->new();

              $browser->timeout(60);

              my $request = HTTP::Request->new(GET => $category_url);

              my $response = $browser->request($request);

              if ($response->is_error()) {printf "%s\n", $response->status_line;}

              my $contents = $response->content();

              sleep(1); # don't hammer the server! One read request every 1 second.

              return($category,$contents);

              }

              1. ______________________________________________________________________________#

              sub fetchTalkContents {

              my $category = $_[0];

              $category =~ s/\s/_/g;

              my $category_url = "http://en.wikipedia.org/wiki/Talk:" . $category;

              &logAction("Fetching talk page contents for category $category");

              my $browser = LWP::UserAgent->new();

              $browser->timeout(60);

              my $request = HTTP::Request->new(GET => $category_url);

              my $response = $browser->request($request);

              if ($response->is_error()) {printf "%s\n", $response->status_line;}

              my $contents = $response->content();

              sleep(1); # don't hammer the server! One read request every 1 second.

              return($category,$contents);

              }

              1. ______________________________________________________________________________#

              sub finishedRunning {

              my $category = $_[0];

              my $category_url = "http://en.wikipedia.org/wiki/Talk:" . $category;

              &logAction("Finished processing category $category");

              print "

              PockBot has finished running. The results should be visible on the talk page at Category_talk:$category

              ";

              &printFooter;

              }

              1. ______________________________________________________________________________#

              sub resetLogAndResultsFiles {

              my $log_file = '/files/home2/thepaty/cgi-bin/log.htm';

              my $results_file = '/files/home2/thepaty/cgi-bin/results.htm';

              &logAction("Resetting log and results files to empty");

              open(LOGFILE,">$log_file") || &error("Cannot open log file.");

              flock(LOGFILE, 2) || &error("Cannot lock log file.");

              print LOGFILE "";

              flock(LOGFILE, 8);

              close (LOGFILE);

              open(RESULTSFILE,">$results_file") || &error("Cannot open log file.");

              flock(RESULTSFILE, 2) || &error("Cannot lock log file.");

              print RESULTSFILE "";

              flock(RESULTSFILE, 8);

              close (RESULTSFILE);

              }

              1. ______________________________________________________________________________#

              sub getMainCategory{

              my $category = "BLANK";

              $category = param('category_specified');

              &resetLogAndResultsFiles();

              &logAction("Bot started for category $category");

              if ($category eq "BLANK") {

              &error("Error receiving category name");

              }

              else {

              my ($category, $contents) = fetchContents($category);

              &processContents($category,$contents);

              &finishedRunning($category);

              }

              }

              1. ______________________________________________________________________________#

              sub enableBot {

              my $status_file = "/files/home2/thepaty/cgi-bin/status.txt";

              &logAction("Bot enable request made");

              open(STATUSFILE,"$status_file") || &error("Cannot open bot status file.");

              flock(STATUSFILE, 2) || &error("Cannot lock bot status file.");

              my $current_status = ;

              flock(STATUSFILE, 8);

              close (STATUSFILE);

              chomp($current_status);

              my $bot_enabled = $current_status;

              if ($bot_enabled == 1) {

              &logAction("Bot already enabled, no action necesary");

              &printOnlineHeader;

              print "

              PockBot is already enabled. Disable PockBot

              ";

              &printFooter;

              exit;

              }

              elsif ($bot_enabled == 0) {

              &logAction("Bot currently disabled. Enabling bot.");

              open(STATUSFILE,">$status_file") || &error("Cannot open bot status file.");

              flock(STATUSFILE, 2) || &error("Cannot lock bot status file.");

              print STATUSFILE "1";

              flock(STATUSFILE, 8);

              close (STATUSFILE);

              &printOnlineHeader;

              print "

              PockBot is now enabled. Disable Pockbot

              ";

              &printFooter;

              exit;

              }

              else {

              &error("Unrecognised bot status. Something has gone wrong.");

              }

              }

              1. ______________________________________________________________________________#

              sub disableBot {

              my $status_file = "/files/home2/thepaty/cgi-bin/status.txt";

              &logAction("Bot disable request made");

              open(STATUSFILE,"$status_file") || &error("Cannot open bot status file.");

              flock(STATUSFILE, 2) || &error("Cannot lock bot status file.");

              my $current_status = ;

              flock(STATUSFILE, 8);

              close (STATUSFILE);

              chomp($current_status);

              my $bot_enabled = $current_status;

              if ($bot_enabled == 0) {

              &logAction("Bot is already disabled. No action necessary");

              &printOfflineHeader;

              print "

              PockBot is already disabled. Enable PockBot

              ";

              &printFooter;

              exit;

              }

              elsif ($bot_enabled == 1) {

              &logAction("Bot is currently enabled. Disabling bot.");

              open(STATUSFILE,">$status_file") || &error("Cannot open bot status file.");

              flock(STATUSFILE, 2) || &error("Cannot lock bot status file.");

              print STATUSFILE "0";

              flock(STATUSFILE, 8);

              close (STATUSFILE);

              &printOfflineHeader;

              print "

              PockBot is now disabled. Enable Pockbot

              ";

              &printFooter;

              exit;

              }

              else {

              &error("Unrecognised bot status. Something has gone wrong.");

              }

              }

              1. ______________________________________________________________________________#

              sub checkIfBotOnline {

              my $status_file = '/files/home2/thepaty/cgi-bin/status.txt';

              &logAction("Checking if bot is online");

              open(STATUSFILE,"$status_file") || &error("Cannot open bot status file.");

              flock(STATUSFILE, 2) || &error("Cannot lock bot status file.");

              my $current_status = ;

              flock(STATUSFILE, 8);

              close (STATUSFILE);

              chomp($current_status);

              my $bot_enabled = $current_status;

              if ($bot_enabled == 0) {

              &logAction("Bot is disabled, cannot perform action");

              &printOfflineHeader;

              print "

              PockBot is currently disabled. If you are certain it has nt been disabled for a reason, you can Enable PockBot

              ";

              &printFooter;

              exit;

              }

              elsif ($bot_enabled == 1) {

              &logAction("Bot is enabled, we are good to go.");

              #no action necessary

              }

              else {

              &error("Unrecognised bot status. Something has gone wrong.");

              }

              }

              1. ______________________________________________________________________________#

              sub printOnlineHeader {

              print "Content-type: text/html\n\n";

              print "PockBot";

              print "

              Wikipedia > Pockbot's User Page

              ";

              print "

              Pockbot is currently ONLINE / ENABLED (Disable PockBot)

              ";

              print "";

              }

              sub printOfflineHeader {

              print "Content-type: text/html\n\n";

              print "PockBot";

              print "

              Wikipedia > Pockbot's User Page

              ";

              print "

              Pockbot is currently OFFLINE / DISABLED (Enable PockBot)

              ";

              print "";

              }

              1. ______________________________________________________________________________#

              sub printFooter {

              print "";

              }

              1. ______________________________________________________________________________#

              sub error {

              &checkIfBotOnline;

              &logAction("ERROR: $_[0]");

              &printOnlineHeader;

              print "

              ERROR: $_[0]

              ";

              &printFooter;

              exit;

              }