User:HBC Searchbot/Source

See also templates.pl

use strict;

use LWP::UserAgent;

use HTTP::Request::Common;

use XML::Simple;

use URI::Escape;

my %revisions = get_complete_history('User talk:HighInBC');

open(PL,'templates.pl');

sysread(PL,my $templates,-s(PL));

close(PL);

my(%templates);

eval($templates) || die;

my $searches;

warn ("Checking ".scalar(keys(%revisions))." revisions.\n");

my(%warnings);

foreach my $revision (sort {$a <=> $b} keys(%revisions))

{

my $rh_rev = $revisions{$revision};

my $text = ${$rh_rev}{'text'}{'content'};

foreach my $template (sort keys(%templates))

{

$searches++;

my $start = index(lc($text), lc($templates{$template}));

if ($start > 0)

{

my $end = (index($text,"\n",$start) - $start-1);

$end = (length($text)-$start) if ($end < 0);

my $string = substr($text,$start,$end);

$warnings{$string}{'template'} = $template;

$warnings{$string}{'regex'} = $templates{$template};

push(@{$warnings{$string}{'revisions'}},$revision);

}

}

}

warn "$searches searches performed.\n";

warn Dumper(\%warnings);

sub get_complete_history

{

mkdir('cache') unless (-d('cache'));

my $page = shift;

my(%revisions);

my $count;

my $offset;

my $fname = 'cache/'.uri_escape($page);

if (-f($fname))

{

warn "found $fname in cache, loading\n";

open(IN,$fname);

sysread(IN,my $code,-s(IN));

close(IN);

my $VAR1; eval($code); %revisions = %{$VAR1};

my(@keys) = sort {$a <=> $b} keys(%revisions);

$offset = ($revisions{$keys[scalar(@keys)-1]}{'timestamp'});

warn (scalar(keys(%revisions))." loaded from cache.\n");

}

else

{

warn "No cache, starting fresh.\n";

$offset = '0';

}

my $total;

GETMORE:

warn "Downloading 100 revisions.\n";

my $ua = LWP::UserAgent->new('agent' => 'HighInBC warning checker .01b');

my $index = 'http://en.wikipedia.org/w/index.php';

my $res = $ua->request

(

POST $index."?title=Special:Export",

Content_Type => 'application/x-www-form-urlencoded',

Content => [(

'pages' => $page,

'action' => 'submit',

'submit' => 'Export',

'limit' => 100,

'offset' => $offset

)]

);

my $current = $res->content();

unless ($current =~ m|^

{

warn "Failed somehow, trying again\n";

goto GETMORE;

}

my $index = rindex($current, '');

my $string = substr($current,$index,43);

$string =~ m|(.+?)|;

$offset = $1;

my $xml_data = XMLin($current);

$count = 0;

if (${$xml_data}{'page'}{'revision'}{'timestamp'} eq $offset)

{

# do nothing

}

elsif (${$xml_data}{'page'}{'revision'}{'comment'})

{

($count++ && $total++) unless ($revisions{${$xml_data}{'page'}{'revision'}{'id'}});

$revisions{${$xml_data}{'page'}{'revision'}{'id'}} = ${$xml_data}{'page'}{'revision'};

}

else

{

foreach my $revision (sort {$a <=> $b} keys(%{${$xml_data}{'page'}{'revision'}}))

{

($count++ && $total++) unless ($revisions{$revision});

$revisions{$revision} = ${$xml_data}{'page'}{'revision'}{$revision};

}

}

warn "Got $count revisions\n";

if ($count == 100)

{

warn "Still more past $offset to get, waiting 5 seconds between hits\n";

sleep(5);

goto GETMORE;

}

if ($total > 0)

{

warn "Saving cache...\n";

open(OUT, '>'.$fname);

print OUT (Dumper(\%revisions));

close(OUT);

warn "done.\n";

}

return %revisions;

}