User:ImageRemovalBot/removebot.pl

ImageRemovalBot's code. Requires User:FairuseBot/Pearle.pm, User:FairuseBot/Pearle/WikiPage.pm, and User:FairuseBot/libBot.pm. User:ImageRemovalBot/removebot-followup.pl is used to follow up on failed removals.

  1. !/usr/bin/perl
  1. RemoveBot
  2. A bot to remove deleted images from pages

use strict;

use warnings;

use lib 'Insert the directory containing libBot.pm and Pearle.pm here';

use feature 'unicode_strings';

use Date::Calc qw(Gmtime Add_Delta_Days);

use Fcntl qw(:flock);

use libBot;

my $test = 0;

my $homedir = 'Insert the directory contining the bot\'s script here';

my $permit_interruptions = 0; # Allow talkpage messages to stop the bot?

Pearle::init("Insert your bot name here", "Insert your bot password here", "$homedir/removebot.log","$homedir/cookies.txt");

Pearle::config(nullOK => 1, printlevel => 4, loglevel => 2);

config(username => "Insert your bot name here");

if(!Pearle::login())

{

exit;

}

  1. Check for a running copy

if(-e "$homedir/pid")

{

# Possible other copy. Compare PIDs

open PIDFILE, "<", "$homedir/pid";

my $pid = ;

close PIDFILE;

if(defined($pid) and length($pid) > 0)

{

my $psresult = `ps -p $pid`;

if($psresult =~ /removebot.pl/)

{

botwarnlog("\n*Previous run is taking longer than normal");

exit;

}

}

else

{

Pearle::myLog(1, "Unable to read pidfile, assuming no other copy is running\n");

}

}

open PIDFILE, ">", "$homedir/pid";

print PIDFILE $$;

close PIDFILE;

  1. Get the last log entry processed

my ($last_date);

if(-e "$homedir/lastfile.log")

{

open INFILE, "<", "$homedir/lastfile.log";

$last_date = ;

close INFILE;

if(defined($last_date) && length($last_date) > 0)

{

chomp $last_date;

}

else

{

my ($y, $m, $d, $h, $min, $s, undef, undef, undef) = Gmtime();

($y, $m, $d) = Add_Delta_Days($y, $m, $d, -2);

$last_date = sprintf("%02d-%02d-%02dT%02d:%02d:%02dZ", $y, $m, $d, $h, $min, $s);

Pearle::myLog(1, "Unable to read lastfile.log, using $last_date instead\n");

}

}

else

{

$last_date = "1970-01-01T00:00:01Z"; # Beginning of time

}

my $total_images = 0;

my $total_processed = 0;

my $i = 1;

  1. for($i = 1; $i <= 100; $i++)

{

my @images;

my $image;

my $images_removed = 0;

@images = ();

Pearle::myLog(2, "Beginning set at " . time() . "\n");

# Get the log

my @articles;

if($test)

{

@articles = undef;

@images = "File:RAF logo.svg";

chomp @images;

}

else

{

@articles = Pearle::getLogArticles(log => 'delete', limit => 500, time => $last_date, dir => 'newer');

foreach my $item (@articles)

{

# Get all files that were not moved to Commons

push @images, $item->[0] if($item->[0] =~ /^(?:Image|File):/ and $item->[2] !~ /^.?F8:/);

}

}

if(scalar(@articles) == 0)

{

Pearle::myLog(2, "Empty deletion log\n");

exit;

}

if($test)

{

$last_date = undef;

}

else

{

$last_date = $articles[0]->[3];

Pearle::myLog(2, "Last date: $last_date\n");

}

Pearle::myLog(4, join("\n", @images) . "\n");

Pearle::myLog(2, scalar(@images) . " images found\n");

$total_processed += scalar(@images);

# Process for deleted images

if(scalar(@images) == 0)

{

Pearle::myLog(1, "*No images in log\n");

#exit;

}

foreach $image (@images)

{

my $image_url;

my $image_regex = $image;

my $page;

my @pages = ();

my ($day, $month, $year);

# Perform various checks that can be done using the canonical image name

my $image_data = Pearle::APIQuery(titles => [$image], prop => 'imageinfo', meta => 'userinfo', uiprop => ['hasmsg'], # Basic data

list => 'backlinks', bltitle => $image, blnamespace => [6], bllimit => 500, blfilterredir => 'redirects'); # Image names

my $full_comment = "";

my $removal_prefix = "Deleted image removed:";

my $removal_comment = "Removing links to deleted file image";

if($permit_interruptions and DoIHaveMessages($image_data))

{

Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n");

exit;

}

# Verify the image is still deleted

if($image_data !~ /missing=""/)

{

Pearle::myLog(2, "*Image :$image has been re-uploaded.\n");

next;

}

# Images from Commons. May have been masked by the deleted version.

if($image_data =~ /imagerepository="shared"/)

{

Pearle::myLog(2, "*Commons image :$image found\n");

next;

}

# Check for bug 33292 (e.g. http://en.wikipedia.org/w/api.php?action=query&format=xml&prop=imageinfo&titles=File:RAF%20logo.svg&iilimit=10)

# Note that this check must be done *after* the Commons check, as the API returns are

# almost identical -- the only difference is that an image on Commons will be flagged as

# coming from a shared repository.

if($image_data =~ /imageinfo/)

{

botwarnlog("\n* File :$image is in an inconsistent state.");

Pearle::myLog(2, "*File :$image is in an inconsistent state.\n");

next;

}

if($image !~ /(\.jpg|\.jpeg|\.png|\.gif|\.svg|\.tiff|\.tif)$/i and $image !~ /^http:\/\//i)

{

botwarnlog("\n* Non-image media file :$image found.");

Pearle::myLog(2, "*Non-image media file :$image found.\n");

next; # Non-image files are too hard to work with

}

# Perform operations on the image and all redirects

my @image_names = GetImageNames($image_data);

push @image_names, $image;

Pearle::myLog(2, "*Image has names :", join ", :", @image_names, "\n");

foreach my $image_name (@image_names)

{

$image_data = Pearle::APIQuery(list => 'imageusage', iutitle => $image_name, iunamespace => [0, 10, 12, 14, 100], iulimit => 500);

@pages = GetPageList($image_data);

if(scalar(@pages) == 0)

{

notelog("Image $image_name is already orphaned\n");

next;

}

my ($raw_image) = $image_name =~ /(?:Image|File):(.*)/;

$raw_image = MakeWikiRegex($raw_image);

$image_regex = "[ _]*(?:[Ii][Mm][Aa][Gg][Ee]|[Ff][Ii][Ll][Ee])[ _]*:[ _]*${raw_image}[ _]*";

# Sanity check

if(!defined($raw_image) or $image_name !~ /$raw_image/)

{

botwarnlog("\n*Parse error on image :$image_name ($raw_image)");

exit;

}

Pearle::myLog(3, "Image regex: $image_regex\n");

my $parsed_removal_comment = $removal_comment;

$parsed_removal_comment =~ s/image/:$image/;

foreach $page (@pages)

{

eval

{

my $hits = 0;

Pearle::myLog(3, "Page for removal: $page\n");

if($hits = RemoveImageFromPage($image_name, $page, $image_regex, $removal_prefix, $parsed_removal_comment)) # Don't limit if we just touched the article

{

Pearle::myLog(2, "Removed image $image with name $image_name from article $page ($hits times)\n");

Pearle::limit();

}

$images_removed += $hits;

};

if($@)

{

if(925 == $@)

{

botwarnlog("\n*Page :$page is protected removing image :$image_name");

}

else

{

die;

}

}

}

# Verify removal

# Portal removal is too hard to get correct, and we don't really care about it.

# Template removal isn't possible, and the template usage has already been logged.

$image_data = Pearle::APIQuery(list => 'imageusage', iutitle => $image_name, iunamespace => [0, 12, 14], iulimit => 500);

@pages = GetPageList($image_data);

if(scalar(@pages) != 0)

{

# botwarnlog("\n*Unable to remove all instances of :$image");

Pearle::myLog(2, "*Unable to remove all instances of :$image, adding to followup log\n");

open OUTFILE, ">>", "followup.log";

flock OUTFILE, LOCK_EX;

my $date = time;

print OUTFILE "$date $image\n";

flock OUTFILE, LOCK_UN;

close OUTFILE;

}

  1. sleep 30;

}

}

Pearle::myLog(2, "Finished with set. Removed $images_removed images.\n");

$total_images += $images_removed;

# Record the last log entry processed

if(!$test)

{

open OUTFILE, ">", "$homedir/lastfile.log";

print OUTFILE "$last_date\n";

print "$last_date\n";

close OUTFILE;

}

}

  1. print "Finished. Total $total_images removed, $total_processed processed.\n";

unlink "$homedir/pid"