User:OrphanBot/tagbot.pl

Source code for OrphanBot's upload-tagging task. Requires libBot.pm and Pearle.pm.

  1. !/usr/bin/perl
  1. Tagbot
  2. A bot to identify and tag recently-uploaded images that have no image description page, source information, or copyright tag.

use strict;

use warnings;

use Date::Calc qw(Month_to_Text Today);

use Array::Utils;

use utf8;

use Data::Dumper;

use libBot;

binmode STDOUT, ":utf8";

my $permit_interruptions = 0; # Allow talkpage messages to stop the bot?

my ($cur_y, $cur_m, $cur_d);

my %users_notified; # List of users notifed. 0, undef = no; 1 = notified once; 2 = notified and second notice

my %notifications; # List of user,image pairs, used to ensure that no user is ever notified about an image twice.

my %dont_notify = (); # List of users to never notify

my %banned_users = (); # List of users banned from uploading

my %exempt_users = (); # List of users exempt from inspection

my %unknown_tags; # List of tags found that are not in either the "good" or "bad" list

my @sourcereq_tags; # List of tags that require a separate source

my $sourcereq_tags;

my @nosource_tags; # List of self-sourcing tags

my $nosource_tags;

my @deletion_tags; # Tags that will eventually lead to the deletion of the image

my $deletion_tags;

my @forbidden_tags; # List of tags that should never be seen

my $forbidden_tags;

my @deprecated_tags; # List of tags that shouldn't be used any more

my $deprecated_tags;

my @nontags; # List of tags that aren't copyright tags

my $nontags;

my @source_tags; # List of tags that provide source but not copyright status

my $source_tags;

sub loadTagList

{

my $filename = shift;

my @list = ();

open INFILE, "<", $filename;

while()

{

$_ =~ s/#.*//; # Remove comments

$_ =~ s/^\s*//; # Remove leading whitespace

$_ =~ s/\s*$//; # Remove trailing whitespace

push @list, $_ if($_ !~ /^\s*$/);

}

close INFILE;

return @list;

}

sub processTagList

{

my $tags = join "|", @_;

$tags =~ s/\(/\\\(/g;

$tags =~ s/\)/\\\)/g;

$tags =~ s/\./\\\./g;

$tags =~ s/\*/.*?/g;

return "($tags)";

}

@sourcereq_tags = loadTagList("sourcereq.tags");

$sourcereq_tags = processTagList(@sourcereq_tags);

print "Sourcereq: Loaded\n\n";

@nosource_tags = loadTagList("nosource.tags");

$nosource_tags = processTagList(@nosource_tags);

print "Nosource: Loaded\n\n";

@forbidden_tags = loadTagList("forbidden.tags");

$forbidden_tags = processTagList(@forbidden_tags);

print "Forbid: Loaded\n\n";

@deletion_tags = loadTagList("deletion.tags");

$deletion_tags = processTagList(@deletion_tags);

print "Deletion: Loaded\n\n";

@deprecated_tags = loadTagList("deprecated.tags");

$deprecated_tags = processTagList(@deprecated_tags);

print "Deprecated: Loaded\n\n";

@nontags = loadTagList("nontags.tags");

$nontags = processTagList(@nontags);

print "Nontags: Loaded\n\n";

@source_tags = loadTagList("source.tags");

$source_tags = processTagList(@source_tags);

print "Sourcetags: Loaded\n\n";

sub tokenSubst

{

my $string = shift;

my $image = shift;

$string =~ s//$image/g if(defined($image));

$string =~ s//$cur_d/g;

$string =~ s//$cur_m/g;

$string =~ s//$cur_y/g;

return $string;

}

sub loadUserList

{

my $file = shift;

my %notelist;

my $i = 0;

Pearle::myLog(4, "File: $file\n");

open INFILE, "<", $file;

while()

{

my ($user, $reason);

$_ =~ s/\s*#.*$//g;

chomp;

($user, $reason) = $_ =~ /([^\t]*)\t+(.*)/;

next if(!defined($user) or !defined($reason));

$notelist{$user} = $reason;

$i++;

}

close INFILE;

Pearle::myLog(3, "$i notifications loaded\n");

return %notelist;

}

  1. Initialize

($cur_y, $cur_m, $cur_d) = Today(1); # Today in GMT

$cur_m = Month_to_Text($cur_m);

Pearle::init("username", "password", "tagbot.log","cookies.tagbot.txt");

Pearle::config(nullOK => 1, sanityCheck => 1, loglevel => 3, printlevel => 4, testmode => 0);

config(username => 'username');

Pearle::myLog(2, "Beginning execution\n");

%dont_notify = loadNotificationList("orphanbot.whitelist");

%banned_users = loadUserList("banneduser.list");

%exempt_users = loadUserList("exemptuser.list");

if(!Pearle::login())

{

exit;

}

  1. Get the day's uploads

my @articles;

@articles = Pearle::getLogArticles(log => 'upload', limit => 150);

  1. Chop off the 20 most recent log entries

splice @articles, 0, 20;

Pearle::myLog(3, scalar(@articles) . " images found\n");

foreach my $log_entry (@articles)

{

my $image = $log_entry->[0];

my $uploader = $log_entry->[1];

my $summary = $log_entry->[2] || "";

print "$image\n";

print "$uploader\n";

Pearle::myLog(2, "Processing image $image\n");

# Basic checks that can be done from the log alone

# Non-terminating check: Was the image uploaded by a blacklisted user?

if($banned_users{$uploader})

{

botwarnlog("*Image :$image uploaded by blacklisted user User:$uploader\n");

Pearle::myLog(3, "Upload by banned user $uploader\n");

}

# Terminating check: Is the user on the whitelist?

if($exempt_users{$uploader})

{

Pearle::myLog(2, "Upload by exempt user $uploader found.\n");

next;

}

# Terminating check: Is the upload a modification?

if($summary =~ /optimi(z|s)ed using (optipng|PNGCrusher)/i)

{

Pearle::myLog(2, "Optimize upload found for image $image\n");

next;

}

if($summary =~ /tweak|crop|scale|adjust|change|resize|corrected|correcting/i)

{

Pearle::myLog(2, "Tweak found for image $image\n");

next;

}

# Terminating check: Is the upload a revert?

if($summary =~ /Reverted to earlier revision|Reverted to version/)

{

Pearle::myLog(2, "Revert upload found for image $image\n");

next;

}

# Get page data

my $image_data = Pearle::APIQuery(titles => $image, prop => ['templates', 'revisions'],

tllimit => 500, # All the templates

rvprop => ['content'], # Article body

meta => 'userinfo', uiprop => ['hasmsg'], # Check for talkpage messages

redirects => 1, # Resolve redirects

);

if(!defined($image_data))

{

Pearle::myLog(1, "Server did not return an appropriate response.\n");

next;

}

my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data, ForceArray => ['tl'] );

Pearle::myLog(4, Dumper($parsed_xml));

my $page_text = GetPageText($parsed_xml);

my @templates = GetPageTemplates($parsed_xml);

# Remove non-tags from template list

# TODO: Remove redlinks

@templates = grep {$_ !~ /:$nontags$/i} @templates;

my $stripped_page_text = $page_text || "";

$stripped_page_text =~ s/^==.*?==//gm; # Remove section headers

$stripped_page_text =~ s/\n//g; # Remove newlines

$stripped_page_text =~ s/{{{[^}]+}}}//g; # Remove template parameters

$stripped_page_text =~ s/{{[^}]+}}//gi; # Remove templates

Pearle::myLog(4, "Templates: " . join(", ", @templates) . "\n");

Pearle::myLog(4, "Stripped text: $stripped_page_text\n");

print "=============================================================================\n";

# Check for interruptions

if($permit_interruptions and DoIHaveMessages($image_data))

{

Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n");

last;

}

# Sanity check: Does the image still exist?

if(defined($parsed_xml->{query}->{pages}->{page}->{missing}))

{

Pearle::myLog(2, "Image $image has already been deleted\n");

next;

}

# Sanity check: Is the image marked for deletion?

if(grep {$_ =~ /:$deletion_tags$/i} @templates)

{

# We don't do anything with images already marked for deletion. There are just too many corner cases and wasted-effort conditions.

Pearle::myLog(2, "Deletion tag found\n");

next;

}

# Check for red flag: "Google Image" (matches 'image', 'images', 'imagesearch')

if(defined($page_text) and $page_text =~ /google image/i)

{

Pearle::myLog(2, "Image $image has red-flag keyword 'google image'\n");

botwarnlog("* Image :$image has red-flag keyword 'google image'\n");

}

# Check for red flag: those goddamn navboxes

if(grep {$_ =~ /navbox/i} @templates)

{

botwarnlog("* Navbox found on :$image\n");

}

######### Check for source, license, and tag ###################

# Meanings: "undef" = we don't know, "0" = definitely no, "1" = probably yes

my $has_source = undef;

my $has_license = undef;

my $has_tag = undef;

# Does the image lack a description page?

if(!defined($page_text) or $page_text =~ /^\s*$/)

{

Pearle::myLog(3, "Empty IDP\n");

$has_source = 0;

$has_license = 0;

$has_tag = 0;

}

# Does the image have a source-providing template?

if(my @tags = grep {$_ =~ /:$source_tags$/i} @templates)

{

Pearle::myLog(3, "Source-providing template @tags found\n");

$has_source = 1;

# Remove the tags from the candidate set: they can't keep an image from being "untagged" or having unknown tags

@templates = Array::Utils::array_diff(@templates, @tags);

}

# Does the image have a self-sourcing tag?

if(my @tags = grep {$_ =~ /:$nosource_tags$/i} @templates)

{

Pearle::myLog(3, "Self-sourcing tag @tags found\n");

$has_source = 1;

$has_license = 1;

$has_tag = 1;

}

# Does the image have a sourcereq tag?

if(my @tags = grep {$_ =~ /:$sourcereq_tags$/i} @templates)

{

Pearle::myLog(3, "Sourcereq tag @tags found\n");

$has_license = 1;

$has_tag = 1;

}

# Handle those damned "Information" and "Non-free use rationale" tags

if(grep {$_ =~ /:Information$/} @templates)

{

Pearle::myLog(3, "Has an Information template\n");

# Remove the template from the list

@templates = grep {$_ !~ /:Information$/} @templates;

# Attempt to parse an "information" template

if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i)

{

# If there's a filled-in "source" parameter, assume a source

Pearle::myLog(3, "Assuming source in {{Information:source}}\n");

$has_source = 1;

}

if($page_text =~/\|\s*author\s*=\s*[^|}]{4,}/i)

{

# If there's a filled-in "author" parameter, assume a source

Pearle::myLog(3, "Assuming source in {{Information:author}}\n");

$has_source = 1;

}

if($page_text =~/\|\s*permission\s*=\s*[^|}]{4,}/i)

{

# If there's a filled-in "permission" parameter, assume a license (but not a tag)

Pearle::myLog(3, "Assuming license in {{Information:permission}}\n");

$has_license = 1;

}

if($page_text =~/\|\s*flickr_url\s*=\s*[^|}]{4,}/i)

{

# If there's a filled-in "flickr_url" parameter, assume a source

Pearle::myLog(3, "Assuming source in {{Flickr:flickr_url}}\n");

$has_source = 1;

}

}

if((grep {$_ =~ /:Non-free media rationale$/} @templates) or

(grep {$_ =~ /:Non-free use rationale$/} @templates))

{

Pearle::myLog(3, "Has a non-free use rationale template\n");

# Remove the template from the list

@templates = grep {$_ !~ /:Non-free media rationale$/} @templates;

@templates = grep {$_ !~ /:Non-free use rationale$/} @templates;

# Attempt to parse a "non-free use rationale" template or derivative

if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i)

{

# If there's a filled-in "source" parameter, assume a source

Pearle::myLog(3, "Assuming source in {{Non-free * rationale:source}}\n");

$has_source = 1;

}

if($page_text =~ /\|\s*publisher\s*=\s*[^|}]{4,}/i)

{

# If there's a filled-in "publisher" parameter, assume a source

Pearle::myLog(3, "Assuming source in {{Non-free * rationale:publisher}}\n");

$has_source = 1;

}

if($page_text =~ /\|\s*owner\s*=\s*[^|}]{4,}/i)

{

# If there's a filled-in "owner" parameter, assume a source

Pearle::myLog(3, "Assuming source in {{Non-free * rationale:owner}}\n");

$has_source = 1;

}

if($page_text =~ /\|\s*website\s*=\s*[^|}]{4,}/i)

{

# If there's a filled-in "website" parameter, assume a source

Pearle::myLog(3, "Assuming source in {{Non-free * rationale:website}}\n");

$has_source = 1;

}

if($page_text =~ /\|\s*distributor\s*=\s*[^|}]{4,}/i)

{

# If there's a filled-in "distributor" parameter, assume a source

Pearle::myLog(3, "Assuming source in {{Non-free * rationale:distributor}}\n");

$has_source = 1;

}

$has_license = 1; # Assume that it's licensed as "fair use"

}

if(grep {$_ =~ /:Non-free image data$/} @templates)

{

Pearle::myLog(3, "Has a non-free image data template\n");

# Remove the template from the list

@templates = grep {$_ !~ /:Non-free image data$/} @templates;

# Attempt to parse a "non-free use rationale" template

if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i)

{

# If there's a filled-in "source" parameter, assume a source

Pearle::myLog(3, "Assuming source in {{Non-free image data:source}}\n");

$has_source = 1;

}

$has_license = 1; # Assume that it's licensed as "fair use"

}

if(grep {$_ =~ /:spoken article entry$/i} @templates)

{

Pearle::myLog(3, "Has a Spoken Article template\n");

# Remove the template from the list

@templates = grep {$_ !~ /:Spoken article entry$/i} @templates;

# Attempt to parse

if($page_text =~ /\|\s*user_name\s*=[ \t]*\S+/i)

{

# If there's a filled-in "user_name" parameter, assume a source

Pearle::myLog(3, "Assuming source in {{Spoken article entry:user_name}}\n");

$has_source = 1;

}

$has_license = 1; # Assume that it's GFDL

}

# Is the image description page lacking in tags?

# This is checked here because we may have removed "information" or "non-free use rationale" templates from the list earlier

# We want those in the list before here because it makes detecting them for parsing easier, but we don't want them in the list

# here so we can say for sure that the page is untagged.

if(scalar(@templates) == 0)

{

Pearle::myLog(3, "No templates found\n");

$has_tag = 0;

}

# Does it have source information outside of the templates?

if(length($stripped_page_text) >= 7) # Page text with headers, newlines and templates stripped is at least seven bytes ("my work")

{

# TODO: Better source checking

if(!defined($has_source) or $has_source == 0)

{

Pearle::myLog(3, "Assuming page has source\n");

$has_source = 1;

}

if(!defined($has_license) or $has_license == 0)

{

Pearle::myLog(3, "Assuming page has license\n");

$has_license = 1;

}

}

else

{

if(!defined($has_source))

{

# If we still don't know if it has a source, it's safe to assume it doesn't.

Pearle::myLog(3, "Assuming page doesn't have source\n");

$has_source = 0;

}

if(!defined($has_license))

{

# If we still don't know if it has a license, we'll assume it doesn't

Pearle::myLog(3, "Assuming page doesn't have license information\n");

$has_license = 0;

}

}

########## Check for exceptional conditions ##########

# Terminating check: Is the image using a deprecated tag, and doesn't have any other license tag?

if(($has_tag != 1) and (grep {$_ =~ /:$deprecated_tags$/i} @templates))

{

Pearle::myLog(2, "Image has deprecated tag\n");

# Mark as no-license

wikilog($image, tokenSubst("\n{{no copyright information|month=|day=|year=}}"), "Obsolete or deprecated tag");

if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))

{

Pearle::myLog(2, "Warning user $uploader\n");

wikilog("User talk:$uploader", "\n{{subst:User:OrphanBot/deprecated|$image}} --~~~~", "Image with obsolete or deprecated license");

}

Pearle::limit();

next;

}

# Terminating check: Does the image have a forbidden tag?

if(my @tags = grep {$_ =~ /:$forbidden_tags$/i} @templates)

{

# Doesn't matter what else is on the page, the image requires human handling

Pearle::myLog(2, "Forbidden tag $tags[0] found on image :$image\n");

botwarnlog("*Forbidden tag {{tl|$tags[0]}} found on image :$image\n");

Pearle::limit();

next;

}

# TODO: Terminating check: Malformed fair-use rationale

########## Process #####################################

# We've found an image with only unknown templates

if(!defined($has_tag))

{

# We don't know if it has any tags or not, and so we cannot deduce the license or source status

# We know an image doesn't have tags if:

# * It has no templates

# * or all templates are on the "nontags" list

# We know an image has tags if:

# * We found a tag we know about

print "Has unknown tags\n";

my @new_unknown_tags = grep {!defined($unknown_tags{$_})} @templates;

if(scalar(@new_unknown_tags) > 0)

{

foreach my $unknown_tag (@new_unknown_tags)

{

Pearle::myLog(2, "Unknown tag {{$unknown_tag}} found\n");

botwarnlog("* Unknown tag $unknown_tag found\n");

$unknown_tags{$unknown_tag} = 1;

}

Pearle::limit();

}

}

elsif($has_tag == 0)

{

if($has_source == 0)

{

if($has_license == 0)

{

# Tag as "no source" and "no license"

wikilog($image, tokenSubst("\n{{no copyright holder|month=|day=|year=}}\n{{no copyright information|month=|day=|year=}}"), "Image has no source or license information");

if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))

{

Pearle::myLog(2, "Warning user $uploader\n");

wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nosource nolicense|}} --~~~~\n", $image), "You've uploaded an image with no source or license information");

}

Pearle::myLog(2, "No source, no license\n");

Pearle::limit();

}

else

{

# No license tag, and it either has a license or we don't know if it has a license

# Tag as "no source" and "untagged"

wikilog($image, tokenSubst("\n{{no copyright holder|month=|day=|year=}}\n{{untagged|month=|day=|year=}}"), "Image has no source or license tag");

if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))

{

Pearle::myLog(2, "Warning user $uploader\n");

wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nosource untagged|}} --~~~~\n", $image), "You've uploaded an image with no source or license tag");

}

Pearle::myLog(2, "No source, untagged\n");

Pearle::limit();

}

}

else

{

if($has_license == 0)

{

# Tag as "no license"

wikilog($image, tokenSubst("\n{{no copyright information|month=|day=|year=}}"), "Image has no license information");

if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))

{

Pearle::myLog(2, "Warning user $uploader\n");

wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nolicense|}} --~~~~\n", $image), "You've uploaded an image with no license information");

}

Pearle::myLog(2, "No license\n");

Pearle::limit();

}

else

{

# Tag as "untagged"

wikilog($image, tokenSubst("\n{{untagged|month=|day=|year=}}"), "Image has no license tag");

if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))

{

Pearle::myLog(2, "Warning user $uploader\n");

wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/untagged-new|}} --~~~~\n", $image), "You've uploaded an image with no license tag");

}

Pearle::myLog(2, "Untagged\n");

Pearle::limit();

}

}

}

else

{

# If it has a tag, it has a license

if($has_source == 0)

{

# Tag as "no source"

wikilog($image, tokenSubst("\n{{no copyright holder|month=|day=|year=}}"), "Image has no source information");

if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))

{

Pearle::myLog(2, "Warning user $uploader\n");

wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nosource-new|}} --~~~~\n", $image), "You've uploaded an image with no source information");

}

Pearle::myLog(2, "No source\n");

Pearle::limit();

}

else

{

# Everything's fine

Pearle::myLog(2, "Image has no problems\n");

}

}

sleep(2);

}

Pearle::myLog(2, "Finished with upload set\n");