User:Interiot/Tool/code
- !/usr/bin/perl
- License: Public domain
- regression test cases:
- Markadet fr.wikipedia.org (11k edits)
- Kolja21 de.wikipedia.org (5.1k edits)
- OldakQuill en.wikipedia.org (12k edits)
- Mxn meta.wikimedia.org (1.7k edits)
- Helios89 it.wikipedia.org (7k edits)
- TODO:
- - regarding the "403 access denied" problem, contact noc@wikimedia.org or #wikimedia-tech on freenode
- - ahh, they actively block screen-scrapers
- - sweet-talk Hashar or Dom into unblocking, temporarily disable the tool or enable some form of rate limiting, etc.
- - add a starting-cutoff-date, so renominations for RfA could only include the most recent items
- - add a # edits per day
- - use something like this to retrieve the list of namespaces in real-time:
- http://en.wikiquote.org/wiki/Special:Export/Main_Page
- - make "minor" actually work well for editcountitis:
- - eg. for each namespace, present it like: Category Talk: 23 (13)
- where "23" is the non-minor edits, and "13" is the minor edits
- - get it to work with other mediawikis (example: http://brandt-watch.org/bwwiki/Main_Page)
- - include a date at the end of the top-15 breakdown
- - change the s to
s on graph bars - - don't count comments as having an edit summary when it's purely an autocomment
- - fix the issue where there's an "extra" first result when $offset > 0
- - REWRITE IN AJAX so we don't have to worry about it being a temporary solution or not
- - fix the sorting order on the output
- - ?? http://tools.wikimedia.de/~avar/cgi-bin/count
- Possible other analysis graphs:
- - monthly breakdowns
- : have all the monthly breakdowns appear in one space on the page, but allow the user to
- select between them with Javascript
- - monthly breakdown of major/minor edits (like current red/green... make major edits on left, with minor edits trailing on right)
- - monthly breakdown of the number of edits with summaries of /^(rv|revert)/
- - monthly breakdown, one each for the separate namespaces
- - on monthly breakdowns, extrapolate the current month forward
- - allow the user to hit (more) at the bottom of the namespace breakdowns, allowing them to
- see a more complete list of top-15
- - allow the user to restrict the metrics to some specific recent period... eg. this is
- something that's sometimes discussed on RfA
- - any content-based analyses? (I suppose one would have to know which SQL thingies are quicker than others)
- semi-far-out:
- - allow the user to see JUST their edits from a specific page, when they click on that page on
- the top-15 breakdown (furthermore, if structured right, it might let anybody's tool basically to
- pop up the results of a $user && $page query)
- - allow the results to be the combination of multiple users (either logged-in-user + anon-IP,
- and multiple logged-in-users from multiple sites, eg. meta)
use strict;
use warnings;
use CGI;
- use CGI::Carp qw(fatalsToBrowser);
use Date::Parse;
use LWP::Simple;
use HTML::Entities;
use Data::Dumper;
sub LOGFILE {"/home/interiot/public_html/tmp/wannabe_kate.log"}
if ($ENV{QUERY_STRING} eq "code") { # send ourself when requested
open FIN, $0 and print "Content-type: text/plain\n\n",
; exit;
}
- fill out using these documents:
- http://meta.wikimedia.org/wiki/MediaWiki_localisation#Getting_latest_file
- http://sourceforge.net/docs/E04/
sub nmspc {
my @a = map {s/#.*//; s/^\s+|\s+$//g; $_} grep /\S/, split /[\n\r]+/, shift;
return { "\x00order" => [@a], map { $_,1} @a};
}
my %valid_namespaces = (
'en.wikipedia.org' => nmspc(qq[
Talk:
Category talk:
Category:
Help:
Help talk:
Image:
Image talk:
MediaWiki:
MediaWiki talk:
Portal:
Portal talk:
Template:
Template talk:
User:
User talk:
Wikipedia:
Wikipedia talk:
]),
'de.wikipedia.org' => nmspc(qq[
Diskussion: # Talk
Kategorie: # Category:
Kategorie Diskussion: # Category Talk:
Hilfe: # Help:
Hilfe Diskussion: # Help Talk:
Bild: # Image:
Bild Diskussion: # Image Talk:
MediaWiki: # MediaWiki:
MediaWiki Diskussion: # MediaWiki Talk:
Portal: # Portal:
Portal Diskussion: # Portal Talk:
Vorlage: # Template:
Vorlage Diskussion: # Template Talk:
Benutzer: # User:
Benutzer Diskussion: # User Talk:
Wikipedia: # Wikipedia:
Wikipedia Diskussion: # Wikipedia Talk:
]),
'it.wikipedia.org' => nmspc(qq[
Discussione # Talk:
Categoria # Category:
Discussioni categoria # Category Talk:
Aiuto # Help:
Discussioni aiuto # Help Talk:
Immagine # Image:
Discussioni immagine # Image Talk:
MediaWiki # MediaWiki:
Discussioni MediaWiki # MediaWiki Talk:
Template # Template:
Discussioni template # Template Talk:
Utente # User:
Discussioni utente # User Talk:
Wikipedia # Wikipedia:
Discussioni Wikipedia # Wikipedia Talk:
]),
);
my $query = new CGI;
my $site = $query->param("site");
my $username = CGI::Util::escape($query->param("username"));
$username =~ s/[\+\s]/_/g;
my $isvalid = 0;
my $this_namespace;
if ($ENV{QUERY_STRING}) {
$isvalid = 1;
$isvalid = 0 unless ($site =~ /^[\w\.]*\.(org|com|net)$/i);
#$isvalid = 0 unless ($username =~ /^[-\w\._]*$/);
$isvalid = 0 if (length($username) == 0);
}
- data we generate by parsing the output from Wikipedia
my @urls;
my $bandwidth_down = 0;
my %namespace_totals;
my $xml_lang = "";
my $earliest_perldate;
my $latest_perldate;
my %month_totals;
my %month_editsummary_totals;
my %unique_articles;
my %namespace_unique_articles;
my %article_titles;
print "Content-type: text/html; charset=utf-8\n\n";
- cgi_dumper(\%valid_namespaces);
if (!$isvalid) {
if ($ENV{QUERY_STRING}) {
print "Invalid value. email Interiot if this is incorrect.
\n";}
print <<"EOF";
This is a slow substitute for
href="http://en.wikipedia.org/wiki/Wikipedia:Kate%27s_Tool">Kate's Tool
when it's unavailable.Notes:
- Green bars are for edit summaries, red bars are for edits with no summaries
- The statistics are real-time (it scrapes data off of the Special:Contributions page while you wait).
- It's somewhat slow for edit counts over 5000
- It's unable to count deleted edits
- It should work with most wikis out there that use MediaWiki, since it doesn't need privileged access to the databases.
- Source code is in the public domain and available here
- Warning: metrics are evil
For bug reports/comments, see User talk:Interiot or email him.
EOF
} else {
$this_namespace = $valid_namespaces{lc $site};
#cgi_dumper(\$this_namespace); exit;
$username =~ s/^_+|_$//g;
#print "$site
$username\n";$namespace_totals{earliest} = get_5000($site, $username, 0);
#cgi_dumper(\@urls, \%namespace_totals); exit;
#cgi_dumper(\%unique_articles);
$namespace_totals{"number of unique articles"} = scalar(keys %unique_articles);
$namespace_totals{"avg edits per article"} = sprintf("%5.2f", $namespace_totals{total} / $namespace_totals{"number of unique articles"});
print $xml_lang, <<'EOF';
td {padding: .1em 1em .1em}
table.months {padding-top: 2em}
table.months td.date {font-weight: bold}
table.months td {font-size: 75%}
div.red, div.green {
height:1em;
float:left;
}
div.red {background-color: #f00}
div.green {background-color: #0f0}
div.topN {
float: left;
min-height: 30em; /* otherwise, they get ALL jumbled up */
}
table.topN {
float: left;
border: 1px solid black;
}
table.topN th {
background-color: #000;
color: #fff;
}
table.topN td {
/* override the above */
padding: .1em .3em;
}
EOF
print "
- Go back to see caveats or to check another user.
print "
User:$username
\n";print "
\n";
\n";foreach my $key (sort keys %namespace_totals) {
print "
", $key, " ", $namespace_totals{$key}, "\n"; }
print "
#### output the months stats
#cgi_dumper(\%month_editsummary_totals);
my @months = list_months();
my $max_width = 0;
$max_width = ($_ > $max_width ? $_ : $max_width) foreach (values %month_totals);
if ($max_width > 0) {
print "
\n";
\n";foreach my $month (@months) {
my $no_summary = $month_totals{$month} - $month_editsummary_totals{$month};
print "
$month ", $month_totals{$month}, "\n"; #print "
\n"; print "
\n"; print "
\n";}
print "
}
#### output the top-15 namespace stats
my $num_to_present = 15;
if ($this_namespace) { # only do it if we're sure about the namespaces
print "
\n";#print "
- NOTE: This section has a tendency to hilight a user's \"youthful indiscretions\". Please take the dates of the edits into account.
foreach my $nmspc ("Mainspace", @{$this_namespace->{"\x00order"}}) {
next unless %{$namespace_unique_articles{$nmspc}};
my @articles = sort {$namespace_unique_articles{$nmspc}{$b} <=> $namespace_unique_articles{$nmspc}{$a}}
grep { $namespace_unique_articles{$nmspc}{$_} > 1} # filter out items with only 1 edit
keys(%{$namespace_unique_articles{$nmspc}});
next unless @articles;
#print "
\n";\n";print "
\n";$nmspc\n"; my @present = splice(@articles, 0, $num_to_present);
foreach my $article (@present) {
my $artname = $article_titles{$article};
if ($nmspc ne 'Mainspace') {
$artname =~ s/^.*?://;
}
$artname =~ s/\s/ /g;
my $url = "http://$site/w/index.php?title=$article&action=history";
print "
", $namespace_unique_articles{$nmspc}{$article}, " $artname\n"; }
# fill it out so float:left doesn't jumble up
foreach (@present..14) {
print "
\n"; }
print "
#print "
}
}
#### output the bottom summary
print "
If there were any problems, please email Interiot or post at User talk:Interiot.\n";#print "
Based on these URLs:\n
- \n", join("\n", map {"
- Based directly on these URLs:\n";
foreach my $ctr (0..$#urls) {
print "[", ($ctr+1), "]";
print ", " unless ($ctr >= @urls - 1);
print "\n";
}
print "\n";
#### log the bandwidth used
open FOUT, ">>" . LOGFILE() or die;
printf FOUT "%s %-20s %-30s %5dK %7d\n", scalar(localtime), $username, $site,
int($bandwidth_down / 1024), $namespace_totals{total};
close FOUT;
}
sub get_5000 {
my $site = shift;
my $username = shift;
my $offset = shift;
my $earliest = "";
my $url = "http://$site/w/index.php?title=Special:Contributions&target=$username&offset=${offset}&limit=5000";
if (! $LWP::Simple::ua) {
LWP::Simple::_init_ua();
#$LWP::Simple::ua->agent("Mozilla/4.0 WebTV/2.6 (compatible; MSIE 4.0)"); # apparently they're picky about useragent strings
$LWP::Simple::ua->agent("Wget/1.9.1"); # apparently they're picky about useragent strings. Use the same as wget.
}
push(@urls, $url);
if (@urls >= 10) {
print "Too many pages fetched. Terminating.
\n";#cgi_dumper(\@urls); exit;
}
my $page;
if (1) {
my $request = HTTP::Request->new(GET => $url);
my $response = $LWP::Simple::ua->request($request);
if (!$response->is_success) {
print "While trying to fetch $url, $site responded:
\n", $response->status_line, "
", $response->content;exit;
}
$page = $response->content;
$bandwidth_down += length($page);
if (0) {
local *FOUTOUT;
open FOUTOUT, ">/var/tmp/kate/tmp.out" or die;
print FOUTOUT $page;
close FOUTOUT;
}
} else {
open FININ, "
local $/ = undef;
$page =
; close FININ;
}
if ($page =~ /(]+>)/i) {
$xml_lang = $1;
}
## parse each individual contribution
#while ($page =~ /^
- (\d\d:\d\d,.*)/igm) {
while ($page =~ /^
- ([^(]+\(
my $this_time;
local $_ = $1;
my $edit_summary;
#$edit_summary++ if (m#[^<]*\s*\(#is);
$edit_summary++ if (//si);
my $article_url;
if (m#([^<]+)#si) {
$article_url = $1;
$article_titles{$1} = $2;
}
$unique_articles{$article_url}++;
## strip out all the HTML tags
s/<[^>]*>//gs;
if (/^(.*?) \(/) {
my $date = $1;
$earliest = $date;
# translate months into english, so Date::Parse chn handle them
# languages believed to work here: EN, DE, IT
$date =~ s/\b(?:gen )\b/jan/gix;
$date =~ s/\b(?:mär )\b/mar/gix;
$date =~ s/\b(?:mai|mag )\b/may/gix;
$date =~ s/\b(?:giu )\b/jun/gix;
$date =~ s/\b(?:lug )\b/jul/gix;
$date =~ s/\b(?:ago )\b/aug/gix;
$date =~ s/\b(?:set )\b/sep/gix;
$date =~ s/\b(?:okt|ott )\b/oct/gix;
$date =~ s/\b(?:dez|dic )\b/dec/gix;
$this_time = str2time($date);
if ($this_time == 0) {
#print "XXXXXXXXXXXXXXXXXXXXXXXXX
\n";} else {
#print scalar(gmtime($this_time)), "
\n";$earliest_perldate = $this_time; # record the earliest and latest month we see
$latest_perldate ||= $this_time;
my $monthkey = monthkey(localtime($this_time));
$month_totals{$monthkey}++;
$edit_summary && $month_editsummary_totals{$monthkey}++;
}
}
s/^[^()]*\([^()]*\) \([^()]*\) (?:\S )? //;
my $subspace = "Mainspace";
if (/^([^\s\d\/:]+(?:\s[^\s\d\/:]+)?:)/) {
if (!$this_namespace || exists $this_namespace->{$1}) {
$subspace = $1;
}
}
$namespace_totals{$subspace}++;
$namespace_totals{total}++;
$namespace_unique_articles{$subspace}{$article_url}++;
#print "$_
\n";}
## if they have more than 5000 contributions, go to the next page
while ($page =~ /href="[^"]+:Contributions[^"]+offset=(\d+)/ig) {
#print "Trying again at offset $1
\n";next unless $1 > 0 && ($offset == 0 || $1 < $offset);
return get_5000($site, $username, $1); # tail recursion until there are no more
}
return $earliest;
}
- returns something like [
- "2003/10",
- "2003/11",
- "2003,12"
- ]
sub list_months {
my $last_monthkey = '';
my @ret;
# yes, this is a fairly odd algorithm. oh well.
for (my $date=$earliest_perldate; $date<=$latest_perldate; $date+=10*24*60*60) {
my $monthkey = monthkey(localtime($date));
if ($monthkey ne $last_monthkey) {
push(@ret, $monthkey);
$last_monthkey = $monthkey;
}
}
return @ret;
}
sub monthkey {($_[5] + 1900) . "/" . ($_[4] + 1)}
sub cgi_dumper {print "
", HTML::Entities::encode(Dumper(@_)), "
"} - (\d\d:\d\d,.*)/igm) {