User:Interiot/Tool/code

  1. !/usr/bin/perl
  2. License: Public domain
  1. regression test cases:
  2. Markadet fr.wikipedia.org (11k edits)
  3. Kolja21 de.wikipedia.org (5.1k edits)
  4. OldakQuill en.wikipedia.org (12k edits)
  5. Mxn meta.wikimedia.org (1.7k edits)
  6. Helios89 it.wikipedia.org (7k edits)
  1. TODO:
  2. - regarding the "403 access denied" problem, contact noc@wikimedia.org or #wikimedia-tech on freenode
  3. - ahh, they actively block screen-scrapers
  4. - sweet-talk Hashar or Dom into unblocking, temporarily disable the tool or enable some form of rate limiting, etc.
  5. - add a starting-cutoff-date, so renominations for RfA could only include the most recent items
  6. - add a # edits per day
  7. - use something like this to retrieve the list of namespaces in real-time:
  8. http://en.wikiquote.org/wiki/Special:Export/Main_Page
  9. - make "minor" actually work well for editcountitis:
  10. - eg. for each namespace, present it like: Category Talk: 23 (13)
  11. where "23" is the non-minor edits, and "13" is the minor edits
  12. - get it to work with other mediawikis (example: http://brandt-watch.org/bwwiki/Main_Page)
  13. - include a date at the end of the top-15 breakdown
  14. - change the
    s to s on graph bars
  15. - don't count comments as having an edit summary when it's purely an autocomment
  16. - fix the issue where there's an "extra" first result when $offset > 0
  17. - REWRITE IN AJAX so we don't have to worry about it being a temporary solution or not
  18. - fix the sorting order on the output
  19. - ?? http://tools.wikimedia.de/~avar/cgi-bin/count
  1. Possible other analysis graphs:
  2. - monthly breakdowns
  3. : have all the monthly breakdowns appear in one space on the page, but allow the user to
  4. select between them with Javascript
  5. - monthly breakdown of major/minor edits (like current red/green... make major edits on left, with minor edits trailing on right)
  6. - monthly breakdown of the number of edits with summaries of /^(rv|revert)/
  7. - monthly breakdown, one each for the separate namespaces
  8. - on monthly breakdowns, extrapolate the current month forward
  9. - allow the user to hit (more) at the bottom of the namespace breakdowns, allowing them to
  10. see a more complete list of top-15
  11. - allow the user to restrict the metrics to some specific recent period... eg. this is
  12. something that's sometimes discussed on RfA
  13. - any content-based analyses? (I suppose one would have to know which SQL thingies are quicker than others)
  1. semi-far-out:
  2. - allow the user to see JUST their edits from a specific page, when they click on that page on
  3. the top-15 breakdown (furthermore, if structured right, it might let anybody's tool basically to
  4. pop up the results of a $user && $page query)
  5. - allow the results to be the combination of multiple users (either logged-in-user + anon-IP,
  6. and multiple logged-in-users from multiple sites, eg. meta)

use strict;

use warnings;

use CGI;

  1. use CGI::Carp qw(fatalsToBrowser);

use Date::Parse;

use LWP::Simple;

use HTML::Entities;

use Data::Dumper;

sub LOGFILE {"/home/interiot/public_html/tmp/wannabe_kate.log"}

if ($ENV{QUERY_STRING} eq "code") { # send ourself when requested

open FIN, $0 and print "Content-type: text/plain\n\n", ;

exit;

}

  1. fill out using these documents:
  2. http://meta.wikimedia.org/wiki/MediaWiki_localisation#Getting_latest_file
  3. http://sourceforge.net/docs/E04/

sub nmspc {

my @a = map {s/#.*//; s/^\s+|\s+$//g; $_} grep /\S/, split /[\n\r]+/, shift;

return { "\x00order" => [@a], map { $_,1} @a};

}

my %valid_namespaces = (

'en.wikipedia.org' => nmspc(qq[

Talk:

Category talk:

Category:

Help:

Help talk:

Image:

Image talk:

MediaWiki:

MediaWiki talk:

Portal:

Portal talk:

Template:

Template talk:

User:

User talk:

Wikipedia:

Wikipedia talk:

]),

'de.wikipedia.org' => nmspc(qq[

Diskussion: # Talk

Kategorie: # Category:

Kategorie Diskussion: # Category Talk:

Hilfe: # Help:

Hilfe Diskussion: # Help Talk:

Bild: # Image:

Bild Diskussion: # Image Talk:

MediaWiki: # MediaWiki:

MediaWiki Diskussion: # MediaWiki Talk:

Portal: # Portal:

Portal Diskussion: # Portal Talk:

Vorlage: # Template:

Vorlage Diskussion: # Template Talk:

Benutzer: # User:

Benutzer Diskussion: # User Talk:

Wikipedia: # Wikipedia:

Wikipedia Diskussion: # Wikipedia Talk:

]),

'it.wikipedia.org' => nmspc(qq[

Discussione # Talk:

Categoria # Category:

Discussioni categoria # Category Talk:

Aiuto # Help:

Discussioni aiuto # Help Talk:

Immagine # Image:

Discussioni immagine # Image Talk:

MediaWiki # MediaWiki:

Discussioni MediaWiki # MediaWiki Talk:

Template # Template:

Discussioni template # Template Talk:

Utente # User:

Discussioni utente # User Talk:

Wikipedia # Wikipedia:

Discussioni Wikipedia # Wikipedia Talk:

]),

);

my $query = new CGI;

my $site = $query->param("site");

my $username = CGI::Util::escape($query->param("username"));

$username =~ s/[\+\s]/_/g;

my $isvalid = 0;

my $this_namespace;

if ($ENV{QUERY_STRING}) {

$isvalid = 1;

$isvalid = 0 unless ($site =~ /^[\w\.]*\.(org|com|net)$/i);

#$isvalid = 0 unless ($username =~ /^[-\w\._]*$/);

$isvalid = 0 if (length($username) == 0);

}

  1. data we generate by parsing the output from Wikipedia

my @urls;

my $bandwidth_down = 0;

my %namespace_totals;

my $xml_lang = "";

my $earliest_perldate;

my $latest_perldate;

my %month_totals;

my %month_editsummary_totals;

my %unique_articles;

my %namespace_unique_articles;

my %article_titles;

print "Content-type: text/html; charset=utf-8\n\n";

  1. cgi_dumper(\%valid_namespaces);

if (!$isvalid) {

if ($ENV{QUERY_STRING}) {

print "Invalid value. email Interiot if this is incorrect.



\n";

}

print <<"EOF";

This is a slow substitute for

href="http://en.wikipedia.org/wiki/Wikipedia:Kate%27s_Tool">Kate's Tool when it's unavailable.

username

site

Notes:

  • Green bars are for edit summaries, red bars are for edits with no summaries

  • The statistics are real-time (it scrapes data off of the Special:Contributions page while you wait).

  • It's somewhat slow for edit counts over 5000

  • It's unable to count deleted edits

  • It should work with most wikis out there that use MediaWiki, since it doesn't need privileged access to the databases.

  • Source code is in the public domain and available here

  • Warning: metrics are evil

For bug reports/comments, see User talk:Interiot or email him.

EOF

} else {

$this_namespace = $valid_namespaces{lc $site};

#cgi_dumper(\$this_namespace); exit;

$username =~ s/^_+|_$//g;

#print "$site
$username\n";

$namespace_totals{earliest} = get_5000($site, $username, 0);

#cgi_dumper(\@urls, \%namespace_totals); exit;

#cgi_dumper(\%unique_articles);

$namespace_totals{"number of unique articles"} = scalar(keys %unique_articles);

$namespace_totals{"avg edits per article"} = sprintf("%5.2f", $namespace_totals{total} / $namespace_totals{"number of unique articles"});

print $xml_lang, <<'EOF';

EOF

print "

    Go back to see caveats or to check another user.
\n";

print "

User:$username

\n";

print "

\n";

foreach my $key (sort keys %namespace_totals) {

print "

", $key, "", $namespace_totals{$key}, "\n";

}

print "

\n";

#### output the months stats

#cgi_dumper(\%month_editsummary_totals);

my @months = list_months();

my $max_width = 0;

$max_width = ($_ > $max_width ? $_ : $max_width) foreach (values %month_totals);

if ($max_width > 0) {

print "

\n";

foreach my $month (@months) {

my $no_summary = $month_totals{$month} - $month_editsummary_totals{$month};

print "

$month ", $month_totals{$month}, "\n";

#print "

\n";

print "

\n";

print "

\n";

}

print "

\n";

}

#### output the top-15 namespace stats

my $num_to_present = 15;

if ($this_namespace) { # only do it if we're sure about the namespaces

print "


\n";

#print "

    NOTE: This section has a tendency to hilight a user's \"youthful indiscretions\". Please take the dates of the edits into account.
\n";

foreach my $nmspc ("Mainspace", @{$this_namespace->{"\x00order"}}) {

next unless %{$namespace_unique_articles{$nmspc}};

my @articles = sort {$namespace_unique_articles{$nmspc}{$b} <=> $namespace_unique_articles{$nmspc}{$a}}

grep { $namespace_unique_articles{$nmspc}{$_} > 1} # filter out items with only 1 edit

keys(%{$namespace_unique_articles{$nmspc}});

next unless @articles;

#print "

\n";

print "

$nmspc\n";

my @present = splice(@articles, 0, $num_to_present);

foreach my $article (@present) {

my $artname = $article_titles{$article};

if ($nmspc ne 'Mainspace') {

$artname =~ s/^.*?://;

}

$artname =~ s/\s/ /g;

my $url = "http://$site/w/index.php?title=$article&action=history";

print "

", $namespace_unique_articles{$nmspc}{$article}, "$artname\n";

}

# fill it out so float:left doesn't jumble up

foreach (@present..14) {

print "

  \n";

}

print "

\n";

#print "

\n";

}

}

#### output the bottom summary

print "



If there were any problems, please email Interiot or post at User talk:Interiot.\n";

#print "

Based on these URLs:\n

\n";

#### log the bandwidth used

open FOUT, ">>" . LOGFILE() or die;

printf FOUT "%s %-20s %-30s %5dK %7d\n", scalar(localtime), $username, $site,

int($bandwidth_down / 1024), $namespace_totals{total};

close FOUT;

}

sub get_5000 {

my $site = shift;

my $username = shift;

my $offset = shift;

my $earliest = "";

my $url = "http://$site/w/index.php?title=Special:Contributions&target=$username&offset=${offset}&limit=5000";

if (! $LWP::Simple::ua) {

LWP::Simple::_init_ua();

#$LWP::Simple::ua->agent("Mozilla/4.0 WebTV/2.6 (compatible; MSIE 4.0)"); # apparently they're picky about useragent strings

$LWP::Simple::ua->agent("Wget/1.9.1"); # apparently they're picky about useragent strings. Use the same as wget.

}

push(@urls, $url);

if (@urls >= 10) {

print "Too many pages fetched. Terminating.
\n";

#cgi_dumper(\@urls); exit;

}

my $page;

if (1) {

my $request = HTTP::Request->new(GET => $url);

my $response = $LWP::Simple::ua->request($request);

if (!$response->is_success) {

print "While trying to fetch $url, $site responded:

\n", $response->status_line, "

", $response->content;

exit;

}

$page = $response->content;

$bandwidth_down += length($page);

if (0) {

local *FOUTOUT;

open FOUTOUT, ">/var/tmp/kate/tmp.out" or die;

print FOUTOUT $page;

close FOUTOUT;

}

} else {

open FININ, "

local $/ = undef;

$page = ;

close FININ;

}

if ($page =~ /(]+>)/i) {

$xml_lang = $1;

}

## parse each individual contribution

#while ($page =~ /^

  • (\d\d:\d\d,.*)/igm) {

    while ($page =~ /^

  • ([^(]+\(

    my $this_time;

    local $_ = $1;

    my $edit_summary;

    #$edit_summary++ if (m#[^<]*\s*\(#is);

    $edit_summary++ if (//si);

    my $article_url;

    if (m#([^<]+)#si) {

    $article_url = $1;

    $article_titles{$1} = $2;

    }

    $unique_articles{$article_url}++;

    ## strip out all the HTML tags

    s/<[^>]*>//gs;

    if (/^(.*?) \(/) {

    my $date = $1;

    $earliest = $date;

    # translate months into english, so Date::Parse chn handle them

    # languages believed to work here: EN, DE, IT

    $date =~ s/\b(?:gen )\b/jan/gix;

    $date =~ s/\b(?:mär )\b/mar/gix;

    $date =~ s/\b(?:mai|mag )\b/may/gix;

    $date =~ s/\b(?:giu )\b/jun/gix;

    $date =~ s/\b(?:lug )\b/jul/gix;

    $date =~ s/\b(?:ago )\b/aug/gix;

    $date =~ s/\b(?:set )\b/sep/gix;

    $date =~ s/\b(?:okt|ott )\b/oct/gix;

    $date =~ s/\b(?:dez|dic )\b/dec/gix;

    $this_time = str2time($date);

    if ($this_time == 0) {

    #print "XXXXXXXXXXXXXXXXXXXXXXXXX
    \n";

    } else {

    #print scalar(gmtime($this_time)), "
    \n";

    $earliest_perldate = $this_time; # record the earliest and latest month we see

    $latest_perldate ||= $this_time;

    my $monthkey = monthkey(localtime($this_time));

    $month_totals{$monthkey}++;

    $edit_summary && $month_editsummary_totals{$monthkey}++;

    }

    }

    s/^[^()]*\([^()]*\) \([^()]*\) (?:\S )? //;

    my $subspace = "Mainspace";

    if (/^([^\s\d\/:]+(?:\s[^\s\d\/:]+)?:)/) {

    if (!$this_namespace || exists $this_namespace->{$1}) {

    $subspace = $1;

    }

    }

    $namespace_totals{$subspace}++;

    $namespace_totals{total}++;

    $namespace_unique_articles{$subspace}{$article_url}++;

    #print "$_
    \n";

    }

    ## if they have more than 5000 contributions, go to the next page

    while ($page =~ /href="[^"]+:Contributions[^"]+offset=(\d+)/ig) {

    #print "Trying again at offset $1
    \n";

    next unless $1 > 0 && ($offset == 0 || $1 < $offset);

    return get_5000($site, $username, $1); # tail recursion until there are no more

    }

    return $earliest;

    }

    1. returns something like [
    2. "2003/10",
    3. "2003/11",
    4. "2003,12"
    5. ]

    sub list_months {

    my $last_monthkey = '';

    my @ret;

    # yes, this is a fairly odd algorithm. oh well.

    for (my $date=$earliest_perldate; $date<=$latest_perldate; $date+=10*24*60*60) {

    my $monthkey = monthkey(localtime($date));

    if ($monthkey ne $last_monthkey) {

    push(@ret, $monthkey);

    $last_monthkey = $monthkey;

    }

    }

    return @ret;

    }

    sub monthkey {($_[5] + 1900) . "/" . ($_[4] + 1)}

    sub cgi_dumper {print "

    ", HTML::Entities::encode(Dumper(@_)), "
    "}