Wikipedia:Duplicated sections/script



Hot pipes

$| = 1;

This script is expecting entries.txt to be a relatively database
dump that has been pre-processed to put each page on line by itself.


On 31 July 2005, this script ran on a 1.2GHz i686 laptop with ~700MB
RAM in about 20 minutes.  Not using the dupHeaders() filter will
cause it to take probably about 5 hours or more.


The author of this script is Christopher Beland, User:Beland on
en.wikipedia.org.  It is hereby released into the Public Domain.
Feel free to use it for any purpose whatsoever.

use strict;
main();
sub main
{
my ($cur_id, $cur_namespace, $cur_title, $cur_text, @junk, $line,
$cur_namespace_name, $i, $j, @tokens, $printed, $chain);
unless (-d "./todo")
{
mkdir "./todo";
}
open (ENTRIES, "
|| die "Cannot read data/entries.txt";
open (DUPHEAD, ">todo/duplicate-chunks.txt")
|| die "Cannot write todo/blank-pages.txt" ;
while ()
{
if (++$j % 100 == 0)
{
print STDERR $j."\r";
}
$line = $_;
eval("\@tokens = $line");
($cur_id, $cur_namespace, $cur_title, $cur_text, @junk)
= @tokens;
unless (dupHeaders($cur_text) == 1)
{
next;
}
if ($cur_namespace == -2)
{
$cur_namespace_name = "Media:";
}
elsif ($cur_namespace == -1)
{
$cur_namespace_name = "Special:";
}
elsif ($cur_namespace == 0)
{
$cur_namespace_name = "";
}
elsif ($cur_namespace == 1)
{
$cur_namespace_name = "Talk:";
}
elsif ($cur_namespace == 2)
{
$cur_namespace_name = "User:";
}
elsif ($cur_namespace == 3)
{
$cur_namespace_name = "User_talk:";
}
elsif ($cur_namespace == 4)
{
$cur_namespace_name = "Wikipedia:";
}
elsif ($cur_namespace == 5)
{
$cur_namespace_name = "Wikipedia_talk:";
}
elsif ($cur_namespace == 6)
{
$cur_namespace_name = ":Image:";
}
elsif ($cur_namespace == 7)
{
$cur_namespace_name = "Image_talk:";
}
elsif ($cur_namespace == 8)
{
$cur_namespace_name = "MediaWiki:";
}
elsif ($cur_namespace == 9)
{
$cur_namespace_name = "MediaWiki_talk:";
}
elsif ($cur_namespace == 10)
{
$cur_namespace_name = "Template:";
}
elsif ($cur_namespace == 11)
{
$cur_namespace_name = "Template_talk:";
}
elsif ($cur_namespace == 12)
{
$cur_namespace_name = "Help:";
}
elsif ($cur_namespace == 13)
{
$cur_namespace_name = "Help_talk:";
}
elsif ($cur_namespace == 14)
{
$cur_namespace_name = ":Category";
}
elsif ($cur_namespace == 15)
{
$cur_namespace_name = "Category_talk:";
}
# Remove leading and trailing 's.
$cur_title =~ s/^\'//;
$cur_title =~ s/\'$//;
# Remove leading and trailing whitespace
$cur_title =~ s/^\s*//;
$cur_title =~ s/\s*$//;
$cur_text =~ s/\\n/ /g;
$cur_text =~ s/\s+/ /g;
my (%chains, @chunks, $i, $per, $numberRepeated);
@chunks = split (" ", $cur_text);
while (@chunks > 3)
{
$chain = $chunks[-1]." ".$chunks[-2]." ".$chunks[-3];
$chains{$chain}++;
pop(@chunks);
# Note: pop from the rear is a bjillion times more
# efficient than unloading manually from the front.
$i++;
}

print DUPHEAD "* ".$cur_namespace_name.$cur_title." $i\n";

$printed = 0;
foreach $chain (keys(%chains))
{
if ($chains{$chain} > 1)
{
if ($printed == 0)
{
print DUPHEAD "* ".$cur_namespace_name.$cur_title."";
$printed = 1;
}

print DUPHEAD $chains{$chain}.": ".$chain."\n";

$numberRepeated++
}
}
if ($printed == 1)
{
$per = int(($numberRepeated / $i) * 100);
print DUPHEAD " ${per}% repeated - $numberRepeated out of $i triplets\n";
}
}
close (ENTRIES);
close (DUPHEAD);
}
sub dupHeaders
{
my ($text, %headers, $line);
$text = $_[0];
unless ($text =~ m/=/)
{
# No headers means no duplicate headers
return (0);
}
$text =~ s/\\n/\n/g;
foreach $line (split ("\n", $text))
{
if ($line =~ m/^\s*\=/)
{
$headers{$line}++;
}
}
foreach $line (keys(%headers))
{
if ($headers{$line} > 1)
{
# Found a duplicated header
return(1);
}
}
# Didn't return, so must not have found any duplicate headers
return(0);
}
print `sort -nr -k3 todo/duplicate-chunks.txt > todo/duplicate-chunks-sorted.txt`