Wikipedia:Duplicated sections/script

  1. Hot pipes

$| = 1;

  1. This script is expecting entries.txt to be a relatively database
  2. dump that has been pre-processed to put each page on line by itself.
  1. On 31 July 2005, this script ran on a 1.2GHz i686 laptop with ~700MB
  2. RAM in about 20 minutes. Not using the dupHeaders() filter will
  3. cause it to take probably about 5 hours or more.
  1. The author of this script is Christopher Beland, User:Beland on
  2. en.wikipedia.org. It is hereby released into the Public Domain.
  3. Feel free to use it for any purpose whatsoever.

use strict;

main();

sub main

{

my ($cur_id, $cur_namespace, $cur_title, $cur_text, @junk, $line,

$cur_namespace_name, $i, $j, @tokens, $printed, $chain);

unless (-d "./todo")

{

mkdir "./todo";

}

open (ENTRIES, "

|| die "Cannot read data/entries.txt";

open (DUPHEAD, ">todo/duplicate-chunks.txt")

|| die "Cannot write todo/blank-pages.txt" ;

while ()

{

if (++$j % 100 == 0)

{

print STDERR $j."\r";

}

$line = $_;

eval("\@tokens = $line");

($cur_id, $cur_namespace, $cur_title, $cur_text, @junk)

= @tokens;

unless (dupHeaders($cur_text) == 1)

{

next;

}

if ($cur_namespace == -2)

{

$cur_namespace_name = "Media:";

}

elsif ($cur_namespace == -1)

{

$cur_namespace_name = "Special:";

}

elsif ($cur_namespace == 0)

{

$cur_namespace_name = "";

}

elsif ($cur_namespace == 1)

{

$cur_namespace_name = "Talk:";

}

elsif ($cur_namespace == 2)

{

$cur_namespace_name = "User:";

}

elsif ($cur_namespace == 3)

{

$cur_namespace_name = "User_talk:";

}

elsif ($cur_namespace == 4)

{

$cur_namespace_name = "Wikipedia:";

}

elsif ($cur_namespace == 5)

{

$cur_namespace_name = "Wikipedia_talk:";

}

elsif ($cur_namespace == 6)

{

$cur_namespace_name = ":Image:";

}

elsif ($cur_namespace == 7)

{

$cur_namespace_name = "Image_talk:";

}

elsif ($cur_namespace == 8)

{

$cur_namespace_name = "MediaWiki:";

}

elsif ($cur_namespace == 9)

{

$cur_namespace_name = "MediaWiki_talk:";

}

elsif ($cur_namespace == 10)

{

$cur_namespace_name = "Template:";

}

elsif ($cur_namespace == 11)

{

$cur_namespace_name = "Template_talk:";

}

elsif ($cur_namespace == 12)

{

$cur_namespace_name = "Help:";

}

elsif ($cur_namespace == 13)

{

$cur_namespace_name = "Help_talk:";

}

elsif ($cur_namespace == 14)

{

$cur_namespace_name = ":Category";

}

elsif ($cur_namespace == 15)

{

$cur_namespace_name = "Category_talk:";

}

# Remove leading and trailing 's.

$cur_title =~ s/^\'//;

$cur_title =~ s/\'$//;

# Remove leading and trailing whitespace

$cur_title =~ s/^\s*//;

$cur_title =~ s/\s*$//;

$cur_text =~ s/\\n/ /g;

$cur_text =~ s/\s+/ /g;

my (%chains, @chunks, $i, $per, $numberRepeated);

@chunks = split (" ", $cur_text);

while (@chunks > 3)

{

$chain = $chunks[-1]." ".$chunks[-2]." ".$chunks[-3];

$chains{$chain}++;

pop(@chunks);

# Note: pop from the rear is a bjillion times more

# efficient than unloading manually from the front.

$i++;

}

  1. print DUPHEAD "* ".$cur_namespace_name.$cur_title." $i\n";

$printed = 0;

foreach $chain (keys(%chains))

{

if ($chains{$chain} > 1)

{

if ($printed == 0)

{

print DUPHEAD "* ".$cur_namespace_name.$cur_title."";

$printed = 1;

}

  1. print DUPHEAD $chains{$chain}.": ".$chain."\n";

$numberRepeated++

}

}

if ($printed == 1)

{

$per = int(($numberRepeated / $i) * 100);

print DUPHEAD " ${per}% repeated - $numberRepeated out of $i triplets\n";

}

}

close (ENTRIES);

close (DUPHEAD);

}

sub dupHeaders

{

my ($text, %headers, $line);

$text = $_[0];

unless ($text =~ m/=/)

{

# No headers means no duplicate headers

return (0);

}

$text =~ s/\\n/\n/g;

foreach $line (split ("\n", $text))

{

if ($line =~ m/^\s*\=/)

{

$headers{$line}++;

}

}

foreach $line (keys(%headers))

{

if ($headers{$line} > 1)

{

# Found a duplicated header

return(1);

}

}

# Didn't return, so must not have found any duplicate headers

return(0);

}

print `sort -nr -k3 todo/duplicate-chunks.txt > todo/duplicate-chunks-sorted.txt`