Wikipedia:Persondata/transform.pl

  1. !/usr/bin/perl -w

use POSIX;

  1. Transform Persondata
  1. Parse date entry and expand each date field to 8 fields
  2. Likewise the articles about places and first/last names

my %months=(

January => 1,

February => 2,

March => 3,

April => 4,

May => 5,

June => 6,

July => 7,

August => 8,

September => 9,

October => 10,

November => 11,

December => 12

);

my %monthname = reverse %months;

my $MONTH = "(" . join("|",keys %months) . ")";

my $YEAR = "(\\d{1,4})";

my $PREFIX = "([^0-9]+)";

  1. Clean up date entry for standardised persondata date format

sub clean_date {

my $d = shift;

# Put spaces between wikilinks

$d =~ s/([\]])([\[])/$1 $2/g;

# Deal with piped links

$d =~ s/[\[]{2}([^\]]*)[|](.*)[\]]{2}/$2/g;

# Remove square brackets

$d =~ s/[\[\]]//g;

# Deal with templates

$d =~ s/{{([Bb]irth|[Dd]eath) date( and age)?\s?\|\s?(\d{1,4})\|\s?(\d{1,2})\|\s?(\d{1,2})(.*)/$3-$4-$5/g;

# DD.MM.YYYY => DD MMM YYYY

if ($d =~ /^0?(\d{1,2})\.(\d{1,2})\.(\d{1,4})$/) {

my $month = int $2;

$d = "$1 ".$monthname{$month}." $3";

}

# YYYY-MM-DD => DD MMM YYYY

if ($d =~ /^(\d{1,4})-(\d{1,2})-0?(\d{1,2})$/) {

my $month = int $2;

$d = "$3 ".$monthname{$month}." $1";

}

# AD/CE is implicit

$d =~ s/A\. ?D\.//; # TODO: doesn't completely work?? e.g. Libanios

$d =~ s/A ?D//;

$d =~ s/\bC\. ?E\.//;

$d =~ s/\bC ?E//;

# Remove trailing punctuation

$d =~ s/(,|\||;|=)$//;

# Remove HTML comments

$d =~ s///;

# Add forgotten spaces

$d =~ s/([a-z])(\d)/$1 $2/g;

$d =~ s/(\d)([a-z])/$1 $2/g;

$d =~ s/\.([^ ])/. $1/g;

#Remove spaces from ordinals

$d =~ s/(\d) (th|st|nd|rd|s\b)/$1$2/g;

# remove double spaces

$d =~ s/\s+/ /g;

# remove spaces at beginning and end

$d =~ s/^\s+//;

$d =~ s/\s+$//;

# Fields with only question marks => "unknown"

$d =~ s/^\?$/unknown/;

# Uniform capitalisation

$d =~ s/Unknown/unknown/g;

$d =~ s/After/after/g;

# question marks should always have brackets and a space before them

$d =~ s/\(\?\)/\?/g;

$d =~ s/([^ ])\?+/$1 \?/g;

$d =~ s/\?/(\?)/g;

# Instead of a question mark at the end, 'probably' at the beginning

$d =~ s/^(.+) \(\?\)$/probably $1/;

$d =~ s/^\(\?\)$/probably/;

# Slash without space

$d =~ s/ ?\/ ?/\//g;

# Remove bolding/italics

$d =~ s/\'{2,5}//g;

# Shortened month names

$d =~ s/Jan[\.\s]+/January /;

$d =~ s/Feb[\.\s]+/February /;

$d =~ s/Mar[\.\s]+/March /;

$d =~ s/Apr[\.\s]+/April /;

$d =~ s/Jun[\.\s]+/June /;

$d =~ s/Jul[\.\s]+/July /;

$d =~ s/Aug[\.\s]+/August /;

$d =~ s/Sep[\.\s]+/September /;

$d =~ s/Sept[\.\s]+/September /;

$d =~ s/Oct[\.\s]+/October /;

$d =~ s/Nov[\.\s]+/November /;

$d =~ s/Dec[\.\s]+/December /;

# Write out "Century"

$d =~ s/C\./century/g;

# "End of the 5th century" => "End 5th century" (simpler)

$d =~ s/ of the//;

$d =~ s/(approx\.?|[Cc]irca\.?\s?|\bca?[\.\s]+|about|around|~)/circa /;

$d =~ s/([Pp]ossibly|[Pp]robably)/probably/g;

$d =~ s/([Ss]till )?[Ll]iving( [Pp]erson)?//;

$d =~ s/[Nn]\/?[Aa]//;

$d =~ s/[Nn]ot [Aa]pplicable//;

# Remove double spaces

$d =~ s/\s+/ /g;

#----------Error fixing---------

# Other typing errors

#$d =~ s/^Um/um/;

#$d =~ s/chr/Chr/;

return $d;

}

sub ilog10 { $x=shift; return int log10($x); }

my @fields = ( "day", "month", "year", "century", "decade", "year1", "year2", "note" );

sub parse_date {

my $date = shift;

my %d;

if (! $date ) {

$d{"note"} = "";

# Normal date entry

} elsif ($date =~ /^$PREFIX?(\d+ )?$MONTH,? $YEAR( BCE?\.?)?$/) {

$d{note} = $1 if defined $1;

$d{day} = substr($2,0,-1) if defined $2;

if (defined $3) {

$d{month} = $months{$3};

}

$d{year} = int $4;

$d{year} = -$d{year} if defined $5; # B.C.

} elsif ($date =~ /^$PREFIX?$MONTH (\d+)?,?\s?$YEAR( BCE?\.?)?$/) {

$d{note} = $1 if defined $1;

$d{day} = substr($3,0) if defined $3;

if (defined $2) {

$d{month} = $months{$2};

}

$d{year} = int $4;

$d{year} = -$d{year} if defined $5; # B.C.

} elsif ($date =~ /^$PREFIX?$YEAR( BCE?\.?)?$/) {

$d{note} = trim($1) if defined $1;

$d{year} = int $2;

$d{year} = -$d{year} if defined $3; # B.C.

# Century

#} elsif ($date =~ /^(around |probably |[Bb]eginning |[Mm]iddle |[Ee]nd )?(\d{1,2})\. [Cc]entury( BCE?\.?)?$/) {

} elsif ($date =~ /^$PREFIX?(\d{1,2})(\.|th|st|nd|rd) [Cc]entury( BCE?\.?)?$/) {

$d{note} = trim($1) if defined $1;

$d{century} = $2;

$d{century} = -$d{century} if defined $4;

# Decade

} elsif ($date =~ /^(\d{1,4}) ?'?s( BCE?\.?)?$/) {

$d{decade} = (int $1 / 10) * 10;

$d{decade} = -$d{decade} if defined $3;

$d{century} = int ($d{decade} / 10) + 1;

} elsif ($date =~ /^$PREFIX?$YEAR or $YEAR$/) { # TODO: or != until, TODO: B.C.

$d{note} = trim($1) if defined $1;

$d{year1} = int $2;

$d{year2} = int $3;

} elsif ($date =~ /^$PREFIX?$YEAR\/(\d{1,4})$/) { # TODO: B.C.

$d{note} = trim($1) if defined $1;

$d{year1} = int $2;

$d{year2} = int $3;

# Example: 1632/33 => 1632/1633

$c = ilog10($d{year1}) - ilog10($d{year2});

if ( $c < 0 ) { # Errr

$d{year1} = "";

$d{year2} = "";

$d{note} = $date;

} else {

$d{year2} = substr($d{year1},0,$c) . $d{year2};

}

# Interval of several years # TODO: not tested!

} elsif ( $date =~ /^$PREFIX?between $YEAR( and )$YEAR( BCE?\.?)?$/ or

$date =~ /^$PREFIX?$YEAR( to |-)$YEAR$( BCE?\.?)?/ )

{

$d{note} = trim($1) if defined $1;

$d{year1} = int $2;

$d{year2} = int $4;

if (defined $5) {

($d{year1}, $d{year2}) = (-$d{year2}, -$d{year1});

}

#} elsif ($date =~ /^$PREFIX?$YEAR( to |-)$YEAR$/) {

# $d{note} = trim($1) if defined $1;

# $d{year1} = int $2;

# $d{year2} = int $4;

} else {

$d{note} = $date;

}

if (defined $d{year1} and defined $d{year2}) {

# if years need to be switched

#if ($d{year1} > $d{year2}) {

# ($d{year1}, $d{year2}) = ($d{year2}, $d{year1});

#}

# if century can be defined

if ( (int $d{year1} / 100) eq (int $d{year2} / 100)) {

$d{century} = (int $d{year1} / 100) + 1;

}

# if decade can be defined

if ( $d{century} and (int $d{year1} / 10) eq (int $d{year2} / 10)) {

$d{decade} = (int $d{year1} / 10) * 10;

}

}

# calculate decade and century

if (defined $d{year}) {

$d{decade} = (int $d{year} / 10) * 10;

}

# calculate century

if (defined $d{decade}) {

if( $d{decade} != 0 ){

$d{century} = (int $d{decade} / 100) + (int $d{decade} / abs $d{decade});

}

else{

if (defined $d{year}) {

$d{century} = (int $d{decade} / 100) + (int $d{year} / abs $d{year});

}

else{

}

}

}

# Initialise undefined fields

foreach my $f (@fields) {

$d{$f} = '' if not defined $d{$f};

}

return %d;

}

sub parse_location {

my $p = trim(shift);

# a or b <- so what is it?!

# ...

# ..., ...

# ... , ...

# .../ ... <- preferably not like this

if ($p =~ /^\[\[([^\]]+)\]\].*?$/) {

my $a = $1;

$a =~ s/\|.*$//;

#print "$p|$a\n";

return $a;

} else {

#print "!$p\n";

return "";

}

}

  1. For PND number (assigned to German-speaking authors).
  2. Small number of articles on en wiki have this

sub parse_pnd {

my $p = trim(shift);

my $pnd_nr = "";

my $pnd_date = "";

if ($p =~ /^(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)([\dX])/) {

my $check = $9;

$check = 10 if $check eq "X";

# pruefziffer

if ( ((2*$1+3*$2+4*$3+5*$4+6*$5+7*$6+8*$7+9*$8) % 11) eq $check ) {

# TODO: Number range 10000000 bis 14999999

if ($1 == 1 && $2>=0 && $2<=4) {

$pnd_nr = "$1$2$3$4$5$6$7$8$9";

}

}

}

if ($p =~ /(\d){1,2}\.(\d{1,2})\.(\d\d\d\d)/) {

$pnd_date = "$3-$2-$1";

}

return ($pnd_nr, $pnd_date);

}

sub unbracket {

my $p = shift;

# Insert missing spaces between wikilinks

$p =~ s/([\]])([\[])/$1 $2/g;

# remove square brackets

$p =~ s/[\[\]]//g;

return $p;

}

sub trim {

my $p = shift;

if ($p) {

# remove empty spaces at beginning and end

$p =~ s/^\s+//;

$p =~ s/\s+$//;

}

return $p;

}

while(my $line = <>) {

my @pd = split("\t",$line);

chop($pd[-1]); # remove end-of-line

my @pd_transformed;

$pd_transformed[0] = trim($pd[0]); # pd_id

$pd_transformed[1] = trim($pd[1]); # pd_article

$pd_transformed[2] = trim($pd[2]); # pd_name

$pd_transformed[3] = trim($pd[3]); # pd_alternative

$pd_transformed[4] = trim($pd[4]); # pd_description

$pd_transformed[5] = unbracket(trim($pd[5])); # pd_born

$pd_transformed[6] = trim($pd[6]); # pd_born_in

$pd_transformed[7] = unbracket(trim($pd[7])); # pd_died

$pd_transformed[8] = trim($pd[8]); # pd_died_in

$pd_transformed[9] = trim($pd[9]); # pd_pnd

# extract checked pnd-nr and additional date

($pd_transformed[10], $pd_transformed[11]) = parse_pnd($pd[9]); # pnr_nr, pnd_date

if ( trim($pd[2]) =~ /^([^,]+),([^,]+)$/ ) {

$pd_transformed[12] = trim($2); # n_given

$pd_transformed[13] = trim($1); # n_surname

$pd_transformed[14] = ''; # n_suffix

}

elsif ( trim($pd[2]) =~ /^([^,]+),([^,]+),([^,]+)$/ ) {

$pd_transformed[12] = trim($2); # n_given

$pd_transformed[13] = trim($1); # n_surname

$pd_transformed[14] = trim($3); # n_suffix

}

else {

$pd_transformed[12] = ''; # n_given

$pd_transformed[13] = ''; # n_surname

$pd_transformed[14] = ''; # n_suffix

}

$pd_transformed[15] = parse_location($pd[6]); # b_place

$pd_transformed[16] = parse_location($pd[8]); # d_place

my %born = parse_date(clean_date($pd[5]));

$pd_transformed[17] = $born{day}; # b_day

$pd_transformed[18] = $born{month}; # b_month

$pd_transformed[19] = $born{year}; # b_year

$pd_transformed[20] = $born{decade}; # b_decade

$pd_transformed[21] = $born{century}; # b_century

$pd_transformed[22] = $born{year1}; # b_year1

$pd_transformed[23] = $born{year2}; # b_year2

$pd_transformed[24] = $born{note}; # b_note

my %died = parse_date(clean_date($pd[7]));

$pd_transformed[25] = $died{day}; # d_day

$pd_transformed[26] = $died{month}; # d_month

$pd_transformed[27] = $died{year}; # d_year

$pd_transformed[28] = $died{decade}; # d_decade

$pd_transformed[29] = $died{century}; # d_century

$pd_transformed[30] = $died{year1}; # d_year1

$pd_transformed[31] = $died{year2}; # d_year2

$pd_transformed[32] = $died{note}; # d_note

print join("\t",@pd_transformed) . "\n";

}