User:FairuseBot/Pearle/WikiPage.pm

  1. IMPORTANT ###
  1. This code is released into the public domain.
  1. RECENT CHANGES ###
  2. 6 Aug 2007: Created
  3. 21 Aug 2007: Added comment folding/unfolding
  4. 21 Oct 2007: Fixed and tested comment folding
  5. 25 Oct 2007: Added link canonicalization
  1. Notes on editable markup:
  2. * Multi-character symbols are replaced with single-character placeholders
  3. from the Unicode "control symbols" set (U+0001 to U+001F).
  4. * Comments are replaced with single-character placeholders from the
  5. Unicode fifteenth-plane private-use area (U+F0000 to U+FFFFF).

package Pearle::WikiPage;

use strict;

use warnings;

use URI::Escape;

use Encode;

  1. Constructor ###############################################

sub new

{

my $class = shift;

my %params = @_;

my $self = {

text => '', # Page text

title => '', # Page title

# Internal variables

editTime => undef, # editTime parameter used when editing a page

startTime => undef, # startTime parameter used when editing a page

editToken => undef, # editToken parameter used when editing a page

# Comment-folding

_comments_folded => 0, # Are comments presently folded?

_comment_fold_lookup => {}, # Lookup table of proxy,comment pairs

_comment_fold_proxy => 0xF0000, # Next proxy character to use

# Single-character markup representations

_linkstart => "\x01",

_linkend => "\x02",

_transclusionstart => "\x03",

_transclusionend => "\x04",

};

foreach my $key (keys(%params))

{

if($key eq 'text')

{

$self->{text} = $params{text};

}

elsif($key eq 'title')

{

$self->{title} = $params{title};

}

elsif($key eq 'editTime')

{

$self->{editTime} = $params{editTime};

}

elsif($key eq 'startTime')

{

$self->{startTime} = $params{startTime};

}

elsif($key eq 'editToken')

{

$self->{editToken} = $params{editToken};

}

}

bless($self, $class);

return $self;

}

  1. Accessor functions ########################################
  1. Return the text with modifications to make it easier to operate on
  2. NOTE: Don't try to print this. In order to make editing easier,
  3. various multi-character markup sequences have been replaced with
  4. very non-printable characters.

sub getEditableText

{

my $self = shift;

$self->foldComments();

return $self->makeEditableMarkup($self->{text});

}

sub setEditableText

{

my $self = shift;

$self->{text} = shift;

}

  1. Return the text in WikiMarkup format

sub getWikiText

{

my $self = shift;

$self->unfoldComments();

return $self->makeWikiMarkup($self->{text});

}

sub getTitle

{

my $self = shift;

return $self->{title};

}

sub setTitle

{

die "Setting the title of a WikiPage is not supported.\n";

}

sub getEditToken

{

my $self = shift;

return $self->{editToken};

}

sub getStartTime

{

my $self = shift;

return $self->{startTime};

}

sub getEditTime

{

my $self = shift;

return $self->{editTime};

}

  1. Verbs #####################################################
  1. Convert to editable representation

sub makeEditableMarkup

{

my $self = shift;

my $text = shift;

  1. $text =~ s/\[\[\[/\x01[/g; # Triple opening brackets: not valid wikimarkup

$text =~ s/\[\[/\x01/g; # Double opening brackets: the start of an internal link or inline image

$text =~ s/\]\]\]\]/\x02\x02/g; # Quadruple closing brackets: The end of an image caption containing an internal link

$text =~ s/\]\]\]/]\x02/g; # Triple closing brackets: an image caption containing an external link

$text =~ s/\]\]/\x02/g; # Double closing brackets: the end of an internal link or image

$text =~ s/\{\{/\x03/g; # Double opening braces: the start of a transclusion

$text =~ s/\}\}/\x04/g; # Double closing braces: the end of a transclusion

return $text;

}

  1. Convert to WikiMarkup representation

sub makeWikiMarkup

{

my $self = shift;

my $text = shift;

$text =~ s/\x01/[[/g;

$text =~ s/\x02/]]/g;

$text =~ s/\x03/{{/g;

$text =~ s/\x04/}}/g;

return $text;

}

  1. Replace comments with single-character proxies.

sub foldComments

{

my $self = shift;

my $text = $self->{text};

while($text =~ /()/s)

{

my $proxy_char = chr $self->{_comment_fold_proxy};

$self->{_comment_fold_lookup}->{$proxy_char} = $1;

my $comment = escapeRegex($1);

$text =~ s/$comment/$proxy_char/;

$self->{_comment_fold_proxy} += 1;

die "Too many comments in page" if $self->{_comment_fold_proxy} > 0xFFFFF; # More than 65535 comments in the page

}

$self->{text} = $text;

return $text;

}

  1. Replace proxies with the original comments

sub unfoldComments

{

my $self = shift;

my $text = $self->{text};

while (my ($proxy_char,$link) = each(%{$self->{_comment_fold_lookup}}))

{

$text =~ s/$proxy_char/$link/g;

}

$self->{text} = $text;

return $text;

}

sub canonicalizeLinks

{

my $self = shift;

my %link_lookup;

# NOTE: Order of the following two lines is important, since getEditableText modifies $self->{_comment_fold_proxy}

my $text = $self->getEditableText();

my $link_proxy = $self->{_comment_fold_proxy};

# Extract the links beginnings into a lookup table

while($text =~ /(\x01.*?[|\x02])/)

{

my $proxy_char = chr $link_proxy;

$link_lookup{$proxy_char} = $1;

my $link = escapeRegex($1);

$text =~ s/$link/$proxy_char/;

  1. print "$link_proxy $link_lookup{$proxy_char}\n";

$link_proxy += 1;

die "Too many links in page" if($link_proxy > 0xFFFFF);

}

# Canonicalize link beginnings

while (my ($proxy_char,$link) = each %link_lookup)

{

next if $link =~ /http:/; # Skip if it's a badly-formatted external link

$link = unescapeUTF8URL($link); # Convert URL-encoded UTF8 to Perl chars

$link =~ s/_/ /g; # Underscores to spaces

$link =~ s/ / /g; # Collapse multiple spaces

$link =~ s/[\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]//g; # Kill Unicode BiDi markers

# TODO: Decode HTML entities (E E Á

# Trim spaces

$link =~ s/^\x01 /\x01/;

$link =~ s/ \|$/|/;

$link =~ s/ \x02$/\x02/;

# TODO: Trim internal spaces for namespaced links

  1. print URI::Escape::uri_escape_utf8($link), "\n";

$link_lookup{$proxy_char} = $link;

}

# Put link beginnings back in the text

while (my ($proxy_char,$link) = each %link_lookup)

{

$text =~ s/$proxy_char/$link/g;

}

$self->setEditableText($text);

  1. exit;

}

  1. Utilities #################################################
  1. Escape a string so that it's a literal match in a regex

sub escapeRegex

{

my $string = shift;

$string =~ s/\\/\\\\/g;

$string =~ s/\./\\\./g;

$string =~ s/\(/\\\(/g;

$string =~ s/\)/\\\)/g;

$string =~ s/\[/\\\[/g;

$string =~ s/\{/\\\{/g;

$string =~ s/\+/\\\+/g;

$string =~ s/\*/\\\*/g;

$string =~ s/\?/\\\?/g;

$string =~ s/\^/\\\^/g;

$string =~ s/\$/\\\$/g;

$string =~ s/\|/\\\|/g;

return $string;

}

sub unescapeUTF8URL

{

# Since nobody seems to have a module to unescape a UTF8-encoded URL-escaped string...

my $string = shift;

my @chars = split //, $string;

my $result_string = '';

for(my $i = 0; $i < scalar(@chars); $i++)

{

my $partial_string = '';

if($chars[$i] eq '%')

{

while(1)

{

# If the next two chars are hex values, stuff them in $partial_string

if($chars[$i+1] =~ /[0-9a-f]/i and $chars[$i+2] =~ /[0-9a-f]/i)

{

$partial_string .= $chars[$i] . $chars[$i+1] . $chars[$i+2];

$i += 3;

}

else

{

# Literal percent

$result_string .= $chars[$i];

$i++;

last;

}

if($chars[$i] ne '%')

{

last;

}

}

if($partial_string ne '')

{

$result_string .= decode("utf8", URI::Escape::uri_unescape($partial_string));

}

$i--;

}

else

{

# Literal char, already in unicode

$result_string .= $chars[$i];

}

}

return $result_string;

}

1;