User:Selket/Parsers.cs

/*

WikiFunctions

Copyright (C) 2006 Martin Richards

This program is free software; you can redistribute it and/or modify

it under the terms of the GNU General Public License as published by

the Free Software Foundation; either version 2 of the License, or

(at your option) any later version.

This program is distributed in the hope that it will be useful,

but WITHOUT ANY WARRANTY; without even the implied warranty of

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

GNU General Public License for more details.

You should have received a copy of the GNU General Public License

along with this program; if not, write to the Free Software

Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA

  • /

using System;

using System.Collections.Generic;

using System.Text;

using System.Text.RegularExpressions;

using System.Configuration;

using System.Collections;

using System.Web;

[assembly: CLSCompliant(true)]

namespace WikiFunctions.Parse

{

///

/// Provides functions for editting wiki text, such as formatting and re-categorisation.

///

public class Parsers

{

#region constructor etc.

public Parsers()

{//default constructor

metaDataSorter = new MetaDataSorter(this);

MakeRegexes();

}

///

/// Re-organises the Person Data, stub/disambig templates, categories and interwikis

///

/// The number of maximum number of words for a stub.

public Parsers(int StubWordCount, bool AddHumanKey)

{

metaDataSorter = new MetaDataSorter(this);

StubMaxWordCount = StubWordCount;

addCatKey = AddHumanKey;

MakeRegexes();

}

private void MakeRegexes()

{

//look bad if changed

RegexUnicode.Add(new Regex("&(ndash|mdash|minus|times|lt|gt|nbsp|thinsp|shy|lrm|rlm|[Pp]rime);", RegexOptions.Compiled), "&$1;");

//IE6 does like these

RegexUnicode.Add(new Regex("&#(705|803|596|620|699|700|8652|9408|9848|12288|160|61|x27|39);", RegexOptions.Compiled), "&#$1;");

//Decoder doesn't like these

RegexUnicode.Add(new Regex("&#(x109[0-9A-Z]{2});", RegexOptions.Compiled), "&#$1;");

RegexUnicode.Add(new Regex("&#((?:277|119|84|x1D|x100)[A-Z0-9a-z]{2,3});", RegexOptions.Compiled), "&#$1;");

RegexUnicode.Add(new Regex("&#(x12[A-Za-z0-9]{3});", RegexOptions.Compiled), "&#$1;");

//interfere with wiki syntax

RegexUnicode.Add(new Regex("&#(126|x5D|x5B|x7b|x7c|x7d|0?9[13]|0?12[345]|0?0?3[92]);", RegexOptions.Compiled | RegexOptions.IgnoreCase), "&#$1;");

//not entity, but still wrong

RegexUnicode.Add(new Regex("(cm| m|mm|km|mi)2", RegexOptions.Compiled), "$1²");

RegexUnicode.Add(new Regex("(cm| m|mm|km|mi)3", RegexOptions.Compiled), "$1³");

RegexTagger.Add(new Regex("\\{\\{(template:)?(wikify|wikify-date|wfy|wiki)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), "{{Wikify|February 2007}}");

RegexTagger.Add(new Regex("\\{\\{(template:)?(Clean ?up|CU|Clean|Tidy)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), "{{Cleanup|February 2007}}");

RegexTagger.Add(new Regex("\\{\\{(template:)?(Linkless|Orphan)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), "{{Linkless|February 2007}}");

RegexTagger.Add(new Regex("\\{\\{(template:)?(Uncategori[sz]ed|Uncat|Classify|Category needed|Catneeded|categori[zs]e|nocats?)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), "{{Uncategorized|February 2007}}");

RegexTagger.Add(new Regex("\\{\\{(template:)?(Unreferenced|add references|cite[ -]sources?|cleanup-sources?|needs? references|no sources|no references?|not referenced|references|sources|unref|Unreferencedsect|unsourced)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), "{{Unreferenced|date=February 2007}}");

RegexConversion.Add(new Regex("\\{\\{(?:Template:)?(Dab|Disamb|Disambiguation)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), "{{Disambig}}");

RegexConversion.Add(new Regex("\\{\\{(?:Template:)?(2cc|2LAdisambig|2LCdisambig|2LC)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), "{{2CC}}");

RegexConversion.Add(new Regex("\\{\\{(?:Template:)?(3cc|3LW|Tla|Tla-dab|TLA-disambig|TLAdisambig|3LC)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), "{{3CC}}");

RegexConversion.Add(new Regex("\\{\\{(?:Template:)?(4cc|4LW|4LA|4LC)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), "{{4CC}}");

RegexConversion.Add(new Regex("\\{\\{(?:Template:)?(Bio-dab|Hndisambig)", RegexOptions.IgnoreCase | RegexOptions.Compiled), "{{Hndis");

RegexConversion.Add(new Regex("\\{\\{(?:Template:)?(Prettytable|Prettytable100|Pt)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), "class="wikitable"");

RegexConversion.Add(new Regex("\\{\\{(?:[Tt]emplate:)?(PAGENAMEE?\\}\\}|[Ll]ived\\||[Bb]io-cats\\|)", RegexOptions.Compiled), "{{subst:$1");

RegexConversion.Add(new Regex(@"\{\{[Ll]ife(?:time|span)\|([0-9]{4})\|([0-9]{4})\|(.*?)\}\}", RegexOptions.Compiled), "$3\r\n$3");

RegexConversion.Add(new Regex(@"\{\{[Ll]ife(?:time|span)\|\|([0-9]{4})\|(.*?)\}\}", RegexOptions.Compiled), "$2\r\n$2");

RegexConversion.Add(new Regex(@"\{\{[Ll]ife(?:time|span)\|([0-9]{4})\|\|(.*?)\}\}", RegexOptions.Compiled), "$2\r\n$2");

}

Dictionary RegexUnicode = new Dictionary();

Dictionary RegexConversion = new Dictionary();

Dictionary RegexTagger = new Dictionary();

HideText hider = new HideText();

MetaDataSorter metaDataSorter;

string testText = "";

int StubMaxWordCount = 500;

///

/// Sort interwiki link order

///

public bool sortInterwikiOrder

{

get { return boolInterwikiOrder; }

set { boolInterwikiOrder = value; }

}

private bool boolInterwikiOrder = true;

///

/// The interwiki link order to use

///

public InterWikiOrderEnum InterWikiOrder

{

set { metaDataSorter.InterWikiOrder = value; }

get { return metaDataSorter.InterWikiOrder; }

}

///

/// When set to true, adds key to categories (for people only) when parsed

///

public bool addCatKey

{

get { return boolAddCatKey; }

set { boolAddCatKey = value; }

}

private bool boolAddCatKey = false;

#endregion

#region General Parse

///

/// Re-organises the Person Data, stub/disambig templates, categories and interwikis

///

/// The wiki text of the article.

/// The article title.

/// True, sort interwiki order per pywiki bots, false keep current order.

/// The re-organised text.

public string SortMetaData(string ArticleText, string ArticleTitle)

{

return metaDataSorter.Sort(ArticleText, ArticleTitle);

}

readonly Regex regexFixDates0 = new Regex("([12][0-9][0-9]0)'s", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex regexFixDates1 = new Regex("(January|February|March|April|May|June|July|August|September|October|November|December) ([1-9][0-9]?)(?:st|nd|rd|th)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex regexFixDates2 = new Regex("([1-9][0-9]?)(?:st|nd|rd|th) (January|February|March|April|May|June|July|August|September|October|November|December)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex regexHeadings0 = new Regex("(== ?)(see also:?|related topics:?|related articles:?|internal links:?|also see:?)( ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex regexHeadings1 = new Regex("(== ?)(external links:?|external sites:?|outside links|web ?links:?|exterior links:?)( ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex regexHeadings2 = new Regex("(== ?)(external link:?|external site:?|web ?link:?|exterior link:?)( ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex regexHeadings3 = new Regex("(== ?)(reference:?)(s? ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex regexHeadings4 = new Regex("(== ?)(source:?)(s? ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex regexHeadings5 = new Regex("(== ?)(further readings?:?)( ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex regexHeadings6 = new Regex("(== ?)(Early|Personal|Adult|Later) Life( ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex regexHeadings7 = new Regex("(== ?)(Current|Past|Prior) Members( ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex regexHeadingsCareer = new Regex("(== ?)([a-zA-Z]+) Career( ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex RegexBadHeader = new Regex("^(={1,4} ?(about|description|overview|definition|profile|(?:general )?information|background|intro(?:duction)?|summary|bio(?:graphy)?) ?={1,4})", RegexOptions.IgnoreCase | RegexOptions.Compiled);

///

/// Fix ==See also== and similar section common errors.

///

/// The wiki text of the article.

/// Value that indicated whether no change was made.

/// The modified article text.

public string FixHeadings(string ArticleText, string ArticleTitle, out bool NoChange)

{

testText = ArticleText;

ArticleText = FixHeadings(ArticleText, ArticleTitle);

if (testText == ArticleText)

NoChange = true;

else

NoChange = false;

return ArticleText.Trim();

}

///

/// Fix ==See also== and similar section common errors. Removes unecessary introductary headings.

///

/// The wiki text of the article.

/// The modified article text.

public string FixHeadings(string ArticleText, string ArticleTitle)

{

ArticleText = Regex.Replace(ArticleText, "^={1,4} ?" + Regex.Escape(ArticleTitle) + " ?={1,4}", "", RegexOptions.IgnoreCase);

ArticleText = RegexBadHeader.Replace(ArticleText, "");

if (!Regex.IsMatch(ArticleText, "= ?See also ?="))

ArticleText = regexHeadings0.Replace(ArticleText, "$1See also$3");

ArticleText = regexHeadings1.Replace(ArticleText, "$1External links$3");

ArticleText = regexHeadings2.Replace(ArticleText, "$1External link$3");

ArticleText = regexHeadings3.Replace(ArticleText, "$1Reference$3");

ArticleText = regexHeadings4.Replace(ArticleText, "$1Source$3");

ArticleText = regexHeadings5.Replace(ArticleText, "$1Further reading$3");

ArticleText = regexHeadings6.Replace(ArticleText, "$1$2 life$3");

ArticleText = regexHeadings7.Replace(ArticleText, "$1$2 members$3");

ArticleText = regexHeadingsCareer.Replace(ArticleText, "$1$2 career$3");

return ArticleText;

}

///

/// Fix date and decade formatting errors.

///

/// The wiki text of the article.

/// The modified article text.

public string FixDates(string ArticleText)

{

ArticleText = regexFixDates0.Replace(ArticleText, "$1s");

ArticleText = regexFixDates1.Replace(ArticleText, "$1 $2");

ArticleText = regexFixDates2.Replace(ArticleText, "$1 $2");

return ArticleText;

}

///

/// Footnote formatting errors per WP:FN.

///

/// The wiki text of the article.

/// The modified article text.

public string FixFootnotes(string ArticleText)

{

string factTag = "({{[ ]*fact[ ]*}}|{{[ ]*fact[ ]*[\\|][^}]*}}|{{[ ]*facts[ ]*}}|{{[ ]*citequote[ ]*}}|{{[ ]*citation needed[ ]*}}|{{[ ]*cn[ ]*}}|{{[ ]*verification needed[ ]*}}|{{[ ]*verify source[ ]*}}|{{[ ]*verify credibility[ ]*}}|{{[ ]*who[ ]*}}|{{[ ]*failed verification[ ]*}}|{{[ ]*nonspecific[ ]*}}|{{[ ]*dubious[ ]*}}|{{[ ]*or[ ]*}}|{{[ ]*lopsided[ ]*}}|{{[ ]*GR[ ]*[\\|][ ]*[^ ]+[ ]*}}|{{[ ]*[c]?r[e]?f[ ]*[\\|][^}]*}}|{{[ ]*ref[ _]label[ ]*[\\|][^}]*}}|{{[ ]*ref[ _]num[ ]*[\\|][^}]*}})";

ArticleText = Regex.Replace(ArticleText, "\n\r\f\t ]+?"+factTag, "$1");

// One space/linefeed

ArticleText = Regex.Replace(ArticleText, "[\\n\\r\\f\\t ]+?])", "

// remove trailing spaces from named refs

ArticleText = Regex.Replace(ArticleText, ">ref ([^>]*[^>])[ ]*>", "");

// removed superscripted punctuation between refs

ArticleText = Regex.Replace(ArticleText, "(|]*?/>)[ ]*[,;-]?[ ]*

ArticleText = Regex.Replace(ArticleText, "(|]*?/>)[ ]*[,;-]?[ ]*

string LacksPunctuation = "([^\\.,;:!\\?\"'’])";

string QuestionOrExclam = "([!\\?])";

string MinorPunctuation = "([\\.,;:])";

string AnyPunctuation = "([\\.,;:!\\?])";

string MajorPunctuation = "([,;:!\\?])";

string Period = "([\\.])";

string Quote = "([\"'’]*)";

string Space = "[ ]*";

string RefTag1 = "(([^<]|<[^/]|])*?)";

string RefTag2 = "(]*?[^/]>([^<]|<[^/]|])*?)";

string RefTag3 = "(]*?/>)";

string match0a = LacksPunctuation + Quote + factTag + Space + AnyPunctuation;

string match0b = QuestionOrExclam + Quote + factTag + Space + MajorPunctuation;

string match0c = MinorPunctuation + Quote + factTag + Space + AnyPunctuation;

string match0d = QuestionOrExclam + Quote + factTag + Space + Period;

string match1a = LacksPunctuation + Quote + RefTag1 + Space + AnyPunctuation;

string match1b = QuestionOrExclam + Quote + RefTag1 + Space + MajorPunctuation;

string match1c = MinorPunctuation + Quote + RefTag1 + Space + AnyPunctuation;

string match1d = QuestionOrExclam + Quote + RefTag1 + Space + Period;

string match2a = LacksPunctuation + Quote + RefTag2 + Space + AnyPunctuation;

string match2b = QuestionOrExclam + Quote + RefTag2 + Space + MajorPunctuation;

string match2c = MinorPunctuation + Quote + RefTag2 + Space + AnyPunctuation;

string match2d = QuestionOrExclam + Quote + RefTag2 + Space + Period;

string match3a = LacksPunctuation + Quote + RefTag3 + Space + AnyPunctuation;

string match3b = QuestionOrExclam + Quote + RefTag3 + Space + MajorPunctuation;

string match3c = MinorPunctuation + Quote + RefTag3 + Space + AnyPunctuation;

string match3d = QuestionOrExclam + Quote + RefTag3 + Space + Period;

for (int j = 0; j < 10; j++) { // repeat for multiple refs together

ArticleText = Regex.Replace(ArticleText, match0a, "$1$2$4$3");

ArticleText = Regex.Replace(ArticleText, match0b, "$1$2$4$3");

ArticleText = Regex.Replace(ArticleText, match0c, "$2$4$3");

ArticleText = Regex.Replace(ArticleText, match0d, "$1$2$3");

ArticleText = Regex.Replace(ArticleText, match1a, "$1$2$5$3");

ArticleText = Regex.Replace(ArticleText, match1b, "$1$2$5$3");

ArticleText = Regex.Replace(ArticleText, match1c, "$2$5$3");

ArticleText = Regex.Replace(ArticleText, match1d, "$1$2$3");

ArticleText = Regex.Replace(ArticleText, match2a, "$1$2$5$3");

ArticleText = Regex.Replace(ArticleText, match2b, "$1$2$5$3");

ArticleText = Regex.Replace(ArticleText, match2c, "$2$5$3");

ArticleText = Regex.Replace(ArticleText, match2d, "$1$2$3");

ArticleText = Regex.Replace(ArticleText, match3a, "$1$2$4$3");

ArticleText = Regex.Replace(ArticleText, match3b, "$1$2$4$3");

ArticleText = Regex.Replace(ArticleText, match3c, "$2$4$3");

ArticleText = Regex.Replace(ArticleText, match3d, "$1$2$3");

}

return ArticleText;

}

///

/// Applies removes some excess whitespace from the article

///

/// The wiki text of the article.

/// The modified article text.

public static string RemoveWhiteSpace(string ArticleText)

{

ArticleText = Regex.Replace(ArticleText, "\r\n(\r\n)+", "\r\n\r\n");

ArticleText = Regex.Replace(ArticleText, "== ? ?\r\n\r\n==", "==\r\n==");

ArticleText = ArticleText.Replace("\r\n\r\n(* ?\\[?http)", "\r\n$1");

ArticleText = Regex.Replace(ArticleText.Trim(), "----+$", "");

ArticleText = Regex.Replace(ArticleText.Trim(), "
$", "", RegexOptions.IgnoreCase);

return ArticleText.Trim();

}

///

/// Applies removes all excess whitespace from the article

///

/// The wiki text of the article.

/// The modified article text.

public string RemoveAllWhiteSpace(string ArticleText)

{//removes all whitespace

ArticleText = ArticleText.Replace("\t", " ");

ArticleText = RemoveWhiteSpace(ArticleText);

ArticleText = ArticleText.Replace("\r\n\r\n*", "\r\n*");

ArticleText = Regex.Replace(ArticleText, " +", " ");

ArticleText = Regex.Replace(ArticleText, " \r\n", "\r\n");

ArticleText = Regex.Replace(ArticleText, "==\r\n\r\n", "==\r\n");

//fix bullet points

ArticleText = Regex.Replace(ArticleText, "^([\\*#]+) ", "$1", RegexOptions.Multiline);

ArticleText = Regex.Replace(ArticleText, "^([\\*#]+)", "$1 ", RegexOptions.Multiline);

//fix heading space

ArticleText = Regex.Replace(ArticleText, "^(={1,4}) ?(.*?) ?(={1,4})$", "$1$2$3", RegexOptions.Multiline);

//fix dash spacing

ArticleText = Regex.Replace(ArticleText, " ?(–|—|[01];|&[nm]dash;|̵[12];|ȁ[34];) ?", "$1");

ArticleText = Regex.Replace(ArticleText, "(—|—|—|—|—|–|–|–|–|–)", " $1 ");

return ArticleText.Trim();

}

///

/// Fixes and improves syntax (such as html markup)

///

/// The wiki text of the article.

/// Value that indicated whether no change was made.

/// The modified article text.

public string FixSyntax(string ArticleText, out bool NoChange)

{

testText = ArticleText;

ArticleText = FixSyntax(ArticleText);

if (testText == ArticleText)

NoChange = true;

else

NoChange = false;

return ArticleText;

}

readonly Regex SyntaxRegex1 = new Regex("\\[\\[http:\\/\\/([^][]*?)\\]", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex SyntaxRegex2 = new Regex("\\[http:\\/\\/([^][]*?)\\]\\]", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex SyntaxRegex3 = new Regex("\\[\\[http:\\/\\/(.*?)\\]\\]", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex SyntaxRegex4 = new Regex("\\[\\[([^][]*?)\\]([^][][^\\]])", RegexOptions.Compiled);

readonly Regex SyntaxRegex5 = new Regex("([^][])\\[([^][]*?)\\]\\]([^\\]])", RegexOptions.Compiled);

readonly Regex SyntaxRegex6 = new Regex("\\[?\\[image:(http:\\/\\/.*?)\\]\\]?", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex SyntaxRegex7 = new Regex("\\[\\[ (.*)?\\]\\]", RegexOptions.Compiled);

readonly Regex SyntaxRegex8 = new Regex("\\[\\[([A-Za-z]*) \\]\\]", RegexOptions.Compiled);

readonly Regex SyntaxRegex9 = new Regex("\\[\\[(.*)?_#(.*)\\]\\]", RegexOptions.Compiled);

readonly Regex SyntaxRegexTemplate = new Regex("(\\{\\{[\\s]*)[Tt]emplate:(.*?\\}\\})", RegexOptions.Singleline | RegexOptions.Compiled);

readonly Regex SyntaxRegex11 = new Regex("^((#|\\*).*?)
\r\n", RegexOptions.Multiline | RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex SyntaxRegexItalic = new Regex("(.*?)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex SyntaxRegexBold = new Regex("(.*?)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

///

/// Fixes and improves syntax (such as html markup)

///

/// The wiki text of the article.

/// The modified article text.

public string FixSyntax(string ArticleText)

{

//replace html with wiki syntax

if (!Regex.IsMatch(ArticleText, "'|'", RegexOptions.IgnoreCase))

{

ArticleText = SyntaxRegexItalic.Replace(ArticleText, "$1");

ArticleText = SyntaxRegexBold.Replace(ArticleText, "$1");

}

ArticleText = Regex.Replace(ArticleText, "^


|^----+", "----", RegexOptions.Multiline);

//remove appearance of double line break

ArticleText = Regex.Replace(ArticleText, "(^==?[^=]*==?)\r\n(\r\n)?----+", "$1", RegexOptions.Multiline);

//remove unnecessary namespace

ArticleText = SyntaxRegexTemplate.Replace(ArticleText, "$1$2");

//remove
from lists

ArticleText = SyntaxRegex11.Replace(ArticleText, "$1\r\n");

//can cause problems

//ArticleText = Regex.Replace(ArticleText, "^<[Hh]2>(.*?)", "==$1==", RegexOptions.Multiline);

//ArticleText = Regex.Replace(ArticleText, "^<[Hh]3>(.*?)", "===$1===", RegexOptions.Multiline);

//ArticleText = Regex.Replace(ArticleText, "^<[Hh]4>(.*?)", "====$1====", RegexOptions.Multiline);

//fix uneven bracketing on links

if (!Regex.IsMatch(ArticleText, "\\[\\Ii]mage:[^*http"))

{

ArticleText = SyntaxRegex1.Replace(ArticleText, "[http://$1]");

ArticleText = SyntaxRegex2.Replace(ArticleText, "[http://$1]");

ArticleText = SyntaxRegex3.Replace(ArticleText, "[http://$1]");

ArticleText = SyntaxRegex4.Replace(ArticleText, "$1$2");

ArticleText = SyntaxRegex5.Replace(ArticleText, "$1$2$3");

}

//repair bad external links

ArticleText = SyntaxRegex6.Replace(ArticleText, "[$1]");

//repair bad internal links

ArticleText = SyntaxRegex7.Replace(ArticleText, "$1");

ArticleText = SyntaxRegex8.Replace(ArticleText, "$1");

ArticleText = SyntaxRegex9.Replace(ArticleText, "$1#$2");

ArticleText = Regex.Replace(ArticleText, "ISBN: ?([0-9])", "ISBN $1");

return ArticleText.Trim();

}

///

/// Fixes link syntax

///

/// The wiki text of the article.

/// Value that indicated whether no change was made.

/// The modified article text.

public string FixLinks(string ArticleText, out bool NoChange)

{

testText = ArticleText;

string y = "";

string cat = "[[" + Variables.Namespaces[14];

foreach (Match m in WikiRegexes.SimpleWikiLink.Matches(ArticleText))

{

if (!m.Value.StartsWith(cat) && !m.Value.StartsWith("[[Image:") && !m.Value.StartsWith("[[image:") && !m.Value.StartsWith("[[_") && !m.Value.Contains("|_"))

{

y = m.Value.Replace("_", " ");

y = Regex.Replace(y, " ?\\| ?", "|");

}

else

y = m.Value;

y = y.Replace("+", "%2B");

y = HttpUtility.UrlDecode(y);

ArticleText = ArticleText.Replace(m.Value, y);

}

if (testText == ArticleText)

NoChange = true;

else

NoChange = false;

return ArticleText;

}

///

/// Simplifies some links in article wiki text such as changing Dogs to Dogs

///

/// The wiki text of the article.

/// Value that indicated whether no change was made.

/// The simplified article text.

public string LinkSimplifier(string ArticleText, out bool NoChange)

{

testText = ArticleText;

ArticleText = LinkSimplifier(ArticleText);

if (testText == ArticleText)

NoChange = true;

else

NoChange = false;

return ArticleText;

}

///

/// Simplifies some links in article wiki text such as changing Dogs to Dogs

///

/// The wiki text of the article.

/// The simplified article text.

public string LinkSimplifier(string ArticleText)

{

string n = "";

string a = "";

string b = "";

string k = "";

foreach (Match m in WikiRegexes.PipedWikiLink.Matches(ArticleText))

{

n = m.Value;

a = m.Groups[1].Value;

b = m.Groups[2].Value;

if (a == b || Tools.TurnFirstToLower(a) == b)

{

k = WikiRegexes.PipedWikiLink.Replace(n, "$2");

ArticleText = ArticleText.Replace(n, k);

}

else if (a + "s" == b || Tools.TurnFirstToLower(a) + "s" == b)

{

k = WikiRegexes.PipedWikiLink.Replace(n, "$2");

k = "" + k.Substring(0, k.Length - 1) + "s";

ArticleText = ArticleText.Replace(n, k);

}

}

return ArticleText;

}

///

/// Adds bullet points to external links after "external links" header

///

/// The wiki text of the article.

/// Value that indicated whether no change was made.

/// The modified article text.

public string BulletExternalLinks(string ArticleText, out bool NoChange)

{

testText = ArticleText;

ArticleText = BulletExternalLinks(ArticleText);

if (testText == ArticleText)

NoChange = true;

else

NoChange = false;

return ArticleText;

}

///

/// Adds bullet points to external links after "external links" header

///

/// The wiki text of the article.

/// The modified article text.

public string BulletExternalLinks(string ArticleText)

{

int intStart = 0;

string ArticleTextSubstring = "";

Match m = Regex.Match(ArticleText, "= ? ?external links? ? ?=", RegexOptions.IgnoreCase | RegexOptions.RightToLeft);

if (!m.Success)

return ArticleText;

intStart = m.Index;

ArticleTextSubstring = ArticleText.Substring(intStart);

ArticleText = ArticleText.Substring(0, intStart);

ArticleTextSubstring = Regex.Replace(ArticleTextSubstring, "(\r\n)?(\r\n)(\\[?http)", "$2* $3");

ArticleText += ArticleTextSubstring;

return ArticleText;

}

public string FixCategories(string ArticleText)

{//Fix common spacing/capitalisation errors in categories

Regex catregex = new Regex("\\[\\[ ?" + Variables.NamespacesCaseInsensitive[14] + " ?(.*?)\\]\\]");

string cat = "[[" + Variables.Namespaces[14];

string x = "";

foreach (Match m in catregex.Matches(ArticleText))

{

x = cat + m.Groups[1].Value.Replace("_", " ") + "]]";

ArticleText = ArticleText.Replace(m.Value, x);

}

return ArticleText;

}

#endregion

#region other functions

///

/// Converts HTML entities to unicode, with some deliberate exceptions

///

/// The wiki text of the article.

/// Value that indicated whether no change was made.

/// The modified article text.

public string Unicodify(string ArticleText, out bool NoChange)

{

testText = ArticleText;

ArticleText = Unicodify(ArticleText);

if (testText == ArticleText)

NoChange = true;

else

NoChange = false;

return ArticleText;

}

///

/// Converts HTML entities to unicode, with some deliberate exceptions

///

/// The wiki text of the article.

/// The modified article text.

public string Unicodify(string ArticleText)

{

if (Regex.IsMatch(ArticleText, "<[Mm]ath>"))

return ArticleText;

ArticleText = Regex.Replace(ArticleText, "–|–|–", "–");

ArticleText = Regex.Replace(ArticleText, "—|—|—", "—");

ArticleText = ArticleText.Replace(" & ", " & ");

ArticleText = ArticleText.Replace("&", "&amp;");

foreach (KeyValuePair k in RegexUnicode)

{

ArticleText = k.Key.Replace(ArticleText, k.Value);

}

try

{

ArticleText = HttpUtility.HtmlDecode(ArticleText);

}

catch (Exception ex)

{

System.Windows.Forms.MessageBox.Show(ex.ToString());

}

return ArticleText;

}

///

/// Emboldens the first occurence of the title, if it isnt already

///

/// The wiki text of the article.

/// The title of the article.

/// Value that indicated whether no change was made.

/// The modified article text.

public string BoldTitle(string ArticleText, string ArticleTitle, out bool NoChange)

{

//ignore date articles

if (WikiRegexes.Dates2.IsMatch(ArticleTitle))

{

NoChange = true;

return ArticleText;

}

string escTitle = Regex.Escape(ArticleTitle);

//remove self links first

Regex tregex = new Regex("\\[\\[(" + Tools.CaseInsensitive(escTitle) + ")\\]\\]");

if (!ArticleText.Contains("'''"))

{

ArticleText = tregex.Replace(ArticleText, "$1", 1);

}

else

{

ArticleText = ArticleText.Replace("" + ArticleTitle + "", ArticleTitle);

ArticleText = ArticleText.Replace("" + Tools.TurnFirstToLower(ArticleTitle) + "", Tools.TurnFirstToLower(ArticleTitle));

}

if (Regex.IsMatch(ArticleText, "^(\\[\\[|\\*|:)") || Regex.IsMatch(ArticleText, " ?" + escTitle + " ?", RegexOptions.IgnoreCase))

{

NoChange = true;

return ArticleText;

}

ArticleText = hider.HideMore(ArticleText);

escTitle = Regex.Replace(ArticleTitle, " \\(.*?\\)$", "");

escTitle = Regex.Escape(escTitle);

Regex regexBold = new Regex("([^\\[]|^)(" + escTitle + ")([ ,.:;])", RegexOptions.IgnoreCase);

string strSecondHalf = "";

if (ArticleText.Length > 80)

{

strSecondHalf = ArticleText.Substring(80);

ArticleText = ArticleText.Substring(0, 80);

}

if (ArticleText.Contains("'''"))

{

ArticleText = ArticleText + strSecondHalf;

ArticleText = hider.AddBackMore(ArticleText);

NoChange = true;

return ArticleText;

}

if (regexBold.IsMatch(ArticleText))

{

NoChange = false;

ArticleText = regexBold.Replace(ArticleText, "$1$2$3", 1);

}

else

NoChange = true;

ArticleText = ArticleText + strSecondHalf;

ArticleText = hider.AddBackMore(ArticleText);

return ArticleText;

}

///

/// Replaces an iamge in the article.

///

/// The wiki text of the article.

/// The old image to replace.

/// The new image.

/// Value that indicated whether no change was made.

/// The new article text.

public string ReplaceImage(string OldImage, string NewImage, string ArticleText, out bool NoChange)

{

testText = ArticleText;

ArticleText = ReplaceImage(OldImage, NewImage, ArticleText);

if (testText == ArticleText)

NoChange = true;

else

NoChange = false;

return ArticleText;

}

///

/// Replaces an iamge in the article.

///

/// The wiki text of the article.

/// The old image to replace.

/// The new image.

/// The new article text.

public string ReplaceImage(string OldImage, string NewImage, string ArticleText)

{

//remove image prefix

OldImage = Regex.Replace(OldImage, "^" + Variables.Namespaces[6], "", RegexOptions.IgnoreCase).Replace("_", " ");

NewImage = Regex.Replace(NewImage, "^" + Variables.Namespaces[6], "", RegexOptions.IgnoreCase).Replace("_", " ");

OldImage = Regex.Escape(OldImage).Replace("\\ ", "[ _]");

OldImage = Variables.NamespacesCaseInsensitive[6] + Tools.CaseInsensitive(OldImage);

NewImage = Variables.Namespaces[6] + NewImage;

ArticleText = Regex.Replace(ArticleText, OldImage, NewImage);

return ArticleText;

}

///

/// Removes an iamge in the article.

///

/// The wiki text of the article.

/// The image to remove.

/// The new article text.

public string RemoveImage(string Image, string ArticleText, bool CommentOut, string Comment)

{

//remove image prefix

Image = Regex.Replace(Image, "^" + Variables.Namespaces[6], "", RegexOptions.IgnoreCase).Replace("_", " ");

Image = Regex.Escape(Image).Replace("\\ ", "[ _]");

Image = Tools.CaseInsensitive(Image);

Regex r = new Regex("\\[\\[" + Variables.NamespacesCaseInsensitive[6] + Image + ".*\\]\\]");

MatchCollection n = r.Matches(ArticleText);

if (n.Count > 0)

{

foreach (Match m in n)

{

string match = m.Value;

int i = 0;

int j = 0;

foreach (char c in match)

{

if (c == '[')

j++;

else if (c == ']')

j--;

i++;

if (j == 0)

{

if (match.Length > i)

match = match.Remove(i);

Regex t = new Regex(Regex.Escape(match));

if (CommentOut)

ArticleText = t.Replace(ArticleText, "", 1, m.Index);

else

ArticleText = t.Replace(ArticleText, "", 1);

break;

}

}

}

}

else

{

r = new Regex("(" + Variables.NamespacesCaseInsensitive[6] + ")?" + Image);

n = r.Matches(ArticleText);

foreach (Match m in n)

{

Regex t = new Regex(Regex.Escape(m.Value));

if (CommentOut)

ArticleText = t.Replace(ArticleText, "", 1, m.Index);

else

ArticleText = t.Replace(ArticleText, "", 1, m.Index);

}

}

return ArticleText;

}

///

/// Removes an iamge in the article.

///

/// The wiki text of the article.

/// The image to remove.

/// Value that indicated whether no change was made.

/// The new article text.

public string RemoveImage(string Image, string ArticleText, bool CommentOut, string Comment, out bool NoChange)

{

testText = ArticleText;

ArticleText = RemoveImage(Image, ArticleText, CommentOut, Comment);

if (testText == ArticleText)

NoChange = true;

else

NoChange = false;

return ArticleText;

}

///

/// Adds the category to the article.

///

/// The wiki text of the article.

/// The new category.

/// The article text.

public string AddCategory(string NewCategory, string ArticleText, string ArticleTitle, out bool NoChange)

{

testText = ArticleText;

ArticleText = AddCategory(NewCategory, ArticleText, ArticleTitle);

if (testText == ArticleText)

NoChange = true;

else

NoChange = false;

return ArticleText;

}

///

/// Adds the category to the article.

///

/// The wiki text of the article.

/// The new category.

/// The article text.

public string AddCategory(string NewCategory, string ArticleText, string ArticleTitle)

{

if (Regex.IsMatch(ArticleText, "\\[\\[ ?[Cc]ategory ?: ?" + Regex.Escape(NewCategory)))

return ArticleText;

string cat = "\r\n" + Variables.Namespaces[14] + NewCategory + "";

cat = Tools.ApplyKeyWords(ArticleTitle, cat);

if (ArticleTitle.StartsWith(Variables.Namespaces[10]))

ArticleText += "" + cat + "\r\n";

else

ArticleText += cat;

return ArticleText;

}

///

/// Re-categorises the article.

///

/// The wiki text of the article.

/// The old category to replace.

/// The new category.

/// Value that indicated whether no change was made.

/// The re-categorised article text.

public string ReCategoriser(string OldCategory, string NewCategory, string ArticleText, out bool NoChange)

{

//remove category prefix

OldCategory = Regex.Replace(OldCategory, "^" + Variables.Namespaces[14], "", RegexOptions.IgnoreCase);

NewCategory = Regex.Replace(NewCategory, "^" + Variables.Namespaces[14], "", RegexOptions.IgnoreCase);

//format categories properly

ArticleText = FixCategories(ArticleText);

testText = ArticleText;

if (Regex.IsMatch(ArticleText, "\\[\\[" + Variables.NamespacesCaseInsensitive[14] + Tools.CaseInsensitive(Regex.Escape(NewCategory)) + "( ?\\|| ?\\]\\])"))

{

ArticleText = RemoveCategory(OldCategory, ArticleText);

}

else

{

OldCategory = Regex.Escape(OldCategory);

OldCategory = Tools.CaseInsensitive(OldCategory);

OldCategory = Variables.Namespaces[14] + OldCategory + "( ?\\|| ?\\]\\])";

NewCategory = Variables.Namespaces[14] + NewCategory + "$1";

ArticleText = Regex.Replace(ArticleText, OldCategory, NewCategory);

}

if (testText == ArticleText)

NoChange = true;

else

NoChange = false;

return ArticleText;

}

///

/// Removes a category from an article.

///

/// The wiki text of the article.

/// The old category to remove.

/// Value that indicated whether no change was made.

/// The article text without the old category.

public string RemoveCategory(string strOldCat, string ArticleText, out bool NoChange)

{

testText = ArticleText;

ArticleText = RemoveCategory(strOldCat, ArticleText);

if (testText == ArticleText)

NoChange = true;

else

NoChange = false;

return ArticleText;

}

///

/// Removes a category from an article.

///

/// The wiki text of the article.

/// The old category to remove.

/// The article text without the old category.

public string RemoveCategory(string strOldCat, string ArticleText)

{

//format categories properly

ArticleText = FixCategories(ArticleText);

strOldCat = Regex.Escape(strOldCat);

strOldCat = Tools.CaseInsensitive(strOldCat);

//broken into two parts to avoid removal of newline when it's not desirable

string s = "\\[\\[" + Variables.NamespacesCaseInsensitive[14] + " ?" + strOldCat + "( ?\\]\\]| ?\\|[^\\|]*?\\]\\])\r\n\\[";

ArticleText = Regex.Replace(ArticleText, s, "[");

s = "\\[\\[" + Variables.NamespacesCaseInsensitive[14] + " ?" + strOldCat + "( ?\\]\\]| ?\\|[^\\|]*?\\]\\])";

ArticleText = Regex.Replace(ArticleText, s, "");

return ArticleText;

}

public string LivingPeople(string ArticleText, out bool NoChange)

{

NoChange = true;

testText = ArticleText;

if (Regex.IsMatch(ArticleText, "\\[\\[ ?Category ?:[ _]?([0-9]{1,2}[ _]century[ _]deaths|[0-9s]{4,5}[ _]deaths|Disappeared[ _]people|Living[ _]people|Year[ _]of[ _]death[ _]missing|Possibly[ _]living[ _]people)", RegexOptions.IgnoreCase))

return ArticleText;

Match m = Regex.Match(ArticleText, "\\[\\[ ?Category ?:[ _]?([0-9]{4})[ _]births(\\|.*?)?\\]\\]", RegexOptions.IgnoreCase);

if (!m.Success)

return ArticleText;

string birthCat = m.Value;

int birthYear = int.Parse(m.Groups[1].Value);

string catKey = "";

if (birthYear < 1910)

return ArticleText;

if (birthCat.Contains("|"))

catKey = Regex.Match(birthCat, "\\|.*?\\]\\]").Value;

else

catKey = "]]";

ArticleText += "[[Category:Living people" + catKey;

if (testText == ArticleText)

NoChange = true;

else

NoChange = false;

return ArticleText;

}

///

/// Converts/subst'd some deprecated templates

///

/// The wiki text of the article.

/// Value that indicated whether no change was made.

/// The new article text.

public string Conversions(string ArticleText, out bool NoChange)

{

testText = ArticleText;

ArticleText = Conversions(ArticleText);

if (testText == ArticleText)

NoChange = true;

else

NoChange = false;

return ArticleText;

}

///

/// Converts/subst'd some deprecated templates

///

/// The wiki text of the article.

/// The new article text.

public string Conversions(string ArticleText)

{

//Use proper codes

ArticleText = ArticleText.Replace("[[zh-tw:", "[[zh:");

ArticleText = ArticleText.Replace("[[nb:", "[[no:");

ArticleText = ArticleText.Replace("[[dk:", "[[da:");

ArticleText = ArticleText.Replace("{{msg:", "{{");

foreach (KeyValuePair k in RegexConversion)

{

ArticleText = k.Key.Replace(ArticleText, k.Value);

}

return ArticleText;

}

///

/// Subst'd some user talk templates

///

/// The wiki text of the talk page.

/// The new text.

public string SubstUserTemplates(string TalkPageText)

{

TalkPageText = Regex.Replace(TalkPageText, "\\{\\{(template:)?(test[n0-6]?[ab]?)\\}\\}", "{{subst:$2}}", RegexOptions.IgnoreCase);

TalkPageText = Regex.Replace(TalkPageText, "\\{\\{(template:)?(test[n0-6]?[ab]?-n\\|.*?)\\}\\}", "{{subst:$2}}", RegexOptions.IgnoreCase);

TalkPageText = Regex.Replace(TalkPageText, "\\{\\{(template:)?(3RR[0-5]?)\\}\\}", "{{subst:$2}}", RegexOptions.IgnoreCase);

TalkPageText = Regex.Replace(TalkPageText, "\\{\\{(template:)?(spam[0-5][ab]?)\\}\\}", "{{subst:$2}}", RegexOptions.IgnoreCase);

TalkPageText = Regex.Replace(TalkPageText, "\\{\\{(template:)?(spam[0-5]?-n\\|.*?)\\}\\}", "{{subst:$2}}", RegexOptions.IgnoreCase);

TalkPageText = Regex.Replace(TalkPageText, "\\{\\{(template:)?(welcome[0-6]|welcomeip|anon|welcome-anon)\\}\\}", "{{subst:$2}}", RegexOptions.IgnoreCase);

return TalkPageText;

}

///

/// If necessary, adds/removes wikify or stub tag

///

public string Tagger(string ArticleText, string ArticleTitle, out bool NoChange, ref string Summary)

{

testText = ArticleText;

ArticleText = Tagger(ArticleText, ArticleTitle, ref Summary);

if (testText == ArticleText)

NoChange = true;

else

NoChange = false;

return ArticleText;

}

///

/// adds/removes

///

/// The wiki text of the article.

/// The old category to remove.

/// The article text without the old category.

public string Tagger(string ArticleText, string ArticleTitle, ref string Summary)

{

if (Tools.IsRedirect(ArticleText))

return ArticleText;

if (!Tools.IsMainSpace(ArticleTitle)) return ArticleText;

double Length = ArticleText.Length + 1;

double LinkCount = 1;

double Ratio = 0;

string CommentsStripped = WikiRegexes.Comments.Replace(ArticleText, "");

int words = Tools.WordCount(CommentsStripped);

//update by-date tags

foreach (KeyValuePair k in RegexTagger)

{

ArticleText = k.Key.Replace(ArticleText, k.Value);

}

//remove stub tags from long articles

if (words > StubMaxWordCount && WikiRegexes.Stub.IsMatch(CommentsStripped))

{

MatchEvaluator stubEvaluator = new MatchEvaluator(stubChecker);

ArticleText = WikiRegexes.Stub.Replace(ArticleText, stubEvaluator);

ArticleText = ArticleText.Trim();

}

foreach (Match m in WikiRegexes.Template.Matches(ArticleText))

{

if (!m.Value.Contains("stub"))

return ArticleText;

}

LinkCount = Tools.LinkCount(CommentsStripped);

Ratio = LinkCount / Length;

if (words > 6 && !WikiRegexes.Category.IsMatch(CommentsStripped) && !Regex.IsMatch(ArticleText, @"\{\{[Uu]ncategori[zs]ed"))

{

if (WikiRegexes.Stub.IsMatch(CommentsStripped))

{

ArticleText += "\r\n\r\n{{Uncategorizedstub|February 2007}}";

Summary += ", added uncategorised tag";

}

else

{

ArticleText += "\r\n\r\n{{Uncategorized|February 2007}}";

Summary += ", added uncategorised tag";

}

}

else if (LinkCount < 3 && (Ratio < 0.0025))

{

ArticleText = "{{Wikify|February 2007}}\r\n\r\n" + ArticleText;

Summary += ", added wikify tag";

}

else if (CommentsStripped.Length <= 300 && !WikiRegexes.Stub.IsMatch(CommentsStripped))

{

ArticleText = ArticleText + "\r\n\r\n\r\n{{stub}}";

Summary += ", added stub tag";

}

return ArticleText;

}

private string stubChecker(Match m)

{// Replace each Regex cc match with the number of the occurrence.

if (Regex.IsMatch(m.Value, Variables.SectStub))

return m.Value;

else

return "";

}

#endregion

#region unused

///

/// Bypasses all redirects in the article

///

public string BypassRedirects(string ArticleText)

{//checks links to make them bypass redirects and (TODO) disambigs

string link = "";

string article = "";

MatchCollection simple = WikiRegexes.WikiLinksOnly.Matches(ArticleText);

MatchCollection piped = WikiRegexes.PipedWikiLink.Matches(ArticleText);

foreach (Match m in simple)

{

//make link

link = m.Value;

article = m.Groups[1].Value;

//get text

string text = "";

try

{

text = Tools.GetArticleText(article);

}

catch

{

continue;

}

//test if redirect

if (Tools.IsRedirect(text))

{

string directLink = Tools.RedirectTarget(text).Replace("_"," ");

directLink = "" + article + "";

ArticleText = ArticleText.Replace(link, directLink);

}

}

return ArticleText;

}

///

/// Fixes minor problems, such as abbreviations and miscapitalisations

///

/// The wiki text of the article.

/// The new article text.

public string MinorThings(string ArticleText)

{

ArticleText = Regex.Replace(ArticleText, "[Aa]\\.[Kk]\\.[Aa]\\.?", "also known as");

ArticleText = ArticleText.Replace("e.g.", "for example");

ArticleText = ArticleText.Replace("i.e.", "that is");

MatchCollection ma = Regex.Matches(ArticleText, "(monday|tuesday|wednesday|thursday|friday|saturday|sunday|january|february|april|june|july|august|september|october|november|december)");

if (ma.Count > 0)

{

foreach (Match m in ma)

ArticleText = ArticleText.Replace(m.Groups[1].Value, Tools.TurnFirstToUpper(m.Groups[1].Value));

}

return ArticleText;

}

//[http://en.wikipedia.org/wiki/Dog] to Dog

//private string ExtToInternalLinks(string ArticleText)

//{

// foreach (Match m in Regex.Matches(ArticleText, "\\[http://en\\.wikipedia\\.org/wiki/.*?\\]"))

// {

// string a = HttpUtility.UrlDecode(m.ToString());

// if (a.Contains(" "))

// {

// int intP;

// //string a = n;

// intP = a.IndexOf(" ");

// string b = a.Substring(intP);

// a = a.Remove(intP);

// b = b.TrimStart();

// a = a.Replace("_", " ");

// ArticleText = ArticleText.Replace(m.ToString(), a);

// }

// }

// ArticleText = Regex.Replace(ArticleText, "\\[http://en\\.wikipedia\\.org/wiki/(.*?)\\]", "$1");

// return ArticleText;

//}

#endregion

}

}