User:Carnildo/wiki-regex-tester.c

Common usages:

./wiki-regex-tester titles.txt < blacklist.txt

Will test every regex in "blacklist.txt" to see if it matches any titles in "titles.txt". "blacklist.txt" contains one blacklist regex per line; "titles.txt" contains one title per line.

./wiki-regex-tester 'Title of a Wikipedia article' < blacklist.txt

Will test to see if 'Title of a Wikipedia article' would be blocked by any entry in "blacklist.txt"

wget -O - 'http://en.wikipedia.org/w/index.php?title=MediaWiki:Titleblacklist&action=raw' |wiki-regex-tester ns_0.txt|wc -l

Will fetch the latest version of the English Wikipedia blacklist, test it against the list of titles in "ns_0.txt", and count the number of titles matched.

----

/* wiki-regex-tester.c

*

* A program to test regular expressions for the Wikipedia title blacklist. Assumes UTF-8.

*/

/* Compile using gcc -o wiki-regex-tester wiki-regex-tester.c `pcre-config --libs`

*/

  1. include
  2. include
  3. include
  4. include
  5. include
  6. include
  1. include

void preprocess_regex(char *regex, int *casesensitive, int *newaccountonly)

{

size_t lead = 0;

char tempregex[4096];

/* Crude check for modifiers -- assumes correct formatting and that they'll never appear in a regex. */

if(strstr(regex, "casesensitive"))

{

*casesensitive = 1;

}

if(strstr(regex, "newaccountonly"))

{

*newaccountonly = 1;

}

/* Cut off the trailing newline */

if(strrchr(regex, '\n'))

{

*strrchr(regex, '\n') = '\0';

}

/* Whack off the tail end of the regex -- all modifiers and comments */

if(strchr(regex, '#'))

{

*strchr(regex, '#') = '\0'; /* I think it's a safe assumption that '#'-characters can't appear in blacklist entries -- the code appears to be buggy that way. */

}

if(strstr(regex, "

{

*strstr(regex, "

}

if(strstr(regex, "

{

*strstr(regex, "

}

if(strstr(regex, "

{

*strstr(regex, "

}

if(strstr(regex, "

{

*strstr(regex, "

}

if(strstr(regex, "

{

*strstr(regex, "

}

if(strstr(regex, "

{

*strstr(regex, "

}

if(strstr(regex, "

{

*strstr(regex, "

}

/* Trim leading and trailing whitespace */

lead = strspn(regex, " \t");

if(lead > 0)

{

memmove(regex, regex + lead, strlen(regex) - lead + 1);

}

while(regex[strlen(regex) - 1] == ' ')

{

regex[strlen(regex) - 1] = '\0';

}

/* Add anchors */

if(strlen(regex) > 0)

{

sprintf(tempregex, "^%s$", regex);

strcpy(regex, tempregex);

}

}

void fixup_line(char *line)

{

line[strlen(line) - 1] = '\0'; /* Cut off the trailing newline */

while(strchr(line, '_'))

{

*strchr(line, '_') = ' ';

}

}

int main(int argc, char *argv[])

{

int i;

char regex[4096];

char line[1024];

int ovector[300];

FILE *infile;

struct stat dummy;

pcre *comp_regex;

int result;

int matches;

int lines = 0;

const char * errptr;

int offset;

/* Read the regexes in from stdin */

while(!feof(stdin))

{

int casesensitive = 0;

int newaccountonly = 0;

fgets(regex, 4096, stdin);

/* For each regex */

/* Preprocess */

preprocess_regex(regex, &casesensitive, &newaccountonly);

if(strlen(regex) > 0)

{

matches = 0;

fprintf(stderr, "Testing /%s/%c now\n", regex, casesensitive?' ':'i');

comp_regex = pcre_compile(regex, PCRE_UTF8|(casesensitive?0:PCRE_CASELESS), &errptr, &offset, NULL);

if(NULL == comp_regex)

{

fprintf(stderr, "Compile failed: %d %s\n", offset, errptr);

}

else

{

if(!newaccountonly)

{

/* Test */

for(i = 1; i < argc; i++)

{

/* If it's a file */

if(!stat(argv[i], &dummy))

{

infile = fopen(argv[i], "r");

while(!feof(infile))

{

lines += 1;

fgets(line, 1024, infile);

fixup_line(line);

result = pcre_exec(comp_regex, NULL, line, strlen(line), 0, 0, ovector, 300);

if(result >= 0)

{

printf("* %s :: %s\n", line, regex);

matches += 1;

}

else if(result == PCRE_ERROR_NOMATCH)

{

// printf("* Nomatch\n");

}

else

{

fprintf(stderr, "Error: %d\n", result);

}

if((lines % 100000) == 0)

{

fprintf(stderr, "Lines: %d \r", lines);

}

}

fclose(infile);

}

else

{

lines += 1;

/* Otherwise, test as a literal */

result = pcre_exec(comp_regex, NULL, argv[i], strlen(argv[i]), 0, 0, ovector, 300);

if(result >= 0)

{

matches += 1;

printf("* %s :: %s\n", argv[i], regex);

}

else if(result == PCRE_ERROR_NOMATCH)

{

// printf("* No match\n");

}

else

{

fprintf(stderr, "Error: %d\n", result);

}

}

}

}

}

fprintf(stderr, "Matches: %d\n", matches);

}

}

}