User:Carnildo/wiki-regex-tester.c
Common usages:
./wiki-regex-tester titles.txt < blacklist.txt
Will test every regex in "blacklist.txt" to see if it matches any titles in "titles.txt". "blacklist.txt" contains one blacklist regex per line; "titles.txt" contains one title per line.
./wiki-regex-tester 'Title of a Wikipedia article' < blacklist.txt
Will test to see if 'Title of a Wikipedia article' would be blocked by any entry in "blacklist.txt"
wget -O - 'http://en.wikipedia.org/w/index.php?title=MediaWiki:Titleblacklist&action=raw' |wiki-regex-tester ns_0.txt|wc -l
Will fetch the latest version of the English Wikipedia blacklist, test it against the list of titles in "ns_0.txt", and count the number of titles matched.
----
/* wiki-regex-tester.c *
* A program to test regular expressions for the Wikipedia title blacklist. Assumes UTF-8.
*/
/* Compile using gcc -o wiki-regex-tester wiki-regex-tester.c `pcre-config --libs`
*/
- include
- include
- include
- include
- include
- include
- include
void preprocess_regex(char *regex, int *casesensitive, int *newaccountonly)
{
size_t lead = 0;
char tempregex[4096];
/* Crude check for modifiers -- assumes correct formatting and that they'll never appear in a regex. */
if(strstr(regex, "casesensitive"))
{
*casesensitive = 1;
}
if(strstr(regex, "newaccountonly"))
{
*newaccountonly = 1;
}
/* Cut off the trailing newline */
if(strrchr(regex, '\n'))
{
*strrchr(regex, '\n') = '\0';
}
/* Whack off the tail end of the regex -- all modifiers and comments */
if(strchr(regex, '#'))
{
*strchr(regex, '#') = '\0'; /* I think it's a safe assumption that '#'-characters can't appear in blacklist entries -- the code appears to be buggy that way. */
}
if(strstr(regex, "
{
*strstr(regex, "
}
if(strstr(regex, "
{
*strstr(regex, "
}
if(strstr(regex, "
{
*strstr(regex, "
}
if(strstr(regex, "
{
*strstr(regex, "
}
if(strstr(regex, "
{
*strstr(regex, "
}
if(strstr(regex, "
{
*strstr(regex, "
}
if(strstr(regex, "
{
*strstr(regex, "
}
/* Trim leading and trailing whitespace */
lead = strspn(regex, " \t");
if(lead > 0)
{
memmove(regex, regex + lead, strlen(regex) - lead + 1);
}
while(regex[strlen(regex) - 1] == ' ')
{
regex[strlen(regex) - 1] = '\0';
}
/* Add anchors */
if(strlen(regex) > 0)
{
sprintf(tempregex, "^%s$", regex);
strcpy(regex, tempregex);
}
}
void fixup_line(char *line)
{
line[strlen(line) - 1] = '\0'; /* Cut off the trailing newline */
while(strchr(line, '_'))
{
*strchr(line, '_') = ' ';
}
}
int main(int argc, char *argv[])
{
int i;
char regex[4096];
char line[1024];
int ovector[300];
FILE *infile;
struct stat dummy;
pcre *comp_regex;
int result;
int matches;
int lines = 0;
const char * errptr;
int offset;
/* Read the regexes in from stdin */
while(!feof(stdin))
{
int casesensitive = 0;
int newaccountonly = 0;
fgets(regex, 4096, stdin);
/* For each regex */
/* Preprocess */
preprocess_regex(regex, &casesensitive, &newaccountonly);
if(strlen(regex) > 0)
{
matches = 0;
fprintf(stderr, "Testing /%s/%c now\n", regex, casesensitive?' ':'i');
comp_regex = pcre_compile(regex, PCRE_UTF8|(casesensitive?0:PCRE_CASELESS), &errptr, &offset, NULL);
if(NULL == comp_regex)
{
fprintf(stderr, "Compile failed: %d %s\n", offset, errptr);
}
else
{
if(!newaccountonly)
{
/* Test */
for(i = 1; i < argc; i++)
{
/* If it's a file */
if(!stat(argv[i], &dummy))
{
infile = fopen(argv[i], "r");
while(!feof(infile))
{
lines += 1;
fgets(line, 1024, infile);
fixup_line(line);
result = pcre_exec(comp_regex, NULL, line, strlen(line), 0, 0, ovector, 300);
if(result >= 0)
{
printf("* %s :: %s\n", line, regex);
matches += 1;
}
else if(result == PCRE_ERROR_NOMATCH)
{
// printf("* Nomatch\n");
}
else
{
fprintf(stderr, "Error: %d\n", result);
}
if((lines % 100000) == 0)
{
fprintf(stderr, "Lines: %d \r", lines);
}
}
fclose(infile);
}
else
{
lines += 1;
/* Otherwise, test as a literal */
result = pcre_exec(comp_regex, NULL, argv[i], strlen(argv[i]), 0, 0, ovector, 300);
if(result >= 0)
{
matches += 1;
printf("* %s :: %s\n", argv[i], regex);
}
else if(result == PCRE_ERROR_NOMATCH)
{
// printf("* No match\n");
}
else
{
fprintf(stderr, "Error: %d\n", result);
}
}
}
}
}
fprintf(stderr, "Matches: %d\n", matches);
}
}
}