Wikipedia:Database reports/Articles containing invalid template parameters/Configuration

bullshitparams.py

  1. ! /usr/bin/env python
  2. Public domain; MZMcBride; 2011

import datetime

import codecs

import re

import MySQLdb

import wikitools

import settings

def get_target_templates_list():

return ['Infobox_officeholder']

def get_template_parameters_from_template(template):

template_parameters = set()

template_text = wikitools.Page(wiki, 'Template:'+template).getWikiText()

legal_chars = r'[ %!"$&\'()*,\-.0-9:;?@A-Z^_`a-z~\x80-\xFF]'

legal_chars_spaceless = r'[%!"$&\'()*,\-.0-9:;?@A-Z^_`a-z~\x80-\xFF]'

dynamic_parameter_re = re.compile(r'('+

legal_chars_spaceless + '+' +

r')\{\{#if:\{\{\{(' +

legal_chars + '+' +

r')\|\}\}\}\|(' +

legal_chars + '*' +

r')\|(' +

legal_chars + '*' +

r')\}\}(' +

legal_chars + '+' +

r')')

for match in dynamic_parameter_re.finditer(template_text):

parameter_name_1 = match.group(1)+match.group(3)+match.group(5)

parameter_name_2 = match.group(1)+match.group(4)+match.group(5)

template_parameters.add(parameter_name_1)

template_parameters.add(parameter_name_2)

parameter_re = re.compile(r'\{\{\{([ %!"$&\'()*,\-.0-9:;?@A-Z^_`a-z~\x80-\xFF]+)(\||\})', re.I|re.MULTILINE)

for match in parameter_re.finditer(template_text):

template_parameters.add(match.group(1).strip())

return template_parameters

def get_articles_list(cursor, template):

articles_list = []

cursor.execute('''

/* bullshitparams.py SLOW_OK */

SELECT

page_title

FROM page

JOIN templatelinks

ON tl_from = page_id

WHERE tl_namespace = 10

AND tl_title = %s

AND page_namespace = 0

AND page_is_redirect = 0;

''' , template)

for row in cursor.fetchall():

article = unicode(row[0], 'utf-8')

articles_list.append(article)

return articles_list

def get_template_parameters_from_article(article, templates):

article_parameters = set()

inner_template_re = re.compile(r'\{\{[^}]+\}\}', re.I|re.MULTILINE)

parameter_re = re.compile(r'\|\s*([ %!"$&\'()*,\-.0-9:;?@A-Z^_`a-z~\x80-\xFF]+)\s*=', re.I|re.MULTILINE)

article_text = wikitools.Page(wiki, article).getWikiText()

for template in templates:

template_re = re.compile(r'\{\{\s*%s\s*(.*?)\}\}' % template.replace('_', r'[\s_]*'), re.I|re.MULTILINE|re.DOTALL)

if not template_re.search(article_text):

continue

string_start_position = template_re.search(article_text).start()

shit_re = re.compile(r'(\{\

\{\{\{|\}\}|\}\}\})')

start_shit_re = re.compile(r'(\{\{|\{\{\{)')

end_shit_re = re.compile(r'(\}\}|\}\}\})')

start_matches = 0

end_matches = 0

for match in shit_re.finditer(article_text[string_start_position:]):

if start_shit_re.search(match.group(0)):

start_matches += 1

elif end_shit_re.search(match.group(0)):

string_end_position = match.end()

end_matches += 1

if start_matches == end_matches:

template_content = article_text[string_start_position:string_end_position+string_start_position]

for match in inner_template_re.finditer(template_content[2:]):

template_content = re.sub(re.escape(match.group(0)), '', template_content)

break

for match in parameter_re.finditer(template_content):

article_parameter = match.group(1).strip()

article_parameters.add(article_parameter)

return article_parameters

report_title = settings.rootpage + 'Articles containing bullshit template parameters'

report_template = u'''\

Articles containing bullshit template parameters (limited to approximately \

the first 1000 entries); data as of %s.

{| class="wikitable sortable plainlinks" style="width:100%%; margin:auto;"

style="white-space:nowrap;"

! No.

! Page

! Parameter

%s

'''

wiki = wikitools.Wiki(settings.apiurl); wiki.setMaxlag(-1)

wiki.login(settings.username, settings.password)

conn = MySQLdb.connect(host=settings.host,

db=settings.dbname,

read_default_file='~/.my.cnf')

cursor = conn.cursor()

target_templates = get_target_templates_list()

bullshit_parameters = []

f = codecs.open('%sbullshit-reviewed-page-titles.txt' % settings.path, 'r', 'utf-8')

reviewed_page_titles = f.read()

reviewed_page_titles_list = reviewed_page_titles.split('\n')

f.close()

g = codecs.open('%sbullshit-reviewed-page-titles.txt' % settings.path, 'a', 'utf-8')

count = 1

for template in target_templates:

if count > 1000:

break

articles_list = get_articles_list(cursor, template)

template_parameters = get_template_parameters_from_template(template)

for article in articles_list:

if count > 1000:

break

if article in reviewed_page_titles:

continue

article_parameters = get_template_parameters_from_article(article, target_templates)

bullshit_parameters_count = 0

for i in article_parameters-template_parameters:

bullshit_parameters.append([article, i])

count += 1

bullshit_parameters_count += 1

if bullshit_parameters_count == 0:

g.write(article+'\n')

g.close()

i = 1

output = []

for bullshit_parameter in bullshit_parameters:

page_title = u'{{dbr link|1='+bullshit_parameter[0].replace('_', ' ')+u'}}'

parameter = bullshit_parameter[1]

table_row = u'''| %d

| %s

| %s

|-''' % (i, page_title, parameter)

output.append(table_row)

i += 1

cursor.execute('''

SELECT

UNIX_TIMESTAMP() - UNIX_TIMESTAMP(rc_timestamp)

FROM recentchanges

ORDER BY rc_timestamp DESC

LIMIT 1;

''')

rep_lag = cursor.fetchone()[0]

time_diff = datetime.datetime.utcnow() - datetime.timedelta(seconds=rep_lag)

current_of = time_diff.strftime('%H:%M, %d %B %Y (UTC)')

report = wikitools.Page(wiki, report_title)

report_text = report_template % (current_of, '\n'.join(output))

report_text = report_text.encode('utf-8')

report.edit(report_text, summary=settings.editsumm, bot=1)

cursor.close()

conn.close()

crontab

50 2 * * 6 PYTHONPATH=$HOME/scripts python $HOME/scripts/database-reports/bullshitparams.py > /dev/null