User:Gdr/yearbot.py

#!/usr/bin/python



YEARBOT.PY -- POPULATE BIRTHS/DEATHS IN YEAR
Gdr, 2005-05-14
Minor updates: User:Docu, 2006-12-17

INTRODUCTION

This script assists with the population of the "Births" and "Deaths"
sections of an article about a year in the English wikipedia, using
articles in Category: births and Category: deaths.


USAGE

See User:Gdr/Yearbot
requires User:Gdr/history.py

DATA STRUCTURES

An entry is a dictionary with these fields:

article   Name of article.
bdate     Date of birth, as a pair like ('April 17', '0417').
byear     Birth year, as string like '1543'
ddate     Date of death, as a pair like ('September 23', '0923').
dyear     Death year, as string like '1602'
exclude   1 if article is to be excluded from the page.
intro     Introductory paragraph of article, if any is found.
pagelink  wikipedia.Page object referring to article.
post      String placed after the article link.
pre       String placed before the article link.
sort      Sort key, if any.
desc      Description extracted from article (used as text for 'post'
if entry is new).


LICENCE

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version.

import catlib
import getopt
import history
import re
import sys
import wikipedia
class Year:
site = wikipedia.Site('en')
# List of regexp search-and-replace patterns that should be applied
# to all descriptions.
patterns = []
# The year we are working on, its Page, and the original text.
year = None
year_pl = None
year_orig = None
year_text = None
ignore = {
'Special:Categories': 1,
}
# Matches a regexp pattern.
pattern_re = r'/((?:[^\\]|\\.)*)/((?:[^\\]|\\.)*)/$'
# File to store patterns.
pattern_file = 'yearbot-patterns'
def __init__(self, year):
if not re.match(r'^' + self.year_re + r'$', year):
print "%s doesn't look like a year" % year
self.year = year
self.year_pl = wikipedia.Page(self.site, self.year)
self.patterns = []
f = file(self.pattern_file)
if f:
for line in f:
m = re.match(self.pattern_re, line)
if m:
self.patterns.append(m.groups())
f.close()
# Matches a year in the range for which the script operates.
year_re = r'1[0-9][0-9][0-9]'
# Matches a trailing birth date.
trail_born_re = re.compile(r'^(.*\S)\s*\(b(?:\.|orn)\s*\[?\[?('
+ year_re + r')\]?\]?\)$')
# Matches a trailing death date.
trail_died_re = re.compile(r'^(.*\S)\s*\(d(?:\.|ied)\s*\[?\[?('
+ year_re + r')\]?\]?\)$')
# Matches a month name.
month_re = (r'January|February|March|April|May|June|'
r'July|August|September|October|November|December')
# Matches a date.
date_re = (r'\[?\[?(?:(' + month_re + r')\s+([0-9]+)|([0-9]+)\s*('
+ month_re + r'))\]?\]?')
# Matches an entry starting with a date.
entry_date_re = re.compile(r'^\s*' + date_re
+ r'\s*(?:-|–|—)?\s*(.*)$')
# Matches an entry: captures pre, article, linktext, post.
entry_re = re.compile(r'([^\[]*)\[\[([^\]|]+)(?:\|([^|\]]*))?\]\](.*)')
# Matches the introductory paragraph of an article, once filled in
# with birth year and death year.
intro1_re = r"^.*[^']+(.*?)\[?\[?%s\]?\]?(.*?)\[?\[?%s\]?\]?\W*(.*)$"
intro2_re = r"^.*[^']+[^\(]*\([^\)]+\)(.*)$"
# Matches description.
desc_re = r'\s+(?:(?:the|an?)\s+)?(([^,.!?\[]|\[\^\+\]\])+)[,.!?]'
desc1_re = re.compile(r'\)\s*was' + desc_re)
desc2_re = re.compile(r'\),' + desc_re)
desc3_re = re.compile(r'\s+was' + desc_re)
desc4_re = re.compile(r',' + desc_re)
# Matches wiki-link
link1_re = re.compile(r'\[\\+\|([^|\]]+)\]\]')
link2_re = re.compile(r'\[\[([^|\]]+)\]\]')
# Approximate date?
approx_re = re.compile(r'\bc(?:\.|a\.|irca)')
def save_patterns(self):
f = file(self.pattern_file, 'w')
if f:
for p in self.patterns:
f.write(u'/%s/%s/\n' % (p[0], p[1]))
f.close()
else:
print "Couldn't write %s" % self.pattern_file
def apply_patterns(self):
for entries in self.topic_entries.values():
for e in entries:
for p in self.patterns:
if e.has_key('post'):
e['post'] = re.sub(p[0], p[1], e['post'])
elif e.has_key('desc'):
e['desc'] = re.sub(p[0], p[1], e['desc'])
def unwikify(self, text):
text = self.link1_re.sub(r'\1', text)
text = self.link2_re.sub(r'\1', text)
return text
def make_date(self, m):
month = m.group(1) or m.group(4)
day = m.group(2) or m.group(3)
return ('%s %s' % (month, day),
'%02d%02d' % (history.months[month], int(day)))
def parse_entries(self, what):
m = re.search(r'==\s*' + what.capitalize()
+ '\s*==\n((?:\s*\n|\*.*\n)*)',
self.year_pl.get())
if not m:
print "No ==%s==" % what.capitalize()
return []
lines = re.split(r'\s*\n\s*', m.group(1))
entries = []
for line_orig in lines:
entry = {}
line = re.sub(r'^\*\s*', '', line_orig)
m = self.entry_date_re.match(line)
if m:
date = self.make_date(m)
if what == 'births':
entry['bdate'] = date
elif what == 'deaths':
entry['ddate'] = date
else:
entry['?date'] = date
line = m.group(5)
m = self.trail_born_re.match(line)
if m:
entry['byear'] = m.group(2)
line = m.group(1)
m = self.trail_died_re.match(line)
if m:
entry['dyear'] = m.group(2)
line = m.group(1)
m = self.entry_re.match(line)
if m:
entry['pre'] = m.group(1)
entry['article'] = m.group(2)
if m.group(3):
entry['linktext'] = m.group(3)
entry['post'] = m.group(4)
entries.append(entry)
elif not re.match(r'^\s*$', line_orig):
wikipedia.output(u"Couldn't parse %s" % line_orig)
return entries
def check_entry(self, entry, key, what, value):
if value != None:
if entry.has_key(key) and entry[key] != value:
wikipedia.output(u"%s '%s' fails to match '%s'; "
u"discarding the former."
% (what, entry[key], value))
entry[key] = value
def parse_article(self, entry, what, entries = {}):
intro = None
try:
text = entry['pagelink'].get()
except wikipedia.IsRedirectPage, arg:
return
except wikipedia.NoPage:
return
# Look for {{lived}} template.
m = re.search(r'{{lived|\s*b\s*=\s*(' + self.year_re
+ r')\s*|\s*d\s*=\s*(' + self.year_re
+ r')\s*|\s*key\s*=\s*(.*)}}', text)
if m:
self.check_entry(entry, 'byear', 'birth year', m.group(1))
self.check_entry(entry, 'dyear', 'death year', m.group(2))
self.check_entry(entry, 'sortkey', 'sort key', m.group(3))
else:
# Get birth year from category, if possible.
m = re.search(r'\[\[[Cc]ategory:(' + self.year_re
+ ') births(?:\|([^|\]]+))?\]\]', text)
if m:
self.check_entry(entry, 'byear', 'birth year', m.group(1))
self.check_entry(entry, 'sortkey', 'sort key', m.group(2))
else:
wikipedia.output(u"%s has no Category:births"
% entry['article'])
# Get death year from category, if possible.
m = re.search(r'\[\[[Cc]ategory:(' + self.year_re
+ ') deaths(?:\|([^|\]]+))?\]\]', text)
if m:
self.check_entry(entry, 'dyear', 'death year', m.group(1))
self.check_entry(entry, 'sortkey', 'sort key', m.group(2))
else:
wikipedia.output(u"%s has no Category:deaths"
% entry['article'])
# Find introductory paragraph.
m = re.search(self.intro1_re % (entry.get('byear') or self.year_re,
entry.get('dyear') or self.year_re),
text, re.M)
if m:
entry['intro'] = m.group(0)
intro = m.group(3)
# Birth date available in intro?
mm = re.search(self.date_re, m.group(1))
if mm:
self.check_entry(entry, 'bdate', 'birth date',
self.make_date(mm))
# Birth date approximate?
if self.approx_re.search(m.group(1)) and what == 'births':
entry['exclude'] = True
# Death date available in intro?
mm = re.search(self.date_re, m.group(2))
if mm:
self.check_entry(entry, 'ddate', 'death date',
self.make_date(mm))
# Death date approximate?
if self.approx_re.search(m.group(2)) and what == 'deaths':
entry['exclude'] = True
else:
m = re.search(self.intro2_re, text, re.M)
if m:
entry['intro'] = m.group(0)
intro = m.group(1)
else:
# Use first line instead.
entry['intro'] = text.split('\n')[0]
# Brief description available?
mm = None
if intro:
mm = (self.desc3_re.match(intro)
or self.desc4_re.match(intro))
mm = (mm or self.desc1_re.search(entry['intro'])
or self.desc2_re.search(entry['intro'])
or self.desc3_re.search(entry['intro'])
or self.desc4_re.search(entry['intro']))
if mm:
entry['desc'] = self.unwikify(mm.group(1))
def get_entries(self, what):
# Get entries from the section of the year page.
entries = self.parse_entries(what)
article_entry = {}
for entry in entries:
article_entry[entry['article']] = entry
# Get lists of births and deaths articles for this year.
cl = catlib.Category(self.site, '%s %s' % (self.year, what))
for a in cl.articles():
if (not self.ignore.has_key(a.title())
and not article_entry.has_key(a.title())):
e = {'article': a.title()}
article_entry[a.title()] = e
# Get them all.
for e in article_entry.values():
e['pagelink'] = wikipedia.Page(self.site, e['article'])
wikipedia.getall(self.site, map(lambda e: e['pagelink'],
article_entry.values()))
# Merge redirects.
for e in article_entry.values():
try:
text = e['pagelink'].get()
except wikipedia.IsRedirectPage, arg:
pl = wikipedia.Page(self.site, arg.args[0])
redir = pl.title()
wikipedia.output("%s redirects to %s" % (e['article'], redir))
if article_entry.has_key(redir):
e['pagelink'] = article_entry[redir]['pagelink']
del article_entry[redir]
else:
e['pagelink'] = pl
del article_entry[e['article']]
article_entry[redir] = e
e['article'] = redir
except wikipedia.NoPage:
continue
# Parse articles.
for e in article_entry.values():
self.parse_article(e, what)
return article_entry.values()
def guess_sortkey(self, article):
words = article.split(' ')
if 1 < len(words):
return words[-1] + u', ' + u' '.join(words[:-1])
else:
return article
def sort_entries(self, entries, what):
for e in entries:
if what == 'births':
e['sort'] = e.has_key('bdate') and e['bdate'][1] or e.get('sortkey') or self.guess_sortkey(e['article'])
elif what == 'deaths':
e['sort'] = e.has_key('ddate') and e['ddate'][1] or e.get('sortkey') or self.guess_sortkey(e['article'])
else:
e['sort'] = e.get('sortkey') or self.guess_sortkey(e['article'])
entries.sort(key=lambda e: e['sort'])
def format_entry(self, entry, what):
if entry.get('exclude'):
t = u'- '
else:
t = u'* '
if what == 'births' and entry.has_key('bdate'):
t = t + u'%s - ' % entry['bdate'][0]
elif what == 'deaths' and entry.has_key('ddate'):
t = t + u'%s - ' % entry['ddate'][0]
t = t + (entry.get('pre') or u'')
if entry.has_key('linktext'):
t = t + u'%s' % (entry['article'], entry['linktext'])
elif entry['article'][-1] == ')':
t = t + u'' % entry['article']
else:
t = t + u'%s' % entry['article']
if entry.has_key('post'):
t = t + entry['post']
elif entry.has_key('desc'):
t = t + u', ' + entry['desc']
if what == 'births' and entry.has_key('dyear'):
t = t + u' (died %s)' % entry['dyear']
elif what == 'deaths' and entry.has_key('byear'):
t = t + u' (born %s)' % entry['byear']
return t
def write_entries(self, entries, what):
if not self.year_text:
self.year_text = self.year_pl.get()
text = self.year_text
m = re.search(r'==\s*' + what.capitalize()
+ '\s*==\n((?:\s*\n|\*.*\n)*)',
text)
if not m:
print "No ==%s==" % what.capitalize()
return ""
return (text[:m.start(1)]
+ u'\n'.join(map(lambda e: self.format_entry(e, what),
filter(lambda e: not e.get('exclude'),
entries)))
+ u'\n\n'
+ text[m.end(1):])
help_text = u"""
h - Help
l - List entries
v - Preview changes to the page
s - Save changes to the page
q - Quit
/// - Edit all entries and save pattern in file
p - Print entry 
i - Print introductory paragraph for entry 
t - Print whole article text for entry 
x - Exclude entry  (or include if already excluded)
d: - Update description for entry 
d - Cut description for entry  to  words
P: - Update prefix text for entry 
/// - Edit entry  using regexp search-and-replace
"""
def show_entries(self, title, entries, what):
wikipedia.output(u'------- %s -------' % title)
n = 0
self.sort_entries(entries, what)
for e in entries:
n = n + 1
wikipedia.output(u"%d%s" % (n, self.format_entry(e, what)))
def interface(self, title, entries, what):
self.show_entries(title, entries, what)
while 1:
inp = wikipedia.input(u"-- What now? [hlqs0-9pdtx]")
m1 = re.match(r'^\s*([0-9]+)\s*([A-Za-z])\s*([0-9]+)$', inp)
m2 = re.match(r'^\s*([0-9]+)\s*([A-Za-z])\s*(:.*)?$', inp)
m3 = re.match(r'^\s*([0-9]+)\s*' + self.pattern_re, inp)
m4 = re.match(r'^\s*' + self.pattern_re, inp)
if inp == 'l':
self.show_entries(title, entries, what)
elif inp == 'q':
return False
elif inp == 's' or inp == 'w':
return True
elif inp == 'h':
wikipedia.output(self.help_text)
elif m1:
n = int(m1.group(1))
op = m1.group(2)
n2 = int(m1.group(3))
if n < 1 or len(entries) < n:
wikipedia.output(u"No entry %d (must be 1-%d)"
% (n, len(entries)))
elif op == 'd':
desc = (entries[n-1].get('post')
or entries[n-1].has_key('desc')
and u', ' + entries[n-1]['desc'] or '')
entries[n-1]['post'] = ' '.join(desc.split(' ')[:n2 + 1])
wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
else:
wikipedia.output(u"Not understood: %s" % inp)
elif m2:
n = int(m2.group(1))
op = m2.group(2)
if n < 1 or len(entries) < n:
wikipedia.output(u"No entry %d (must be 1-%d)"
% (n, len(entries)))
elif op == 'p':
for k, v in entries[n-1].items():
wikipedia.output(u'  %s: %s' % (k, v))
elif op == 'd':
if m2.group(3) and 2 <= len(m2.group(3)):
entries[n-1]['post'] = u', ' + m2.group(3)[1:]
wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
else:
entries[n-1]['post'] = ''
wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
elif op == 'P':
entries[n-1]['pre'] = m2.group(3)[1:]
wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
elif op == 't':
try:
wikipedia.output(entries[n-1]['pagelink'].get())
except:
wikipedia.output(u"No page %s" % entries[n-1]['pagelink'].title())
elif op == 'i':
wikipedia.output(entries[n-1].get('intro', u'No intro'))
elif op == 'x':
entries[n-1]['exclude'] = not entries[n-1].get('exclude')
wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
else:
wikipedia.output(u"Not understood: %s" % inp)
elif m3:
n = int(m3.group(1))
if n < 1 or len(entries) < n:
wikipedia.output(u"No entry %d (must be 1-%d)"
% (n, len(entries)))
else:
desc = (entries[n-1].get('post')
or entries[n-1].has_key('desc')
and u', ' + entries[n-1]['desc'] or '')
entries[n-1]['post'] = re.sub(m3.group(2), m3.group(3), desc)
wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
elif m4:
self.patterns.append((m4.group(1), m4.group(2)))
self.save_patterns()
self.apply_patterns()
else:
wikipedia.output(u"Not understood: %s" % inp)
comment = "yearbot - robot-assisted updating of births and deaths"
topic_names = ['births', 'deaths']
def run(self):
self.topic_entries = {}
for what in self.topic_names:
self.topic_entries[what] = self.get_entries(what)
self.sort_entries(self.topic_entries[what], what)
self.apply_patterns()
while 1:
for what in self.topic_names:
entries = self.topic_entries[what]
for i in range((len(entries) + 19) / 20):
efrom = i * 20
eto = min(len(entries), (i + 1) * 20)
batch = entries[efrom : eto]
title = u'%s (%d-%d)' % (what.capitalize(), efrom + 1, eto),
if not self.interface(title, batch, what):
return
self.sort_entries(entries, what)
self.year_text = self.write_entries(entries, what)
wikipedia.showDiff(self.year_pl.get(), self.year_text)
if wikipedia.input(u"OK? [yN]") == 'y':
self.year_pl.put(self.year_text, self.comment)
return
if __name__ == '__main__':
wikipedia.username = 'yearbot'
try:
if len(sys.argv) < 2:
raise "No year specified"
Year(sys.argv[1]).run()
finally:
wikipedia.stopme()