User:Gdr/yearbot.py
#!/usr/bin/python
- YEARBOT.PY -- POPULATE BIRTHS/DEATHS IN YEAR
- Gdr, 2005-05-14
- Minor updates: User:Docu, 2006-12-17
- INTRODUCTION
- This script assists with the population of the "Births" and "Deaths"
- sections of an article about a year in the English wikipedia, using
- articles in Category:
births and Category:deaths .- USAGE
- See User:Gdr/Yearbot
- requires User:Gdr/history.py
- DATA STRUCTURES
- An entry is a dictionary with these fields:
- article Name of article.
- bdate Date of birth, as a pair like ('April 17', '0417').
- byear Birth year, as string like '1543'
- ddate Date of death, as a pair like ('September 23', '0923').
- dyear Death year, as string like '1602'
- exclude 1 if article is to be excluded from the page.
- intro Introductory paragraph of article, if any is found.
- pagelink wikipedia.Page object referring to article.
- post String placed after the article link.
- pre String placed before the article link.
- sort Sort key, if any.
- desc Description extracted from article (used as text for 'post'
- if entry is new).
- LICENCE
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or (at
- your option) any later version.
import catlib
import getopt
import history
import re
import sys
import wikipedia
class Year:
site = wikipedia.Site('en')
# List of regexp search-and-replace patterns that should be applied
# to all descriptions.
patterns = []
# The year we are working on, its Page, and the original text.
year = None
year_pl = None
year_orig = None
year_text = None
ignore = {
'Special:Categories': 1,
}
# Matches a regexp pattern.
pattern_re = r'/((?:[^\\]|\\.)*)/((?:[^\\]|\\.)*)/$'
# File to store patterns.
pattern_file = 'yearbot-patterns'
def __init__(self, year):
if not re.match(r'^' + self.year_re + r'$', year):
print "%s doesn't look like a year" % year
self.year = year
self.year_pl = wikipedia.Page(self.site, self.year)
self.patterns = []
f = file(self.pattern_file)
if f:
for line in f:
m = re.match(self.pattern_re, line)
if m:
self.patterns.append(m.groups())
f.close()
# Matches a year in the range for which the script operates.
year_re = r'1[0-9][0-9][0-9]'
# Matches a trailing birth date.
trail_born_re = re.compile(r'^(.*\S)\s*\(b(?:\.|orn)\s*\[?\[?('
+ year_re + r')\]?\]?\)$')
# Matches a trailing death date.
trail_died_re = re.compile(r'^(.*\S)\s*\(d(?:\.|ied)\s*\[?\[?('
+ year_re + r')\]?\]?\)$')
# Matches a month name.
month_re = (r'January|February|March|April|May|June|'
r'July|August|September|October|November|December')
# Matches a date.
date_re = (r'\[?\[?(?:(' + month_re + r')\s+([0-9]+)|([0-9]+)\s*('
+ month_re + r'))\]?\]?')
# Matches an entry starting with a date.
entry_date_re = re.compile(r'^\s*' + date_re
+ r'\s*(?:-|–|—)?\s*(.*)$')
# Matches an entry: captures pre, article, linktext, post.
entry_re = re.compile(r'([^\[]*)\[\[([^\]|]+)(?:\|([^|\]]*))?\]\](.*)')
# Matches the introductory paragraph of an article, once filled in
# with birth year and death year.
intro1_re = r"^.*[^']+(.*?)\[?\[?%s\]?\]?(.*?)\[?\[?%s\]?\]?\W*(.*)$"
intro2_re = r"^.*[^']+[^\(]*\([^\)]+\)(.*)$"
# Matches description.
desc_re = r'\s+(?:(?:the|an?)\s+)?(([^,.!?\[]|\[\^\+\]\])+)[,.!?]'
desc1_re = re.compile(r'\)\s*was' + desc_re)
desc2_re = re.compile(r'\),' + desc_re)
desc3_re = re.compile(r'\s+was' + desc_re)
desc4_re = re.compile(r',' + desc_re)
# Matches wiki-link
link1_re = re.compile(r'\[\\+\|([^|\]]+)\]\]')
link2_re = re.compile(r'\[\[([^|\]]+)\]\]')
# Approximate date?
approx_re = re.compile(r'\bc(?:\.|a\.|irca)')
def save_patterns(self):
f = file(self.pattern_file, 'w')
if f:
for p in self.patterns:
f.write(u'/%s/%s/\n' % (p[0], p[1]))
f.close()
else:
print "Couldn't write %s" % self.pattern_file
def apply_patterns(self):
for entries in self.topic_entries.values():
for e in entries:
for p in self.patterns:
if e.has_key('post'):
e['post'] = re.sub(p[0], p[1], e['post'])
elif e.has_key('desc'):
e['desc'] = re.sub(p[0], p[1], e['desc'])
def unwikify(self, text):
text = self.link1_re.sub(r'\1', text)
text = self.link2_re.sub(r'\1', text)
return text
def make_date(self, m):
month = m.group(1) or m.group(4)
day = m.group(2) or m.group(3)
return ('%s %s' % (month, day),
'%02d%02d' % (history.months[month], int(day)))
def parse_entries(self, what):
m = re.search(r'==\s*' + what.capitalize()
+ '\s*==\n((?:\s*\n|\*.*\n)*)',
self.year_pl.get())
if not m:
print "No ==%s==" % what.capitalize()
return []
lines = re.split(r'\s*\n\s*', m.group(1))
entries = []
for line_orig in lines:
entry = {}
line = re.sub(r'^\*\s*', '', line_orig)
m = self.entry_date_re.match(line)
if m:
date = self.make_date(m)
if what == 'births':
entry['bdate'] = date
elif what == 'deaths':
entry['ddate'] = date
else:
entry['?date'] = date
line = m.group(5)
m = self.trail_born_re.match(line)
if m:
entry['byear'] = m.group(2)
line = m.group(1)
m = self.trail_died_re.match(line)
if m:
entry['dyear'] = m.group(2)
line = m.group(1)
m = self.entry_re.match(line)
if m:
entry['pre'] = m.group(1)
entry['article'] = m.group(2)
if m.group(3):
entry['linktext'] = m.group(3)
entry['post'] = m.group(4)
entries.append(entry)
elif not re.match(r'^\s*$', line_orig):
wikipedia.output(u"Couldn't parse %s" % line_orig)
return entries
def check_entry(self, entry, key, what, value):
if value != None:
if entry.has_key(key) and entry[key] != value:
wikipedia.output(u"%s '%s' fails to match '%s'; "
u"discarding the former."
% (what, entry[key], value))
entry[key] = value
def parse_article(self, entry, what, entries = {}):
intro = None
try:
text = entry['pagelink'].get()
except wikipedia.IsRedirectPage, arg:
return
except wikipedia.NoPage:
return
# Look for {{lived}} template.
m = re.search(r'{{lived|\s*b\s*=\s*(' + self.year_re
+ r')\s*|\s*d\s*=\s*(' + self.year_re
+ r')\s*|\s*key\s*=\s*(.*)}}', text)
if m:
self.check_entry(entry, 'byear', 'birth year', m.group(1))
self.check_entry(entry, 'dyear', 'death year', m.group(2))
self.check_entry(entry, 'sortkey', 'sort key', m.group(3))
else:
# Get birth year from category, if possible.
m = re.search(r'\[\[[Cc]ategory:(' + self.year_re
+ ') births(?:\|([^|\]]+))?\]\]', text)
if m:
self.check_entry(entry, 'byear', 'birth year', m.group(1))
self.check_entry(entry, 'sortkey', 'sort key', m.group(2))
else:
wikipedia.output(u"%s has no Category:births"
% entry['article'])
# Get death year from category, if possible.
m = re.search(r'\[\[[Cc]ategory:(' + self.year_re
+ ') deaths(?:\|([^|\]]+))?\]\]', text)
if m:
self.check_entry(entry, 'dyear', 'death year', m.group(1))
self.check_entry(entry, 'sortkey', 'sort key', m.group(2))
else:
wikipedia.output(u"%s has no Category:deaths"
% entry['article'])
# Find introductory paragraph.
m = re.search(self.intro1_re % (entry.get('byear') or self.year_re,
entry.get('dyear') or self.year_re),
text, re.M)
if m:
entry['intro'] = m.group(0)
intro = m.group(3)
# Birth date available in intro?
mm = re.search(self.date_re, m.group(1))
if mm:
self.check_entry(entry, 'bdate', 'birth date',
self.make_date(mm))
# Birth date approximate?
if self.approx_re.search(m.group(1)) and what == 'births':
entry['exclude'] = True
# Death date available in intro?
mm = re.search(self.date_re, m.group(2))
if mm:
self.check_entry(entry, 'ddate', 'death date',
self.make_date(mm))
# Death date approximate?
if self.approx_re.search(m.group(2)) and what == 'deaths':
entry['exclude'] = True
else:
m = re.search(self.intro2_re, text, re.M)
if m:
entry['intro'] = m.group(0)
intro = m.group(1)
else:
# Use first line instead.
entry['intro'] = text.split('\n')[0]
# Brief description available?
mm = None
if intro:
mm = (self.desc3_re.match(intro)
or self.desc4_re.match(intro))
mm = (mm or self.desc1_re.search(entry['intro'])
or self.desc2_re.search(entry['intro'])
or self.desc3_re.search(entry['intro'])
or self.desc4_re.search(entry['intro']))
if mm:
entry['desc'] = self.unwikify(mm.group(1))
def get_entries(self, what):
# Get entries from the section of the year page.
entries = self.parse_entries(what)
article_entry = {}
for entry in entries:
article_entry[entry['article']] = entry
# Get lists of births and deaths articles for this year.
cl = catlib.Category(self.site, '%s %s' % (self.year, what))
for a in cl.articles():
if (not self.ignore.has_key(a.title())
and not article_entry.has_key(a.title())):
e = {'article': a.title()}
article_entry[a.title()] = e
# Get them all.
for e in article_entry.values():
e['pagelink'] = wikipedia.Page(self.site, e['article'])
wikipedia.getall(self.site, map(lambda e: e['pagelink'],
article_entry.values()))
# Merge redirects.
for e in article_entry.values():
try:
text = e['pagelink'].get()
except wikipedia.IsRedirectPage, arg:
pl = wikipedia.Page(self.site, arg.args[0])
redir = pl.title()
wikipedia.output("%s redirects to %s" % (e['article'], redir))
if article_entry.has_key(redir):
e['pagelink'] = article_entry[redir]['pagelink']
del article_entry[redir]
else:
e['pagelink'] = pl
del article_entry[e['article']]
article_entry[redir] = e
e['article'] = redir
except wikipedia.NoPage:
continue
# Parse articles.
for e in article_entry.values():
self.parse_article(e, what)
return article_entry.values()
def guess_sortkey(self, article):
words = article.split(' ')
if 1 < len(words):
return words[-1] + u', ' + u' '.join(words[:-1])
else:
return article
def sort_entries(self, entries, what):
for e in entries:
if what == 'births':
e['sort'] = e.has_key('bdate') and e['bdate'][1] or e.get('sortkey') or self.guess_sortkey(e['article'])
elif what == 'deaths':
e['sort'] = e.has_key('ddate') and e['ddate'][1] or e.get('sortkey') or self.guess_sortkey(e['article'])
else:
e['sort'] = e.get('sortkey') or self.guess_sortkey(e['article'])
entries.sort(key=lambda e: e['sort'])
def format_entry(self, entry, what):
if entry.get('exclude'):
t = u'- '
else:
t = u'* '
if what == 'births' and entry.has_key('bdate'):
t = t + u'%s - ' % entry['bdate'][0]
elif what == 'deaths' and entry.has_key('ddate'):
t = t + u'%s - ' % entry['ddate'][0]
t = t + (entry.get('pre') or u'')
if entry.has_key('linktext'):
t = t + u'%s' % (entry['article'], entry['linktext'])
elif entry['article'][-1] == ')':
t = t + u'' % entry['article']
else:
t = t + u'%s' % entry['article']
if entry.has_key('post'):
t = t + entry['post']
elif entry.has_key('desc'):
t = t + u', ' + entry['desc']
if what == 'births' and entry.has_key('dyear'):
t = t + u' (died %s)' % entry['dyear']
elif what == 'deaths' and entry.has_key('byear'):
t = t + u' (born %s)' % entry['byear']
return t
def write_entries(self, entries, what):
if not self.year_text:
self.year_text = self.year_pl.get()
text = self.year_text
m = re.search(r'==\s*' + what.capitalize()
+ '\s*==\n((?:\s*\n|\*.*\n)*)',
text)
if not m:
print "No ==%s==" % what.capitalize()
return ""
return (text[:m.start(1)]
+ u'\n'.join(map(lambda e: self.format_entry(e, what),
filter(lambda e: not e.get('exclude'),
entries)))
+ u'\n\n'
+ text[m.end(1):])
help_text = u"""
h - Help
l - List entries
v - Preview changes to the page
s - Save changes to the page
q - Quit
/
/ / - Edit all entries and save pattern in file
p - Print entry
i - Print introductory paragraph for entry
t - Print whole article text for entry
x - Exclude entry (or include if already excluded)
d: - Update description for entry
d - Cut description for entry to words
P: - Update prefix text for entry
/ / / - Edit entry using regexp search-and-replace """
def show_entries(self, title, entries, what):
wikipedia.output(u'------- %s -------' % title)
n = 0
self.sort_entries(entries, what)
for e in entries:
n = n + 1
wikipedia.output(u"%d%s" % (n, self.format_entry(e, what)))
def interface(self, title, entries, what):
self.show_entries(title, entries, what)
while 1:
inp = wikipedia.input(u"-- What now? [hlqs0-9pdtx]")
m1 = re.match(r'^\s*([0-9]+)\s*([A-Za-z])\s*([0-9]+)$', inp)
m2 = re.match(r'^\s*([0-9]+)\s*([A-Za-z])\s*(:.*)?$', inp)
m3 = re.match(r'^\s*([0-9]+)\s*' + self.pattern_re, inp)
m4 = re.match(r'^\s*' + self.pattern_re, inp)
if inp == 'l':
self.show_entries(title, entries, what)
elif inp == 'q':
return False
elif inp == 's' or inp == 'w':
return True
elif inp == 'h':
wikipedia.output(self.help_text)
elif m1:
n = int(m1.group(1))
op = m1.group(2)
n2 = int(m1.group(3))
if n < 1 or len(entries) < n:
wikipedia.output(u"No entry %d (must be 1-%d)"
% (n, len(entries)))
elif op == 'd':
desc = (entries[n-1].get('post')
or entries[n-1].has_key('desc')
and u', ' + entries[n-1]['desc'] or '')
entries[n-1]['post'] = ' '.join(desc.split(' ')[:n2 + 1])
wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
else:
wikipedia.output(u"Not understood: %s" % inp)
elif m2:
n = int(m2.group(1))
op = m2.group(2)
if n < 1 or len(entries) < n:
wikipedia.output(u"No entry %d (must be 1-%d)"
% (n, len(entries)))
elif op == 'p':
for k, v in entries[n-1].items():
wikipedia.output(u' %s: %s' % (k, v))
elif op == 'd':
if m2.group(3) and 2 <= len(m2.group(3)):
entries[n-1]['post'] = u', ' + m2.group(3)[1:]
wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
else:
entries[n-1]['post'] = ''
wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
elif op == 'P':
entries[n-1]['pre'] = m2.group(3)[1:]
wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
elif op == 't':
try:
wikipedia.output(entries[n-1]['pagelink'].get())
except:
wikipedia.output(u"No page %s" % entries[n-1]['pagelink'].title())
elif op == 'i':
wikipedia.output(entries[n-1].get('intro', u'No intro'))
elif op == 'x':
entries[n-1]['exclude'] = not entries[n-1].get('exclude')
wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
else:
wikipedia.output(u"Not understood: %s" % inp)
elif m3:
n = int(m3.group(1))
if n < 1 or len(entries) < n:
wikipedia.output(u"No entry %d (must be 1-%d)"
% (n, len(entries)))
else:
desc = (entries[n-1].get('post')
or entries[n-1].has_key('desc')
and u', ' + entries[n-1]['desc'] or '')
entries[n-1]['post'] = re.sub(m3.group(2), m3.group(3), desc)
wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
elif m4:
self.patterns.append((m4.group(1), m4.group(2)))
self.save_patterns()
self.apply_patterns()
else:
wikipedia.output(u"Not understood: %s" % inp)
comment = "yearbot - robot-assisted updating of births and deaths"
topic_names = ['births', 'deaths']
def run(self):
self.topic_entries = {}
for what in self.topic_names:
self.topic_entries[what] = self.get_entries(what)
self.sort_entries(self.topic_entries[what], what)
self.apply_patterns()
while 1:
for what in self.topic_names:
entries = self.topic_entries[what]
for i in range((len(entries) + 19) / 20):
efrom = i * 20
eto = min(len(entries), (i + 1) * 20)
batch = entries[efrom : eto]
title = u'%s (%d-%d)' % (what.capitalize(), efrom + 1, eto),
if not self.interface(title, batch, what):
return
self.sort_entries(entries, what)
self.year_text = self.write_entries(entries, what)
wikipedia.showDiff(self.year_pl.get(), self.year_text)
if wikipedia.input(u"OK? [yN]") == 'y':
self.year_pl.put(self.year_text, self.comment)
return
if __name__ == '__main__':
wikipedia.username = 'yearbot'
try:
if len(sys.argv) < 2:
raise "No year specified"
Year(sys.argv[1]).run()
finally:
wikipedia.stopme()