User:Gdr/yearbot.py

#!/usr/bin/python

  1. YEARBOT.PY -- POPULATE BIRTHS/DEATHS IN YEAR
  2. Gdr, 2005-05-14
  3. Minor updates: User:Docu, 2006-12-17
  4. INTRODUCTION
  5. This script assists with the population of the "Births" and "Deaths"
  6. sections of an article about a year in the English wikipedia, using
  7. articles in Category: births and Category: deaths.
  8. USAGE
  9. See User:Gdr/Yearbot
  10. requires User:Gdr/history.py
  11. DATA STRUCTURES
  12. An entry is a dictionary with these fields:
  13. article Name of article.
  14. bdate Date of birth, as a pair like ('April 17', '0417').
  15. byear Birth year, as string like '1543'
  16. ddate Date of death, as a pair like ('September 23', '0923').
  17. dyear Death year, as string like '1602'
  18. exclude 1 if article is to be excluded from the page.
  19. intro Introductory paragraph of article, if any is found.
  20. pagelink wikipedia.Page object referring to article.
  21. post String placed after the article link.
  22. pre String placed before the article link.
  23. sort Sort key, if any.
  24. desc Description extracted from article (used as text for 'post'
  25. if entry is new).
  26. LICENCE
  27. This program is free software; you can redistribute it and/or modify
  28. it under the terms of the GNU General Public License as published by
  29. the Free Software Foundation; either version 2 of the License, or (at
  30. your option) any later version.

import catlib

import getopt

import history

import re

import sys

import wikipedia

class Year:

site = wikipedia.Site('en')

# List of regexp search-and-replace patterns that should be applied

# to all descriptions.

patterns = []

# The year we are working on, its Page, and the original text.

year = None

year_pl = None

year_orig = None

year_text = None

ignore = {

'Special:Categories': 1,

}

# Matches a regexp pattern.

pattern_re = r'/((?:[^\\]|\\.)*)/((?:[^\\]|\\.)*)/$'

# File to store patterns.

pattern_file = 'yearbot-patterns'

def __init__(self, year):

if not re.match(r'^' + self.year_re + r'$', year):

print "%s doesn't look like a year" % year

self.year = year

self.year_pl = wikipedia.Page(self.site, self.year)

self.patterns = []

f = file(self.pattern_file)

if f:

for line in f:

m = re.match(self.pattern_re, line)

if m:

self.patterns.append(m.groups())

f.close()

# Matches a year in the range for which the script operates.

year_re = r'1[0-9][0-9][0-9]'

# Matches a trailing birth date.

trail_born_re = re.compile(r'^(.*\S)\s*\(b(?:\.|orn)\s*\[?\[?('

+ year_re + r')\]?\]?\)$')

# Matches a trailing death date.

trail_died_re = re.compile(r'^(.*\S)\s*\(d(?:\.|ied)\s*\[?\[?('

+ year_re + r')\]?\]?\)$')

# Matches a month name.

month_re = (r'January|February|March|April|May|June|'

r'July|August|September|October|November|December')

# Matches a date.

date_re = (r'\[?\[?(?:(' + month_re + r')\s+([0-9]+)|([0-9]+)\s*('

+ month_re + r'))\]?\]?')

# Matches an entry starting with a date.

entry_date_re = re.compile(r'^\s*' + date_re

+ r'\s*(?:-|–|—)?\s*(.*)$')

# Matches an entry: captures pre, article, linktext, post.

entry_re = re.compile(r'([^\[]*)\[\[([^\]|]+)(?:\|([^|\]]*))?\]\](.*)')

# Matches the introductory paragraph of an article, once filled in

# with birth year and death year.

intro1_re = r"^.*[^']+(.*?)\[?\[?%s\]?\]?(.*?)\[?\[?%s\]?\]?\W*(.*)$"

intro2_re = r"^.*[^']+[^\(]*\([^\)]+\)(.*)$"

# Matches description.

desc_re = r'\s+(?:(?:the|an?)\s+)?(([^,.!?\[]|\[\^\+\]\])+)[,.!?]'

desc1_re = re.compile(r'\)\s*was' + desc_re)

desc2_re = re.compile(r'\),' + desc_re)

desc3_re = re.compile(r'\s+was' + desc_re)

desc4_re = re.compile(r',' + desc_re)

# Matches wiki-link

link1_re = re.compile(r'\[\\+\|([^|\]]+)\]\]')

link2_re = re.compile(r'\[\[([^|\]]+)\]\]')

# Approximate date?

approx_re = re.compile(r'\bc(?:\.|a\.|irca)')

def save_patterns(self):

f = file(self.pattern_file, 'w')

if f:

for p in self.patterns:

f.write(u'/%s/%s/\n' % (p[0], p[1]))

f.close()

else:

print "Couldn't write %s" % self.pattern_file

def apply_patterns(self):

for entries in self.topic_entries.values():

for e in entries:

for p in self.patterns:

if e.has_key('post'):

e['post'] = re.sub(p[0], p[1], e['post'])

elif e.has_key('desc'):

e['desc'] = re.sub(p[0], p[1], e['desc'])

def unwikify(self, text):

text = self.link1_re.sub(r'\1', text)

text = self.link2_re.sub(r'\1', text)

return text

def make_date(self, m):

month = m.group(1) or m.group(4)

day = m.group(2) or m.group(3)

return ('%s %s' % (month, day),

'%02d%02d' % (history.months[month], int(day)))

def parse_entries(self, what):

m = re.search(r'==\s*' + what.capitalize()

+ '\s*==\n((?:\s*\n|\*.*\n)*)',

self.year_pl.get())

if not m:

print "No ==%s==" % what.capitalize()

return []

lines = re.split(r'\s*\n\s*', m.group(1))

entries = []

for line_orig in lines:

entry = {}

line = re.sub(r'^\*\s*', '', line_orig)

m = self.entry_date_re.match(line)

if m:

date = self.make_date(m)

if what == 'births':

entry['bdate'] = date

elif what == 'deaths':

entry['ddate'] = date

else:

entry['?date'] = date

line = m.group(5)

m = self.trail_born_re.match(line)

if m:

entry['byear'] = m.group(2)

line = m.group(1)

m = self.trail_died_re.match(line)

if m:

entry['dyear'] = m.group(2)

line = m.group(1)

m = self.entry_re.match(line)

if m:

entry['pre'] = m.group(1)

entry['article'] = m.group(2)

if m.group(3):

entry['linktext'] = m.group(3)

entry['post'] = m.group(4)

entries.append(entry)

elif not re.match(r'^\s*$', line_orig):

wikipedia.output(u"Couldn't parse %s" % line_orig)

return entries

def check_entry(self, entry, key, what, value):

if value != None:

if entry.has_key(key) and entry[key] != value:

wikipedia.output(u"%s '%s' fails to match '%s'; "

u"discarding the former."

% (what, entry[key], value))

entry[key] = value

def parse_article(self, entry, what, entries = {}):

intro = None

try:

text = entry['pagelink'].get()

except wikipedia.IsRedirectPage, arg:

return

except wikipedia.NoPage:

return

# Look for {{lived}} template.

m = re.search(r'{{lived|\s*b\s*=\s*(' + self.year_re

+ r')\s*|\s*d\s*=\s*(' + self.year_re

+ r')\s*|\s*key\s*=\s*(.*)}}', text)

if m:

self.check_entry(entry, 'byear', 'birth year', m.group(1))

self.check_entry(entry, 'dyear', 'death year', m.group(2))

self.check_entry(entry, 'sortkey', 'sort key', m.group(3))

else:

# Get birth year from category, if possible.

m = re.search(r'\[\[[Cc]ategory:(' + self.year_re

+ ') births(?:\|([^|\]]+))?\]\]', text)

if m:

self.check_entry(entry, 'byear', 'birth year', m.group(1))

self.check_entry(entry, 'sortkey', 'sort key', m.group(2))

else:

wikipedia.output(u"%s has no Category:births"

% entry['article'])

# Get death year from category, if possible.

m = re.search(r'\[\[[Cc]ategory:(' + self.year_re

+ ') deaths(?:\|([^|\]]+))?\]\]', text)

if m:

self.check_entry(entry, 'dyear', 'death year', m.group(1))

self.check_entry(entry, 'sortkey', 'sort key', m.group(2))

else:

wikipedia.output(u"%s has no Category:deaths"

% entry['article'])

# Find introductory paragraph.

m = re.search(self.intro1_re % (entry.get('byear') or self.year_re,

entry.get('dyear') or self.year_re),

text, re.M)

if m:

entry['intro'] = m.group(0)

intro = m.group(3)

# Birth date available in intro?

mm = re.search(self.date_re, m.group(1))

if mm:

self.check_entry(entry, 'bdate', 'birth date',

self.make_date(mm))

# Birth date approximate?

if self.approx_re.search(m.group(1)) and what == 'births':

entry['exclude'] = True

# Death date available in intro?

mm = re.search(self.date_re, m.group(2))

if mm:

self.check_entry(entry, 'ddate', 'death date',

self.make_date(mm))

# Death date approximate?

if self.approx_re.search(m.group(2)) and what == 'deaths':

entry['exclude'] = True

else:

m = re.search(self.intro2_re, text, re.M)

if m:

entry['intro'] = m.group(0)

intro = m.group(1)

else:

# Use first line instead.

entry['intro'] = text.split('\n')[0]

# Brief description available?

mm = None

if intro:

mm = (self.desc3_re.match(intro)

or self.desc4_re.match(intro))

mm = (mm or self.desc1_re.search(entry['intro'])

or self.desc2_re.search(entry['intro'])

or self.desc3_re.search(entry['intro'])

or self.desc4_re.search(entry['intro']))

if mm:

entry['desc'] = self.unwikify(mm.group(1))

def get_entries(self, what):

# Get entries from the section of the year page.

entries = self.parse_entries(what)

article_entry = {}

for entry in entries:

article_entry[entry['article']] = entry

# Get lists of births and deaths articles for this year.

cl = catlib.Category(self.site, '%s %s' % (self.year, what))

for a in cl.articles():

if (not self.ignore.has_key(a.title())

and not article_entry.has_key(a.title())):

e = {'article': a.title()}

article_entry[a.title()] = e

# Get them all.

for e in article_entry.values():

e['pagelink'] = wikipedia.Page(self.site, e['article'])

wikipedia.getall(self.site, map(lambda e: e['pagelink'],

article_entry.values()))

# Merge redirects.

for e in article_entry.values():

try:

text = e['pagelink'].get()

except wikipedia.IsRedirectPage, arg:

pl = wikipedia.Page(self.site, arg.args[0])

redir = pl.title()

wikipedia.output("%s redirects to %s" % (e['article'], redir))

if article_entry.has_key(redir):

e['pagelink'] = article_entry[redir]['pagelink']

del article_entry[redir]

else:

e['pagelink'] = pl

del article_entry[e['article']]

article_entry[redir] = e

e['article'] = redir

except wikipedia.NoPage:

continue

# Parse articles.

for e in article_entry.values():

self.parse_article(e, what)

return article_entry.values()

def guess_sortkey(self, article):

words = article.split(' ')

if 1 < len(words):

return words[-1] + u', ' + u' '.join(words[:-1])

else:

return article

def sort_entries(self, entries, what):

for e in entries:

if what == 'births':

e['sort'] = e.has_key('bdate') and e['bdate'][1] or e.get('sortkey') or self.guess_sortkey(e['article'])

elif what == 'deaths':

e['sort'] = e.has_key('ddate') and e['ddate'][1] or e.get('sortkey') or self.guess_sortkey(e['article'])

else:

e['sort'] = e.get('sortkey') or self.guess_sortkey(e['article'])

entries.sort(key=lambda e: e['sort'])

def format_entry(self, entry, what):

if entry.get('exclude'):

t = u'- '

else:

t = u'* '

if what == 'births' and entry.has_key('bdate'):

t = t + u'%s - ' % entry['bdate'][0]

elif what == 'deaths' and entry.has_key('ddate'):

t = t + u'%s - ' % entry['ddate'][0]

t = t + (entry.get('pre') or u'')

if entry.has_key('linktext'):

t = t + u'%s' % (entry['article'], entry['linktext'])

elif entry['article'][-1] == ')':

t = t + u'' % entry['article']

else:

t = t + u'%s' % entry['article']

if entry.has_key('post'):

t = t + entry['post']

elif entry.has_key('desc'):

t = t + u', ' + entry['desc']

if what == 'births' and entry.has_key('dyear'):

t = t + u' (died %s)' % entry['dyear']

elif what == 'deaths' and entry.has_key('byear'):

t = t + u' (born %s)' % entry['byear']

return t

def write_entries(self, entries, what):

if not self.year_text:

self.year_text = self.year_pl.get()

text = self.year_text

m = re.search(r'==\s*' + what.capitalize()

+ '\s*==\n((?:\s*\n|\*.*\n)*)',

text)

if not m:

print "No ==%s==" % what.capitalize()

return ""

return (text[:m.start(1)]

+ u'\n'.join(map(lambda e: self.format_entry(e, what),

filter(lambda e: not e.get('exclude'),

entries)))

+ u'\n\n'

+ text[m.end(1):])

help_text = u"""

h - Help

l - List entries

v - Preview changes to the page

s - Save changes to the page

q - Quit

/// - Edit all entries and save pattern in file

p - Print entry

i - Print introductory paragraph for entry

t - Print whole article text for entry

x - Exclude entry (or include if already excluded)

d: - Update description for entry

d - Cut description for entry to words

P: - Update prefix text for entry

/// - Edit entry using regexp search-and-replace

"""

def show_entries(self, title, entries, what):

wikipedia.output(u'------- %s -------' % title)

n = 0

self.sort_entries(entries, what)

for e in entries:

n = n + 1

wikipedia.output(u"%d%s" % (n, self.format_entry(e, what)))

def interface(self, title, entries, what):

self.show_entries(title, entries, what)

while 1:

inp = wikipedia.input(u"-- What now? [hlqs0-9pdtx]")

m1 = re.match(r'^\s*([0-9]+)\s*([A-Za-z])\s*([0-9]+)$', inp)

m2 = re.match(r'^\s*([0-9]+)\s*([A-Za-z])\s*(:.*)?$', inp)

m3 = re.match(r'^\s*([0-9]+)\s*' + self.pattern_re, inp)

m4 = re.match(r'^\s*' + self.pattern_re, inp)

if inp == 'l':

self.show_entries(title, entries, what)

elif inp == 'q':

return False

elif inp == 's' or inp == 'w':

return True

elif inp == 'h':

wikipedia.output(self.help_text)

elif m1:

n = int(m1.group(1))

op = m1.group(2)

n2 = int(m1.group(3))

if n < 1 or len(entries) < n:

wikipedia.output(u"No entry %d (must be 1-%d)"

% (n, len(entries)))

elif op == 'd':

desc = (entries[n-1].get('post')

or entries[n-1].has_key('desc')

and u', ' + entries[n-1]['desc'] or '')

entries[n-1]['post'] = ' '.join(desc.split(' ')[:n2 + 1])

wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))

else:

wikipedia.output(u"Not understood: %s" % inp)

elif m2:

n = int(m2.group(1))

op = m2.group(2)

if n < 1 or len(entries) < n:

wikipedia.output(u"No entry %d (must be 1-%d)"

% (n, len(entries)))

elif op == 'p':

for k, v in entries[n-1].items():

wikipedia.output(u' %s: %s' % (k, v))

elif op == 'd':

if m2.group(3) and 2 <= len(m2.group(3)):

entries[n-1]['post'] = u', ' + m2.group(3)[1:]

wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))

else:

entries[n-1]['post'] = ''

wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))

elif op == 'P':

entries[n-1]['pre'] = m2.group(3)[1:]

wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))

elif op == 't':

try:

wikipedia.output(entries[n-1]['pagelink'].get())

except:

wikipedia.output(u"No page %s" % entries[n-1]['pagelink'].title())

elif op == 'i':

wikipedia.output(entries[n-1].get('intro', u'No intro'))

elif op == 'x':

entries[n-1]['exclude'] = not entries[n-1].get('exclude')

wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))

else:

wikipedia.output(u"Not understood: %s" % inp)

elif m3:

n = int(m3.group(1))

if n < 1 or len(entries) < n:

wikipedia.output(u"No entry %d (must be 1-%d)"

% (n, len(entries)))

else:

desc = (entries[n-1].get('post')

or entries[n-1].has_key('desc')

and u', ' + entries[n-1]['desc'] or '')

entries[n-1]['post'] = re.sub(m3.group(2), m3.group(3), desc)

wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))

elif m4:

self.patterns.append((m4.group(1), m4.group(2)))

self.save_patterns()

self.apply_patterns()

else:

wikipedia.output(u"Not understood: %s" % inp)

comment = "yearbot - robot-assisted updating of births and deaths"

topic_names = ['births', 'deaths']

def run(self):

self.topic_entries = {}

for what in self.topic_names:

self.topic_entries[what] = self.get_entries(what)

self.sort_entries(self.topic_entries[what], what)

self.apply_patterns()

while 1:

for what in self.topic_names:

entries = self.topic_entries[what]

for i in range((len(entries) + 19) / 20):

efrom = i * 20

eto = min(len(entries), (i + 1) * 20)

batch = entries[efrom : eto]

title = u'%s (%d-%d)' % (what.capitalize(), efrom + 1, eto),

if not self.interface(title, batch, what):

return

self.sort_entries(entries, what)

self.year_text = self.write_entries(entries, what)

wikipedia.showDiff(self.year_pl.get(), self.year_text)

if wikipedia.input(u"OK? [yN]") == 'y':

self.year_pl.put(self.year_text, self.comment)

return

if __name__ == '__main__':

wikipedia.username = 'yearbot'

try:

if len(sys.argv) < 2:

raise "No year specified"

Year(sys.argv[1]).run()

finally:

wikipedia.stopme()