User:Gdr/authority.py

  1. !/usr/bin/python
  2. AUTHORITY.PY -- ADD AUTHORITY TO TAXOBOX
  3. Gdr, 2005-07-05
  4. 1. INTRODUCTION
  5. This Python script add an authority to a selected taxobox on the
  6. English wikipedia.
  7. 1.1 USAGE
  8. ./authority.py --rebuild Rebuild abbreviation table
  9. ./authority.py --query=ABBREV Query abbreviation
  10. ./authority.py TAXON Find authority and add it to taxon
  11. ./authority.py TAXON AUTHORITY Add authority to taxon
  12. 1.2 OPTIONS
  13. -r --rebuild Rebuild abbreviation table
  14. -q X --query=X Query abbreviation
  15. -a A --article=A Start at article A instead of TAXON
  16. -n --noexpand Don't expand abbreviations
  17. -d --disambig Solve disambiguations for abbrevs
  18. 1.2 EXAMPLES
  19. ./authority.py Magnolia
  20. ./authority.py 'Boa constrictor'
  21. ./authority.py Quercus L.
  22. ./authority.py 'Passer domesticus' '(Linnaeus, 1758)'
  23. ./authority.py 'Plasmodium vivax' 'Grassi & Feletti 1890'
  24. ./authority.py -a 'Homo (genus)' Homo
  25. 1.3 LICENCE
  26. This program is free software; you can redistribute it and/or modify
  27. it under the terms of the GNU General Public License as published by
  28. the Free Software Foundation; either version 2 of the License, or (at
  29. your option) any later version.

import getopt

import htmlentitydefs

import os

import pickle

import re

import sys

import time

import unicodedata

import urllib

import wikipedia

class Error(Exception):

def __init__(self, s):

wikipedia.output(unicode(s))

self.s = s

def __repr__(self):

return self.s

class Authority:

# 2. CONFIGURATION

# 2.1 USER CONFIGURATION

# Which Wikipedia we are editing.

site = wikipedia.Site('en')

# 'authfile' is the filename in which the tables of author names and

# abbreviations will be saved.

authfile = 'authority.dat'

# A regular expression that matches an authority and abbreviation in

# a Wikipedia article. (This is the default; you can override it for

# particular sources; see below.)

auth_re = re.compile(ur'^\*[ \']*([\w\'., -]+[\w.])[ \']*'

ur' +(?:[-\u2013]|&[nm]dash;) +'

ur'\[\[([^\]|]+).*\r?\n', re.M|re.U)

# 'wiki_abbrev_sources' is a dictionary mapping a code letter to a

# Wikipedia sources for authority abbreviations. Each source is a

# dictionary with these keys:

#

# name ---- name of the Wikipedia article containing authorities and

# their abbreviations

# re ------ a regular expression matching an authority and its

# abbreviation(s). There must be two groups, one for the

# abbreviation(s) for that authority and one for the name

# of the article about that authority. If omitted, auth_re

# is used as the default. Abbreviations are presumed to be

# separated by commas.

# groups -- a tuple giving the group for the abbreviation(s) and the

# article; if omitted, (1,2) is the default.

# fmt ----- format string for a new authority. Use %A for the

# abbreviation and %B for the authority.

# sort ---- How to sort (by 'surname' or by 'abbrev').

wiki_abbrev_sources = {

'b': {'name': 'List of botanists by author abbreviation',

'fmt': "* %A - %B\n",

'sort': 'abbrev'},

'z': {'name': 'List of zoologists by author abbreviation',

'fmt': "* %A - %B\n",

'sort': 'surname'},

}

# 'other_abbbrev_sources' is a list of other (non-Wikipedia) sources

# for abbreviations. Each entry is a dictionary with keys:

#

# taxon --- a regular expression matching a taxon; means that this

# entry is only appropriate for articles contained in taxa

# matching this regexp. For example 'Plant' for a source

# listing only botanists, or 'Arthropod' for a source

# listsing only entomologists.

# re ------ a regular expression matching the abbreviation and its

# expansion. %A will be replaced by the regexp-escaped

# form of the abbreviation we are looking for. It should

# contain one group, matching the expansion.

# url ----- the URL to visit to find the abbreviation. %A will be

# replaced by the URL-encoded form of the abbreviation we

# are looking for.

other_abbrev_sources = [

{'taxon': 'Plant',

'url': 'http://www.ipni.org/ipni/authorsearch?find_abbreviation=%A&query_type=by_query',

're': r'(?u)>%A - (\w(?:&[a-z]+;|[\w.\' -]+)*(?!\d)\w) *[0-9\n]'},

{'url': 'http://www.ipni.org/ipni/authorsearch?find_surname=%A&query_type=by_query',

're': r'(?u)>%A - (\w(?:&[a-z]+;|[\w.\' -]+)*(?!\d)\w) *[0-9\n]'},

]

# 'auth_sources' is a list of sources to consult to find the

# authority for a taxon. Each entry is a dictionary with these keys:

#

# taxon --- a regular expression matching a taxon; means that this

# entry is only appropriate for articles contained in taxa

# matching this regexp. For example 'Plant' for a source

# listing only plant names, or 'Coleoptera' for a source

# listsing only beetles.

# url ----- the URL to visit to find the taxon. %T will be replaced

# by the URL-encoded form of the taxon we are looking

# for, and %S by the SN2000 "subject" area.

# re ------ a regexp for getting the authority. %A will be replaced

# by the regexp-escaped form of the abbreviation we are

# looking for. It should contain one group, matching the

# expansion.

auth_sources = [

{'taxon': 'Plant',

'url': ('http://www.ipni.org/ipni/plantsearch?'

'find_wholeName=%T&query_type=by_query'),

're': r'%T (.*)'},

{'url': ('http://sn2000.taxonomy.nl/Taxonomicon/TaxonList.aspx?'

'searchBy=ScientificName&subject=%S&search=%T'),

're': r'%T[^<]* *(\(?[^<,]+,? +[0-9]+\)?)'},

# {'url': ('http://www.itis.usda.gov/servlet/SingleRpt/SingleRpt?' 'search_topic=Scientific_Name&search_value=%T'), 're': (r'(?i)%T' r'[ \r\n]*[ \r\n]*' r' ([^<]+)'),}

]

# 2.2 OTHER CONFIGURATION

# 'rank_to_subject' is a dictionary mapping Linnaean rank in Latin

# (as used in Wikipedia taxobox template names) to the SN2000

# "Subject area" in which a taxon can be looked up. Ranks not listed

# here are looked up in the subject area "High".

rank_to_subject = {

'subspecies': 'Species',

'species': 'Species',

'subgenus': 'Genus',

'genus': 'Genus',

'tribus': 'Family',

'subfamilia': 'Family',

'familia': 'Family',

'superfamilia': 'Family',

}

# Don't ask easy questions of the user?

noquery = False

def __init__(self):

for s in self.wiki_abbrev_sources.values():

s['page'] = wikipedia.Page(self.site, s['name'])

self.restore_abbreviations()

# 3. ABBREVIATIONS

#

# We want to be able to find abbreviations and turn them into links

# to the appropriate article. For example, given the abbreviation

# 'L.' we need to generate the wikitext 'L.'.

# This section includes the code for finding, storing, and updating

# these abbreviations.

# 3.1 LOADING AND SAVING ABBREVIATIONS

# Load abbreviations from disk.

def restore_abbreviations(self):

self.abbrev = {}

if os.path.isfile(self.authfile):

f = open(self.authfile, 'r')

if f:

self.abbrev = pickle.load(f)

f.close()

# Save authorities to disk.

def save_abbreviations(self):

f = file('authority.dat', 'w')

pickle.dump(self.abbrev, f)

f.close()

def unhtmlify(self, s):

s = s.decode('iso-8859-1')

while True:

m = re.search(r'&([a-z]+);', s)

if not m:

break

s = s[:m.start(0)] \

+ unichr(htmlentitydefs.name2codepoint[m.group(1)]) \

+ s[m.end(0):]

return s

# Normalize the unicode string 's' into ASCII. The idea is to store

# the authority Lac'ep`ede under the key 'Lacepede' so that

# inconsistent accentuation doesn't cause us to miss an

# abbreviation. We decompose all composed characters and then ignore

# everything non-ASCII. (This converts eacute->e etc.)

def normalize(self, s):

return unicodedata.normalize('NFD', unicode(s)).encode('ascii', 'ignore')

# Add an abbreviation to the table. 'abbrev' is the abbreviation;

# 'article' is the title of the Wikipedia article on that authority;

# 'code' is the code for the list from which it came, if any.

def add_abbreviation(self, abbrev, article, code = None):

key = self.normalize(abbrev)

if key not in self.abbrev:

self.abbrev[key] = []

for a in self.abbrev[key]:

# Do we already have this authority under this abbreviation?

if abbrev == a[0] and article == a[1]:

return

self.abbrev[key].append((abbrev, article, code))

# 3.2 USER INTERFACE FOR ADDING A NEW ABBREVIATION

# If we don't find an abbreviation in any of wiki_abbrev_sources, we can

# prompt the user to tell us the article title corresponding to the

# abbreviation; then we can add it to the relevant source.

# Return the normalized surname of the abbreviation.

def surname(self, abbrev):

m = re.search(r'(?ui)(?:de |von |d\')?[\w-]+\.?$',

self.normalize(abbrev))

if m:

return m.group(0)

else:

wikipedia.output(u"No surname for %s" % abbrev)

return 'a'

# 'abbrev' is the abbreviation for the authority described at

# 'article'. Add this to the source given by 'code'.

def add_abbreviation_to_source(self, abbrev, article, code):

source = self.wiki_abbrev_sources[code]

text = source['page'].get()

if source['sort'] == 'surname':

sortkey = self.surname(abbrev)

else:

sortkey = abbrev

groups = source.get('groups', (1,2))

# Format authority for insertion into the source.

fmt = source['fmt']

fmt = re.sub('%A', abbrev, fmt)

if article[-1] == '(':

fmt = re.sub('%B', article + '|', fmt)

else:

fmt = re.sub('%B', article, fmt)

# Go through abbreviations in the source until we get to the

# appropriate point in alphabetical order by surname.

for m in re.finditer(source.get('re', self.auth_re), text):

newtext = None

if source['sort'] == 'surname':

s2 = self.surname(m.group(groups[0]))

else:

s2 = m.group(groups[0])

if sortkey[0] != s2[0]:

# Sort keys not in the same letter of the alphabet.

continue

elif sortkey < s2:

# New abbrev goes before this one.

newtext = text[:m.start(0)] + fmt + text[m.start(0):]

elif re.match(r'(?: *\r?\n)*==', text[m.end(0):]):

# We've reached the end of the section for the right

# letter, but not found anywhere to put the new

# abbrev. So it goes at the end.

newtext = text[:m.end(0)] + fmt + text[m.end(0):]

else:

continue

# Found a place for it.

wikipedia.showDiff(source['page'].get(), newtext)

if wikipedia.input(u'OK? [yN]') == 'y':

source['page'].put(newtext, 'nomialbot - adding %s = %s'

% (abbrev, article))

return

wikipedia.output(u'Sorry, nowhere to put authority %s' % fmt)

# 'abbrev' is the abbreviation for the authority described at

# 'article'. Ask the user which source to add it to.

def user_add_abbreviation(self, abbrev, article):

for code, source in self.wiki_abbrev_sources.items():

wikipedia.output(u'(%s) %s' % (code, source['name']))

if self.noquery:

inp = None

else:

inp = wikipedia.input(u"Add abbreviation %s = %s to which source? [%s]"

% (abbrev, article,

''.join(self.wiki_abbrev_sources.keys())))

if inp in self.wiki_abbrev_sources:

self.add_abbreviation(abbrev, article, inp)

self.save_abbreviations()

self.add_abbreviation_to_source(abbrev, article, inp)

else:

self.add_abbreviation(abbrev, article)

self.save_abbreviations()

# 3.3 FINDING EXPANSIONS FOR ABBREVIATIONS

# Rebuild table of authorities from the Wikipedia articles listed in

# 'wiki_abbrev_sources'.

def rebuild_abbreviations(self):

wikipedia.getall(self.site,

map(lambda l: l['page'], self.wiki_abbrev_sources.values()))

for code, s in self.wiki_abbrev_sources.items():

for m in re.finditer(s.get('re', self.auth_re), s['page'].get()):

groups = s.get('groups', (1,2))

abbrevs = m.group(groups[0])

pagename = m.group(groups[1])

for a in re.split(r', +', abbrevs):

self.add_abbreviation(a, pagename, code)

self.save_abbreviations()

# User interface for finding an abbreviation using the stored

# abbrevs, returning the pair (abbrev, expansion) or None.

def find_abbreviation_in_store(self, abbrev):

key = self.normalize(abbrev)

if key in self.abbrev:

if len(self.abbrev[key]) == 1:

return self.abbrev[key][0]

for i in range(len(self.abbrev[key])):

wikipedia.output(u'(%d) %s' % (i + 1, self.abbrev[key][i][1]))

while True:

i = wikipedia.input(u"Which authority? [1-%d]"

% len(self.abbrev[key]))

if re.match(r'[0-9]+$', i) \

and int(i) - 1 in range(len(self.abbrev[key])):

break

return (abbrev, self.abbrev[key][int(i) - 1][1])

return None

# Find abbreviation using 'other_abbrev_sources', returning the pair

# (abbrev, expansion) or None.

def find_abbreviation_other(self, abbrev):

# TODO: check source[taxon]

for source in self.other_abbrev_sources:

url = re.sub('%A', urllib.quote(abbrev), source['url'])

wikipedia.output(u'Trying %s' % url)

f = urllib.urlopen(url)

r = re.sub('%A', re.escape(abbrev), source['re'])

m = re.search(r, f.read())

f.close()

if m:

e = self.unhtmlify(m.group(1))

self.user_add_abbreviation(abbrev, e)

return (abbrev, e)

return None

# User interface for finding abbreviation using Wikipedia, returning

# its expansion, or None.

def find_abbreviation_wiki(self, abbrev):

# See if there's a Wikipedia page for the abbrev.

pl = wikipedia.Page(self.site, abbrev)

if not pl.exists():

expansions = []

elif pl.isRedirectPage():

expansions = [wikipedia.Page(self.site, pl.getRedirectTarget())]

elif pl.isDisambig():

expansions = pl.linkedPages()

else:

expansions = []

for i in range(len(expansions)):

wikipedia.output(u'(%d) %s' % (i + 1, expansions[i].title()))

while True:

if expansions:

inp = wikipedia.input(u'Expansion for %s? [1-%d;aecq]'

% (abbrev, len(expansions)))

else:

inp = wikipedia.input(u'Expansion for %s? [aecq]'

% abbrev)

if inp == 'a':

abbrev = wikipedia.input(u'Enter new abbrev:')

return self.find_abbreviation(abbrev)

elif inp == 'e':

expansion = wikipedia.input(u'Enter expansion for %s:'

% abbrev)

self.user_add_abbreviation(abbrev, expansion)

return (abbrev, expansion)

elif re.match(r'[0-9]+$', inp) \

and int(inp) - 1 in range(len(expansions)):

expansion = expansions[int(inp) - 1].title()

self.user_add_abbreviation(abbrev, expansion)

return (abbrev, expansion)

elif inp == 'c':

return None

elif inp == 'q':

raise Error, "Quit requested"

elif inp == 'l':

for i in range(len(expansions)):

wikipedia.output(u'(%d) %s' % (i + 1, expansions[i]))

else:

wikipedia.output(

u' = choose expansion;\n'

u'a = enter new abbreviation\n'

u'e = enter expansion\n'

u'c = continue (with no expansion for abbreviation)\n'

u'l = list expansions\n'

u'q = quit\n')

# Find expansion for abbreviation using all available methods,

# returning the pair (abbrev, expansion) or just abbrev if nothing

# found.

def find_abbreviation(self, abbrev):

if abbrev:

return self.find_abbreviation_in_store(abbrev) \

or self.find_abbreviation_other(abbrev) \

or self.find_abbreviation_wiki(abbrev) \

or (abbrev,)

else:

return ('',)

def wikify_abbreviation(self, expansion):

if 2 <= len(expansion):

return u'%s' % (expansion[1], expansion[0])

else:

return expansion[0]

# 4. FINDING THE AUTHORITY FOR A TAXON

# 'format_authority' takes an 'authority', splits it into its

# component authorities, makes wikilinks for those components, and

# returns a wikitext string.

def format_authority(self, authority):

r = re.compile(r'^\(|, +[0-9]*| +[0-9]+| +in +| +and +|'

r' *\bex\.? +| +& +| +& +|\) *|'

r' +et al\.?')

abbrevs = r.split(authority)

joins = r.findall(authority)

expansions = map(self.wikify_abbreviation,

map(self.find_abbreviation, abbrevs))

return sum(x+y for x,y in zip(expansions, joins + ['']))

# 'find_authority' returns the authority for the given taxon. 'text'

# is the text of the Wikipedia article about that taxon.

def find_authority(self, taxon, text):

rank = self.rank_of_taxon(taxon, text)

subject = self.rank_to_subject.get(rank, 'High')

for source in self.auth_sources:

if 'taxon' in source and not \

re.search(r'(?m)^\| [a-z_]+ *= *\[\[%s' % source['taxon'], text):

continue

url = re.sub('%T', urllib.quote(taxon), source['url'])

url = re.sub('%S', subject, url)

url = re.sub('%R', rank, url)

wikipedia.output(u'Trying %s' % url)

f = urllib.urlopen(url)

r = re.sub('%T',

re.sub(r'\\? +', r'(?: +| +)', re.escape(taxon)),

source['re'])

m = re.search(r, f.read())

f.close()

if m:

return self.unhtmlify(m.group(1))

wikipedia.output(u'No authority found for %s' % taxon)

return None

# 5. UPDATING THE AUTHORITY FOR AN ARTICLE

kingdom_map = {

'Plant': 'Plantae',

'Animal': 'Animalia',

'Bacterium': 'Bacteria',

'Fungus': 'Fungi',

'Protist': 'Protista',

}

def kingdom(self, text):

m = re.search(r'(?m)^\| *regnum *= *\[\[([^\|\]]+)', text)

if m:

return self.kingdom_map.get(m.group(1), m.group(1))

else:

raise Error, "No kingdom found."

def rank_of_taxon(self, taxon, text):

if re.match(r'^[\w-]+ [\w-]+ [\w-]+$', taxon):

return 'subspecies'

elif re.match(r'^[\w-]+ [\w-]+$', taxon):

return 'species'

m = re.search(r'(?m)^\| *((?!name)[a-z_]+) *= *'

r'[ \']*\[*%s[^\w]\]*[ \']*$' % re.escape(taxon), text)

if not m:

raise Error, "Can't find taxon %s in taxobox" % taxon

return m.group(1)

kingdom_to_color = {

'Animalia': 'pink',

'Plantae': 'lightgreen',

'Fungi': 'lightblue',

'Archaea': 'darkgray',

'Protista': 'khaki',

'Bacteria': 'lightgrey',

}

# 'find_article' takes the name of an article to start looking at,

# and returns a Page object.

def find_article(self, article):

while True:

pl = wikipedia.Page(self.site, article)

if not pl.exists():

wikipedia.output(u"No page %s" % pl.title())

i = wikipedia.input(u"Redirect to:")

if not i:

raise Error, "Quit requested"

pl.put(u"#REDIRECT %s" % i,

u"nomialbot - redirecting scientific name %s to %s"

% (article, i))

article = i

elif pl.isRedirectPage():

article = pl.getRedirectTarget()

elif pl.isDisambig():

links = pl.linkedPages()

for i in range(len(links)):

wikipedia.output(u'(%d) %s' % (i + 1, links[i]))

inp = wikipedia.input(u'Choose which article? [1-%d]'

% len(links))

if re.match(r'[0-9]+$', inp) \

and int(inp) - 1 in range(len(links)):

article = links[int(inp) - 1].title()

else:

raise Error, "Quit requested"

else:

return pl

# 'add_authority_to_article' takes a Page object, a taxon and an

# authority. It adds the authority to that page.

def add_authority_to_article(self, pl, taxon, authority, expand = True):

text = pl.get()

text = self.tidy_taxobox(text)

if expand:

authority = self.format_authority(authority)

rank = self.rank_of_taxon(taxon, text)

kingdom = self.kingdom(text)

if rank == 'species':

test_param = 'binomial'

auth_param = 'binomial_authority'

elif rank == 'subspecies':

test_param = 'trinomial'

auth_param = 'trinomial_authority'

else:

test_param = rank

auth_param = rank + '_authority'

m = re.search('(?m)^\| *%s *=.*$' % re.escape(test_param), text)

if not m:

raise Error, "Can't find rank %s in %s" % (test_param, pl.title())

m1 = re.search(r'(?m)^\| *%s *= *(.*)' % re.escape(auth_param), text)

if not m1:

text = (text[:m.end(0)]

+ u'\n| %s = %s' % (auth_param, authority)

+ text[m.end(0):])

elif wikipedia.input(u'%s already has authority "%s". '

u'Replace? [yN]' % (taxon, m1.group(1))) == 'y':

text = (text[:m1.start(0)]

+ u'\n| %s = %s' % (auth_param, authority)

+ text[m1.end(0):])

wikipedia.showDiff(pl.get(), text)

if pl.get() != text and (self.noquery or (wikipedia.input(u"OK? [yN]") == 'y')):

pl.put(text, u'nomialbot - adding authority for %s %s'

% (taxon, authority))

def add_authority(self, article, taxon, authority, expand = True):

pl = self.find_article(article)

if pl:

self.add_authority_to_article(pl, taxon, authority, expand)

def find_and_add_authority(self, article, taxon, expand = True):

pl = self.find_article(article)

if not pl:

return

authority = self.find_authority(taxon, pl.get())

if authority:

self.add_authority_to_article(pl, taxon, authority, expand)

# 7. GENERAL TIDYING

subs = [

# Capitalize "Taxobox"

(r'{{taxobox', '{{Taxobox'),

# Italicise genus entry.

(r'(?m)^\| * genus *=[ \']*\[\[([^\]]+)\]\][ \']*$',

'| genus = \'\'\\1\'\''),

# Abbreviate genus in species entry.

(r'(?m)^\| *species *= *([\']*)([A-Z])[a-z]+ ([a-z]+)',

r'| species = \1\2. \3'),

# Supply missing genus abbrev in species entry.

(r'(?m)^(\| *genus *=[ \'\[]*([A-Z])[a-z]+[\] \']* *\n'

r'\| *species *=[ \']*)([a-z-]+[ \']*$)',

r'\1\2. \3'),

# Supply missing species entry.

(r'(?m)(^\| *genus *=.*\n)'

r'(\| * binomial *= *'

r'([A-Z])[a-z]+ ([a-z-]+))',

r"\1| species = \3. \4\n\2"),

# Italicise genus or species if it appears as the title.

(r'(?ms)^\| *name *= *([a-z -]+[a-z]) *(\n.*'

r'^\| *(?:genus|species) *=[ \'\[]*\1[ \'\]]*$)',

'| name = \'\'\\1\'\'\\2'),

# Bold genus if unlinked.

(r'(?m)^\| *genus *= *\'*(\w+)\'* *$',

"| genus = \\1"),

# Cut superfluous taxa.

(r'(?m)(?:^\| *(?!(?:regnum|phylum|divisio|classis|ordo|familia|genus|species))'

r'(?:super|sub|infra|nano)(?:regnum|phylum|divisio|classis|ordo|familia|genus|species) *=.*\n)+'

r'(^\| *(?:regnum|phylum|divisio|classis|ordo|familia|genus|species)'

r' *=.*\n)'

r'(?=^\| *[a-z]+ *=.*$)',

r'\1'),

]

conditional_subs = [

# Bold species entry if subject of article.

([r'(?m)^\| *binomial *='],

r'(?m)^\| *species *=[ \']*([^\]\'\}]+)[ \']*$',

'| species = \'\'\'\'\'\\1\'\'\'\'\''),

# Bold subspecies entry if subject of article.

([r'(?m)^\| *trinomial *='],

r'(?m)^\| *subspecies *=[ \']*([^\]\'\}]+)[ \']*$',

'| subspecies = \'\'\'\'\'\\1\'\'\'\'\''),

]

anticonditional_subs = [

# Supply missing binomial entry.

([r'(?m)^\| *binomial *=',

r'(?m)^\| *subspecies *='],

r'(?m)(^\| *genus *=[ \'\[]*([A-Z])([a-z]+)[ \'\]]*\n(?:.*\n)*'

r'(?m)^\| *species *=[ \']*\2. ([a-z-]+)[ \']*\n)',

r"\1| binomial = \2\3 \4\n"),

([r'(?m)^\| *binomial *=',

r'(?m)^\| *subspecies *='],

r'(?m)(^\| *species *=[ \']*([A-Z][a-z]+ [a-z-]+)[ \']*\n)',

r"\1| binomial = \2\n"),

]

def tidy_taxobox(self, text):

for s in self.subs:

text = re.sub(s[0], s[1], text)

for s in self.conditional_subs:

if all(re.search(c, text) for c in s[0]):

text = re.sub(s[1], s[2], text)

for s in self.anticonditional_subs:

if not any(re.search(c, text) for c in s[0]):

text = re.sub(s[1], s[2], text)

# Add FishBase reference.

if re.search(r'(?m)^\| *[a-z_]+ *= *'

r'\[\[(?:Actinopterygii|Chondrichthyes)\]\]$', text) \

and not re.search(r'{{FishBase', text):

m1 = re.search(r'(?m)^\| * genus *=[ \'\[]*'

r'([A-Z][a-z]+)[ \'\]]*$', text)

m2 = re.search(r'(?m)^\| species *=[ \']*'

r'(?:[A-Z]\. )?([a-z-]+)[ \']*$', text)

if m1 and m2:

ref = time.strftime('{{FishBase species | genus = %s | '

'species = %s | month = %%B | year = %%Y}}'

% (m1.group(1), m2.group(1)))

elif m1:

ref = time.strftime('{{FishBase genus | genus = %s | '

'month = %%B | year = %%Y}}'

% m1.group(1))

else:

ref = None

if ref:

m1 = re.search(r'==+ *References? *==+ *\n+', text)

m2 = re.search(r'(?:(?:{{.*-stub}}|\[\[[a-z][a-z]:.*\]\]'

r'|\[\[Category:.*\]\])[ \n]*)*$',

text)

if m1:

text = text[:m1.end(0)] \

+ '* ' + ref + '\n' \

+ text[m1.end(0):]

elif m2:

text = text[:m2.start(0)] \

+ '\n==References==\n* ' + ref + '\n' \

+ text[m2.start(0):]

else:

raise Error, "Nowhere to put FishBase reference"

return text

# 6. DISAMBIGUATION

# Run solve_disambiguation on all botanical abbreviations.

def disambiguate(self):

import solve_disambiguation

for a in self.abbrev.values():

for aa in a:

if aa[2] == 'b' and aa[0][-1] == '.':

bot = solve_disambiguation.DisambiguationRobot(

'0', [aa[1]], False, False, [aa[0]], False, True)

bot.run()

def badusage():

raise Error, ('Usage:\n'

'%s --rebuild Rebuild abbreviation table\n'

'%s --query=abbrev Query abbreviation\n'

'%s taxon Find authority and add it to taxon\n'

'%s taxon authority Add authority to taxon\n'

% (sys.argv[0], sys.argv[0], sys.argv[0], sys.argv[0]))

def main():

wikipedia.username = 'nomialbot'

try:

auth = Authority()

article = None

expand = True

try:

opts, args = getopt.getopt(sys.argv[1:], 'zdnra:q:',

['noexpand', 'rebuild', 'article=',

'query=', 'disambig', 'noquery'])

for o, a in opts:

if o in ('-q', '--query'):

print auth.find_abbreviation(a.decode())

elif o in ('-r', '--rebuild'):

auth.rebuild_abbreviations()

elif o in ('-d', '--disambig'):

auth.disambiguate()

elif o in ('-a', '--article'):

article = a

elif o in ('-n', '--noexpand'):

expand = False

elif o in ('-z', '--noquery'):

auth.noquery = True

else:

badusage()

return

except getopt.GetoptError:

badusage()

return

if len(args) == 1:

auth.find_and_add_authority(article or args[0], args[0], expand)

elif len(args) == 2:

auth.add_authority(article or args[0], args[0], args[1], expand)

else:

badusage()

return

except Error:

return

if __name__ == '__main__':

try:

main()

finally:

wikipedia.stopme()