User:CobraBot/Code
- -*- coding: utf-8 -*-
import wikipedia
import pagegenerators
import re
import warnings
from time import sleep
from sys import stdout
from oclc import isbn2oclc
- This is required for the text that is shown when you run this script
- with the parameter -help.
docuReplacements = {
'¶ms;': pagegenerators.parameterHelp
}
TEMPLATE_PREFIX = u"Template:"
SITE = wikipedia.getSite()
def pagesUsingTemplate(templateName):
transclusionPageName = unicode(SITE.namespace(10)) + u":" + templateName
transclusionPage = wikipedia.Page(SITE, transclusionPageName)
gen = pagegenerators.ReferringPageGenerator(transclusionPage, onlyTemplateInclusion=True)
return gen
class BailOut(StandardError):
"""Immediately stop processing the current page"""
class OCLCBot:
# Edit summary message that should be used.
EDIT_SUMMARY = u'Adding OCLC# to book infobox based on ISBN (CobraBot; PLEASE report any problems)'
BOOK_INFOBOX = u"Infobox Book"
DASHES = [u'-', u'‒', u'–', u'—', u'―']
TERMINATOR = re.compile(u"(}})|\\|")
INFOBOX_START = re.compile(u"\\{\\{[ \t\n]*infobox[ _]((book(s)?)|(novel))", re.IGNORECASE)
OCLC_PARAM = u"\\|[ \t\n]*oclc[ \t\n]*=[ \t\n]*"
ISBN_MIN_LEN = 10
def __init__(self, debug):
"""
Constructor. Parameters:
* generator - The page generator that determines on which pages
to work on.
* debug - If True, doesn't do any real changes, but only shows
what would have been changed.
"""
self.generator = pagesUsingTemplate(self.BOOK_INFOBOX)
self.debug = debug
self.editCount = 0
self.log = file("skipped.log", 'a')
def run(self):
N = 371+145+36+29+38+26+48+56+48+188+85+45+171+130+105
# Set the edit summary message
wikipedia.setAction(self.EDIT_SUMMARY)
print "Advancing by %s..." % N
stdout.flush()
for i in xrange(N):
next(self.generator)
print "Done advancing!"
stdout.flush()
for pageIndex, page in enumerate(self.generator):
self.treat(page, pageIndex)
self.log.close()
#########
def partition(self, text):
boxmatch = self.INFOBOX_START.search(text)
if not boxmatch:
wikipedia.output(u"SKIPPING: Page either uses 'Book infobox' alias or is false positive")
raise BailOut, "SKIPPING: Page either uses 'Book infobox' alias or is false positive"
boxStart = boxmatch.start()
boxEnd = boxStart + re.search(u"\\}\\}", text[boxStart:]).end()
prebox = text[:boxStart]
box = text[boxStart:boxEnd]
postbox = text[boxEnd:]
return prebox, box, postbox
def checkForOclc(self, box):
paramMatch = re.search(self.OCLC_PARAM, box)
if paramMatch: #has |oclc=
oclcValAndRest = box[paramMatch.end():]
oclcTermMatch = self.TERMINATOR.search(oclcValAndRest)
value = oclcValAndRest[:oclcTermMatch.start()].strip() # | oclc = VALUE |
if value: #already has |oclc= filled in
wikipedia.output(u"SKIPPING: oclc param already filled")
raise BailOut, "SKIPPING: oclc param already filled"
else: #remove the |oclc=
# print "REMOVED OCLC:", repr(paramMatch.group())
box = box[:paramMatch.start()] + box[paramMatch.start()+len(paramMatch.group()):]
# print "NEW BOX:"
# print box
return box
return box
def findIsbnVal(self, box):
paramMatch = re.search(u"\\|([ \t\n])*isbn([ \t\n])*=([ \t\n])*", box)
if not paramMatch: #no ISBN present
wikipedia.output(u"SKIPPING: No isbn param present")
raise BailOut, "SKIPPING: No isbn param present"
isbnValAndRest = box[paramMatch.end():]
termMatch = self.TERMINATOR.search(isbnValAndRest)
isbnVal = isbnValAndRest[:termMatch.start()]
relIsbnTerm = self.TERMINATOR.search(isbnValAndRest).start()
isbnTerm = paramMatch.end() + relIsbnTerm
isbnFrag = isbnValAndRest[:relIsbnTerm]
if '' in isbnFrag and '' not in isbnFrag:
wikipedia.output(u"SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle")
raise BailOut, "SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle"
return isbnVal, isbnTerm
def removeDashes(self, isbn):
for dash in self.DASHES:
isbn = isbn.replace(dash, '')
return isbn
def checkForNA(self, isbn):
if re.match(u"N/?A", isbn, re.IGNORECASE):
wikipedia.output(u"SKIPPING: ISBN Not/Applicable")
raise BailOut, "SKIPPING: ISBN Not/Applicable"
def removeExtraISBN(self, isbnVal):
match = re.match(u"([ \t\n])*ISBN([ \t\n])*", isbnVal)
if match:
return isbnVal[match.end():]
return isbnVal
def firstWord(self, isbnVal):
wordMatch = re.search("[^ \t\n<,;\\[\\]]+", isbnVal)
return wordMatch.group()
def normalize(self, string):
return string.replace(u' ',u).replace(u"-",u).replace(u"and", u"&").replace(u',', u).replace(u'.', u).replace(u"'", u).replace(u'"', u).replace(u"’", u).lower().replace(u"the", u)
def treat(self, page, pageIndex):
"""
Loads the given page, does some changes, and saves it.
"""
print "=================================================================="
# if u"British" not in page.title(): return
# raw_input("Continue?")
print "PAGE TITLE:", page.title()
print "PAGE#:", pageIndex+1
print "EDIT COUNT:", self.editCount
if page.namespace() != 0:
wikipedia.output(u"SKIPPING: Non-article namespace!")
return
try:
# Load the page
text = page.get()
except wikipedia.NoPage:
wikipedia.output(u"Page %s does not exist; skipping." % page.aslink())
return
except wikipedia.IsRedirectPage:
wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink())
return
################################################################
# NOTE: Here you can modify the text in whatever way you want. #
################################################################
# If you find out that you do not want to edit this page, just return.
try:
prebox, box, postbox = self.partition(text)
# print "BOX:"
# print box
box = self.checkForOclc(box)
isbnVal, isbnTerm = self.findIsbnVal(box)
# print "INITIAL ISBN:", repr(isbnVal)
isbnVal = self.removeDashes(isbnVal).strip()
# print "ISBN SANS DASH:", repr(isbnVal)
isbnVal = self.removeExtraISBN(isbnVal)
self.checkForNA(isbnVal)
# print "ISBN SANS ISBN:", repr(isbnVal)
if not isbnVal: #empty |isbn=
wikipedia.output(u"SKIPPING: Empty isbn param")
raise BailOut, "SKIPPING: Empty isbn param"
isbn = self.firstWord(isbnVal)
# print "ONE TRUE ISBN:", isbn
print "ISBN#:", isbn
if len(isbn) < self.ISBN_MIN_LEN:
wikipedia.output(u"SKIPPING: Malformed ISBN, too short (%s)" % isbn)
raise BailOut, ("SKIPPING: Malformed ISBN, too short (%s)" % isbn)
if not re.search("[0-9]", isbn):
wikipedia.output(u"SKIPPING: Malformed ISBN, no numbers (%s)" % isbn)
raise BailOut, ("SKIPPING: Malformed ISBN, no numbers (%s)" % isbn)
except BailOut as e:
self.log.write(page.title().encode('utf8')+"; "+e.message+"\n")
return
#do lookup
try:
oclc, oclcTitle = isbn2oclc(isbn)
except RuntimeError as e:
wikipedia.output(u"ABORTED: Problem looking up OCLC# (%s)" % e.message)
return
print "PAGE TITLE:", page.title()
wikiCanon = self.normalize(page.title().split(u"(")[0])
oclcCanon = self.normalize(oclcTitle.split(u":")[0])
titlesMatch = oclcCanon.startswith(wikiCanon)
if titlesMatch:
print "--Canonical titles DO MATCH.--"
else:
print wikiCanon
print oclcCanon
box = box[:isbnTerm] + "| oclc= "+oclc+(" " if self.debug else "\n") + box[isbnTerm:]
text = prebox + box + postbox
# only save if something was changed
if text != page.get():
# Show the title of the page we're working on.
# Highlight the title in purple.
wikipedia.output(u"\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
# show what was changed
wikipedia.showDiff(page.get(), text)
# raw_input("Continue?")
# sleep(3)
if not self.debug:
choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
if choice != 'y':
return
try:
# Save the page
page.put(text)
except wikipedia.LockedPage:
wikipedia.output(u"Page %s is locked; skipping." % page.aslink())
except wikipedia.EditConflict:
wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
except wikipedia.SpamfilterError, error:
wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
else:
self.editCount += 1
def main():
DEBUG = False # True
bot = OCLCBot(DEBUG)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
bot.run()
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()