User:CobraBot/Code



-*- coding: utf-8  -*-

import wikipedia
import pagegenerators
import re
import warnings
from time import sleep
from sys import stdout
from oclc import isbn2oclc

This is required for the text that is shown when you run this script
with the parameter -help.

docuReplacements = {
'¶ms;': pagegenerators.parameterHelp
}
TEMPLATE_PREFIX = u"Template:"
SITE = wikipedia.getSite()
def pagesUsingTemplate(templateName):
transclusionPageName = unicode(SITE.namespace(10)) + u":" + templateName
transclusionPage = wikipedia.Page(SITE, transclusionPageName)
gen = pagegenerators.ReferringPageGenerator(transclusionPage, onlyTemplateInclusion=True)
return gen
class BailOut(StandardError):
"""Immediately stop processing the current page"""
class OCLCBot:
# Edit summary message that should be used.
EDIT_SUMMARY = u'Adding OCLC# to book infobox based on ISBN (CobraBot; PLEASE report any problems)'
BOOK_INFOBOX = u"Infobox Book"
DASHES = [u'-', u'‒', u'–', u'—', u'―']
TERMINATOR = re.compile(u"(}})|\\|")
INFOBOX_START = re.compile(u"\\{\\{[ \t\n]*infobox[ _]((book(s)?)|(novel))", re.IGNORECASE)
OCLC_PARAM = u"\\|[ \t\n]*oclc[ \t\n]*=[ \t\n]*"
ISBN_MIN_LEN = 10
def __init__(self, debug):
"""
Constructor. Parameters:
* generator - The page generator that determines on which pages
to work on.
* debug     - If True, doesn't do any real changes, but only shows
what would have been changed.
"""
self.generator = pagesUsingTemplate(self.BOOK_INFOBOX)
self.debug = debug
self.editCount = 0
self.log = file("skipped.log", 'a')
def run(self):
N = 371+145+36+29+38+26+48+56+48+188+85+45+171+130+105
# Set the edit summary message
wikipedia.setAction(self.EDIT_SUMMARY)
print "Advancing by %s..." % N
stdout.flush()
for i in xrange(N):
next(self.generator)
print "Done advancing!"
stdout.flush()
for pageIndex, page in enumerate(self.generator):
self.treat(page, pageIndex)
self.log.close()
#########
def partition(self, text):
boxmatch = self.INFOBOX_START.search(text)
if not boxmatch:
wikipedia.output(u"SKIPPING: Page either uses 'Book infobox' alias or is false positive")
raise BailOut, "SKIPPING: Page either uses 'Book infobox' alias or is false positive"
boxStart = boxmatch.start()
boxEnd = boxStart + re.search(u"\\}\\}", text[boxStart:]).end()
prebox = text[:boxStart]
box = text[boxStart:boxEnd]
postbox = text[boxEnd:]
return prebox, box, postbox
def checkForOclc(self, box):
paramMatch = re.search(self.OCLC_PARAM, box)
if paramMatch: #has |oclc=
oclcValAndRest = box[paramMatch.end():]
oclcTermMatch = self.TERMINATOR.search(oclcValAndRest)
value = oclcValAndRest[:oclcTermMatch.start()].strip() # | oclc = VALUE |
if value: #already has |oclc= filled in
wikipedia.output(u"SKIPPING: oclc param already filled")
raise BailOut, "SKIPPING: oclc param already filled"
else: #remove the |oclc=
# print "REMOVED OCLC:", repr(paramMatch.group())
box = box[:paramMatch.start()] + box[paramMatch.start()+len(paramMatch.group()):]
# print "NEW BOX:"
# print box
return box
return box
def findIsbnVal(self, box):
paramMatch = re.search(u"\\|([ \t\n])*isbn([ \t\n])*=([ \t\n])*", box)
if not paramMatch: #no ISBN present
wikipedia.output(u"SKIPPING: No isbn param present")
raise BailOut, "SKIPPING: No isbn param present"
isbnValAndRest = box[paramMatch.end():]
termMatch = self.TERMINATOR.search(isbnValAndRest)
isbnVal = isbnValAndRest[:termMatch.start()]
relIsbnTerm = self.TERMINATOR.search(isbnValAndRest).start()
isbnTerm = paramMatch.end() + relIsbnTerm
isbnFrag = isbnValAndRest[:relIsbnTerm]
if '' in isbnFrag and '' not in isbnFrag:
wikipedia.output(u"SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle")
raise BailOut, "SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle"
return isbnVal, isbnTerm
def removeDashes(self, isbn):
for dash in self.DASHES:
isbn = isbn.replace(dash, '')
return isbn
def checkForNA(self, isbn):
if re.match(u"N/?A", isbn, re.IGNORECASE):
wikipedia.output(u"SKIPPING: ISBN Not/Applicable")
raise BailOut, "SKIPPING: ISBN Not/Applicable"
def removeExtraISBN(self, isbnVal):
match = re.match(u"([ \t\n])*ISBN([ \t\n])*", isbnVal)
if match:
return isbnVal[match.end():]
return isbnVal
def firstWord(self, isbnVal):
wordMatch = re.search("[^ \t\n<,;\\[\\]]+", isbnVal)
return wordMatch.group()
def normalize(self, string):
return string.replace(u' ',u).replace(u"-",u).replace(u"and",  u"&").replace(u',', u).replace(u'.', u).replace(u"'", u).replace(u'"', u).replace(u"’", u).lower().replace(u"the", u)
def treat(self, page, pageIndex):
"""
Loads the given page, does some changes, and saves it.
"""
print "=================================================================="
# if u"British" not in page.title(): return
# raw_input("Continue?")
print "PAGE TITLE:", page.title()
print "PAGE#:", pageIndex+1
print "EDIT COUNT:", self.editCount
if page.namespace() != 0:
wikipedia.output(u"SKIPPING: Non-article namespace!")
return
try:
# Load the page
text = page.get()
except wikipedia.NoPage:
wikipedia.output(u"Page %s does not exist; skipping." % page.aslink())
return
except wikipedia.IsRedirectPage:
wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink())
return
################################################################
# NOTE: Here you can modify the text in whatever way you want. #
################################################################
# If you find out that you do not want to edit this page, just return.
try:
prebox, box, postbox = self.partition(text)
# print "BOX:"
# print box
box = self.checkForOclc(box)
isbnVal, isbnTerm = self.findIsbnVal(box)
# print "INITIAL ISBN:", repr(isbnVal)
isbnVal = self.removeDashes(isbnVal).strip()
# print "ISBN SANS DASH:", repr(isbnVal)
isbnVal = self.removeExtraISBN(isbnVal)
self.checkForNA(isbnVal)
# print "ISBN SANS ISBN:", repr(isbnVal)
if not isbnVal: #empty |isbn=
wikipedia.output(u"SKIPPING: Empty isbn param")
raise BailOut, "SKIPPING: Empty isbn param"
isbn = self.firstWord(isbnVal)
# print "ONE TRUE ISBN:", isbn
print "ISBN#:", isbn
if len(isbn) < self.ISBN_MIN_LEN:
wikipedia.output(u"SKIPPING: Malformed ISBN, too short (%s)" % isbn)
raise BailOut, ("SKIPPING: Malformed ISBN, too short (%s)" % isbn)
if not re.search("[0-9]", isbn):
wikipedia.output(u"SKIPPING: Malformed ISBN, no numbers (%s)" % isbn)
raise BailOut, ("SKIPPING: Malformed ISBN, no numbers (%s)" % isbn)
except BailOut as e:
self.log.write(page.title().encode('utf8')+"; "+e.message+"\n")
return
#do lookup
try:
oclc, oclcTitle = isbn2oclc(isbn)
except RuntimeError as e:
wikipedia.output(u"ABORTED: Problem looking up OCLC# (%s)" % e.message)
return
print "PAGE TITLE:", page.title()
wikiCanon = self.normalize(page.title().split(u"(")[0])
oclcCanon = self.normalize(oclcTitle.split(u":")[0])
titlesMatch = oclcCanon.startswith(wikiCanon)
if titlesMatch:
print
print "--Canonical titles DO MATCH.--"
else:
print wikiCanon
print oclcCanon
box = box[:isbnTerm] + "| oclc= "+oclc+(" " if self.debug else "\n") + box[isbnTerm:]
text = prebox + box + postbox
# only save if something was changed
if text != page.get():
# Show the title of the page we're working on.
# Highlight the title in purple.
wikipedia.output(u"\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
# show what was changed
wikipedia.showDiff(page.get(), text)
# raw_input("Continue?")
# sleep(3)
if not self.debug:
choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
if choice != 'y':
return
try:
# Save the page
page.put(text)
except wikipedia.LockedPage:
wikipedia.output(u"Page %s is locked; skipping." % page.aslink())
except wikipedia.EditConflict:
wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
except wikipedia.SpamfilterError, error:
wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
else:
self.editCount += 1
def main():
DEBUG = False # True
bot = OCLCBot(DEBUG)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
bot.run()
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()