User:CobraBot/Code

  1. -*- coding: utf-8 -*-

import wikipedia

import pagegenerators

import re

import warnings

from time import sleep

from sys import stdout

from oclc import isbn2oclc

  1. This is required for the text that is shown when you run this script
  2. with the parameter -help.

docuReplacements = {

'¶ms;': pagegenerators.parameterHelp

}

TEMPLATE_PREFIX = u"Template:"

SITE = wikipedia.getSite()

def pagesUsingTemplate(templateName):

transclusionPageName = unicode(SITE.namespace(10)) + u":" + templateName

transclusionPage = wikipedia.Page(SITE, transclusionPageName)

gen = pagegenerators.ReferringPageGenerator(transclusionPage, onlyTemplateInclusion=True)

return gen

class BailOut(StandardError):

"""Immediately stop processing the current page"""

class OCLCBot:

# Edit summary message that should be used.

EDIT_SUMMARY = u'Adding OCLC# to book infobox based on ISBN (CobraBot; PLEASE report any problems)'

BOOK_INFOBOX = u"Infobox Book"

DASHES = [u'-', u'‒', u'–', u'—', u'―']

TERMINATOR = re.compile(u"(}})|\\|")

INFOBOX_START = re.compile(u"\\{\\{[ \t\n]*infobox[ _]((book(s)?)|(novel))", re.IGNORECASE)

OCLC_PARAM = u"\\|[ \t\n]*oclc[ \t\n]*=[ \t\n]*"

ISBN_MIN_LEN = 10

def __init__(self, debug):

"""

Constructor. Parameters:

* generator - The page generator that determines on which pages

to work on.

* debug - If True, doesn't do any real changes, but only shows

what would have been changed.

"""

self.generator = pagesUsingTemplate(self.BOOK_INFOBOX)

self.debug = debug

self.editCount = 0

self.log = file("skipped.log", 'a')

def run(self):

N = 371+145+36+29+38+26+48+56+48+188+85+45+171+130+105

# Set the edit summary message

wikipedia.setAction(self.EDIT_SUMMARY)

print "Advancing by %s..." % N

stdout.flush()

for i in xrange(N):

next(self.generator)

print "Done advancing!"

stdout.flush()

for pageIndex, page in enumerate(self.generator):

self.treat(page, pageIndex)

self.log.close()

#########

def partition(self, text):

boxmatch = self.INFOBOX_START.search(text)

if not boxmatch:

wikipedia.output(u"SKIPPING: Page either uses 'Book infobox' alias or is false positive")

raise BailOut, "SKIPPING: Page either uses 'Book infobox' alias or is false positive"

boxStart = boxmatch.start()

boxEnd = boxStart + re.search(u"\\}\\}", text[boxStart:]).end()

prebox = text[:boxStart]

box = text[boxStart:boxEnd]

postbox = text[boxEnd:]

return prebox, box, postbox

def checkForOclc(self, box):

paramMatch = re.search(self.OCLC_PARAM, box)

if paramMatch: #has |oclc=

oclcValAndRest = box[paramMatch.end():]

oclcTermMatch = self.TERMINATOR.search(oclcValAndRest)

value = oclcValAndRest[:oclcTermMatch.start()].strip() # | oclc = VALUE |

if value: #already has |oclc= filled in

wikipedia.output(u"SKIPPING: oclc param already filled")

raise BailOut, "SKIPPING: oclc param already filled"

else: #remove the |oclc=

# print "REMOVED OCLC:", repr(paramMatch.group())

box = box[:paramMatch.start()] + box[paramMatch.start()+len(paramMatch.group()):]

# print "NEW BOX:"

# print box

return box

return box

def findIsbnVal(self, box):

paramMatch = re.search(u"\\|([ \t\n])*isbn([ \t\n])*=([ \t\n])*", box)

if not paramMatch: #no ISBN present

wikipedia.output(u"SKIPPING: No isbn param present")

raise BailOut, "SKIPPING: No isbn param present"

isbnValAndRest = box[paramMatch.end():]

termMatch = self.TERMINATOR.search(isbnValAndRest)

isbnVal = isbnValAndRest[:termMatch.start()]

relIsbnTerm = self.TERMINATOR.search(isbnValAndRest).start()

isbnTerm = paramMatch.end() + relIsbnTerm

isbnFrag = isbnValAndRest[:relIsbnTerm]

if '' in isbnFrag and '' not in isbnFrag:

wikipedia.output(u"SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle")

raise BailOut, "SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle"

return isbnVal, isbnTerm

def removeDashes(self, isbn):

for dash in self.DASHES:

isbn = isbn.replace(dash, '')

return isbn

def checkForNA(self, isbn):

if re.match(u"N/?A", isbn, re.IGNORECASE):

wikipedia.output(u"SKIPPING: ISBN Not/Applicable")

raise BailOut, "SKIPPING: ISBN Not/Applicable"

def removeExtraISBN(self, isbnVal):

match = re.match(u"([ \t\n])*ISBN([ \t\n])*", isbnVal)

if match:

return isbnVal[match.end():]

return isbnVal

def firstWord(self, isbnVal):

wordMatch = re.search("[^ \t\n<,;\\[\\]]+", isbnVal)

return wordMatch.group()

def normalize(self, string):

return string.replace(u' ',u).replace(u"-",u).replace(u"and", u"&").replace(u',', u).replace(u'.', u).replace(u"'", u).replace(u'"', u).replace(u"’", u).lower().replace(u"the", u)

def treat(self, page, pageIndex):

"""

Loads the given page, does some changes, and saves it.

"""

print "=================================================================="

# if u"British" not in page.title(): return

# raw_input("Continue?")

print "PAGE TITLE:", page.title()

print "PAGE#:", pageIndex+1

print "EDIT COUNT:", self.editCount

if page.namespace() != 0:

wikipedia.output(u"SKIPPING: Non-article namespace!")

return

try:

# Load the page

text = page.get()

except wikipedia.NoPage:

wikipedia.output(u"Page %s does not exist; skipping." % page.aslink())

return

except wikipedia.IsRedirectPage:

wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink())

return

################################################################

# NOTE: Here you can modify the text in whatever way you want. #

################################################################

# If you find out that you do not want to edit this page, just return.

try:

prebox, box, postbox = self.partition(text)

# print "BOX:"

# print box

box = self.checkForOclc(box)

isbnVal, isbnTerm = self.findIsbnVal(box)

# print "INITIAL ISBN:", repr(isbnVal)

isbnVal = self.removeDashes(isbnVal).strip()

# print "ISBN SANS DASH:", repr(isbnVal)

isbnVal = self.removeExtraISBN(isbnVal)

self.checkForNA(isbnVal)

# print "ISBN SANS ISBN:", repr(isbnVal)

if not isbnVal: #empty |isbn=

wikipedia.output(u"SKIPPING: Empty isbn param")

raise BailOut, "SKIPPING: Empty isbn param"

isbn = self.firstWord(isbnVal)

# print "ONE TRUE ISBN:", isbn

print "ISBN#:", isbn

if len(isbn) < self.ISBN_MIN_LEN:

wikipedia.output(u"SKIPPING: Malformed ISBN, too short (%s)" % isbn)

raise BailOut, ("SKIPPING: Malformed ISBN, too short (%s)" % isbn)

if not re.search("[0-9]", isbn):

wikipedia.output(u"SKIPPING: Malformed ISBN, no numbers (%s)" % isbn)

raise BailOut, ("SKIPPING: Malformed ISBN, no numbers (%s)" % isbn)

except BailOut as e:

self.log.write(page.title().encode('utf8')+"; "+e.message+"\n")

return

#do lookup

try:

oclc, oclcTitle = isbn2oclc(isbn)

except RuntimeError as e:

wikipedia.output(u"ABORTED: Problem looking up OCLC# (%s)" % e.message)

return

print "PAGE TITLE:", page.title()

wikiCanon = self.normalize(page.title().split(u"(")[0])

oclcCanon = self.normalize(oclcTitle.split(u":")[0])

titlesMatch = oclcCanon.startswith(wikiCanon)

if titlesMatch:

print

print "--Canonical titles DO MATCH.--"

else:

print wikiCanon

print oclcCanon

box = box[:isbnTerm] + "| oclc= "+oclc+(" " if self.debug else "\n") + box[isbnTerm:]

text = prebox + box + postbox

# only save if something was changed

if text != page.get():

# Show the title of the page we're working on.

# Highlight the title in purple.

wikipedia.output(u"\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())

# show what was changed

wikipedia.showDiff(page.get(), text)

# raw_input("Continue?")

# sleep(3)

if not self.debug:

choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')

if choice != 'y':

return

try:

# Save the page

page.put(text)

except wikipedia.LockedPage:

wikipedia.output(u"Page %s is locked; skipping." % page.aslink())

except wikipedia.EditConflict:

wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))

except wikipedia.SpamfilterError, error:

wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))

else:

self.editCount += 1

def main():

DEBUG = False # True

bot = OCLCBot(DEBUG)

with warnings.catch_warnings():

warnings.simplefilter("ignore")

bot.run()

if __name__ == "__main__":

try:

main()

finally:

wikipedia.stopme()