User:Disambot/Source

The Disambot source code is divided into three scripts:

  • enwp.py provides the framework for interfacing with the English Wikipedia. It uses a combination of API calls and regular HTTP requests.
  • disambot.py extracts a list of disambiguation pages (or more precisely, their titles) from working list.txt and puts each one through an inspection function which loads the page content, makes various changes, and saves any changes.
  • private.py stores the username and password of the bot account.

These scripts are shown below:

enwp.py

import urllib, urllib2, ClientCookie, time

debug_mode = False

base_url = 'http://en.wikipedia.org/'

api_url = base_url + 'w/api.php'

def login(username, password):

url = globals()['api_url']

data = {

'action' : 'login',

'lgname' : username,

'lgpassword' : password,

'format' : 'xml'

}

if globals()['debug_mode']: print 'Logging in...'

response = ClientCookie.urlopen(url, urllib.urlencode(data)).read()

if globals()['debug_mode']: print 'Done'

def grab_page(title, render=False, expand_templates=False):

if render: ren_param = '&action=render'

else: ren_param = '&action=raw'

if expand_templates: expand_param = '&templates=expand'

else: expand_param = ''

url = globals()['base_url'] + 'w/index.php?title=' + title.replace(' ', '_') + ren_param + expand_param

if globals()['debug_mode']: print 'Fetching ' + url

response = ClientCookie.urlopen(url).read()

if globals()['debug_mode']: print str(len(response)) + ' bytes received'

return response

def edit_page(title, new_content, summary=''):

# First, obtain the required editing token and the timestamp of the last page edit

url = globals()['api_url']

data = {

'action' : 'query',

'prop' : 'info|revisions',

'intoken' : 'edit',

'titles' : title,

'format' : 'xml'

}

if globals()['debug_mode']: print 'Fetching ' + url

response = ClientCookie.urlopen(url, urllib.urlencode(data)).read()

if globals()['debug_mode']: print str(len(response)) + ' bytes received'

# Grab the supplied token from the XML-formatted response

token_start = response.find('edittoken="') + len('edittoken="')

token_end = response.find('"', token_start)

token = response[token_start : token_end]

if globals()['debug_mode']: print 'Token: ' + token

# Grab the last revision timestamp as well

ts_start = response.find('timestamp="') + len('edittoken="')

ts_end = response.find('"', ts_start)

ts = response[ts_start : ts_end]

if globals()['debug_mode']: print 'Base timestamp: ' + ts

# We just fetched a (last edit) timestamp of the form 2008-06-18T07:18:06Z; convert it to 20080618071806

edit_time = ts[0:4] + ts[5:7] + ts[8:10] + ts[11:13] + ts[14:16] + ts[17:19]

if globals()['debug_mode']: print 'Time of last edit: ' + str(edit_time)

# Get the current time and convert it to the 20080618071806 format as well

ct = time.gmtime()[0:6] # tuple of the form (year, month, day, hour, minute, second)

start_time = str(ct[0]).zfill(4) + str(ct[1]).zfill(2) + str(ct[2]).zfill(2) + str(ct[3]).zfill(2) + str(ct[4]).zfill(2) + str(ct[5]).zfill(2)

if globals()['debug_mode']: print 'Time of token retreival: ' + str(start_time)

# Next, use the API to push the new page content

'''

data = {

'action' : 'edit',

'title' : title,

'section' : 0,

'text' : new_content,

'token' : token,

'summary' : summary,

'bot' : True,

'basetimestamp' : ts,

'nocreate' : True,

'format' : 'xml'

}

'''

url = globals()['base_url'] + 'w/index.php?' + urllib.urlencode({ 'title':title, 'action':'submit' }, True)

data = {

'wpAntispam' : '',

'wpSection' : '',

'wpStarttime' : start_time,

'wpEdittime' : edit_time,

'wpScrolltop' : 0, # WTF does this do?

'wpTextbox1' : new_content,

'wpSummary' : summary,

'wpAutoSummary' : 'd41d8cd98f00b204e9800998ecf8427e', # not sure how this works

'wpSave' : 'Save page',

'wpEditToken' : token

}

data = urllib.urlencode(data)

req = urllib2.Request(url, data, { 'User-Agent' : 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9) Gecko/2008060309 Firefox/3.0' }, True)

if globals()['debug_mode']: print 'Sending data to ' + url

try:

response = ClientCookie.urlopen(req).read()

except urllib2.HTTPError, response:

if globals()['debug_mode']: print 'HTTP error encountered...'

except AttributeError: pass # seems to be a small of bug in ClientCookie

if globals()['debug_mode']: globals()['response'] = response

'''

result_start = response.find('result="') + len('result="')

result_end = response.find('"', result_start)

result = response[result_start : result_end]

if globals()['debug_mode']: print 'Result: ' + result

if result.lower() is 'failure':

return False

'''

return True

def sandbox_test():

edit_page('Wikipedia:Sandbox', 'Hello! This is a sandbox edit done using a Python script.')

disambot.py

import enwp, private

abbreviations = ( 'ac.', 'Co.', 'Corp.', 'deg.', 'ft.', 'Inc.', 'kg.', 'km.' 'mi.', 'mo.', 'oz.', 'qr.', 'qt.', 'yd.' )

# Log in to en-wp account

enwp.login(private.username, private.password)

def inspect(title):

print 'Inspecting ' + title + '...'

# Defaults

changed = False

complex_errors = ()

article_body = enwp.grab_page(title).strip()

article_body_orig = article_body

raw_html = enwp.grab_page(title, True)

# Skip set indices

if article_body.lower().find('[[category:set indices') is not -1:

return false

lines = article_body.splitlines()

# Main loop -- cycle through lines

for i, line in enumerate(lines):

# Skip short/empty lines

if len(line) < 5:

continue

# Strip extra whitespace

line = line.strip()

line_orig = line

# Replace ordered list items with unordered list items

if line[0] is '#':

line = '*' + line[1:]

# Handle list items

if line[0] is '*': # if this line is a list item

# Fix punctuation at the end

if line[-1] is '.' or line[-1] is ',' or line[-1] is ';': # if there is punctuation at the end

if line.count('.') >= 2 and line[line.find('.')+1] == ' ' and line[line.find('.')+2] is line[line.find('.')+2].upper(): # if multiple sentences

complex_errors += ('item with multiple sentences detected (line '+str(i)+')',)

else:

# Remove the punctuation, unless it's a proper abbreviation

abbrev = False

for a in globals()['abbreviations']:

if ' '+a.lower() is line[-1*(len(a)+1):].lower(): # if this abbreviation is at the end of the line

abbrev = True

break;

if not abbrev and line[-2] is line[-2].lower(): # not an abbreviation and not an acronym

line = line[0:-1] # remove punctuation (last character)

# Remove any bullets to assess the item itself

line_content = line

while line_content[0] is '*':

line_content = line_content[1:].strip()

line_content_orig = line_content

# Remove outer boldness if necessary

if line_content[0:3] is "'''":

count = 0

while line_content[0] is "'":

line_content = line_content[1:]

count += 1

if count is 3 and line_content[count:count+2] is '[[':

line_content.replace("'"*count, '', 1)

# Correct piped links

if line.find('|') is not -1 and line_content.find('') is 0 and line.find('') is not -1 and line.find('|') < line.find(']]'):

# There is a piped link at the beginning of this line -- remove it

# Get rid of pipe, checking for italics

p1 = line_content.find('|')

p2 = line_content.find(']]')

p3 = line_content.find("''", p1, p2)

if p3 is not -1 and line_content[p3+2] is not "'": # there are italics inside pipe

pass ####

#p4 = line_content.find("", p3+2) # closing

#if p4 is -1:

#complex_errors += ('italicized text seems misformatted (line '+str(i)+')',)

#else:

#italicized = line_content[p3+2:p4]

else: # no italics --> simply remove pipe

line_content = line_content[:p1] + line_content[p2:]

# Check for wikilinks that are not the first word

if line_content.find('[[', 3) is not -1:

p1 = line_content.find('[[')

p2 = line_content.find('|')

p3 = line_content.find(']]')

if p2 is -1:

article_title = line_content[p1+2:p3]

else:

article_title = line_content[p2+1:p3]

p4 = raw_html.find(article_title+' (page does not exist)')

if (p1 is 0 or p1 is 2) and p4 is -1:

# The first word is wikilinked as it should be and not a red link, but there are other links that shouldn't be here

firstlink_end = line_content.find(']]')

if firstlink_end is -1:

# No closing "]]" ... something must be screwy

complex_errors += ('error in wikilink syntax (line '+str(i)+')',)

else:

firstlink_end += 2 # skip the ]]

while line_content.find('', firstlink_end) is not -1 and line_content.find('', firstlink_end) is not -1: # links remain

link_start = line_content.find('[[', firstlink_end)

link_pipe = line_content.find('|' , firstlink_end)

link_end = line_content.find(']]', firstlink_end)

if link_start > link_end:

complex_errors += ('error in wikilink syntax (line '+str(i)+')',)

break

new = line_content[:link_start]

if link_pipe is -1 or link_pipe > link_end: # no pipe in link of interest

new += line_content[link_start+2:link_end] + line_content[link_end+2:]

else: # there is a pipe in link of interest

new += line_content[link_pipe+1:link_end] + line_content[link_end+2:]

line_content = new # update

else:

# There are inappropriate wikilinks, but if we remove them we'll be left with no links. Human review needed.

complex_errors += ('item contains link, but not in the proper place (line '+str(i)+')',)

# Update the line without screwing with its spacing

line = line[:len(line)-len(line_content_orig)] + line_content

# Replace old version of this line with new one if we've changed anything

if line is not line_orig:

lines[i] = line

changed = True

# Implode lines back into one big string

article_body = "\n".join(lines)

# Check for external links

links = article_body.count('[http')

if links > 0:

complex_errors += ('contains '+str(links)+' external link'+('s'*(links!=1)),)

# Finish up

if lines is not article_body_orig.splitlines(False):

# Update the article

print "\tMaking changes..."

enwp.edit_page(title, article_body, 'Cleaning up disambiguation page in accordance with Wikipedia:Manual of Style (disambiguation pages)')

if len(complex_errors) > 0:

# Add the article to list of potential atrocities, along with notes, unless it's already there

atrocities = enwp.grab_page('User:Disambot/Potential atrocities')

if atrocities.find("" + title + "") == -1: # if not already listed

atrocities += "\n\n" + title + ""

for this in complex_errors:

atrocities += "\n* " + this

print "\tListing on potential atrocities..."

enwp.edit_page('User:Disambot/Potential atrocities', atrocities, 'Adding '+title+'')

def go():

article_list = open('working list', 'r')

for title in article_list: inspect(title.strip())

article_list.close()

private.py

username = '(not shown)'

password = '(not shown)'