User:Monk of the highest order/ASE/code

This is the code I used to calculate Wikipedia articles which have only seen one human editor (usually the page creator). The last time I ran this was two years ago, it produced a list about 2000 entries long which since has been whittled down to about 100 or so - in other words all but one hundred have seen review. I'll probably run this script again soon, accounting for those articles already reviewed from the first run. When I do that, I'll clean these up, re-organize, give more meaningful filenames, etc.

xmlsplitter.py

  1. XMLsplitter.py
  2. V03
  3. Released under GNU GPLv3 by Monk of the Highest Order, 2008.
  1. Partitions a giant XML document
  2. into smaller documents without breaking content across
  3. a selected element. So for example, if the element is
  4. all data between that and the tag is kept in the
  5. same doc.

import re, random

from utility import *

from sys import exit

  1. example exml doc:
  2. change10
  3. change 9
  4. ...repeat x100000000000....
  1. basic idea: Strip base, mid1, mid2 (why even worry)
  2. just make files which just contain distinct page data
  3. I wouldn't make 1 file for each page. Not sure the
  4. file system could handle 2mil files. I'd stay safe
  5. at something like 2k. Named numerically, probably,
  6. so we don't need to get into title extraction
  7. so 2.2 mil / 2000 files = 1.1*10^3 pages ea.

def interpret(textline, pagecount, parent_tags, data_to_get):

if re.search('(?i)', textline):

pagecount+=1

#print('page' + str(pagecount))

elif re.search('(?i)^\s*\s*$',

textline):

return None, pagecount

return textline, pagecount

def get_pages_per_file(rel_position):

#input a float giving the relative

#position of the file break-up-er

#in the big meta file, where 0.0 == the beginning

#and 1.0 == the end.

if rel_position < 0.1:

pages_per_file=200

elif rel_position < 0.3:

pages_per_file=800

elif rel_position < 0.5:

pages_per_file=1200

elif rel_position < 1.0:

pages_per_file=2600

else:

print("error! no rel position within 0-1.0", repr(rel_position))

exit()

return pages_per_file

def main():

(sourcexml, pos, filenum)=unpickle_data('xmlsplitter.tmp',

['1008smh.xml', 0, 1])

nwiki=2600000. #estimate of the number of elements

nfilegoal=16000. #estimate of number of pages desired

output_folder='output/'

data_to_get='page'

parent_tags='mediawiki'

fbig=open(sourcexml, 'r')

fbig.seek(0, 2)

eof_loc = fbig.tell()

fbig.seek(pos)

pages_per_file = get_pages_per_file(pos/float(eof_loc))

while fbig.tell() < eof_loc:

if filenum >= nfilegoal: exit()

newblock = []

pagecount = 0

fblock = open(output_folder + \

str(filenum) + '.block', 'w')

next=fbig.tell()

while pagecount < pages_per_file and next < eof_loc:

prev=next

try:newline, pagecount = interpret(fbig.readline(),

pagecount, parent_tags, data_to_get)

except IOError:

print("IOError... waiting it out.")

fbig.seek(prev+30)

pass

next=fbig.tell()

if next > eof_loc:

next=prev+30

fbig.seek(next)

if newline: newblock.append(newline)

newblock.append('')

newblock.insert(0,'\n')

print(fbig.tell(), eof_loc)

fblock.writelines(newblock)

fblock.flush()

fblock.close()

rel_position=fbig.tell()/float(eof_loc)

pages_per_file=get_pages_per_file(rel_position)

print("File " + str(filenum) + " (" + \

str(int(rel_position*100)) + \

"%) written.")

filenum+=1

pickle_data('xmlsplitter.tmp', [sourcexml, fbig.tell(), filenum])

if __name__ == '__main__':

main()

parser.py

  1. The structure of this program is designed not around speed, but around
  2. memory constraints. It is assumed that you have lotsa space and lotsa time.
  3. TODO:
  4. Output file
  5. Cleanup constants -> (eg, one file should handle the constant locations of
  6. the bot list, the redirect list, ids-editors db, one-editor folder, etc.
  7. probably this folder)

import sys

import re

import csv

import optparse

from xml.sax import make_parser, handler

import sqlite3

from glob import glob

try: from urllib.parse import quote

except: from urllib import quote

import utility

import pageparser_db

import wiki_pageset

import one_authorize

from xml_to_pageset import WikiXMLParser

BOT_NAMES_LIST='bot_list.txt'

BOT_IDS_LIST='bot_list_ids.txt'

def get_bots_list(value='names'):

try:

if value=='names':

fbots=open(BOT_NAMES_LIST,'r')

elif value=='ids':

fbots=open(BOT_IDS_LIST,'r')

bots=fbots.readlines()

for i in range(len(bots)-1):

bots[i] = quote(bots[i].rstrip())

fbots.close()

bots.append('Conversion%20script')

return sorted(bots)

except IOError:

print(" error: could not read one of bots list filez")

sys.exit()

  1. Command System #####

if __name__=='__main__':

command = optparse.OptionParser()

command.set_usage("""

Usage: parser.py [-v/-q]

[-1 1.xml 2.xml 3.xml...]

[-f 1.xml.csv 2.xml.csv...]

[-2 1.xml.csv 2.xml.csv...]

[-3 1.xml.csv 2.xml.csv...]

[-4 1.xml.csv 2.xml.csv...]

[-5 1.xml.inx.csv 2.xml.inx.csv...]

""")

command.add_option("-1", "--xml_decode",

action="store_true",

dest="xml_decode",

help="XML -> CSV 'pageset' of pagename, pageid, editorid, and edits by editor id")

command.add_option("-f", "--filter_csv",

action="store_true",

dest="filter_csv",

help="refilter a csv file for bots, userpages, etc...")

command.add_option("-2", "--fill-editor-db",

action="store_true",

dest="fill_editor_db",

help="add CSV pageset data to: sqlite db of edit count per page by each user.")

command.add_option("-t", "--tally-editor-db",

action="store_true",

dest="tally",

help="run (2) on every pageset available, then run this, before using option (4)")

command.add_option("-3", "--one-editor",

action="store_true",

dest="one_editor",

help="CSV pageset -> new CSV with one-editor pages only")

command.add_option("-4", "--inexp-editor",

action="store_true",

dest="inexp_editor",

help="""CSV pageset -> new CSV with with one author only,

with that author having less than 15 edits to his name

(completely fill the SQLITE database b4 using this option).""")

command.add_option("-5", "--title-list",

action="store_true",

dest="title_list",

help="CSV pageset -> list of pages within by title")

command.add_option("-i", "--id-list",

action="store_true",

dest="id_list",

help="CSV pageset -> list of pages within by id")

command.add_option("--gt_ids",

action="store",

dest="gt_ids",

help="necessary for -4: list of the userids whose editcounts qualify them as experienced")

command.add_option("--gt_ips",

action="store",

dest="gt_ips",

help="necessary for -4: list of the ips whose editcounts qualify them as experienced")

#command.add_option("-X", "--mult-editors",

# action="store_true",

# dest="make_list",

# help="CSV pageset -> new CSV of pages with more than one editor.")

command.add_option("-v", "--verbose",

action="store_true",

dest="output_verbose",

help="option: give lots of debug output")

command.add_option("-q", "--quiet",

action="store_true",

dest="output_quiet",

help="option: No command line output")

(options, args) = command.parse_args(sys.argv[1:])

if options.output_quiet:

verbose=0

elif options.output_verbose:

verbose=2

else:

verbose=1

#testing for usability of command line options...

operations=options.__dict__

j=0

for i in operations:

if i not in ['output_quiet', 'output_verbose', 'gt_ids', 'gt_ips'] and \

operations[i]:

if verbose: print(str(i))

j+=1

if j==2:

print(str(i))

command.print_usage()

sys.exit()

if j==0:

command.print_usage()

sys.exit()

if True:

#if we're using an option which only uses file(s) as the argument(s)

if not args:

print(' error:this operation requires at least one file argument')

sys.exit()

elif [] in [glob(x) for x in args]:

print(' error:this operation requires all arguments to be files.')

sys.exit()

args=utility.glob_list(args)

  1. operations ######

if options.xml_decode:

parser = make_parser()

parser.setContentHandler(WikiXMLParser(verbose=verbose))

cleaner = wiki_pageset.PageFilter(verbose=verbose,

bot_ids=get_bots_list('ids'), bot_names=get_bots_list('names'))

for arg in args:

if verbose: print(" opening file",arg)

parser.parse(arg)

pages=cleaner.clean(parser.getContentHandler().pages,

rm_bot_revisions=True,

rm_user_talk=True,

rm_redirects=True,

associate_to=False,

associate_from=True,

rm_usernames=True)

if verbose: print(" done.")

csv_store_pageset(arg+'.csv', pages)

elif options.filter_csv:

cleaner = wiki_pageset.PageFilter(verbose=verbose,

bot_ids=get_bots_list('ids'), bot_names=get_bots_list('names'))

for arg in args:

if verbose: print(" opening file",arg)

pageset=wiki_pageset.csv_load_pageset(arg)

pageset=cleaner.clean(pageset,

rm_bot_revisions=False,

rm_user_talk=True,

rm_redirects=False,

associate_to=False,

associate_from=False,

rm_usernames=False)

if verbose: print(" done.")

wiki_pageset.csv_store_pageset(arg[:-4] + '.f.csv', pageset)

elif options.fill_editor_db:

editor_db = one_authorize.EditsByUser(verbose=verbose)

for arg in args:

if verbose: print(" opening file",arg)

pageset=wiki_pageset.csv_load_pageset(arg)

userids, ip_addrs=editor_db.get_edits_by_user(pageset)

utility.csv_write(arg[:-4]+'.editors_ids.csv', userids)

utility.csv_write(arg[:-4]+'.editors_ips.csv', ip_addrs)

elif options.one_editor:

for arg in args:

if verbose: print(" opening file",arg)

pageset=wiki_pageset.csv_load_pageset(arg)

pageset2=[]

for page in pageset:

editors=set()

if verbose==2: print(" going thru pageset")

for revision in page.revisions:

editors.add(revision["contributorID"])

if len(editors)>1:

break

else:

pageset2.append(page)

wiki_pageset.csv_store_pageset(arg[:-4]+'.one_edtr', pageset2)

if verbose: print(" done")

elif options.inexp_editor:

for arg in args:

if verbose: print("opening file", arg)

pageset_listform=utility.csv_read(arg)

if not options.gt_ips or not options.gt_ids:

print("""ERROR. you need to provide a list of

'experienced users' for this operation... both

by ip and userid. see --help""")

sys.exit()

editor_db = one_authorize.EditsByUser(verbose=verbose)

pageset2 =editor_db.get_inx_pages(pageset_listform,

ips_gt=options.gt_ips,

ids_gt=options.gt_ids)

utility.csv_write(arg[:-4]+'.inx_edtr', pageset2)

if verbose: print(" done")

elif options.id_list or options.title_list:

if options.idlist:

ext='.pageids'

columnpos=1

else:

ext='.titles'

columnpos=0

for arg in args:

if verbose: print(" opening file",arg)

f_arg=open(arg,'r')

f_output=open(arg+ext,'w')

f_arg.seek(0,2)

eof_loc=f_arg.tell()

f_arg.seek(0)

while f_arg.tell() < eof_loc:

line_buffer=[]

for i in range(800):

line_buffer.append(f_arg.readline())

line_buffer.remove('') #in case we exceed the end of the file

if verbose: print(" progress:", float(100*f_arg.tell())/eof_loc)

splitted=wiki_pageset.csv_load_pageset(line_buffer, isfile=False)

page_attr_list=[x[columnpos] + '\n' for x in splitted]

f_output.writelines(page_attr_list)

f_output.flush()

del splitted

del page_attr_list

f_titles.close()

if verbose: print(" done.",arg)

elif options.tally:

editor_db = one_authorize.EditsByUser(verbose=verbose)

if verbose: print(" start")

editor_db.fill_edit_db(input_files=args, editcount_folder='/opt/editcounts/')

if verbose: print(" done")

pageparser_db.py

much of this is obsolete and no longer used...

sqlite is rather no good for some high load things, I feel. :*

just kidding, I'm just no good at sqlite optimization

import sqlite3,sys

  1. 5555555555555555
  2. DB operations 5
  3. 5555555555555555

ID_TO_NAME = {}

ID_TO_NAME['filename']='ids_to_names.sqlite'

ID_TO_NAME['creation_schema']="CREATE TABLE contributors(contributorID text PRIMARY KEY,username text)" #USERIDS MUST NOT BE STRONGLY TYPED AS INTS: SEVERAL OF THE EARLIER IDS WERE IN ASCII, AND ARE NOT CONVERTIBLE TO INTS.

ID_TO_NAME['table_list']=['contributors']

EDITCOUNT = {}

EDITCOUNT['filename']='editcount.sqlite'

EDITCOUNT['creation_schema']="CREATE TABLE total_edits(contributorID INTEGER PRIMARY KEY,editcount INTEGER)" #USERIDS MUST NOT BE STRONGLY TYPED AS INTS (in sqlite, int is the only type which can be strongly typed, and that is by using the term INTEGER): SEVERAL OF THE EARLIER IDS WERE IN ASCII, AND ARE NOT CONVERTIBLE TO INTS.

EDITCOUNT['table_list']=['total_edits']

REDIRECTS = {}

REDIRECTS['filename']='redirects.sqlite'

REDIRECTS['creation_schema']="CREATE TABLE redirects(idnum INTEGER NOT NULL UNIQUE)"

REDIRECTS['table_list']=['redirects']

def connect_base(filename, creation_schema, table_list):

base=sqlite3.connect(filename)

cu=base.cursor()

cu.execute("select tbl_name from sqlite_master where type='table' order by tbl_name")

tables = []

for row in cu.fetchall():

tables.extend(row)

#print(repr(tables))

if tables==[]:

cu.execute(creation_schema)

base.commit()

elif table_list[0] not in tables:

print(filename, " db has unknown schema. please fix manually.")

sys.exit()

return base, cu

def connect_contributor_id_base():

return connect_base(ID_TO_NAME['filename'],

ID_TO_NAME['creation_schema'], ID_TO_NAME['table_list'])

def connect_editcount_base(basemodulo):

return connect_base('/opt/editcounts/'+str(basemodulo)+EDITCOUNT['filename'],

EDITCOUNT['creation_schema'], EDITCOUNT['table_list'])

def connect_redirect_base():

return connect_base(REDIRECTS['filename'],

REDIRECTS['creation_schema'], REDIRECTS['table_list'])

  1. <<<<<<<<<<<<>>>>>>>>>>>>
  2. < ID_to_Name functions >
  3. <<<<<<<<<<<<>>>>>>>>>>>>

def associate(contributorID, username):

base, cu = connect_contributor_id_base()

try:

results=cu.execute('INSERT INTO contributors(contributorID,username) values (?,?)', (contributorID,username))

except sqlite3.IntegrityError:

return None

base.commit()

base.close()

return results

def get_username(contributorID):

base, cu = connect_contributor_id_base()

cu.execute('SELECT username FROM contributors WHERE contributorID=?',(contributorID,))

rows=[]

for row in cu.fetchall():

rows.extend(row)

base.close()

return rows

wiki_pageset.py

for understanding and filtering sets of page history

for bots, redirects, etc.

parser.py is used to load and call the classes and functions in here, usually.

import utility, pageparser_db, sqlite3

try: from urllib.parse import quote

except: from urllib import quote

from time import time #for benchmarking purposes

class PageHistory():

def __init__(self):

self.title=None

self.idnum=None

self.revisions=[]

def csv_store_pageset(filename, cleaned_pageset):

a pageset is a list [] of PageHistory objects

#WARNING: strips all username and character data

writable_pageset = [utility.flatten_list([page.title, page.idnum,

[revision['contributorID'] for revision in \

page.revisions]]) for page in cleaned_pageset]

#for page in cleaned_pageset:

# page.revisions = [revision['contributorID'] for revision in page.revisions]

# writable_pageset[-1].extend([page.title, page.idnum, page.revisions])

if filename.split('.')[-1] !='csv':

filename+='.csv'

utility.csv_write(filename,writable_pageset)

return True

def csv_load_pageset(filename, isfile=True):

pageset=[]

csv_data = utility.csv_read(filename, isfile)

for row in csv_data:

pageset.append(PageHistory())

pageset[-1].title=row[0]

pageset[-1].idnum=row[1]

pageset[-1].revisions=[{'contributorID':contributorID, 'username':, 'comment':} for contributorID in row[2:]]

return pageset

  1. Massive pageset filterer

class PageFilter():

def __init__(self, verbose=0,bot_names=[],bot_ids=[]):

self.verbose=verbose

if self.verbose: print(" loading data to clean pagesets")

#redirect stuff....

#int version (by pageid, but those don't always work, trust me...

"""redirect_list=[int(x) for x in redirect_list]

dictum={}

for i in range(100):

dictum[i]=[]

for item in redirect_list:

dictum[item % 100].append(item)"""

#str version

#f_r_list=open('TLR4')

#redirect_list=f_r_list.readlines()

#dictum={}

#for item in redirect_list:

# if item[:2] not in dictum:

# dictum[item[:2]]=[]

# dictum[item[:2]].append(item.rstrip())

#PageFilter.redirect_complex=dictum

#del redirect_list

#f_r_list.close()

PageFilter.bot_ids=bot_ids

PageFilter.bot_names=bot_names

if self.verbose==2: print(" Connecting to sqlite database of userid-username pairs.")

#sqlite database with a single table with userid as primary key and username as the other value

PageFilter.id_base, PageFilter.id_cu= \

pageparser_db.connect_contributor_id_base()

def clean(self, pageset, rm_bot_revisions=True, rm_user_talk=True,

rm_redirects=True, associate_to=False, associate_from=False,

rm_usernames=True):

if verbose==2:timer={"redirects":0,"user_talk":0,

"associate to/from":0, "revisions":0, "bot_revisions":0,

"bots2":0, "rm_usernames":0, "rm_unnec_revisions":0,

"rm_unnec_pages":0,"commit":0}

if verbose==2: eop=len(pageset)

if verbose==2: prev='0'

unnec_pages = []

if associate_from: PageFilter.id_cu.execute('BEGIN;')

for pagenum in range(len(pageset)):

if verbose==2: tmptime=time()

if verbose==2: cur=str(int((pagenum/float(eop))*100))

if verbose==2:

if self.verbose and len(cur)>1 and cur[0] != prev[0]: print(cur)

if verbose==2: prev=cur

if rm_redirects:

title=pageset[pagenum].title

#idnum=int(pageset[pagenum].idnum)

#if idnum in PageFilter.redirect_complex[idnum%100]:

if title[:2] in PageFilter.redirect_complex and \

title in PageFilter.redirect_complex[title[:2]]:

if verbose==2: timer['rm_unnec_pages']+=1

if verbose==3: print('found redirect', title)

unnec_pages.append(pagenum)

if verbose==2: timer['redirects']+=(time()-tmptime)

if verbose==2: tmptime=time()

if rm_user_talk:

if re.search('(?i)^(talk|help((\s|\%20)talk)?|wikipedia((\s|\%20)talk)?|user((\s|\%20)talk)?|image((\s|\%20)talk)?|file((\s|\%20)talk)?|category((\s|\%20)talk)?|template((\s|\%20)talk)?|portal((\s|\%20)talk)?)(:|\%3A)',

pageset[pagenum].title):

unnec_pages.append(pagenum)

continue

if verbose==2: timer['user_talk']+=(time()-tmptime)

unnec_revisions=[]

for revision_num in range(len(pageset[pagenum].revisions)):

revision=pageset[pagenum].revisions[revision_num]

if verbose==2: tmptime=time()

if associate_to:

PageFilter.id_cu.execute('SELECT username FROM contributors WHERE contributorID=?',(revision['contributorID'],))

name=PageFilter.id_cu.fetchone()

if name:

pageset[pagenum].revisions[revision_num]['username'] = name[0]

elif associate_from and revision['username']: #associate from pageset into base

try:

PageFilter.id_cu.execute('INSERT INTO ' + \

'contributors(contributorID,username)' + \

'values (?,?)', (revision['contributorID'],

str(revision['username'])))

except sqlite3.IntegrityError:

pass

if verbose==2: timer['associate to/from']+=(time()-tmptime)

if rm_bot_revisions:

if verbose==2:tmptime=time()

if revision['username'] in PageFilter.bot_names or \

revision['contributorID'] in PageFilter.bot_ids:

unnec_revisions.append(revision_num)

if verbose==2: timer['bot_revisions']+=1

elif 'bot' in revision['username'][-4:].lower() or \

'bot' in revision['comment'].lower():

#print("possible bot detection - ", revision['username'],

#"not on list...")

unnec_revisions.append(revision_num)

if verbose==2: timer['bot_revisions']+=1

if verbose==2: timer['revisions']+=1

if verbose==2: timer['bots2']+=(time()-tmptime)

if verbose==2: tmptime=time()

if rm_usernames:

pageset[pagenum].revisions[revision_num] = {'contributorID':revision['contributorID']} #this must occur AFTER botcheck.

if verbose==2: timer['rm_usernames']+=(time()-tmptime)

unnec_revisions.reverse() #items must be removed in reverse order

#or a removal will shift the index numbers of all later list items

for entry_num in unnec_revisions:

del pageset[pagenum].revisions[entry_num]

#tmptime=time()

if verbose==2: timer['commit']=len(pageset)

unnec_pages.reverse() #items must be removed in reverse order

for entry_num in unnec_pages:

del pageset[entry_num]

#timer['rm_unnec_pages']+=(time()-tmptime)

#tmptime=time()

if verbose==2: print(" committing id base")

PageFilter.id_base.commit()

#timer['commit']+=(time()-tmptime)

if verbose==2: print(" done cleaning.")

if verbose==2:

for i in timer:

print(" ", i, " | ", str(timer[i])[:5])

return pageset

def only_one_contributor(pageset):

one_author_pageset=[]

for pagehistory in pageset:

num_authors=set([x['contributorID'] for x in pagehistory.revisions])

if len(num_authors)==1:

one_author_pageset.append(pagehistory)

return one_author_pageset

xml_to_pageset.py

The core function of making use of all that xml.

parser.py is used to load and call the classes and functions in here, usually.

from xml.sax import make_parser, handler

try: from urllib.parse import quote

except: from urllib import quote

import wiki_pageset

class WikiXMLParser(handler.ContentHandler):

"""Converts the XML data into a form that can be more

easily handled en Masse by python. While it is doing

this, it strips the data of everything but page titles,

page ids, and a list of revisions for each page. The

list of revisions includes only the contributor and the

comment, (including both the comment and the contributor

name as well as ID or IP as to provide an opportunity to

filter out bots), and does not even include dates"""

important_tags = {

('contributor','revision'):'contributor',

('username','contributor'):'username',

('comment','revision'):'comment',

('revision','page'):'revision',

('id','page'):'pageID',

('id','contributor'):'contributorID',

('ip','contributor'):'contributorID',

('title','page'):'pagetitle'

}

important_tags_reverse={}

for tag in important_tags:

important_tags_reverse[(tag[0],important_tags[tag])]=tag[1]

def __init__(self, verbose=0):

self.verbose=verbose

pass

def set_filename(self, filename): self.filename=filename

def startDocument(self):

self._elems = 0

self._attrs = 0

self.pages = []

self.parent = 'page'

self.current = None

if self.verbose: print(' reading XML...')

def startElement(self, name, attrs):

self._elems = self._elems + 1

#self._attrs = self._attrs + len(attrs)

if name == 'page':

self.current = wiki_pageset.PageHistory()

self.parent='page'

elif name == 'revision':

self.current.revisions.append({'contributorID':, 'username':, 'comment':''})

self.parent = 'revision'

elif (name,self.parent) in FancyCounter.important_tags:

self.parent = FancyCounter.important_tags[(name,self.parent)]

def endElement(self, name):

if name == 'page':

self.pages.append(self.current)

del self.current

elif (name,self.parent) in FancyCounter.important_tags_reverse:

self.parent=FancyCounter.important_tags_reverse[(name,self.parent)]

def characters(self, content):

if self.parent == 'pagetitle':

self.current.title = quote(content)

elif self.parent == 'pageID':

self.current.idnum = content

elif self.parent in ['contributorID', 'username', 'comment']:

self.current.revisions[-1][self.parent]=quote(content)

def endDocument(self):

if self.verbose: print(" cool stats: ", self._elems, "elements.")

#if self.verbose: print(" There were", self._attrs, "attributes.")

return self.pages

one_authorize.py

All-in-one for creating a tally of how many edits each author has made (on the assumption of a complete and non-redundant set of csv pagesets) and for removing pages from a pageset based on user editcounts

parser.py is used to load and call the classes and functions in here, usually.

from wiki_pageset import PageHistory

from math import ceil, floor

from time import time

import re,operator,os,sys

import utility

class EditsByUser():

def __init__(self, verbose=0):

self.verbose=verbose

#if self.verbose==2: print(" Connecting to sqlite database of userid edit tables.")

#sqlite database with a single table with userid as primary key and username as the other value

#PageFilter.edit_bases={}

#PageFilter.edit_cursors={}

#for i in range(1000):

# PageFilter.edit_bases[i], PageFilter.edit_cursors[i]= \

# pageparser_db.connect_editcount_base(i)

self.interval_dicts_done=0

self.ip_regex=re.compile('^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$')

def ip_to_int(self, valuelist):

return int(valuelist[0])*16777216+\

int(valuelist[1])*65536+\

int(valuelist[2])*256+\

int(valuelist[3])

def int_to_ipstr(self, number):

ip= [((number%(256**4))/256**3),

((number%(256**3))/256**2),

((number%(256**2))/256**1),

((number%(256**1))/256**0)]

return '.'.join([str(x) for x in ip])

def get_edits_by_user(self, pageset):

#get data of pageset

if self.verbose: print(" organizing editor data for storage")

ip_list={}

id_list={}

for page in pageset:

for revision in page.revisions:

userid=revision['contributorID']

is_ip=re.findall(self.ip_regex,userid)

if is_ip:

userid=self.ip_to_int(is_ip[0])

if userid not in ip_list:

ip_list[userid]=1

else: ip_list[userid]+=1

elif re.search('^\d+$', userid):

userid=int(userid)

if userid not in id_list:

id_list[userid]=1

else: id_list[userid]+=1

id_list=x,id_list[x for x in sorted(id_list)]

ip_list=x,ip_list[x for x in sorted(ip_list)]

return id_list,ip_list

def interval_dicts(self):

self.id_dict = {

'upper':10000000, #in reality, currently users peak at 8mili, but at least for the next year and a half or so it'll stay under ten mill, prob.

'lower':1,

'ext':'ids'

}

self.ip_dict = {

'upper':4294967296,

'lower':16777216,

'ext':'ips'

}

self.user_dicts={'ip':self.ip_dict,'id':self.id_dict}

for user_dict in [self.ip_dict, self.id_dict]:

user_dict['interval']=ceil(float(user_dict['upper']-user_dict['lower'])/100)

user_dict['user_blocks']=[]

user_dict['input_files']={} #although it's not inconceivable that base-10 IPs and IDs could be stored in harmony in the same file, I suspect that, barring some kind of apocalyptic kinda thing, or peak oil, the number of editors will double in the next decade, resulting in the inevitable collision. While adjusting the upper limits of users is a predictable problem, this is something which would be hard to figure out. yah like this script is going 4 ten years.

for i in range(100):

user_dict['user_blocks'].append(i*user_dict['interval'])

def fill_edit_db(self, input_files=[], editcount_folder='/opt/editcounts/'):

if self.interval_dicts_done==0:

self.interval_dicts()

#input files: a list of valid file addresses, each of which either contains a list of base-10 IPs or wikipedia editor IDs with a number of edits next to it.

#editcount_folder - the folder to put the total counts.

if self.verbose: print(' Categorizing input editcount files')

for filename in input_files: #all files are assumed to exist at this point, and be a

boxed=False

for user_dict in [self.ip_dict, self.id_dict]:

if user_dict['ext'] in filename:

boxed=True

user_dict['input_files'][filename]=0

if not boxed:

print("Error! The filename", filename, " is not clearly distinguishable as either an ip or userid editcount file.")

for user_dict in [self.id_dict]:

if len(user_dict['input_files'])==0:

if self.verbose: print(' Beginning editcount set ' + user_dict['ext'])

if self.verbose: print(' Found no files which contained editcounts by ' + user_dict['ext'])

continue

for block_num,block in enumerate(user_dict['user_blocks']): #ranges of possible user ids or ips

#for block_num in range(23,24): #ranges of possible user ids or ips

if self.verbose: print(' starting new block', block_num, 'out of 100 blocks...')

loc_block=editcount_folder+'edits.'+user_dict['ext']+'.'+str(block)+'.txt'

block_data={}

if os.path.isfile(loc_block):

if self.verbose: print(' loading old block data', loc_block)

unformatted=[[int(x),int(y)] for x,y in utility.csv_read(loc_block)]

block_data=dict(unformatted)

i=0

timer={'open/seek':0,'tell':0,'readline':0,

'interpret':0,'compare':0, 'incl':0}

for filename in sorted(user_dict['input_files']):

i+=1

if i%100==0 and self.verbose==2:

print(os.path.basename(filename))

for item in timer:

print(" ", item, " | ", str(timer[item]))

tmptime=time()

f_source = open(filename,'r')

f_source.seek(user_dict['input_files'][filename])

timer['open/seek']+=(time()-tmptime)

while True:

tmptime=time()

timer['tell']+=(time()-tmptime)

tmptime=time()

user_dict['input_files'][filename]=f_source.tell()

data=f_source.read(2000)

row_block=data.split('\n')

if len(row_block)==1:

break

if row_block[-1] != '':

newdata='bleaugh'

while newdata != '\n' and newdata != '':

newdata=f_source.read(1)

if newdata =='\n':

row_block.append('')

else:

row_block[-1]+=newdata

breaker=False

for row_num in range(len(row_block)):

tmptime=time()

row=row_block[row_num]

if row=='':

break

user,edits=[int(x) for x in row.split(',')]

timer['interpret']+=(time()-tmptime)

tmptime=time()

#if user==2332919:print(filename, "a",user)

if user >= block:

if user >= block+user_dict['interval']:

breaker=True

break

timer['compare']+=(time()-tmptime)

tmptime=time()

if user not in block_data:

block_data[user]=edits

#if user==2332919:print("b",user, block_data[user])

else:

block_data[user]+=edits

#if user==2332919:print("c",user, block_data[user])

timer['incl']+=(time()-tmptime)

#if user==2332919:print("d",user, block_data[user])

if breaker: break

f_source.close()

writable = sorted(block_data.items(),key=operator.itemgetter(0))

f_block=open(loc_block, 'w')

for item in writable:

f_block.write(str(item[0])+','+str(item[1])+'\n')

f_block.flush()

f_block.close()

safety_valve_progress=editcount_folder+\

'safety_valve_progress.'+ user_dict['ext'] + str(block) + '.txt'

utility.csv_write(safety_valve_progress,

sorted(user_dict['input_files'],key=operator.itemgetter(0)))

def activate_gt(self, ips_gt, ids_gt):

try:

if self.gt:

return True

except:

self.gt={'ip':[int(x.rstrip()) for x in open(ips_gt,'r')],

'id':[int(x.rstrip()) for x in open(ids_gt,'r')]}

def get_inx_pages(self, pagelist,

limit=50, ips_gt=None, ids_gt=None):

"""

pagelist=just any list of lists where the last element of each itemlist is a str userid or a str base-256 ip addr.

if the user or ip is found to be inexperienced,

all elements but the last element are included as one of many in a results list.

ips_gt=sorted list of base-10 ips with a number of edits

that exceed the number of edits that qualify

them as 'experienced,' and thus should return a false value.

ips_lt=sorted list of userids, same as above

limit = not implemented yet. in future, will automate creation

and use of ips_gt, ips_lt from the folder where editcounts were tallied by user.

"""

#returns a list of only the pages which have *less* edits than the limit

results=[]

pagelist2={'ip':[],'id':[]}

for page in pagelist:

userid=page[-1]

is_ip=re.findall(self.ip_regex,userid)

if is_ip:

page[-1]=self.ip_to_int(is_ip[0])

pagelist2['ip'].append(page)

elif re.search('^\d+$', userid):

page[-1]=int(userid)

pagelist2['id'].append(page)

for setname in ['ip','id']:

pagelist2[setname]=sorted(pagelist2[setname],key=operator.itemgetter(-1))

users_shadow=[x[-1] for x in pagelist2[setname]]

inx_list=self.has_less_edits_than(setname=setname,

usernames=users_shadow,ips_gt=ips_gt,ids_gt=ids_gt)

for i in range(len(inx_list)):

if inx_list[i]:

results.append(pagelist2[setname][i])

return results

def has_less_edits_than(self, setname='ip',

usernames=[], ips_gt=None, ids_gt=None):

"""

usernames = list of names to test. Returned list of bools based on test.

ips_gt=sorted list of base-10 ips with a number of edits

that exceed the number of edits that qualify

them as 'experienced,' and thus should return a false value.

ips_lt=sorted list of userids, same as above

limit = not implemented yet. in future, will automate creation

and use of ips_gt, ips_lt from the folder where editcounts were tallied by user.

"""

#returns a list of only the users which have *less* edits than the limit

self.activate_gt(ips_gt,ids_gt)

results=[]

userlist=usernames

len_userlist=len(userlist)

gtlist=sorted(self.gt[setname]) #both gtlist and userlist should supposedly be sorted and of the same type by this line, making the following algorithm pretty efficient.

len_gtlist=len(gtlist)

user_cursor=0

gt_cursor=0

last_res=0

print(setname,len_userlist)

bcs=0

ds=0

while user_cursor!=len_userlist:

user=userlist[user_cursor]

print(len_userlist, user_cursor, len_gtlist,gt_cursor)

gtpos=gtlist[gt_cursor]

#if user==104025: print('a',user,gtpos,user_cursor,gt_cursor)

#104523

if gtpos < user:

if last_res==-1:

user_cursor+=1

results.append(True)

last_res=0

#if user==104025: print('bI',user,gtpos,user_cursor,gt_cursor)

bcs+=1

else:

if gt_cursor+1

gt_cursor+=1

last_res=1

else:

user_cursor+=1

#if user==104025: print('bII',user,gtpos,user_cursor,gt_cursor)

elif gtpos > user:

if last_res==1:

user_cursor+=1

results.append(True)

last_res=0

#if user==104025: print('cI',user,gtpos,user_cursor,gt_cursor)

bcs+=1

else:

if gt_cursor>0:

gt_cursor-=1

last_res=-1

else:

user_cursor+=1

#if user==104025: print('cII',user,gtpos,user_cursor,gt_cursor)

elif gtpos == user:

results.append(False)

user_cursor+=1

last_res=0

#if user==104025: print('d',user,gtpos,user_cursor,gt_cursor)

ds+=1

print('d',ds,'bc',bcs)

return results

utility.py

I know, I know, more descriptive names, I'll give it one.

This is just a set of toolbox functions I typically carry with me everywhere

  1. utility.py
  2. V1

DEBUG=True

import pickle, textwrap, os, csv

from glob import glob

def pickle_data(file_addr, data):

f_pickle=open(file_addr,'wb')

pickle.dump(data, f_pickle)

f_pickle.flush()

f_pickle.close()

def unpickle_data(file_addr, defaultobject=None):

if os.access(file_addr, os.R_OK):

return pickle.load(open(file_addr,'rb'))

else:

data=defaultobject

pickle_data(file_addr,data)

return data

def flatten_list(list_item):

product=list()

for x in list_item:

if type(x) != list:

product.append(x)

elif list in [type(y) for y in x]:

product.extend(flatten_list(x))

else:

product.extend(x)

return product

def glob_list(args1):

args2=[]

for arg in args1:

args2.extend(glob(arg))

return args2

def dbgmsg(text,links=False):

if DEBUG:

if links:

print(" DEBUG: " + text)

else:

print(textwrap.fill(" DEBUG: " + text))

def csv_write(filename, rowlist):

f_csv=open(filename,'w')

writer=csv.writer(f_csv)

writer.writerows(rowlist)

f_csv.flush()

f_csv.close()

return True

def csv_read(filename, isfile=True):

if isfile:

f_csv=open(filename, 'r')

reader=csv.reader(f_csv)

else:

reader=csv.reader(filename)

rowlist=[]

for row in reader:

rowlist.append(row)

del reader

if isfile: f_csv.close()

return rowlist

serch.py

this is the way to update editor data from the website realtime.

Incredibly slow, and server heavy.

That's why you only use this on the list of pages which had a single editor as

of your most recent version of the stub-meta-history file. Because then it is

about 1/26th the number of files to check and it doesn't take several months and dozens of gb of transfer.

  1. !/bin/python
  2. tool for checking real time from a list of wikipage titles
  3. whether the page has more than one contributor,
  4. is a redirect, or has templates, and such things.
  5. but because this tool is rather slow and heavy on
  6. the server load... better to use it on small list
  7. of wikipages just to keep them up2date.

import csv

from urllib.parse import quote

import os

import sys

import re

from hashlib import md5

from utility import *

def wget(link,outfile):

os.system('wget -q "' + link + '" -O "' + outfile + '"')

def make_urls():

#URL addresses for finding out information about pages.

url_book = {

'current' : {

'prefix':'http://en.wikipedia.org/w/index.php?title=Special:Export&pages=',

'suffix':'&limit=1&action=submit'

},

'data' : {

'prefix':'http://en.wikipedia.org/w/index.php?title=Special:Export&pages=',

'suffix':'&limit=10&action=submit&history'

}

}

return url_book

def get_specific_link(url_book, pagename):

link = dict()

for linkaddr in ['current','data']:

link[linkaddr]=url_book[linkaddr]['prefix'] + \

pagename + url_book[linkaddr]['suffix']

return link

def read_link(url_to_get, localaddr):

#returns file handle of a page

#downloaded from the internet

#to location 'localaddr'.

os.system('rm ' + localaddr)

wget(url_to_get,localaddr)

pagesrc = open(localaddr,'r')

return pagesrc

"""class HistoryChecker():

def __init__(self):

def load_from_web(self, web_addr):

dbgmsg('getting contributors')

f_contrib=read_link(web_addr,'/tmp/contrib.txt')

dbgmsg('done')

self.contrbrs=f_contrib.readlines()

self.contrbrs=[re.sub('^.*?\t(.*?)\t.*','\g<1>',x).rstrip() for x in self.contrbrs]

self.contrbrs=self.de_bot(self.contrbrs)

f_contrib.close()

return True

def gauntlet(self, level=0):

if level >=0:

for test in [self.check_max_editors,

self.check_min_editors]:

if not test(self.contrbrs): return False

#if level >=1:

# for test in [self.check_editor_bg]:

# if not test(): return False

#if level >=2:

# pass

dbgmsg("PASSED level " + str(level) + " contributor check.")

return True"""

class ContentChecker():

def __init__(self):

f_bot=open('bot_list.txt', 'r')

self.bot_list = f_bot.readlines()

self.bot_list=[x.rstrip().lower() for x in self.bot_list]

def test_if_redirect(self, pagename, web_addr):

f_page = read_link(web_addr,'/tmp/x.xml')

data=f_page.read()[:2750]

if not re.search('(.+?)',data):

self.is_not_redirect=False #okay, well technically it's probably a defunct page, but whatever, nomenclature later...

return self.is_not_redirect

if quote(re.search('(.+?)',data).group(1)) == pagename:

if not re.search(">\s*\#redirect(\s|$)", data.lower()):

print("not a redirect")

self.is_not_redirect=True

return self.is_not_redirect

print("a redirect")

self.is_not_redirect=False

return self.is_not_redirect

def load_from_web(self, web_addr):

if self.is_not_redirect:

dbgmsg('getting content') #if we wanted to read content

#from database, this is where we'd do it instead.

#the parameter would be something like pagename instead.

f_page = read_link(web_addr,'/tmp/x.xml')

self.data = f_page.read().lower()

self.editors=self.get_editors(self.data)

else:

self.data =''

self.editors=''

return True

def gauntlet(self, level=0):

if not self.is_not_redirect: return False

if level>=0:

for test in [self.check_still_exists]:

#self.check_not_redirect]:

if not test(self.data):return False

for test in [self.check_max_editors]:

if not test(self.editors):return False

if level>=1:

for test in [self.check_no_template]:

if not test(self.data): return False

if level>=2:

pass

dbgmsg("PASSED level " + str(level) + " content check.")

return True

def de_bot(self, usernames):

usernames2=[]

for name in usernames:

if 'bot' not in name[-5:].lower() and \

name not in self.bot_list:

usernames2.append(name)

return usernames2

#def check_not_redirect(self, pagedata):

# if re.search("\n\s*\#redirect(\s|$)", pagedata):

# dbgmsg("X: wiki page is a redirect")

# return False

# return True

def get_editors(self,pagedata, revision_count=9):

#suggested: pagedata incl at least 5 revisions

pagedata2=pagedata.split('\n')

editors=set()

contributor_block=False

for line in pagedata2:

if '' in line:

contributor_block=True

elif not contributor_block:

continue

elif '' in line:

contributor_block=False

if len(editors)==revision_count:

print(repr(editors))

break

elif '' in line:

editors.add(re.sub('^\s*(((?!username>).)*)\s*$','\g<1>',line))

elif '' in line:

editors.add(re.sub('^\s*(((?!ip>).)*)\s*$','\g<1>',line))

return self.de_bot(editors)

def check_still_exists(self, pagedata):

#this is only useful if our source of content data

#is more recent than our page title list. Say if we're getting

#content live from wikipedia's "special:export" function.

pagehash=md5(self.data.encode())

if pagehash.hexdigest() in ['caa3fe485e6f6518af1e5ea59e131f68','3a98a2e740d741a7750f034a99e70025','f8f49e37b4c4bff5ecac639237a0129f']:

#the hash of the uppercased XML returned when you use the URL

#of a non-existent page.

dbgmsg("X: wiki page no longer exists")

return False

else:

print(pagehash.hexdigest())

return True

def check_no_template(self, pagedata):

if re.search("{{", pagedata):

dbgmsg("X: has a template")

return False

return True

def check_max_editors(self, contributors):

if len(contributors) > 1:

dbgmsg("X: >1 contributors")

print(repr(contributors))

return False

print(repr(contributors))

return True

def check_min_editors(self,contributors):

#this test may be excluded if you think it's

#important to check bot created pages for sanity

if len(contributors)==0:

dbgmsg("X: only bot contributors")

return False

return True

def main(titlefile):

loc_addrfile='stored_data.pickle'

lastloc=unpickle_data(loc_addrfile,0)

url_book=make_urls()

loc_output='./results/results'

#loop is designed around iterating through the title file,

#not through a variable holding all its data.

#this means we can loop thru large title files (which would

#freeze us up if put in memory.

f_titles = open(titlefile, 'r')

f_titles.seek(0, 2) #find the byte address of the end of file.

loc_end_of_file=f_titles.tell()

f_titles.seek(lastloc)

l=0

while f_titles.tell() < loc_end_of_file:

one_author_only=[] #temp repository of pages that we've found

#to have one author

handful = [] #handful of pages to check

dbgmsg("getting titles")

for i in range(0,100):

line=f_titles.readline()

if line:

handful.append(line.rstrip()) #assumes title list is already quoted

contentcheck=ContentChecker()

#historycheck=HistoryChecker()

for pagename in handful:

#build the URL addresses for getting data about the page

#makes link['h'] -> 'http://...' (url for page history)

link = get_specific_link(url_book, pagename)

dbgmsg(str(lastloc)+'page addr:' + link['current'],links=True)

valid=contentcheck.test_if_redirect(pagename=pagename,web_addr=link['current'])

if not valid: continue

dbgmsg(str(lastloc)+'page addr:' + link['data'],links=True)

contentcheck.load_from_web(link['data'])

valid = contentcheck.gauntlet()

if not valid: continue #next pagename

#historycheck.load_from_web(link['contrib'])

#valid = historycheck.gauntlet()

#if not valid: continue #next pagename

one_author_only.append(pagename+'\n')

dbgmsg("adding new data")

f_results=open(loc_output+str(lastloc)+'.txt','w')

f_results.writelines(one_author_only)

f_results.flush()

f_results.close()

lastloc=int(f_titles.tell())

print('read 50 pages\' history, of which ',

str(len(one_author_only)),

' met conditions. We are at:', lastloc)

dbgmsg("storing data")

pickle_data(loc_addrfile, lastloc)

l+=1

if __name__ == '__main__':

main(titlefile=sys.argv[1])

"""

This program updates via the internet all the suspected one author pages to see whether it's true. It breaks the list down into a bunch of files is the results folder. The list contains all the files which really seem to be one author only still. Concatenate them into one file after by doing...

python3.0 serch.py

cd results

cat *.txt > ../one_author_pages.title

You'll probably want to change this into a pageset so you can remove pages with experienced authors, so here we go, here's how to work backwords and do that.

cd ..

python

import utility

data_based=utility.csv_read('one_author_pages_prelim.csv')

int_based=open('one_author_pages.title')

r2=[x.rstrip() for x in int_based.readlines()]

dictform={}

for page in data_based:

dictform[page[0]]=page[1:]

for page in r2:

if page not in dictform:

print(page)) #should return none, as int_based was just a narrowing down of data_based

newcsv=[]

for page in r2:

a=[page]

a.extend(dictform[page])

newcsv.append(a)

utility.csv_write('One_author_Pageset.csv',newcsv)

"""

shell commands

a couple of shell commands I made use of... I need to integrate these into the code, even though it will take

more lines when using python. But basically they seem random and unintuitive but they're mostly for quickly converting from pageset to title list or dealing with editcount stuff.

  1. from /opt/editcounts/*

grep -hiE '^[0123456789]+\,([0123456789]{1,2})\s*$' edits.ids.*.txt > ids_lt_99_edits

grep -hiEv '^[0123456789]+\,([0123456789]{1,2})\s*$' edits.ids.*.txt > ids_gt_99_edits

grep -hiE '^[0123456789]+\,([0123456789]{1,2})\s*$' edits.ips.*.txt > ips_lt_99_edits

grep -hiEv '^[0123456789]+\,([0123456789]{1,2})\s*$' edits.ips.*.txt > ips_gt_99_edits

grep -E "^[^,]+,[0-9]+\s*$" limited.txt > updated_list_as_pageset_bot_made_only

grep -Ev "^[^,]+,[0-9]+\s*$" limited.txt > updated_list_as_pageset_with_humans

sed -r 's/,[0-9]+\s*$//g' ips_gt_99_edits > iplist_gt_edits

  1. just a list of base 10 ips, doesn't include editcounts

sed -r 's/,[0-9]+\s*$//g' ids_gt_99_edits > idlist_gt_edits

  1. just a list of ids, doesn't include editcounts

get_redirects.py

deals with the enwiki-pages.sql file to get a list of redirects for wiki_pageset.py

usually called on its own with a little bit of customization.

import re, random, sqlite3

import pageparser_db

from urllib.parse import quote

d=open('enwiki-20081008-page.sql')

  1. d=open('page.sql')

redirects = []

d.seek(0,2)

eof_loc = d.tell()

d.seek(0)

i=0

  1. base, cu=pageparser_db.connect_redirect_base()

f_r=open('redirect_list','w')

  1. initial page id only? i dunno, seems like it might be good to check for both though, cause this definitely removed some when I used it initially.

"""

while d.tell() < eof_loc:

content=d.read(1000000)

redirect_data=re.findall("\((\d+),\d+,\'.+?\',\'.*?\',\d+,(\d)", content)

for article in redirect_data:

if int(article[1])==1:

if random.randint(1,10000)==500:

redirects.append(article[0])

del redirect_data

i+=1

print("ahoy", str(i))

"""

  1. title paired with is_redirect

while d.tell() < eof_loc:

content=d.read(40000000)

redirect_data=re.findall("\(\d+,\d+,\'(.+?)\',\'.*?\',\d+,1", content)

for i in range(len(redirect_data)):

redirect_data[i]=quote(re.sub('_', ' ', redirect_data[i]))+'\n'

f_r.writelines(redirect_data)

del redirect_data

i+=1

if i>5:

f_r.flush()

print("ahoy", str(i))

print("we are at", str(int(100*d.tell()/float(eof_loc))), "%...")

"""

  1. here, the redirects field comes before the page_latest_id field,
  2. so we use article 0.

while d.tell() < eof_loc:

content=d.read(40000000)

redirect_data=re.findall("\(\d+,\d+,'.+?','.*?',\d+,1,\d+,[\d\.]+?,'\d+?',(\d+)", content)

for i in range(len(redirect_data)):

redirect_data[i]=redirect_data[i]+'\n'

f_r.writelines(redirect_data)

del redirect_data

i+=1

if i>5:

f_r.flush()

print("we are at", str(int(100*d.tell()/float(eof_loc))), "%...")

"""

f_r.flush()

f_r.close()

list of bots

bot list used can be found here. tho you'll probably want the more recent version from the category page.