User:Matt Crypto/RandomArticles

While Wikipedia has a [http://en.wikipedia.org/wiki/Special:Randompage Random page] feature, the pages are selected uniformly randomly from the database. As an alternative, I wrote a script to choose pages randomly based on their hit counts for a month; such a set might give a more representative example of how Wikipedia looks to visitors. The hit data for, say, September 2004 can be found [http://wikimedia.org/stats/en.wikipedia.org/url_200409.html here] (warning: very large file). Below is an example from the hits so far this month (to 22nd September 2004). If you would like a set, just send me a message and tell me a Wikipedia page, and I'll run the script for you and paste in the output. — Matt 15:06, 21 Sep 2004 (UTC)

100 randomly-selected articles (weighted by popularity)

==Script==

import re

from random import *

logFile = "/tmp/url_200409.html"

maxEntries = None # 10000

numberOfArticles = 100

r1 = re.compile(r'^(\d*)\s*([0-9.]*)%\s*([0-9]*)\s*([0-9.]*)%\s*/wiki/(\S*)$')

class ArticlePicker:

def __init__(self, logFile, maxEntries = False):

self.logFile = logFile

self.hitList = []

self.count = 0

self.maxEntries = maxEntries

def readLogFile(self):

F = open(self.logFile)

count = 0

self.hitSum = 0

for l in F:

if self.maxEntries and count > self.maxEntries:

break

try:

hits, name = self.parseLine(l)

except ValueError:

continue

count = count + 1

self.hitList.append((hits,name))

self.hitSum += hits

self.count = count

F.close()

self.hitList.sort()

self.hitList.reverse()

def parseLine(self, line):

l = line.strip()

m = r1.match(l)

if m == None: raise ValueError, "No matches found"

(hits, t1, t2, t3, name) = r1.match(l).groups()

self.filterOut(hits, name)

spaceName = re.sub('_', ' ', name)

return int(hits), spaceName

def filterOut(self, hits, name):

if name == "": raise ValueError # Exclude blank

if re.match(r'^\w*:', name): raise ValueError # Exclude namespaces

if re.match(r'Main_Page', name): raise ValueError # Exclude main page

# Exclude popular oddities

if re.match(r'_vti_bin/owssvr.dl|MSOffice/cltreq.asp', name): raise ValueError

def selectRandomly(self, N = 1):

rHits = [random() * self.hitSum for i in range(N)]

outputs = [None] * N

numberOfOutputs = 0

totalSoFar = 0

for hits, name in self.hitList:

totalSoFar += hits

for index in range(N):

if not outputs[index] and totalSoFar >= rHits[index]:

outputs[index] = hits, name

numberOfOutputs += 1

if numberOfOutputs == N: return outputs

return outputs

# Dump the articles

H = ArticlePicker(logFile, maxEntries)

H.readLogFile()

randomArticles = H.selectRandomly(numberOfArticles)

print "==%d randomly-selected articles (weighted by popularity)==" % numberOfArticles

for hits, name in randomArticles:

print "* %s — (%d hits)" % (name, hits)

Category:Random pages tests