User:PDFbot/pdfbot.py

  1. !/usr/bin/python
  2. -*- coding: utf-8 -*-

"""

This script can be used to update links transcluded using the {{PDFlink}} template.

Syntax: python pdfbot.py [-ref: TemplateName]

Command line options:

-file: Update pages listed in a text file.

-ref: Update pages transcluding from a given page.

-cat: Update pages from the given category.

-links: Update pages linked from a given page.

-page: Update that page.

"""

  1. (c) Dispenser, 2007

import re, sys, time

import wikipedia, pagegenerators, catlib

import httplib, socket, urlparse

import codecs

try:

import commonfixes

except ImportError:

wikipedia.output('Unable to import commonfixes')

commonfixes = None

try:

import reflinks

def my_reflink_put_page(self, page, new):

self.page = page

self.new_text = new

reflinks.ReferencesRobot.put_page=my_reflink_put_page

except ImportError:

wikipedia.output('Unable to import reflinks')

reflinks = None

  1. Download this file :
  2. http://www.twoevils.org/files/wikipedia/404-links.txt.gz
  3. ( maintained by User:Marumari )

listof404pages = '404-links.txt'

  1. Define global constants

readDelay = 10 # seconds

writeDelay = 30 # seconds

mix_prefix = ('bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB')

SI_prefix = ('bytes', 'kB', 'MB', 'GB')

IEC_prefix = ('bytes', 'KiB', 'MiB', 'GiB')

  1. following char sperate url from title: []"<>\ \n
  2. } is included since we're in a template

    urlpattern = re.compile(r'http[s]?://[^][<>\s"{

    ]*', re.IGNORECASE)

    httpHeader = {

    'User-Agent': 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)',

    'Accept': 'application/pdf,application/octet-stream,*/*;q=0.5',

    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',

    'Keep-Alive': '30',

    'Connection': 'keep-alive',

    }

    def checkLink(location, useHEAD = True, counter = 5):

    try:

    while counter >= 0 and location:

    (scheme, site, path, query, frag) = urlparse.urlsplit(location)

    query = query and '?' + query or ''

    path = path or '/'

    if scheme == "http":

    conn = httplib.HTTPConnection(site)

    elif scheme == "https":

    conn = httplib.HTTPSConnection(site)

    else:

    return (location, -1, 'Unsupported Protocol', None, None)

    conn.set_debuglevel(0)

    socket.setdefaulttimeout(300)

    try:

    request = path.encode('ascii') + query.encode('ascii')

    except UnicodeEncodeError:

    encoding = 'utf-8'

    noencode = '~!^*()_-=&/|,.?;'

    request = unicode(urllib.quote(path.encode(encoding) + query.encode(encoding), noencode))

    if useHEAD:

    conn.request('HEAD', request, None, httpHeader)

    else:

    conn.request('GET', request, None, httpHeader)

    response = conn.getresponse()

    redirect = response.msg.getheader('location')

    content_length = response.msg.getheader('content-length')

    content_type = response.msg.getheader('content-type')

    conn.close()

    counter -= 1

    if redirect:

    wikipedia.output( u'STATUS: HTTP %s Moved: %s to %s' % (response.status, location, redirect) )

    if redirect.startswith("http"):

    location = urlparse.urljoin(location, redirect)

    else:

    location = redirect

    else:

    location = None

    return (location, response.status, response.reason, content_length, content_type)

    except httplib.error, arg:

    wikipedia.output(u'ERROR: HTTP %s %s' % (arg, location))

    return (location, 52, "", None, None)

    except socket.timeout:

    return (location, 110, 'Connection timeout', None, None)

    except socket.error, arg:

    wikipedia.output(u'ERROR: Socket %s %s' % (arg, location))

    return (location, arg[0], arg[1], None, None)

    except KeyboardInterrupt:

    raise

    except Exception, e: # catches those weird ones

    print u'Exception raised: %s' % e

    return (location, 0, "Exception %s" % e, None, None)

    def binary_notation(size, base = 1024., prefix = IEC_prefix):

    """

    Convert the byte count to a human readable value

    """

    a = float(size)

    exponent = 0

    while a >= 1000.:

    a /= base

    exponent += 3

    # Truncate and remove trailing dot

    byteSigs = str(a)[:4]

    if byteSigs.endswith('.'):

    byteSigs = byteSigs[:3]

    return byteSigs + ' ' + prefix[exponent / 3]

    # return '%3.3g %s' % (byteSigs, prefix[exponent / 3])

    def fix_broken_links(link):

    """

    Returns link replacement for known broken links

    """

    # Moving of resources

    link = link.replace('virginiadot.org/infoservice/', 'virginiadot.org/info/')

    link = link.replace('virginiadot.org/comtravel/', 'virginiadot.org/info/')

    link = link.replace('ncdot.org/transit/aviation/ncairports/locations/pdf/', 'ncdot.org/transit/aviation/download/ncairports/')

    link = link.replace('waitangi-tribunal.govt.nz/doclibrary/researchwhanui/', 'waitangi-tribunal.govt.nz/doclibrary/public/researchwhanui/')

    # 301 Permanent Redirects

    link = link.replace('transportation.ky.gov/planning/', 'www.planning.kytc.ky.gov/')

    link = link.replace('official-documents.co.uk/', 'official-documents.gov.uk/')

    link = link.replace('http://bmj.bmjjournals.com/', 'http://www.bmj.com/')

    link = link.replace('http://bris.ac.uk/', 'http://www.bristol.ac.uk/')

    link = link.replace('http://www.shef.ac.uk/socst/', 'http://www.shef.ac.uk/socstudies/')

    link = link.replace('http://www.sims.berkeley.edu:8000/', 'http://www2.sims.berkeley.edu/')

    link = link.replace('http://www.cs.wm.edu/hpcs/', 'http://www.cse.ohio-state.edu/hpcs/')

    link = link.replace('http://www.pchrgaza.org/', 'http://www.pchrgaza.ps/')

    link = link.replace('http://www.almlondon.org.uk/', 'http://www.mlalondon.org.uk/')

    link = link.replace('http://www.state.ma.us/eot/', 'http://www.eot.state.ma.us/')

    link = link.replace('http://www.aapt.org.au/', 'http://www.ausapt.org.au/')

    link = link.replace('http://berlin.usembassy.gov/', 'http://germany.usembassy.gov/')

    return link

    def update_size_paramter(template_text):

    m = re.search(r'(?s)\{\{(?P[^|]*)\|(1=)?(?P[^|]*).*?(, (?P[0-9]+) byte.*)?\}\}', fix_broken_links(template_text))

    link_text = m.group('text')

    location = urlpattern.search(link_text).group(0)

    old_size = int(m.group('size') or 0)

    parameter_prefix = ''

    if '=' in link_text:

    parameter_prefix = '2='

    # Convert indirect HTML character references

    location = wikipedia.html2unicode(location)

    (redirect, response, reason, content_length, media_type) = checkLink(location)

    try:

    content_length = int(content_length)

    except:

    content_length = None

    if media_type and content_length and content_length != old_size:

    # I should really put in 404 error handling code, but this has been working just fine.

    if 'pdf' in media_type or 'octet-stream' in media_type or 'application/download' in media_type:

    # This was the old format using the comment

    # return u'{{%s|%s|%s%s}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(content_length), content_type, content_length )

    # However, comment was filled with generally non-useful information

    return (not (old_size == 0) or template_text.count('|')<2, u'{{%s|%s|%s%s}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(content_length, prefix = mix_prefix)))

    else:

    wikipedia.output(u'FIXME: Bad response: code: %d, type: %s, location: %s' % (response, media_type, location))

    # If anything else return template_text back

    if old_size:

    return (False, u'{{%s|%s|%s%s}}' % (m.group('tpl'), link_text, parameter_prefix, binary_notation(old_size, prefix = mix_prefix)))

    else:

    return (False, template_text)

    def process_article(page):

    try:

    deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read()

    except IOError:

    wikipedia.output('You need to download http://www.twoevils.org/files/wikipedia/404-links.txt.gz and to ungzip it in the same directory')

    raise

    wikipedia.output('Getting page %s' % page.aslink())

    wikitext = page.get()

    # Fix Casing (Reduces the number of possible expressions)

    wikitext = re.sub(r'(?i)\{\{\s*(template:|)pdf', r'{{PDF', wikitext)

    wikitext = wikitext.replace('{{PDFLink', '{{PDFlink')

    # State point. Count any changes as needing an update if they're after this line

    state0 = wikitext

    # [http {{PDF}}]

    wikitext = re.sub(r'(\[\w+://[^][<>"\s]+\s[^][\n]+?)\s*(\{\{(PDFlink|PDF)\}\})', r'\2\1', wikitext)

    # Convert hard coded pdf links (ex: [http link] (pdf) )

    wikitext = re.sub(r'(\[\w+://[^][]*\]) *\((\[\\*)?\.?(PDF|pdf) *([Ff]ile)? *([Ff]ormat)?(\]\])?\)', r'{{PDFlink|\1}}', wikitext)

    # Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] )

    wikitext = re.sub(r"[(]?\{\{(PDFlink|PDF)\}\}[)]? *((?P'*)\[\w+://[^][]*\](?P=quote)?)", r'{{\1|\2}}', wikitext)

    wikitext = re.sub(r'("?\[\w+://[^]]*\]"?)([^a-zA-Z0-9()]*) *[(]?\{\{(PDFlink|PDF) *\}\}[)]?', r'{{\3|\1}}\2', wikitext)

    # Convert with with tag at the end of a bullet list (ex: * [http link] some text (PDF) )

    if '{{PDF' in wikitext:

    wikitext = re.compile(r'(\n *\*+[^\n:/]*)(\[\w+://[^][]*\])([^\n:/]*) *[(](\[\[|\{\{)?(Portable Document Format[|]PDF|pdflink).?(pdf.?)?(file|format|datei)?(\}\}|\]\])?[)]', re.IGNORECASE).sub(r'\1{{PDFlink|\2}}\3', wikitext)

    wikitext = re.sub(r'(\n *\*+[^\n:/]*)(\[\w+://[^][]*\])([^\n:/]*) *\{\{(PDFlink|PDF)\}\}', r'\1{{\4|\2}}\3', wikitext)

    # Experimental: move {{PDF}} back in tag

    wikitext = re.sub(r'(]*>[^][<>=]*?)("?\[\w+://[^][<>\s"]+[^]\n]*\]"?)([^{}<>]*)\{\{(PDFlink|PDF)\}\}', r'\1{{\4|\2}}\3', wikitext)

    # State point. Correction of {{PDFlink}} template

    genfixState = wikitext

    # Remove PDFlink from citation templates

    # {{cite |format={{PDF}}}}

    wikitext = re.sub(r'(?s)(format *= *)(PDF|pdf)?[(]?\{\{PDF[^{}]*?\}\}[)]?', r'\1PDF', wikitext)

    # {{cite.*?}}{{PDF}}

    wikitext = re.sub(r'(?s)(\{\{ *[Cc]ite[^}]*)(\}\}[^\w() ]*) *[(]?\{\{(PDF|PDFlink)\}\}[)]?', r'\1 |format=PDF\2', wikitext)

    # {{cite | lang= EN {{PDF}} }}

    wikitext = re.sub(r'(?s)(\{\{ *[Cc]ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext)

    # {{PDF| {{template...}} }}

    wikitext = re.sub(r'(?s)\{\{(PDFlink|PDF)\|\s*(\{\{[^{}]+?(\|[^

    }]+)?\}\})[\s|]*\}\}', r'\2', wikitext)

    # {{citation|url={{PDFlink|...}} }}

    wikitext = re.sub(r'(?i)\{\{(([Cc]itation|[Cc]ite)[^{}]+?)\{\{(PDFlink|PDF)\|([^{}]*?)(\|[^{

    ]+)?\}\}', r'{{\1\4', wikitext)

    # Sate point. Removal of {{PDFlink}} in certian instances

    state2 = wikitext

    cleantext = wikitext

    # This is ugly, since we need the comments to check the relative filesize

    for m in re.finditer(r'|]*>.*?', cleantext):

    if '{{PDF' in m.group():

    cleantext = cleantext.replace(m.group(), '')

    sizechange = 0

    for m in re.finditer(r'\{\{(?:PDFlink|PDF)\|[^{}]+?\}\}', cleantext):

    if 'http://' in m.group() or 'https://' in m.group():

    (changed, replacetext) = update_size_paramter(m.group())

    sizechange += changed and 1 or 0

    # print "update page? %s"%(sizechange, )

    wikitext = wikitext.replace(m.group(), replacetext)

    # Uncomment the bellow line to see the replacement text

    # wikipedia.output(u'OUTPUT: %s' % replacetext)

    for s in re.findall(ur'(?ui)\{\{(?:cite[\w\s]+)\|[^{}]+?\}\}', cleantext):

    murl = re.search('\|\s*url\s*=\s*(?Phttp[s]?://[^][<>"\s|]+)(\||}})', s)

    if murl and 'PDF' in murl.group().upper() and (not re.search(ur'\|\s*format\s*=\s*[^\s

    }]+', s) or not re.search(ur'\|\s*(access\w+)\s*=\s*([^{
    ]+?)\s*(?=[
    }])', s)) and not re.search(ur'\|\s*archiveurl\s*=\s*[^\s{
    ]+', s):

    repl_url = fix_broken_links(murl.group('url'))

    (redirect, response, reason, content_length, media_type) = checkLink(repl_url)

    # media_type not given

    if not media_type:

    continue

    # Gone/Not Found error code

    elif (response == 410 or (response == 404 and (u'\t%s\t' % murl.group(1) in deadLinks))) and repl_url == murl.group('url'):

    wikitext = wikitext.replace(s, s + time.strftime("{{dead link|bot=PDFbot|date=%B %Y}}"))

    # valid PDF

    # python2.6code: any(item in media_type.lower() for item in ('pdf', 'octet-stream'))

    elif 'pdf' in media_type.lower() or 'octet-stream' in media_type.lower():

    replacetext = s

    replacetext = replacetext.replace(murl.group(), murl.group().replace(murl.group('url'), repl_url))

    if re.search(ur'\|\s*format\s*=\s*[^

    }]*[
    ]', replacetext):

    # fill in the format=

    replacetext = re.sub(r'(\|\s*format\s*= ??)(\n* *[

    }])', r'\1PDF\2', replacetext)

    else:

    # add format=PDF (third last parameter)

    replacetext = re.sub(r'(\{\{[^{}]+?)((\s*\|\s*)[^[=\]{

    ]+(\s*= *)[^
    }]+)(\s*\|[^{
    ]+)\}\}', r'\1\3format\4PDF\2\5}}', replacetext)

    accessed = re.search(ur'\|\s*(access\w+)\s*=\s*[^

    }\s]+', replacetext)

    # no access-anything filled in, add/fill accessdate

    if not accessed:

    # fill out accessdate if it exists

    replacetext = re.sub(r'(\|\s*accessdate\s*= ??)(?=\n* *[{

    ])', time.strftime(r'\g<1>%Y-%m-%d'), replacetext)

    # if template doesn't contain accessdate then add it (last parameter)

    if not re.search(r'\|\s*accessdate\s*=', replacetext):

    replacetext = re.sub(r'(\{\{[^{}]+?)((\s*\|\s*)[^[=\]

    }]+?(\s*= *)[^{
    ]+?)(\s*)\}\}', time.strftime(r'\1\2\3accessdate\g<4>%Y-%m-%d\5}}'), replacetext)

    #replacetext = re.sub(r'(\{\{[^{}]+?)((\s*\|\s*)[^[=\]

    }]+(\s*= *)[^{
    ]+)(\s*\|[^
    }]+)\}\}', time.strftime(r'\1\2\5\3accessdate\g<4>%Y-%m-%d}}'), replacetext)

    # put back in

    wikitext = wikitext.replace(s, replacetext)

    sizechange += 1

    # Uncomment the bellow line to see the replacement text

    wikipedia.output(u'OUTPUT: %s' % replacetext)

    # remove duplicate {{dead link}}

    dead_templates = r'[Dd]ead[ _]*link|[Dd]l|[Dd]l-s|404|[Bb]roken[ _]+link|[Cc]leanup-link'

    wikitext = re.sub('(\{\{(?:%s)[^}]*?\}\})+(()?\{\{(?:%s)[^}]*?\}\})'%(dead_templates, dead_templates), r'\2', wikitext)

    # Figure out an edit message of what we did

    if sizechange:

    if state2 != state0:

    EditMsg = "Updating %d PDF%s and fixes" % (sizechange, sizechange>1 and 's' or '')

    else:

    EditMsg = "Updating %d PDF%s" % (sizechange, sizechange>1 and 's' or '')

    else:

    # state0: renamed templates

    # genfix: fixPDFlink

    # state2: removePDFlink

    #wikitext: -

    EditMsg = "General fixes for PDFs"

    if wikitext == state0:

    pass # text stayed the same

    elif wikitext == genfixState:

    EditMsg = "Correct {{PDFlink}} syntax"

    elif wikitext == state2:

    if genfixState == state0: # no fixes

    EditMsg = "Remove incorrect {{PDFlink}}"

    else: #fixes+removal

    pass

    wikipedia.setAction(EditMsg)

    updateSizes = wikitext

    # Fix equal sign problem

    # moved here to avoid changing edit message

    wikitext = re.sub(r'\{\{(PDF|PDFlink)\|(1=|)(.{2}[^{

    ]+=[^
    }]+)', r'{{\1|1=\3', wikitext)

    # altert me if the page does not contains {{pdflink|no-link}}

    if re.search(r'\{\{PDF(link|)\|[^:]+\}\}', wikitext):

    wikipedia.output(u'FIXME: No link in {{PDFlink}} on %s' % page.aslink())

    # If the text has changed at all since the state point, upload it

    if (wikitext != state0 and sizechange) or state2 != state0 or updateSizes != wikitext:

    wikipedia.output('PDFs updated: % 3d' % sizechange)

    # pdf -> PDF

    wikitext = re.sub(r'\[\[pdf(?=[|\]])', '[[PDF', wikitext)

    # {{cite | format = pdf }}

    wikitext = re.sub(r'(?s)(?:([|]\s*format\s*=\s*)(?:\[\[|)[Pp][Dd][Ff](?:\]\]|))+(\s*[{

    ])', r'\1PDF\2', wikitext)

    # To many to just fix when we come across, so we don't count it with the fixes

    # Unlink PDF in format parameters

    wikitext = re.sub(r'(?i)(\|\s*format\s*=\s*)\[\[(adobe|portable|document|file|format|pdf|\.|\s|\(|\)|\|)+\]\]', r'\1PDF', wikitext)

    wikitext = re.sub(r'(?i)(\|\s*format\s*=\s*)(\s*\.?(adobe|portable|document|file|format|pdf|\(|\)))+?(?=\s*[|}])', r'\1PDF', wikitext)

    # Apply common fixes if avalible

    if commonfixes:

    wikitext = commonfixes.fix(page, text=wikitext)

    # Apply reflink if avalible

    if reflinks:

    # Hackist hook

    page._contents = wikitext

    if page.get() != wikitext:

    wikipedia.output("Injected text wasn't returned with page.get()")

    elif reflinks.linksInRef.search(wikitext):

    reflinksbot = reflinks.ReferencesRobot(iter([page]))

    reflinksbot.run()

    if hasattr(reflinksbot, 'new_text'):

    if reflinksbot.page != page:raise 'pages not the same'

    wikitext = reflinksbot.new_text

    # Reset edit summary

    wikipedia.setAction(EditMsg)

    try:

    wikipedia.output(u'WRITE: Delta length of % 3d bytes.' % (len(wikitext)-len(state0)))

    page.put(wikitext)

    except Exception, e:

    wikipedia.output(u'ERROR: Except %s raised while writing.' % e)

    # Pause to reduce load on the servers

    time.sleep(writeDelay)

    else:

    wikipedia.put_throttle()

    time.sleep(readDelay)

    pass

    def main():

    site = wikipedia.getSite()

    gen = None

    namespaces = [0]

    for arg in wikipedia.handleArgs():

    if arg.startswith('-ref:'):

    referredPage = wikipedia.Page(site, arg[5:])

    gen = pagegenerators.ReferringPageGenerator(referredPage)

    elif arg.startswith('-file:'):

    gen = pagegenerators.TextfilePageGenerator(arg[6:])

    elif arg.startswith('-cat:'):

    cat = catlib.Category(site, arg[5:])

    gen = pagegenerators.CategorizedPageGenerator(cat)

    elif arg.startswith('-links:'):

    pagelinks = wikipedia.Page(wikipedia.getSite(), arg[7:])

    gen = pagegenerators.LinkedPageGenerator(pagelinks)

    elif arg.startswith('-page:'):

    page = wikipedia.Page(wikipedia.getSite(), unicode(arg[6:]))

    gen = iter([page])

    elif arg.startswith('-ns:'):

    namespaces.append(int(arg[11:]))

    elif arg.startswith('-delay:'):

    global readDelay, writeDelay

    readDelay = int(arg[7:])

    writeDelay = int(arg[7:])

    if not gen:

    wikipedia.showHelp(u'pdfbot')

    return

    wikipedia.output(u'Delays are %s s for read and %s for writes' % (readDelay, writeDelay,) )

    if namespaces != []:

    gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)

    gen = pagegenerators.RedirectFilterPageGenerator(gen)

    for page in gen:

    if page.site().messages:

    wikipedia.output(u'Messages left on talk page, halting.')

    return

    process_article(page)

    wikipedia.output(u'Finished updating')

    if __name__ == "__main__":

    try:

    main()

    finally:

    wikipedia.stopme()