User:PhotoCatBot/Src/StaleReqphotoBot

Source code for StaleReqphotoBot, which identifies {{tl|reqphoto}} instances that appear likely to be "stale" (where the article page has one or more images) and adds them to a category of articles which may need to have the reqphoto template removed.

! /usr/bin/python

StaleReqphotoBot
Examine each article that transcludes {{reqphoto}}. If the
main article page has at least one image, add the article
to Category:Articles which may no longer need photos.
Do not revisit any article more often than once every six
months. Skip articles with {{reqphoto|of=...}}. Skip
articles titled "National Register of Historic Places listing..."

import wikipedia, catlib, pagegenerators

import sqlite3

import time, sys

import re

import wikitemplate

import socket

from datetime import datetime, timedelta

startCatName = 'Category:Wikipedia requested photographs'

startCatAfter = None

hasImageCatName = 'Category:Articles which may no longer need images'

editComment = 'PhotoCatBot thinks this article may no longer need a photo request. Please check and update the talk page!'

def main():

diary = initialize_diary()

site = wikipedia.getSite()

# Find articles transcluding {{reqphoto}}

photoreq_cat = catlib.Category(None, startCatName)

photoreq_pages = pagegenerators.CategorizedPageGenerator(photoreq_cat, recurse = True, start = startCatAfter)

for p in photoreq_pages:

try:

update_stale_reqphotos(diary, p)

except (wikipedia.Error, socket.timeout):

wikipedia.output("%s raised on %s" % (sys.exc_info(), p.title()))

def update_stale_reqphotos(diary, page):

if page.isTalkPage():

talk = page

article = page.toggleTalkPage()

else:

article = page

talk = page.toggleTalkPage()

# Skip NRHP Listing articles per doncram.

if article.title().startswith('National Register of Historic Places listing'):

wikipedia.output("%s: skipping" % article.title())

return

# Skip this page if we have modified it in the last 6 months.

if recently_updated(diary, talk):

wikipedia.output("%s was updated within 6 months" % talk.title())

return

# If the page has a {{reqphoto}} with the "of=" paramter,

# we'll assume it's a very specific photo request and

# ignore it even if the page has images. This is the way

# to short-circuit the bot from re-adding a page inappropriately

# to category 'Articles which may no longer need images'.

reqphotos = find_reqphotos_on(talk)

if reqphotos:

reqphotos_have_of = any( param.find("of=") == 0

for req in reqphotos

for param in req[1] )

if reqphotos_have_of:

wikipedia.output("%s has {{reqphoto|of=}}, skipping" % article.title())

return

# If the article has an infobox *and* any infobox

# does not have an image, skip it -- the image request

# is assumed to still be legitimate in this case.

# Suggestion by {{user|Emperor}}.

infoboxes = find_infoboxes(article)

if infoboxes:

infoboxes_lacking_image = filter(infobox_lacks_image, infoboxes)

if infoboxes_lacking_image:

wikipedia.output("skipping %s: {{%s}} lacks an image" % (article.title(), infoboxes_lacking_image[0][0]))

return

# If this article appears to contain images, add it to

# 'Articles which may no longer need images'.

if has_images(article):

text = talk.get()

cats = talk.categories()

hasImageCat = catlib.Category(None, hasImageCatName, sortKey = article.title())

if hasImageCat in cats:

wikipedia.output("%s already in %s, skipping" % (talk.title(), hasImageCatName))

return

else:

newtext = wikipedia.replaceCategoryLinks(text, cats + [hasImageCat])

if text != newtext:

try:

#talk.put(newtext, editComment)

wikipedia.showDiff(text, newtext)

except:

wikipedia.output("could not save %s: %s" % (talk.title(), sys.exc_info()))

#update_modification_time(diary, talk)

def find_reqphotos_on(page):

reqphotos = [tmpl for tmpl in page.templatesWithParams() if tmpl[0] == 'Reqphoto']

return reqphotos

def find_infoboxes(page):

infoboxes = [tmpl for tmpl in page.templatesWithParams() if tmpl[0].startswith('Infobox')]

return infoboxes

def nonempty_image_param(param):

return re.match(r'image\s*=.*\.(jpg|png)', param, re.I | re.M)

def infobox_lacks_image(template):

parameters = template[1]

return not any(nonempty_image_param(p) for p in parameters)

Check to see if the page includes a JPG, GIF or PNG image.
Skip .SVG because it is so often used for maps, logos, icons,
placeholders and other small art that is not intended by the reqphoto
template.

def has_images(page):

try:

images = page.imagelinks()

except:

wikipedia.output("%s raised on %s" % (sys.exc_info(), page.title()))

return None

return any(re.match(r'.*\.(jpg|jpeg|gif|png)$', img.title(), re.I) for img in images)

def initialize_diary():

db = sqlite3.connect('StaleReqphotoBot.sqlite3', detect_types = sqlite3.PARSE_DECLTYPES)

c = db.cursor()

c.execute("""CREATE TABLE IF NOT EXISTS update_times (

title TEXT PRIMARY KEY,

update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP

)""")

db.commit()

c.close()

return db

def update_modification_time(db, page):

c = db.cursor()

args = (page.title(), )

c.execute('INSERT OR REPLACE INTO update_times (title) VALUES (?)', args)

db.commit()

c.close()

def recently_updated(db, page):

c = db.cursor()

args = (page.title(), )

c.execute('SELECT update_time FROM update_times WHERE title = ?', args)

r = c.fetchone()

c.close()

if r:

expire_time = r[0] + timedelta(180);

return datetime.now() < expire_time

else:

return False

def close_diary(db):

db.disconnect()

try:

main()

finally:

wikipedia.stopme()