User:Cryptic/dup-publisher.py

!/usr/bin/env python3

"""Outputs a list of articles from a given dump file containing

"citation" or "cite [anything]" templates with duplicate values in

publisher and either encyclopedia, journal, magazine, newspaper,

series, title, website, or work parameters, or in journal and series

parameters.

For WP:RAQ#Find all instances of journal=publisher circa 28 June

2023.

"""

import sys

assert sys.version_info >= (3, 6), f"requires Python 3.6 or newer"

import argparse

import bz2

import os

import re

import shutil

import textwrap

import xml.sax

import mwparserfromhell

Arguments #

_all_namespaces = False

HELP_ALL_NAMESPACES = "parse pages in all namespaces, not just article"

_count = None

HELP_COUNT = ("""output a running count of matched pages to stderr,

updating every thousand pages read""")

_output = None

HELP_OUTPUT = "output file, a list of page titles; defaults to stdout"

_print_matches = False

HELP_PRINT_MATCHES = ("""output the page name, a tab, and the names of

the first set of matching template parameters

instead of just the page name""")

Other globals #

_rx_rough_match is used to eliminate pages from consideration before
the expensive full parse; it's important that it have no false
negatives.

_rx_rough_match = re.compile(r"{{\s*[cC]it(?:ation\b|e ).*(publisher|series)")

target template name

_rx_template_name = re.compile(r"^[cC]it(?:ation$|e )")

_namespaces = {} # maps namespace numbers to names

_matched_pages = 0 # count of pages w/at least one duplicate param pair

class _XMLHandler(xml.sax.ContentHandler):

def __init__(self):

super().__init__()

self.ns = None

self.title = None

self.text = None

self.tags = [None]

self.namespace = None

self.namespace_key = None

def startElement(self, name, attrs):

if name == "page":

self.ns = None

self.title = None

self.text = None

# These shouldn't be present in tags anyway, but.

self.namespace = None

self.namespace_key = None

elif name == "ns":

self.ns = ""

elif name == "title":

self.title = ""

elif name == "text":

self.text = ""

elif name == "namespace":

self.namespace = ""

self.namespace_key = int(attrs.get("key"))

else:

return

self.tags.append(name)

def endElement(self, name):

if name == self.tags[-1]:

self.tags.pop()

if ((name == "page" and self.text is not None

and self.ns is not None and self.title is not None)):

process_page(int(self.ns), self.title, self.text)

elif name == "namespace" and self.namespace_key is not None:

_namespaces[self.namespace_key] = self.namespace + ":"

def characters(self, content):

if self.tags[-1] == "ns":

self.ns += content

elif self.tags[-1] == "title":

self.title += content

elif self.tags[-1] == "text":

self.text += content

elif self.tags[-1] == "namespace":

self.namespace += content

def pagename(ns, title):

"""Return human-readable name of page title in numbered namespace ns"""

if ns == 0: # Special-case to omit the :

return title

elif ns in _namespaces:

return _namespaces[ns] + ":" + title

else:

return "{{ns:" + str(ns) + "}}:" + title

def process_page(ns, title, text):

"""Filter ns:title (containing plaintext text) by namespace and

_rx_rough_match, pass it through to has_dupe_cite_params() if

appropriate, increment counters, and output

"""

global _count, _matched_pages

if (((_all_namespaces or ns == 0)

and _rx_rough_match.search(text))):

dupe = has_dupe_cite_params(text)

if dupe is not None:

_matched_pages += 1

if _print_matches:

print(pagename(ns, title) + "\t" + dupe, file=_output)

else:

print(pagename(ns, title), file=_output)

if _count is not None:

_count += 1

if _count % 1000 == 0:

print(f"Read {_count} pages, matched {_matched_pages}",

file=sys.stderr)

def has_dupe_cite_params(text):

"""If text contains a citation template with duplicate parameters

we're looking for, return a string suitable for the print-matches

option; else None

"""

def errval(template, param1name, param2name, paramval):

"""Return a string suitable for the print-matches option"""

return ("{{" + str(template.name).strip() + "}}:" + param1name + ","

+ param2name + '="' + paramval + '"')

def param(template, param_name):

"""Return the wikicode of template's parameter param_name as a

str, or None if empty or not present

"""

par = template.get(param_name, default=None)

if par is None:

return None

rval = str(par.value).strip()

if rval == "":

return None

return rval

parsed = mwparserfromhell.parse(text)

templates = parsed.filter_templates()

for t in templates:

if _rx_template_name.match(str(t.name)):

publisher = param(t, "publisher")

if publisher is not None:

for other in ("encyclopedia",

"journal",

"magazine",

"newspaper",

"series",

"title",

"website",

"work"):

if publisher == param(t, other):

return errval(t, "publisher", other, publisher)

journal = param(t, "journal")

if journal is not None and journal == param(t, "series"):

return errval(t, "journal", "series", journal)

return None

def _fill_paragraphs(text, width=None):

"""Returns text, wrapped as per textwrap.fill(), but preserve

paragraph splits (as denoted by sequences of two newlines).

"""

# width is pulled from argparse.HelpFormatter().__init__() to try

# to match the default behavior - and hence option formatting - as

# closely as practical. Irritatingly, it changed in 3.8, which I

# happened to notice by accident.

# It is infuriating that argparse neither publicizes its formatter

# classes so they can be properly overridden, nor exposes width

# determination so they can be reliably mimicked. Oh well, if it

# changes again, it's ok if *this* looks a little ugly, and it'll

# break less badly than subclassing the private classes would.

if width is None:

if sys.version_info >= (3, 8):

width = shutil.get_terminal_size().columns

else:

try:

width = int(os.environ['COLUMNS'])

except (KeyError, ValueError):

width = 80

width -= 2

return "\n\n".join([textwrap.fill(s, width) for s in text.split("\n\n")])

def _main():

args = argparse.ArgumentParser(description=_fill_paragraphs(__doc__),

# pylint: disable=bad-continuation

formatter_class=argparse.RawDescriptionHelpFormatter)

args.add_argument("dumpfile",

help="input dump file, in xml or bzip2-compressed xml")

args.add_argument("-a", "--all-namespaces",

action="store_true",

help=HELP_ALL_NAMESPACES)

args.add_argument("-c", "--count",

action="store_true",

help=HELP_COUNT)

args.add_argument("-m", "--print-matches",

action="store_true",

help=HELP_PRINT_MATCHES)

args.add_argument("-o", "--output",

default=sys.stdout,

type=argparse.FileType("w", encoding="utf-8"),

help=HELP_OUTPUT)

args = args.parse_args()

global _all_namespaces, _count, _output, _matched_pages, _print_matches

_all_namespaces = args.all_namespaces

_count = 0 if args.count else None

_print_matches = args.print_matches

_output = args.output

_matched_pages = 0

with open(args.dumpfile, 'rb') as f:

magic = f.read(3)

if magic == b'\x42\x5a\x68':

f = bz2.BZ2File(args.dumpfile)

else:

f = open(args.dumpfile, 'r', encoding='utf-8')

xml.sax.parse(f, _XMLHandler())

# don't print this if count's divisible by 1000 and > 0, since it

# would duplicate the print in process_page()

if _count is not None and (_count == 0 or _count % 1000 != 0):

print(f"Read {_count} pages, matched {_matched_pages}",

file=sys.stderr)

if __name__ == "__main__":

_main()