Wikipedia:Bots/Requests for approval/BHGbot 7/Make-BHGbot7-edit-list.sh
- !/bin/bash
- Make-BHGbot7-edit-list.sh
- This script creates a list of category-space pages to be created as category redirects for WP:BHGbot 7
- Each entry in the list is the name of a cateory page to be created as a redirect to the same title,
- but with the "z" in "organization" replaced with "s", and vice versa.
- e.g. the page Category:Anti-Foobar organisations is to be created with
- the content {{Category redirect|Anti-Foobar organizations|bot=BHGbot}}
- Three input files are required. Each is the outut of a quarry query, saved in tab-delimited format.
- i) "quarry-orgcats": the output of https://quarry.wmflabs.org/query/46899
- ii) "quarry-allcats": the output of https://quarry.wmflabs.org/query/46999
- iii) "quarry-all-articles": the output of https://quarry.wmflabs.org/query/47001
- There are 5 steps:
- 1. Convert the list in quarry-orgcats by replacing s with z and vice-versa: output in fixed-orgcats
- 2. Safety check: remove any entries in fixed-orgcats which do not contain "organi[sz]ation"
- 3. Prepare the data files for comparison
- 4. Remove from fixed-orgcats:
- a) existing category pages
- b) titles which exist in article space
- 5. Convert the list into wikilinked format for use by AWB
- ==============================
- STEP 1:
- in quarry-orgcats, replace every instance of "organisation" with "organization", and vice-versa
- save the output in fixed-orgcats
sed -e 's/_/ /g' < quarry-orgcats | \
sed -E "s/\b([oO]rgani)s(ations?)\b/\1%%%@#!%%%\2/g" | \
sed -E "s/\b([oO]rgani)z(ations?)\b/\1s\2/g" | \
sed -E "s/\b([oO]rgani)%%%@#!%%%(ations?)\b/\1z\2/g" | \
sed -e 's/ /_/g' \
> fixed-orgcats
echo -n "created fixed-orgcats. #Lines: "
wc -l fixed-orgcats
- ==============================
- STEP 2:
- Safety check: purge from fixed-orgcats any lines which do not contain "organi[sz]ation"
- This should remove only one line: the "page title" header from the quarry output.
- Any more removals indicates an error in the input data
echo -n "purging from fixed-orgcats any lines which do not contain organi[sz]ation: "
grep -P '[oO]rgani[sz]ations?' < fixed-orgcats > fixed-orgcats-purged
echo -n "DONE. #Lines: "
wc -l fixed-orgcats-purged
- ==============================
- STEP 3:
- Prepare each of the data files
- Each file needs to be:
- a) converted to unix format by stripping out the CR from the CR-LF pairs.
- b) sorted alphabetically to allow use of comm to compare files
echo ""
echo -n "sorting fixed-orgcats-purged: "
sort < fixed-orgcats-purged | tr -d '\015' > fixed-orgcats-sorted
echo -n "DONE. #lines: "
wc -l fixed-orgcats-sorted
echo -n "sorting quarry-allcats: "
sort < quarry-allcats | tr -d '\015' > quarry-allcats-sorted
echo -n "DONE. #lines: "
wc -l quarry-allcats-sorted
echo -n "sorting quarry-all-articles (may be slow): "
sort < quarry-all-articles | tr -d '\015' > quarry-all-articles-sorted
echo -n "DONE. #lines: "
wc -l quarry-all-articles-sorted
- ==============================
- STEP 4
- Compare the lists to remove entries which should not be created
echo ""
echo -n "Removing existing category pages from the list of redirects to be created: "
comm -23 fixed-orgcats-sorted quarry-allcats-sorted > fixed-orgcats-notexist
echo -n "DONE. #lines: "
wc -l fixed-orgcats-notexist
echo -n "Removing existing aricle titles from the list of categ redirects to be created: "
comm -23 fixed-orgcats-notexist quarry-all-articles-sorted > redirect-cats-to-create-bare
echo -n "DONE. #lines: "
wc -l redirect-cats-to-create-bare
- ==============================
- STEP 5
- Convert the list into wikilinked format for use by AWB
echo ""
echo -n "Wikilink the list of redirects to be created: "
sed -E 's/^/# sed -e 's/$//g' > redirect-cats-to-create.txt
echo "DONE"
echo -e "\n\n\n===== FINISHED ====="
echo "Stats:"
echo -n "Existing non-redirected, non-dab cats with organi[sz]ation in title: "
wc -l fixed-orgcats-purged
echo -n "Proposed redirects which don't already exist as cats: "
wc -l fixed-orgcats-notexist
echo -n "Proposed redirects which don't already exist as cats or as article titles: "
wc -l redirect-cats-to-create-bare
echo -e "\nList of redirects to created is at redirect-cats-to-create.txt"
echo -n "Number of redirects to create: "
wc -l redirect-cats-to-create.txt