User:B-bot/source/Orphaned fair use tagger

This task does the following:

Update User:B-bot/List of orphaned images using [http://quarry.wmflabs.org/query/3268] to get an updated list of orphaned non-free images.
If a non-free image has been orphaned for a full day, purge it to make sure it is still orphaned.
If it is still orphaned, tag it with {{tls|orfud}}
Notify the uploader with {{tls|di-orphaned fair use-notice}}

///

/// This class will add {{subst:orfud}} to orphaned fair use images and notify users with {{subst:di-orphaned fair use-notice|file name}}

///

/// Images that are used in article space (namespace 0) or draft space (namespace 118) will be excluded.

///

/// I propose it to be a three-night process - we will run at midnight each night and run http://quarry.wmflabs.org/query/3268 to

/// get a list of orphaned images. Only images that are on the list for three consecutive nights will be tagged. This ensures that

/// we're not tagging images which are only momentarily orphaned as a result of vandalism or an edit war.

///

public class COrphanedImageProcessor : BBotBase

{

#region Constants

const String cstrOrphanedImagesMainPage = "User:B-bot/List of orphaned images";

const String cstrOrphanedImagesDay0 = cstrOrphanedImagesMainPage + "/day-0";

const String cstrOrphanedImagesDay1 = cstrOrphanedImagesMainPage + "/day-1";

const String cstrOrphanedImagesDay2 = cstrOrphanedImagesMainPage + "/day-2";

const String cstrUnusedFairUseFilesDbQueryPage = "Wikipedia:Database reports/Unused non-free files";

#endregion

#region Settings

///

/// Set this to true to completely skip the orphaned file list update and to just dive

/// straight in to tagging orphans. This lets me test that portion of the process without

/// having to wait for Quarry to run the results.

///

public Boolean SkipOrphanedFileListUpdate { get; set; }

///

/// Set this to true to skip updating the query text. Do this if it has been updated

/// manually and we just want the results.

///

public Boolean SkipQuarryQueryUpdate { get; set; }

///

/// Set this to true to skip the main process so that we are ONLY updating the quarry query

///

public Boolean SkipMainProcess { get; set; }

#endregion

///

/// Constructor

///

public COrphanedImageProcessor()

{

Abort = false;

}

///

/// Gets the name for this job

///

public override string GetJobName()

{

return "Orphaned Non-free Image Tagger";

}

///

/// This function will return the text of the last version of this page it edited

///

private String GetLastVersionByAcceptableUser(Site site, String PageName)

{

System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds);

// Sanity check: don't do this if the page name is not in the right namespace

if (!PageName.StartsWith("User:" + Properties.Settings.Default.BotUserName + "/"))

{

ErrorLogging.SendEmailMessage(true, "Unable to revert page", "Only pages under User:" + Properties.Settings.Default.BotUserName + "/ can be reverted. " +

"Cannot revert " + PageName + ".");

return "";

}

PageList pl = new PageList(site);

pl.FillFromPageHistory(PageName, 20);

// Loop through the pages and find the last one I edited

foreach (Page p in pl)

{

if (p.lastUser == Properties.Settings.Default.BotUserName ||

p.lastUser == "B")

{

p.LoadTextOnly();

System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds);

return p.text;

}

return "";

}

///

/// This function will return the last date that the page was modified

///

private DateTime? GetPageLastModDate(ref Site site, String PageName)

{

try

{

SleepApiDelay();

PageList pl = new PageList(site);

pl.FillFromPageHistory(PageName, 1);

if (0 == pl.Count())

{

SleepApiDelay();

return null;

}

SleepApiDelay();

return pl[0].timestamp;

}

catch (Exception ex)

{

LogToEventLog(ref site, MessageType.Error, "Error looking up last mod date for :" + PageName + "", ex);

return null;

}

///

/// This function will attempt to update our list of orphans using Wikipedia:Database reports/Unused non-free files

///

/// Query results from Wikipedia:Database reports/Unused non-free files

///

public bool UpdateOrphansFromDbReport(ref Site site, ref String strQueryResults)

{

try

{

// First, we want to see the last time we updated

DateTime? dtOrphanLastUpdate = GetPageLastModDate(ref site, cstrOrphanedImagesDay0);

DateTime? dtmDbQueryPageLastUpdate = GetPageLastModDate(ref site, cstrUnusedFairUseFilesDbQueryPage);

if (!dtmDbQueryPageLastUpdate.HasValue || !dtOrphanLastUpdate.HasValue)

{

return false;

}

// Does it have a more recent update than we do?

if (dtmDbQueryPageLastUpdate.Value < dtOrphanLastUpdate.Value)

{

LogToEventLog(ref site, MessageType.Informational, "Unfortunately, we already have a more recent update than what [[:" +

cstrUnusedFairUseFilesDbQueryPage + "]] has to offer. Stopping process.", null);

return false;

}

Page pgDbQuery = new Page(site, cstrUnusedFairUseFilesDbQueryPage);

pgDbQuery.LoadTextOnly();

SleepApiDelay();

if (String.IsNullOrWhiteSpace(pgDbQuery.text))

{

LogToEventLog(ref site, MessageType.Error, "Error loading [[:" + cstrUnusedFairUseFilesDbQueryPage +

"]] — the text loaded was null. Stopping process.", null);

return false;

}

String strRet = "";

// Find the files listed in this page

MatchCollection matches = Regex.Matches(pgDbQuery.text, @"\[\[:File:.*\]\]", RegexOptions.IgnoreCase);

foreach (Match m in matches)

{

if (0 < m.Length)

{

// Add this file to the return list

strRet += "*{{lf|" + pgDbQuery.text.Substring(m.Index, m.Length).Trim().Replace("_", " ").Replace(":File:", "").Replace("", "") + "}}\r\n";

}

strQueryResults = strRet;

// As long as we have some results, we are good

return !String.IsNullOrWhiteSpace(strQueryResults);

}

catch (Exception ex)

{

LogToEventLog(ref site, MessageType.Error, "Error attempting to load results from Wikipedia:Database reports/Unused non-free files - aborting.", ex);

return false;

}

///

/// This function will update the Quarry query and wait the requisite amount of time

///

/// True = the update was successful or unnecessary; False = the update failed

public bool UpdateQuarryQueryIfNeeded(ref Site site)

{

try

{

String Error = "";

String strErrorMessage = "";

if (!SkipQuarryQueryUpdate)

{

// Step 1: run the query

bool Success = QuarryExecutor.ExecuteQuarryQuery(Properties.Settings.Default.OAuthUserName, Properties.Settings.Default.OAuthPassword,

Properties.Settings.Default.QuarryQueryID, Properties.Settings.Default.QuarryQueryText, ref Error, ref strErrorMessage);

if (!Success)

{

LogToEventLog(ref site, MessageType.Error, "Error updating list of orphaned images: " + strErrorMessage, null);

return false;

}

// Now wait for the requisite time needed for the query to finish executing

System.Threading.Thread.Sleep(Properties.Settings.Default.QuarryWaitTimeMinutes * 60 * 1000);

}

// Success!

return true;

}

catch (Exception ex)

{

LogToEventLog(ref site, MessageType.Error, "Exception updating Quarry query", ex);

return false;

}

///

/// The master function to perform the job

///

public void PerformTask()

{

try

{

// Connect to Wikipedia

Site site = TryToConnect("https://en.wikipedia.org", Properties.Settings.Default.BotUserName, Properties.Settings.Default.BotPassword);

// Use a separate connection for our less-important API calls

Site site2 = TryToConnect("https://en.wikipedia.org", Properties.Settings.Default.BotUserName, Properties.Settings.Default.BotPassword);

DateTime dtmStarted = DateTime.Now;

LogToEventLog(ref site, MessageType.Start, "B-Bot orphaned fair use image tagger process now commencing.", null);

if (!SkipOrphanedFileListUpdate)

{

String QueryResults = "";

if (!UpdateQuarryQueryIfNeeded(ref site))

{

// We were not able to update the Quarry query, so let's try to use Wikipedia:Database reports/Unused non-free files instead

if (!UpdateOrphansFromDbReport(ref site, ref QueryResults))

{

LogToEventLog(ref site, MessageType.Finish, "Unable to update orphan list from either our own query or from Wikipedia:Database reports/Unused non-free files. Aborting.", null);

return;

}

else

{

// Download the quarry query results

String strQuarryDownloadPage = "http://quarry.wmcloud.org/run/" + QuarryExecutor.GetRunID(Properties.Settings.Default.QuarryQueryID).ToString() + "/output/0/tsv";

QueryResults = site.GetWebPage(strQuarryDownloadPage);

if (String.IsNullOrWhiteSpace(QueryResults))

{

LogToEventLog(ref site, MessageType.Error, "I was unable to update the list of orphaned images. Will try Wikipedia:Database reports/Unused non-free files.", null);

// We were not able to update the Quarry query, so let's try to use Wikipedia:Database reports/Unused non-free files instead

if (!UpdateOrphansFromDbReport(ref site, ref QueryResults))

{

LogToEventLog(ref site, MessageType.Finish, "Unable to update orphan list from either our own query or from Wikipedia:Database reports/Unused non-free files. Aborting.", null);

return;

}

// We don't need the header row

if (QueryResults.StartsWith("CONCAT('*"))

{

QueryResults = QueryResults.Substring("CONCAT('*".Length);

}

// We don't need the header row

QueryResults = QueryResults.Replace("\"CONCAT('*{{lf|', REPLACE(REPLACE(p.page_title, '\"\"', '**DOUBLEQUOTE**'), '_', ' '), '}}')\"\r\n", "");

QueryResults = QueryResults.Replace("\"\t", "");

QueryResults = QueryResults.Replace("\"", "");

QueryResults = QueryResults.Replace("CONCAT('*{{lf|', REPLACE(REPLACE(p.page_title, '', '\"'), '_', ' '), '}}')", "");

// Because double quotes break TSV files, we replace those with a placeholder. Put the double quote back.

QueryResults = QueryResults.Replace("**DOUBLEQUOTE**", "\"");

// Get the text from User:B-bot/List of orphaned images/day-1 and move it to User:B-bot/List of orphaned images/day-2

String strTwoDayOldList = GetLastVersionByAcceptableUser(site, cstrOrphanedImagesDay1);

if (!String.IsNullOrWhiteSpace(strTwoDayOldList))

{

Page p = new Page(site, cstrOrphanedImagesDay2);

/*if (CallEditPage(site, p.title, p.text, strTwoDayOldList))

{*/

p.text = strTwoDayOldList;

p.Save();

//}

}

if (Abort) { LogToEventLog(ref site, MessageType.Error, "I was ordered to abort.", null); return; }

// Get the text from User:B-bot/List of orphaned images/day-0 and move it to User:B-bot/List of orphaned images/day-1

String strOneDayOldList = GetLastVersionByAcceptableUser(site, cstrOrphanedImagesDay0);

if (!String.IsNullOrWhiteSpace(strOneDayOldList))

{

Page p = new Page(site, cstrOrphanedImagesDay1);

/*if (CallEditPage(site, p.title, p.text, strTwoDayOldList))

{*/

p.text = strOneDayOldList;

p.Save();

//}

}

if (Abort) { LogToEventLog(ref site, MessageType.Error, "I was ordered to abort.", null); return; }

// Now, log our new query results to User:B-bot/List of orphaned images/day-0

Page p0 = new Page(site, cstrOrphanedImagesDay0);

/*if (CallEditPage(site, p0.title, p0.text, strTwoDayOldList))

{*/

p0.text = QueryResults;

p0.Save();

//}

if (Abort) { LogToEventLog(ref site, MessageType.Error, "I was ordered to abort.", null); return; }

}

int intImagesTagged = 0;

if (!SkipMainProcess)

{

if (0 < Properties.Settings.Default.MaximumImagesPerRun)

{

LogToEventLog(ref site, MessageType.Finish, "List of orphaned images updated. Now beginning tagging and notifications with a maximum of " + Properties.Settings.Default.MaximumImagesPerRun.ToString() + " images.", null);

}

else

{

LogToEventLog(ref site, MessageType.Finish, "List of orphaned images updated. Now beginning tagging and notifications.", null);

}

Page pgUserspaceTest = new Page(site, Properties.Settings.Default.UserspaceTestPage);

if (UserspaceTest)

{

if (CallEditPage(site, pgUserspaceTest.title, "", "Initial header"))

{

pgUserspaceTest.text = "Now beginning Orphaned Image Processor task on " + DateTime.Now.ToString() + " (local time) ...\r\n\r\n";

pgUserspaceTest.text += "

class=\"wikitable sortable\"\r\n
\r\n! Page !! Timestamp !! Proposed edit\r\n"; pgUserspaceTest.Save(); } } // We want a fresh copy of our lists - we may have skipped the process above if we didn't need to update anything System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds); String CurrentList = GetLastVersionByAcceptableUser(site, cstrOrphanedImagesDay0); System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds); String OneDayOldList = GetLastVersionByAcceptableUser(site, cstrOrphanedImagesDay1).Replace("_", " "); System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds); String TwoDayOldList = GetLastVersionByAcceptableUser(site, cstrOrphanedImagesDay2).Replace("_", " "); System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds); // If any of these are empty, then we don't have anything to do if (String.IsNullOrWhiteSpace(CurrentList)	String.IsNullOrWhiteSpace(OneDayOldList)	String.IsNullOrWhiteSpace(TwoDayOldList)) { return; } CurrentList = CurrentList.Replace("\r\n", "\r"); CurrentList = CurrentList.Replace("\n", "\r"); String[] arrCurrentList = CurrentList.Split(new String[] { "\r" }, StringSplitOptions.RemoveEmptyEntries); // Now, loop through the rows in CurrentList. If a file appears in there that appears all three places, delete it foreach (String item in arrCurrentList) { if (0 < Properties.Settings.Default.MaximumImagesPerRun && intImagesTagged >= Properties.Settings.Default.MaximumImagesPerRun) { break; } // We seem to timeout periodically ... if it's been more than 10 minutes, reconnect if (DateTime.Now.Subtract(dtmStarted).TotalMinutes > 10) { RecycleConnection(ref site); RecycleConnection(ref site2); dtmStarted = DateTime.Now; } String strPageName = item.Replace("*{{lf\|", "").Replace("}}", "").Replace("_", " ").Trim(); if (!string.IsNullOrWhiteSpace(strPageName)) { System.Diagnostics.Debug.WriteLine("... Now checking " + strPageName + " ..."); // If it is new today, then don't process it if (!OneDayOldList.Contains("{{lf\|" + strPageName + "}}")	!TwoDayOldList.Contains("{{lf\|" + strPageName + "}}")) { continue; } strPageName = "File:" + strPageName; try { // Get the page Page pgCurrentImagePage = new Page(site, strPageName); pgCurrentImagePage.Load(); System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds); // If we couldn't get the text, then move on - something might be wrong if (String.IsNullOrWhiteSpace(pgCurrentImagePage.text)) { continue; } // Did someone else tag the page before we got here? if (pgCurrentImagePage.text.ToUpper().Contains("Di-orphaned fair use".ToUpper())) { continue; } // Check to see if the page has been added to the Category:All orphaned non-free use Wikipedia files category List listCategories = pgCurrentImagePage.GetAllCategories(); if (listCategories.Contains("Category:All orphaned non-free use Wikipedia files")) { continue; } System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds); // Try parsing the fair use rationale to see if we can find any linked ARTICLES and purge them // Loop through and find all of the articles in the rationale String strArticleName = ""; try { for (Match match = Regex.Match(pgCurrentImagePage.text, @"\\|\s(a\|A)rticle\s="); null != match && 0 != match.Index; match = match.NextMatch()) { strArticleName = pgCurrentImagePage.text.Substring(match.Index + match.Length); // Now, Find the end of the article name Match matchEnd = Regex.Match(strArticleName, @"\\|"); strArticleName = strArticleName.Substring(0, matchEnd.Index).Trim(); // If we found an article name, then purge it if (!String.IsNullOrWhiteSpace(strArticleName)) { PurgeImage(site2, strArticleName); // Now, let's see if that page is really a redirect try { Page pgPurgedPage = new Page(site2, strArticleName); pgPurgedPage.Load(); pgPurgedPage.ResolveRedirect(); SleepApiDelay(); if (pgPurgedPage.title != strArticleName) { // Okay, we have a redirect - we need to go ahead and purge that too PurgeImage(site2, strArticleName); } } catch(Exception ex) { System.Diagnostics.Trace.WriteLine(ex.ToString()); } } } } catch (Exception ex) { ErrorLogging.SendEmailMessage(true, "Error calling purge API", "There was an error calling purge api for " + strArticleName + ". Will ignore and move on.\r\n\r\n" + ex.ToString()); SleepApiDelay(); continue; } // Now use the API to check and see if the image is still an orphan try { String strImageUsage = site2.GetWebPage("https://en.wikipedia.org/w/api.php?action=query&list=imageusage&iutitle=" + Bot.UrlEncode(pgCurrentImagePage.title.Replace(" ", "_")) + "&iunamespace=0&format=json"); if (!String.IsNullOrWhiteSpace(strImageUsage)) { if (!Regex.IsMatch(strImageUsage, @"imageusage.\[\s\]")) { try { // The image is no longer orphaned. String strArticle = strImageUsage.Substring(1 + strImageUsage.IndexOf("\"title\":\"")); strArticle = strArticle.Substring(8); strArticle = strArticle.Substring(0, strArticle.IndexOf("\"")); // Convert special characters like \u00e8 String strUsedInArticleName = UnescapeString(strArticle); // The image is no longer orphaned. // 2017/02/08 - because there are so blessed many false positives, don't log this any more //LogToEventLog(ref site, MessageType.Informational, "Did not add {{tls\|orfud}} tag to :" + strPageName + " because the image is no longer orphaned. Image is used in " + strUsedInArticleName + "", null); } catch (Exception ex) { LogToEventLog(ref site, MessageType.Error, "Error parsing imageusage for :" + strPageName + ".", ex); } continue; } } } catch (Exception ex) { ErrorLogging.SendEmailMessage(true, "Error calling imageusage API", "There was an error calling imageusage for getting page history for " + pgCurrentImagePage.title + ". Will ignore and move on.\r\n\r\n" + ex.ToString()); SleepApiDelay(); continue; } // Now, look and see if it is used in draft space try { String strImageUsage = site2.GetWebPage("https://en.wikipedia.org/w/api.php?action=query&list=imageusage&iutitle=" + Bot.UrlEncode(pgCurrentImagePage.title.Replace(" ", "_")) + "&iunamespace=118&format=json"); if (!String.IsNullOrWhiteSpace(strImageUsage)) { if (!Regex.IsMatch(strImageUsage, @"imageusage.\[\s\]")) { // The image is no longer orphaned. String strArticle = strImageUsage.Substring(1 + strImageUsage.IndexOf("\"Draft:")); strArticle = strArticle.Substring(0, strArticle.IndexOf("\"")); LogToEventLog(ref site, MessageType.DraftSpaceNote, "Did not add {{tls\|orfud}} tag to :" + strPageName + " which is used only in draft space in the article :" + strArticle + ". This is NOT a permitted use under WP:NFCC#9 and such images may be removed from drafts and tagged as orphans at any time.", null); continue; } } } catch (Exception ex) { ErrorLogging.SendEmailMessage(true, "Error calling imageusage API", "There was an error calling imageusage for getting page history for " + pgCurrentImagePage.title + ". Will ignore and move on.\r\n\r\n" + ex.ToString()); SleepApiDelay(); continue; } // Tag the file with {{subst:orfud}} if (CallEditPage(site, pgCurrentImagePage.title, pgCurrentImagePage.text, "{{subst:orfud}}\r\n" + pgCurrentImagePage.text)) { pgCurrentImagePage.text = "{{subst:orfud}}\r\n" + pgCurrentImagePage.text; if (UserspaceTest) { pgUserspaceTest.text += "
\r\n\| :" + pgCurrentImagePage.title + "	~~~~~	" + pgCurrentImagePage.text.Substring(0, Math.Min(300, pgCurrentImagePage.text.Length)) + " \r\n"; pgUserspaceTest.Save(Properties.Settings.Default.OrfudTagComment, false); } else { pgCurrentImagePage.Save(pgCurrentImagePage.text, Properties.Settings.Default.OrfudTagComment, false); } } } catch (Exception ex) { // This probably means that the page was protected, but we will log it just to be sure LogToEventLog(ref site, MessageType.Error, "Failed to add {{tls\|orfud}} tag to :" + strPageName + ".", ex); // If we failed to tag with orfud, we don't want to notify the user continue; } if (Abort) { LogToEventLog(ref site, MessageType.Error, "I was ordered to abort.", null); return; } intImagesTagged++; // Sleep for our editing delay SleepTaggingDelay(); // Determine the first contributor PageList pl = TryToFillFromPageHistory(ref site, strPageName); if (0 < pl.Count()) { String strNotifyUser = pl[pl.Count() - 1].lastUser; if (!String.IsNullOrWhiteSpace(strNotifyUser)) { try { // Retrieve this user's talk page Page pgUserTalkPage = new Page(site, "User talk:" + strNotifyUser); pgUserTalkPage.Load(); System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds); // If it is a redirect, resolve it. if (pgUserTalkPage.IsRedirect()) { pgUserTalkPage.ResolveRedirect(); } // Can we notify this user? if (BotEditPermitted(pgUserTalkPage.text, Properties.Settings.Default.BotUserName, "orfud")) { if (CallEditPage(site, pgUserTalkPage.title, pgUserTalkPage.text, pgUserTalkPage.text + "\r\n{{subst:di-orphaned fair use-notice\|" + strPageName + "}} --~~~~")) { pgUserTalkPage.text += "\r\n{{subst:di-orphaned fair use-notice\|" + strPageName + "}} --~~~~"; if (UserspaceTest) { pgUserspaceTest.text += "
\r\n\| :" + pgUserTalkPage.title + "	~~~~~	" + pgUserTalkPage.text.Substring(pgUserTalkPage.text.Length - Math.Min(300, pgUserTalkPage.text.Length)) + " \r\n"; pgUserspaceTest.Save(String.Format(Properties.Settings.Default.OrfudWarningTagComment, strPageName), false); } else { pgUserTalkPage.Save(String.Format(Properties.Settings.Default.OrfudWarningTagComment, strPageName), false); } } } } catch (Exception ex) { LogToEventLog(ref site, MessageType.Error, "Failed to notify :" + "User talk:" + strNotifyUser + " that [[:" + strPageName + "]] is orphaned.", ex); } } } if (Abort) { LogToEventLog(ref site, MessageType.Error, "I was ordered to abort.", null); return; } // Sleep for our editing delay SleepTaggingDelay(); } } if (UserspaceTest) { if (CallEditPage(site, pgUserspaceTest.title, "", "footer")) { pgUserspaceTest.text += "

\r\n";

pgUserspaceTest.Save();

}

LogToEventLog(ref site, MessageType.Finish, "B-Bot orphaned fair use image tagger process completed. " + intImagesTagged.ToString() + " orphaned images were tagged.", null);

}

catch (Exception ex)

{

// Connect to Wikipedia

Site site = TryToConnect("https://en.wikipedia.org", Properties.Settings.Default.BotUserName, Properties.Settings.Default.BotPassword);

LogToEventLog(ref site, MessageType.Error, "I crashed and will skip the rest of this run.", ex);

}