User:B-bot/source/Orphaned fair use tagger

This task does the following:

  1. Update User:B-bot/List of orphaned images using [http://quarry.wmflabs.org/query/3268] to get an updated list of orphaned non-free images.
  2. If a non-free image has been orphaned for a full day, purge it to make sure it is still orphaned.
  3. If it is still orphaned, tag it with {{tls|orfud}}
  4. Notify the uploader with {{tls|di-orphaned fair use-notice}}

///

/// This class will add {{subst:orfud}} to orphaned fair use images and notify users with {{subst:di-orphaned fair use-notice|file name}}

///

/// Images that are used in article space (namespace 0) or draft space (namespace 118) will be excluded.

///

/// I propose it to be a three-night process - we will run at midnight each night and run http://quarry.wmflabs.org/query/3268 to

/// get a list of orphaned images. Only images that are on the list for three consecutive nights will be tagged. This ensures that

/// we're not tagging images which are only momentarily orphaned as a result of vandalism or an edit war.

///

public class COrphanedImageProcessor : BBotBase

{

#region Constants

const String cstrOrphanedImagesMainPage = "User:B-bot/List of orphaned images";

const String cstrOrphanedImagesDay0 = cstrOrphanedImagesMainPage + "/day-0";

const String cstrOrphanedImagesDay1 = cstrOrphanedImagesMainPage + "/day-1";

const String cstrOrphanedImagesDay2 = cstrOrphanedImagesMainPage + "/day-2";

const String cstrUnusedFairUseFilesDbQueryPage = "Wikipedia:Database reports/Unused non-free files";

#endregion

#region Settings

///

/// Set this to true to completely skip the orphaned file list update and to just dive

/// straight in to tagging orphans. This lets me test that portion of the process without

/// having to wait for Quarry to run the results.

///

public Boolean SkipOrphanedFileListUpdate { get; set; }

///

/// Set this to true to skip updating the query text. Do this if it has been updated

/// manually and we just want the results.

///

public Boolean SkipQuarryQueryUpdate { get; set; }

///

/// Set this to true to skip the main process so that we are ONLY updating the quarry query

///

public Boolean SkipMainProcess { get; set; }

#endregion

///

/// Constructor

///

public COrphanedImageProcessor()

{

Abort = false;

}

///

/// Gets the name for this job

///

///

public override string GetJobName()

{

return "Orphaned Non-free Image Tagger";

}

///

/// This function will return the text of the last version of this page it edited

///

///

private String GetLastVersionByAcceptableUser(Site site, String PageName)

{

System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds);

// Sanity check: don't do this if the page name is not in the right namespace

if (!PageName.StartsWith("User:" + Properties.Settings.Default.BotUserName + "/"))

{

ErrorLogging.SendEmailMessage(true, "Unable to revert page", "Only pages under User:" + Properties.Settings.Default.BotUserName + "/ can be reverted. " +

"Cannot revert " + PageName + ".");

return "";

}

PageList pl = new PageList(site);

pl.FillFromPageHistory(PageName, 20);

// Loop through the pages and find the last one I edited

foreach (Page p in pl)

{

if (p.lastUser == Properties.Settings.Default.BotUserName ||

p.lastUser == "B")

{

p.LoadTextOnly();

System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds);

return p.text;

}

}

return "";

}

///

/// This function will return the last date that the page was modified

///

///

///

private DateTime? GetPageLastModDate(ref Site site, String PageName)

{

try

{

SleepApiDelay();

PageList pl = new PageList(site);

pl.FillFromPageHistory(PageName, 1);

if (0 == pl.Count())

{

SleepApiDelay();

return null;

}

SleepApiDelay();

return pl[0].timestamp;

}

catch (Exception ex)

{

LogToEventLog(ref site, MessageType.Error, "Error looking up last mod date for :" + PageName + "", ex);

return null;

}

}

///

/// This function will attempt to update our list of orphans using Wikipedia:Database reports/Unused non-free files

///

///

/// Query results from Wikipedia:Database reports/Unused non-free files

///

public bool UpdateOrphansFromDbReport(ref Site site, ref String strQueryResults)

{

try

{

// First, we want to see the last time we updated

DateTime? dtOrphanLastUpdate = GetPageLastModDate(ref site, cstrOrphanedImagesDay0);

DateTime? dtmDbQueryPageLastUpdate = GetPageLastModDate(ref site, cstrUnusedFairUseFilesDbQueryPage);

if (!dtmDbQueryPageLastUpdate.HasValue || !dtOrphanLastUpdate.HasValue)

{

return false;

}

// Does it have a more recent update than we do?

if (dtmDbQueryPageLastUpdate.Value < dtOrphanLastUpdate.Value)

{

LogToEventLog(ref site, MessageType.Informational, "Unfortunately, we already have a more recent update than what [[:" +

cstrUnusedFairUseFilesDbQueryPage + "]] has to offer. Stopping process.", null);

return false;

}

Page pgDbQuery = new Page(site, cstrUnusedFairUseFilesDbQueryPage);

pgDbQuery.LoadTextOnly();

SleepApiDelay();

if (String.IsNullOrWhiteSpace(pgDbQuery.text))

{

LogToEventLog(ref site, MessageType.Error, "Error loading [[:" + cstrUnusedFairUseFilesDbQueryPage +

"]] — the text loaded was null. Stopping process.", null);

return false;

}

String strRet = "";

// Find the files listed in this page

MatchCollection matches = Regex.Matches(pgDbQuery.text, @"\[\[:File:.*\]\]", RegexOptions.IgnoreCase);

foreach (Match m in matches)

{

if (0 < m.Length)

{

// Add this file to the return list

strRet += "*{{lf|" + pgDbQuery.text.Substring(m.Index, m.Length).Trim().Replace("_", " ").Replace(":File:", "").Replace("", "") + "}}\r\n";

}

}

strQueryResults = strRet;

// As long as we have some results, we are good

return !String.IsNullOrWhiteSpace(strQueryResults);

}

catch (Exception ex)

{

LogToEventLog(ref site, MessageType.Error, "Error attempting to load results from Wikipedia:Database reports/Unused non-free files - aborting.", ex);

return false;

}

}

///

/// This function will update the Quarry query and wait the requisite amount of time

///

///

/// True = the update was successful or unnecessary; False = the update failed

public bool UpdateQuarryQueryIfNeeded(ref Site site)

{

try

{

String Error = "";

String strErrorMessage = "";

if (!SkipQuarryQueryUpdate)

{

// Step 1: run the query

bool Success = QuarryExecutor.ExecuteQuarryQuery(Properties.Settings.Default.OAuthUserName, Properties.Settings.Default.OAuthPassword,

Properties.Settings.Default.QuarryQueryID, Properties.Settings.Default.QuarryQueryText, ref Error, ref strErrorMessage);

if (!Success)

{

LogToEventLog(ref site, MessageType.Error, "Error updating list of orphaned images: " + strErrorMessage, null);

return false;

}

// Now wait for the requisite time needed for the query to finish executing

System.Threading.Thread.Sleep(Properties.Settings.Default.QuarryWaitTimeMinutes * 60 * 1000);

}

// Success!

return true;

}

catch (Exception ex)

{

LogToEventLog(ref site, MessageType.Error, "Exception updating Quarry query", ex);

return false;

}

}

///

/// The master function to perform the job

///

public void PerformTask()

{

try

{

// Connect to Wikipedia

Site site = TryToConnect("https://en.wikipedia.org", Properties.Settings.Default.BotUserName, Properties.Settings.Default.BotPassword);

// Use a separate connection for our less-important API calls

Site site2 = TryToConnect("https://en.wikipedia.org", Properties.Settings.Default.BotUserName, Properties.Settings.Default.BotPassword);

DateTime dtmStarted = DateTime.Now;

LogToEventLog(ref site, MessageType.Start, "B-Bot orphaned fair use image tagger process now commencing.", null);

if (!SkipOrphanedFileListUpdate)

{

String QueryResults = "";

if (!UpdateQuarryQueryIfNeeded(ref site))

{

// We were not able to update the Quarry query, so let's try to use Wikipedia:Database reports/Unused non-free files instead

if (!UpdateOrphansFromDbReport(ref site, ref QueryResults))

{

LogToEventLog(ref site, MessageType.Finish, "Unable to update orphan list from either our own query or from Wikipedia:Database reports/Unused non-free files. Aborting.", null);

return;

}

}

else

{

// Download the quarry query results

String strQuarryDownloadPage = "http://quarry.wmcloud.org/run/" + QuarryExecutor.GetRunID(Properties.Settings.Default.QuarryQueryID).ToString() + "/output/0/tsv";

QueryResults = site.GetWebPage(strQuarryDownloadPage);

if (String.IsNullOrWhiteSpace(QueryResults))

{

LogToEventLog(ref site, MessageType.Error, "I was unable to update the list of orphaned images. Will try Wikipedia:Database reports/Unused non-free files.", null);

// We were not able to update the Quarry query, so let's try to use Wikipedia:Database reports/Unused non-free files instead

if (!UpdateOrphansFromDbReport(ref site, ref QueryResults))

{

LogToEventLog(ref site, MessageType.Finish, "Unable to update orphan list from either our own query or from Wikipedia:Database reports/Unused non-free files. Aborting.", null);

return;

}

}

}

// We don't need the header row

if (QueryResults.StartsWith("CONCAT('*"))

{

QueryResults = QueryResults.Substring("CONCAT('*".Length);

}

// We don't need the header row

QueryResults = QueryResults.Replace("\"CONCAT('*{{lf|', REPLACE(REPLACE(p.page_title, '\"\"', '**DOUBLEQUOTE**'), '_', ' '), '}}')\"\r\n", "");

QueryResults = QueryResults.Replace("\"\t", "");

QueryResults = QueryResults.Replace("\"", "");

QueryResults = QueryResults.Replace("CONCAT('*{{lf|', REPLACE(REPLACE(p.page_title, '', '\"'), '_', ' '), '}}')", "");

// Because double quotes break TSV files, we replace those with a placeholder. Put the double quote back.

QueryResults = QueryResults.Replace("**DOUBLEQUOTE**", "\"");

// Get the text from User:B-bot/List of orphaned images/day-1 and move it to User:B-bot/List of orphaned images/day-2

String strTwoDayOldList = GetLastVersionByAcceptableUser(site, cstrOrphanedImagesDay1);

if (!String.IsNullOrWhiteSpace(strTwoDayOldList))

{

Page p = new Page(site, cstrOrphanedImagesDay2);

/*if (CallEditPage(site, p.title, p.text, strTwoDayOldList))

{*/

p.text = strTwoDayOldList;

p.Save();

//}

}

if (Abort) { LogToEventLog(ref site, MessageType.Error, "I was ordered to abort.", null); return; }

// Get the text from User:B-bot/List of orphaned images/day-0 and move it to User:B-bot/List of orphaned images/day-1

String strOneDayOldList = GetLastVersionByAcceptableUser(site, cstrOrphanedImagesDay0);

if (!String.IsNullOrWhiteSpace(strOneDayOldList))

{

Page p = new Page(site, cstrOrphanedImagesDay1);

/*if (CallEditPage(site, p.title, p.text, strTwoDayOldList))

{*/

p.text = strOneDayOldList;

p.Save();

//}

}

if (Abort) { LogToEventLog(ref site, MessageType.Error, "I was ordered to abort.", null); return; }

// Now, log our new query results to User:B-bot/List of orphaned images/day-0

Page p0 = new Page(site, cstrOrphanedImagesDay0);

/*if (CallEditPage(site, p0.title, p0.text, strTwoDayOldList))

{*/

p0.text = QueryResults;

p0.Save();

//}

if (Abort) { LogToEventLog(ref site, MessageType.Error, "I was ordered to abort.", null); return; }

}

int intImagesTagged = 0;

if (!SkipMainProcess)

{

if (0 < Properties.Settings.Default.MaximumImagesPerRun)

{

LogToEventLog(ref site, MessageType.Finish, "List of orphaned images updated. Now beginning tagging and notifications with a maximum of " + Properties.Settings.Default.MaximumImagesPerRun.ToString() + " images.", null);

}

else

{

LogToEventLog(ref site, MessageType.Finish, "List of orphaned images updated. Now beginning tagging and notifications.", null);

}

Page pgUserspaceTest = new Page(site, Properties.Settings.Default.UserspaceTestPage);

if (UserspaceTest)

{

if (CallEditPage(site, pgUserspaceTest.title, "", "Initial header"))

{

pgUserspaceTest.text = "Now beginning Orphaned Image Processor task on " + DateTime.Now.ToString() + " (local time) ...\r\n\r\n";

pgUserspaceTest.text += "

class=\"wikitable sortable\"\r\n
\r\n! Page !! Timestamp !! Proposed edit\r\n";

pgUserspaceTest.Save();

}

}

// We want a fresh copy of our lists - we may have skipped the process above if we didn't need to update anything

System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds);

String CurrentList = GetLastVersionByAcceptableUser(site, cstrOrphanedImagesDay0);

System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds);

String OneDayOldList = GetLastVersionByAcceptableUser(site, cstrOrphanedImagesDay1).Replace("_", " ");

System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds);

String TwoDayOldList = GetLastVersionByAcceptableUser(site, cstrOrphanedImagesDay2).Replace("_", " ");

System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds);

// If any of these are empty, then we don't have anything to do

if (String.IsNullOrWhiteSpace(CurrentList)

String.IsNullOrWhiteSpace(OneDayOldList)String.IsNullOrWhiteSpace(TwoDayOldList))

{

return;

}

CurrentList = CurrentList.Replace("\r\n", "\r");

CurrentList = CurrentList.Replace("\n", "\r");

String[] arrCurrentList = CurrentList.Split(new String[] { "\r" }, StringSplitOptions.RemoveEmptyEntries);

// Now, loop through the rows in CurrentList. If a file appears in there that appears all three places, delete it

foreach (String item in arrCurrentList)

{

if (0 < Properties.Settings.Default.MaximumImagesPerRun && intImagesTagged >= Properties.Settings.Default.MaximumImagesPerRun)

{

break;

}

// We seem to timeout periodically ... if it's been more than 10 minutes, reconnect

if (DateTime.Now.Subtract(dtmStarted).TotalMinutes > 10)

{

RecycleConnection(ref site);

RecycleConnection(ref site2);

dtmStarted = DateTime.Now;

}

String strPageName = item.Replace("*{{lf|", "").Replace("}}", "").Replace("_", " ").Trim();

if (!string.IsNullOrWhiteSpace(strPageName))

{

System.Diagnostics.Debug.WriteLine("... Now checking " + strPageName + " ...");

// If it is new today, then don't process it

if (!OneDayOldList.Contains("{{lf|" + strPageName + "}}")

!TwoDayOldList.Contains("{{lf|" + strPageName + "}}"))

{

continue;

}

strPageName = "File:" + strPageName;

try

{

// Get the page

Page pgCurrentImagePage = new Page(site, strPageName);

pgCurrentImagePage.Load();

System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds);

// If we couldn't get the text, then move on - something might be wrong

if (String.IsNullOrWhiteSpace(pgCurrentImagePage.text))

{

continue;

}

// Did someone else tag the page before we got here?

if (pgCurrentImagePage.text.ToUpper().Contains("Di-orphaned fair use".ToUpper()))

{

continue;

}

// Check to see if the page has been added to the Category:All orphaned non-free use Wikipedia files category

List listCategories = pgCurrentImagePage.GetAllCategories();

if (listCategories.Contains("Category:All orphaned non-free use Wikipedia files"))

{

continue;

}

System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds);

// Try parsing the fair use rationale to see if we can find any linked ARTICLES and purge them

// Loop through and find all of the articles in the rationale

String strArticleName = "";

try

{

for (Match match = Regex.Match(pgCurrentImagePage.text, @"\|\s*(a|A)rticle\s*=");

null != match && 0 != match.Index; match = match.NextMatch())

{

strArticleName = pgCurrentImagePage.text.Substring(match.Index + match.Length);

// Now, Find the end of the article name

Match matchEnd = Regex.Match(strArticleName, @"\|");

strArticleName = strArticleName.Substring(0, matchEnd.Index).Trim();

// If we found an article name, then purge it

if (!String.IsNullOrWhiteSpace(strArticleName))

{

PurgeImage(site2, strArticleName);

// Now, let's see if that page is really a redirect

try

{

Page pgPurgedPage = new Page(site2, strArticleName);

pgPurgedPage.Load();

pgPurgedPage.ResolveRedirect();

SleepApiDelay();

if (pgPurgedPage.title != strArticleName)

{

// Okay, we have a redirect - we need to go ahead and purge that too

PurgeImage(site2, strArticleName);

}

}

catch(Exception ex)

{

System.Diagnostics.Trace.WriteLine(ex.ToString());

}

}

}

}

catch (Exception ex)

{

ErrorLogging.SendEmailMessage(true, "Error calling purge API", "There was an error calling purge api for " + strArticleName + ". Will ignore and move on.\r\n\r\n" + ex.ToString());

SleepApiDelay();

continue;

}

// Now use the API to check and see if the image is still an orphan

try

{

String strImageUsage = site2.GetWebPage("https://en.wikipedia.org/w/api.php?action=query&list=imageusage&iutitle=" + Bot.UrlEncode(pgCurrentImagePage.title.Replace(" ", "_")) + "&iunamespace=0&format=json");

if (!String.IsNullOrWhiteSpace(strImageUsage))

{

if (!Regex.IsMatch(strImageUsage, @"imageusage.*\[\s*\]"))

{

try

{

// The image is no longer orphaned.

String strArticle = strImageUsage.Substring(1 + strImageUsage.IndexOf("\"title\":\""));

strArticle = strArticle.Substring(8);

strArticle = strArticle.Substring(0, strArticle.IndexOf("\""));

// Convert special characters like \u00e8

String strUsedInArticleName = UnescapeString(strArticle);

// The image is no longer orphaned.

// 2017/02/08 - because there are so blessed many false positives, don't log this any more

//LogToEventLog(ref site, MessageType.Informational, "Did not add {{tls|orfud}} tag to :" + strPageName + " because the image is no longer orphaned. Image is used in " + strUsedInArticleName + "", null);

}

catch (Exception ex)

{

LogToEventLog(ref site, MessageType.Error, "Error parsing imageusage for :" + strPageName + ".", ex);

}

continue;

}

}

}

catch (Exception ex)

{

ErrorLogging.SendEmailMessage(true, "Error calling imageusage API", "There was an error calling imageusage for getting page history for " + pgCurrentImagePage.title + ". Will ignore and move on.\r\n\r\n" + ex.ToString());

SleepApiDelay();

continue;

}

// Now, look and see if it is used in draft space

try

{

String strImageUsage = site2.GetWebPage("https://en.wikipedia.org/w/api.php?action=query&list=imageusage&iutitle=" + Bot.UrlEncode(pgCurrentImagePage.title.Replace(" ", "_")) + "&iunamespace=118&format=json");

if (!String.IsNullOrWhiteSpace(strImageUsage))

{

if (!Regex.IsMatch(strImageUsage, @"imageusage.*\[\s*\]"))

{

// The image is no longer orphaned.

String strArticle = strImageUsage.Substring(1 + strImageUsage.IndexOf("\"Draft:"));

strArticle = strArticle.Substring(0, strArticle.IndexOf("\""));

LogToEventLog(ref site, MessageType.DraftSpaceNote, "Did not add {{tls|orfud}} tag to :" + strPageName + " which is used only in draft space in the article :" + strArticle + ". This is NOT a permitted use under WP:NFCC#9 and such images may be removed from drafts and tagged as orphans at any time.", null);

continue;

}

}

}

catch (Exception ex)

{

ErrorLogging.SendEmailMessage(true, "Error calling imageusage API", "There was an error calling imageusage for getting page history for " + pgCurrentImagePage.title + ". Will ignore and move on.\r\n\r\n" + ex.ToString());

SleepApiDelay();

continue;

}

// Tag the file with {{subst:orfud}}

if (CallEditPage(site, pgCurrentImagePage.title, pgCurrentImagePage.text, "{{subst:orfud}}\r\n" + pgCurrentImagePage.text))

{

pgCurrentImagePage.text = "{{subst:orfud}}\r\n" + pgCurrentImagePage.text;

if (UserspaceTest)

{

pgUserspaceTest.text += "

\r\n| :" + pgCurrentImagePage.title + "~~~~~
" + pgCurrentImagePage.text.Substring(0, Math.Min(300, pgCurrentImagePage.text.Length)) + "
\r\n";

pgUserspaceTest.Save(Properties.Settings.Default.OrfudTagComment, false);

}

else

{

pgCurrentImagePage.Save(pgCurrentImagePage.text, Properties.Settings.Default.OrfudTagComment, false);

}

}

}

catch (Exception ex)

{

// This probably means that the page was protected, but we will log it just to be sure

LogToEventLog(ref site, MessageType.Error, "Failed to add {{tls|orfud}} tag to :" + strPageName + ".", ex);

// If we failed to tag with orfud, we don't want to notify the user

continue;

}

if (Abort) { LogToEventLog(ref site, MessageType.Error, "I was ordered to abort.", null); return; }

intImagesTagged++;

// Sleep for our editing delay

SleepTaggingDelay();

// Determine the first contributor

PageList pl = TryToFillFromPageHistory(ref site, strPageName);

if (0 < pl.Count())

{

String strNotifyUser = pl[pl.Count() - 1].lastUser;

if (!String.IsNullOrWhiteSpace(strNotifyUser))

{

try

{

// Retrieve this user's talk page

Page pgUserTalkPage = new Page(site, "User talk:" + strNotifyUser);

pgUserTalkPage.Load();

System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds);

// If it is a redirect, resolve it.

if (pgUserTalkPage.IsRedirect())

{

pgUserTalkPage.ResolveRedirect();

}

// Can we notify this user?

if (BotEditPermitted(pgUserTalkPage.text, Properties.Settings.Default.BotUserName, "orfud"))

{

if (CallEditPage(site, pgUserTalkPage.title, pgUserTalkPage.text, pgUserTalkPage.text +

"\r\n{{subst:di-orphaned fair use-notice|" + strPageName + "}} --~~~~"))

{

pgUserTalkPage.text += "\r\n{{subst:di-orphaned fair use-notice|" + strPageName + "}} --~~~~";

if (UserspaceTest)

{

pgUserspaceTest.text += "

\r\n| :" + pgUserTalkPage.title + "~~~~~
" +

pgUserTalkPage.text.Substring(pgUserTalkPage.text.Length - Math.Min(300, pgUserTalkPage.text.Length)) + "

\r\n";

pgUserspaceTest.Save(String.Format(Properties.Settings.Default.OrfudWarningTagComment, strPageName), false);

}

else

{

pgUserTalkPage.Save(String.Format(Properties.Settings.Default.OrfudWarningTagComment, strPageName), false);

}

}

}

}

catch (Exception ex)

{

LogToEventLog(ref site, MessageType.Error, "Failed to notify :" + "User talk:" + strNotifyUser + " that [[:" +

strPageName + "]] is orphaned.", ex);

}

}

}

if (Abort) { LogToEventLog(ref site, MessageType.Error, "I was ordered to abort.", null); return; }

// Sleep for our editing delay

SleepTaggingDelay();

}

}

if (UserspaceTest)

{

if (CallEditPage(site, pgUserspaceTest.title, "", "footer"))

{

pgUserspaceTest.text += "

\r\n";

pgUserspaceTest.Save();

}

}

}

LogToEventLog(ref site, MessageType.Finish, "B-Bot orphaned fair use image tagger process completed. " + intImagesTagged.ToString() + " orphaned images were tagged.", null);

}

catch (Exception ex)

{

// Connect to Wikipedia

Site site = TryToConnect("https://en.wikipedia.org", Properties.Settings.Default.BotUserName, Properties.Settings.Default.BotPassword);

LogToEventLog(ref site, MessageType.Error, "I crashed and will skip the rest of this run.", ex);

}

}

}