User:B-bot/source/Orphaned fair use tagger
This task does the following:
- Update User:B-bot/List of orphaned images using [http://quarry.wmflabs.org/query/3268] to get an updated list of orphaned non-free images.
- If a non-free image has been orphaned for a full day, purge it to make sure it is still orphaned.
- If it is still orphaned, tag it with {{tls|orfud}}
- Notify the uploader with {{tls|di-orphaned fair use-notice}}
///
/// This class will add {{subst:orfud}} to orphaned fair use images and notify users with {{subst:di-orphaned fair use-notice|file name}}
///
/// Images that are used in article space (namespace 0) or draft space (namespace 118) will be excluded.
///
/// I propose it to be a three-night process - we will run at midnight each night and run http://quarry.wmflabs.org/query/3268 to
/// get a list of orphaned images. Only images that are on the list for three consecutive nights will be tagged. This ensures that
/// we're not tagging images which are only momentarily orphaned as a result of vandalism or an edit war.
///
public class COrphanedImageProcessor : BBotBase
{
#region Constants
const String cstrOrphanedImagesMainPage = "User:B-bot/List of orphaned images";
const String cstrOrphanedImagesDay0 = cstrOrphanedImagesMainPage + "/day-0";
const String cstrOrphanedImagesDay1 = cstrOrphanedImagesMainPage + "/day-1";
const String cstrOrphanedImagesDay2 = cstrOrphanedImagesMainPage + "/day-2";
const String cstrUnusedFairUseFilesDbQueryPage = "Wikipedia:Database reports/Unused non-free files";
#endregion
#region Settings
///
/// Set this to true to completely skip the orphaned file list update and to just dive
/// straight in to tagging orphans. This lets me test that portion of the process without
/// having to wait for Quarry to run the results.
///
public Boolean SkipOrphanedFileListUpdate { get; set; }
///
/// Set this to true to skip updating the query text. Do this if it has been updated
/// manually and we just want the results.
///
public Boolean SkipQuarryQueryUpdate { get; set; }
///
/// Set this to true to skip the main process so that we are ONLY updating the quarry query
///
public Boolean SkipMainProcess { get; set; }
#endregion
///
/// Constructor
///
public COrphanedImageProcessor()
{
Abort = false;
}
///
/// Gets the name for this job
///
///
public override string GetJobName()
{
return "Orphaned Non-free Image Tagger";
}
///
/// This function will return the text of the last version of this page it edited
///
///
private String GetLastVersionByAcceptableUser(Site site, String PageName)
{
System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds);
// Sanity check: don't do this if the page name is not in the right namespace
if (!PageName.StartsWith("User:" + Properties.Settings.Default.BotUserName + "/"))
{
ErrorLogging.SendEmailMessage(true, "Unable to revert page", "Only pages under User:" + Properties.Settings.Default.BotUserName + "/ can be reverted. " +
"Cannot revert " + PageName + ".");
return "";
}
PageList pl = new PageList(site);
pl.FillFromPageHistory(PageName, 20);
// Loop through the pages and find the last one I edited
foreach (Page p in pl)
{
if (p.lastUser == Properties.Settings.Default.BotUserName ||
p.lastUser == "B")
{
p.LoadTextOnly();
System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds);
return p.text;
}
}
return "";
}
///
/// This function will return the last date that the page was modified
///
///
///
private DateTime? GetPageLastModDate(ref Site site, String PageName)
{
try
{
SleepApiDelay();
PageList pl = new PageList(site);
pl.FillFromPageHistory(PageName, 1);
if (0 == pl.Count())
{
SleepApiDelay();
return null;
}
SleepApiDelay();
return pl[0].timestamp;
}
catch (Exception ex)
{
LogToEventLog(ref site, MessageType.Error, "Error looking up last mod date for :" + PageName + "", ex);
return null;
}
}
///
/// This function will attempt to update our list of orphans using Wikipedia:Database reports/Unused non-free files
///
///
/// Query results from Wikipedia:Database reports/Unused non-free files
///
public bool UpdateOrphansFromDbReport(ref Site site, ref String strQueryResults)
{
try
{
// First, we want to see the last time we updated
DateTime? dtOrphanLastUpdate = GetPageLastModDate(ref site, cstrOrphanedImagesDay0);
DateTime? dtmDbQueryPageLastUpdate = GetPageLastModDate(ref site, cstrUnusedFairUseFilesDbQueryPage);
if (!dtmDbQueryPageLastUpdate.HasValue || !dtOrphanLastUpdate.HasValue)
{
return false;
}
// Does it have a more recent update than we do?
if (dtmDbQueryPageLastUpdate.Value < dtOrphanLastUpdate.Value)
{
LogToEventLog(ref site, MessageType.Informational, "Unfortunately, we already have a more recent update than what [[:" +
cstrUnusedFairUseFilesDbQueryPage + "]] has to offer. Stopping process.", null);
return false;
}
Page pgDbQuery = new Page(site, cstrUnusedFairUseFilesDbQueryPage);
pgDbQuery.LoadTextOnly();
SleepApiDelay();
if (String.IsNullOrWhiteSpace(pgDbQuery.text))
{
LogToEventLog(ref site, MessageType.Error, "Error loading [[:" + cstrUnusedFairUseFilesDbQueryPage +
"]] — the text loaded was null. Stopping process.", null);
return false;
}
String strRet = "";
// Find the files listed in this page
MatchCollection matches = Regex.Matches(pgDbQuery.text, @"\[\[:File:.*\]\]", RegexOptions.IgnoreCase);
foreach (Match m in matches)
{
if (0 < m.Length)
{
// Add this file to the return list
strRet += "*{{lf|" + pgDbQuery.text.Substring(m.Index, m.Length).Trim().Replace("_", " ").Replace(":File:", "").Replace("", "") + "}}\r\n";
}
}
strQueryResults = strRet;
// As long as we have some results, we are good
return !String.IsNullOrWhiteSpace(strQueryResults);
}
catch (Exception ex)
{
LogToEventLog(ref site, MessageType.Error, "Error attempting to load results from Wikipedia:Database reports/Unused non-free files - aborting.", ex);
return false;
}
}
///
/// This function will update the Quarry query and wait the requisite amount of time
///
///
///
public bool UpdateQuarryQueryIfNeeded(ref Site site)
{
try
{
String Error = "";
String strErrorMessage = "";
if (!SkipQuarryQueryUpdate)
{
// Step 1: run the query
bool Success = QuarryExecutor.ExecuteQuarryQuery(Properties.Settings.Default.OAuthUserName, Properties.Settings.Default.OAuthPassword,
Properties.Settings.Default.QuarryQueryID, Properties.Settings.Default.QuarryQueryText, ref Error, ref strErrorMessage);
if (!Success)
{
LogToEventLog(ref site, MessageType.Error, "Error updating list of orphaned images: " + strErrorMessage, null);
return false;
}
// Now wait for the requisite time needed for the query to finish executing
System.Threading.Thread.Sleep(Properties.Settings.Default.QuarryWaitTimeMinutes * 60 * 1000);
}
// Success!
return true;
}
catch (Exception ex)
{
LogToEventLog(ref site, MessageType.Error, "Exception updating Quarry query", ex);
return false;
}
}
///
/// The master function to perform the job
///
public void PerformTask()
{
try
{
// Connect to Wikipedia
Site site = TryToConnect("https://en.wikipedia.org", Properties.Settings.Default.BotUserName, Properties.Settings.Default.BotPassword);
// Use a separate connection for our less-important API calls
Site site2 = TryToConnect("https://en.wikipedia.org", Properties.Settings.Default.BotUserName, Properties.Settings.Default.BotPassword);
DateTime dtmStarted = DateTime.Now;
LogToEventLog(ref site, MessageType.Start, "B-Bot orphaned fair use image tagger process now commencing.", null);
if (!SkipOrphanedFileListUpdate)
{
String QueryResults = "";
if (!UpdateQuarryQueryIfNeeded(ref site))
{
// We were not able to update the Quarry query, so let's try to use Wikipedia:Database reports/Unused non-free files instead
if (!UpdateOrphansFromDbReport(ref site, ref QueryResults))
{
LogToEventLog(ref site, MessageType.Finish, "Unable to update orphan list from either our own query or from Wikipedia:Database reports/Unused non-free files. Aborting.", null);
return;
}
}
else
{
// Download the quarry query results
String strQuarryDownloadPage = "http://quarry.wmcloud.org/run/" + QuarryExecutor.GetRunID(Properties.Settings.Default.QuarryQueryID).ToString() + "/output/0/tsv";
QueryResults = site.GetWebPage(strQuarryDownloadPage);
if (String.IsNullOrWhiteSpace(QueryResults))
{
LogToEventLog(ref site, MessageType.Error, "I was unable to update the list of orphaned images. Will try Wikipedia:Database reports/Unused non-free files.", null);
// We were not able to update the Quarry query, so let's try to use Wikipedia:Database reports/Unused non-free files instead
if (!UpdateOrphansFromDbReport(ref site, ref QueryResults))
{
LogToEventLog(ref site, MessageType.Finish, "Unable to update orphan list from either our own query or from Wikipedia:Database reports/Unused non-free files. Aborting.", null);
return;
}
}
}
// We don't need the header row
if (QueryResults.StartsWith("CONCAT('*"))
{
QueryResults = QueryResults.Substring("CONCAT('*".Length);
}
// We don't need the header row
QueryResults = QueryResults.Replace("\"CONCAT('*{{lf|', REPLACE(REPLACE(p.page_title, '\"\"', '**DOUBLEQUOTE**'), '_', ' '), '}}')\"\r\n", "");
QueryResults = QueryResults.Replace("\"\t", "");
QueryResults = QueryResults.Replace("\"", "");
QueryResults = QueryResults.Replace("CONCAT('*{{lf|', REPLACE(REPLACE(p.page_title, '', '\"'), '_', ' '), '}}')", "");
// Because double quotes break TSV files, we replace those with a placeholder. Put the double quote back.
QueryResults = QueryResults.Replace("**DOUBLEQUOTE**", "\"");
// Get the text from User:B-bot/List of orphaned images/day-1 and move it to User:B-bot/List of orphaned images/day-2
String strTwoDayOldList = GetLastVersionByAcceptableUser(site, cstrOrphanedImagesDay1);
if (!String.IsNullOrWhiteSpace(strTwoDayOldList))
{
Page p = new Page(site, cstrOrphanedImagesDay2);
/*if (CallEditPage(site, p.title, p.text, strTwoDayOldList))
{*/
p.text = strTwoDayOldList;
p.Save();
//}
}
if (Abort) { LogToEventLog(ref site, MessageType.Error, "I was ordered to abort.", null); return; }
// Get the text from User:B-bot/List of orphaned images/day-0 and move it to User:B-bot/List of orphaned images/day-1
String strOneDayOldList = GetLastVersionByAcceptableUser(site, cstrOrphanedImagesDay0);
if (!String.IsNullOrWhiteSpace(strOneDayOldList))
{
Page p = new Page(site, cstrOrphanedImagesDay1);
/*if (CallEditPage(site, p.title, p.text, strTwoDayOldList))
{*/
p.text = strOneDayOldList;
p.Save();
//}
}
if (Abort) { LogToEventLog(ref site, MessageType.Error, "I was ordered to abort.", null); return; }
// Now, log our new query results to User:B-bot/List of orphaned images/day-0
Page p0 = new Page(site, cstrOrphanedImagesDay0);
/*if (CallEditPage(site, p0.title, p0.text, strTwoDayOldList))
{*/
p0.text = QueryResults;
p0.Save();
//}
if (Abort) { LogToEventLog(ref site, MessageType.Error, "I was ordered to abort.", null); return; }
}
int intImagesTagged = 0;
if (!SkipMainProcess)
{
if (0 < Properties.Settings.Default.MaximumImagesPerRun)
{
LogToEventLog(ref site, MessageType.Finish, "List of orphaned images updated. Now beginning tagging and notifications with a maximum of " + Properties.Settings.Default.MaximumImagesPerRun.ToString() + " images.", null);
}
else
{
LogToEventLog(ref site, MessageType.Finish, "List of orphaned images updated. Now beginning tagging and notifications.", null);
}
Page pgUserspaceTest = new Page(site, Properties.Settings.Default.UserspaceTestPage);
if (UserspaceTest)
{
if (CallEditPage(site, pgUserspaceTest.title, "", "Initial header"))
{
pgUserspaceTest.text = "Now beginning Orphaned Image Processor task on " + DateTime.Now.ToString() + " (local time) ...\r\n\r\n";
pgUserspaceTest.text += "
class=\"wikitable sortable\"\r\n | |||
\r\n! Page !! Timestamp !! Proposed edit\r\n";
pgUserspaceTest.Save(); } } // We want a fresh copy of our lists - we may have skipped the process above if we didn't need to update anything System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds); String CurrentList = GetLastVersionByAcceptableUser(site, cstrOrphanedImagesDay0); System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds); String OneDayOldList = GetLastVersionByAcceptableUser(site, cstrOrphanedImagesDay1).Replace("_", " "); System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds); String TwoDayOldList = GetLastVersionByAcceptableUser(site, cstrOrphanedImagesDay2).Replace("_", " "); System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds); // If any of these are empty, then we don't have anything to do if (String.IsNullOrWhiteSpace(CurrentList) | String.IsNullOrWhiteSpace(OneDayOldList) | String.IsNullOrWhiteSpace(TwoDayOldList))
{ return; } CurrentList = CurrentList.Replace("\r\n", "\r"); CurrentList = CurrentList.Replace("\n", "\r"); String[] arrCurrentList = CurrentList.Split(new String[] { "\r" }, StringSplitOptions.RemoveEmptyEntries); // Now, loop through the rows in CurrentList. If a file appears in there that appears all three places, delete it foreach (String item in arrCurrentList) { if (0 < Properties.Settings.Default.MaximumImagesPerRun && intImagesTagged >= Properties.Settings.Default.MaximumImagesPerRun) { break; } // We seem to timeout periodically ... if it's been more than 10 minutes, reconnect if (DateTime.Now.Subtract(dtmStarted).TotalMinutes > 10) { RecycleConnection(ref site); RecycleConnection(ref site2); dtmStarted = DateTime.Now; } String strPageName = item.Replace("*{{lf|", "").Replace("}}", "").Replace("_", " ").Trim(); if (!string.IsNullOrWhiteSpace(strPageName)) { System.Diagnostics.Debug.WriteLine("... Now checking " + strPageName + " ..."); // If it is new today, then don't process it if (!OneDayOldList.Contains("{{lf|" + strPageName + "}}") | !TwoDayOldList.Contains("{{lf|" + strPageName + "}}"))
{ continue; } strPageName = "File:" + strPageName; try { // Get the page Page pgCurrentImagePage = new Page(site, strPageName); pgCurrentImagePage.Load(); System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds); // If we couldn't get the text, then move on - something might be wrong if (String.IsNullOrWhiteSpace(pgCurrentImagePage.text)) { continue; } // Did someone else tag the page before we got here? if (pgCurrentImagePage.text.ToUpper().Contains("Di-orphaned fair use".ToUpper())) { continue; } // Check to see if the page has been added to the Category:All orphaned non-free use Wikipedia files category List if (listCategories.Contains("Category:All orphaned non-free use Wikipedia files")) { continue; } System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds); // Try parsing the fair use rationale to see if we can find any linked ARTICLES and purge them // Loop through and find all of the articles in the rationale String strArticleName = ""; try { for (Match match = Regex.Match(pgCurrentImagePage.text, @"\|\s*(a|A)rticle\s*="); null != match && 0 != match.Index; match = match.NextMatch()) { strArticleName = pgCurrentImagePage.text.Substring(match.Index + match.Length); // Now, Find the end of the article name Match matchEnd = Regex.Match(strArticleName, @"\|"); strArticleName = strArticleName.Substring(0, matchEnd.Index).Trim(); // If we found an article name, then purge it if (!String.IsNullOrWhiteSpace(strArticleName)) { PurgeImage(site2, strArticleName); // Now, let's see if that page is really a redirect try { Page pgPurgedPage = new Page(site2, strArticleName); pgPurgedPage.Load(); pgPurgedPage.ResolveRedirect(); SleepApiDelay(); if (pgPurgedPage.title != strArticleName) { // Okay, we have a redirect - we need to go ahead and purge that too PurgeImage(site2, strArticleName); } } catch(Exception ex) { System.Diagnostics.Trace.WriteLine(ex.ToString()); } } } } catch (Exception ex) { ErrorLogging.SendEmailMessage(true, "Error calling purge API", "There was an error calling purge api for " + strArticleName + ". Will ignore and move on.\r\n\r\n" + ex.ToString()); SleepApiDelay(); continue; } // Now use the API to check and see if the image is still an orphan try { String strImageUsage = site2.GetWebPage("https://en.wikipedia.org/w/api.php?action=query&list=imageusage&iutitle=" + Bot.UrlEncode(pgCurrentImagePage.title.Replace(" ", "_")) + "&iunamespace=0&format=json"); if (!String.IsNullOrWhiteSpace(strImageUsage)) { if (!Regex.IsMatch(strImageUsage, @"imageusage.*\[\s*\]")) { try { // The image is no longer orphaned. String strArticle = strImageUsage.Substring(1 + strImageUsage.IndexOf("\"title\":\"")); strArticle = strArticle.Substring(8); strArticle = strArticle.Substring(0, strArticle.IndexOf("\"")); // Convert special characters like \u00e8 String strUsedInArticleName = UnescapeString(strArticle); // The image is no longer orphaned. // 2017/02/08 - because there are so blessed many false positives, don't log this any more //LogToEventLog(ref site, MessageType.Informational, "Did not add {{tls|orfud}} tag to :" + strPageName + " because the image is no longer orphaned. Image is used in " + strUsedInArticleName + "", null); } catch (Exception ex) { LogToEventLog(ref site, MessageType.Error, "Error parsing imageusage for :" + strPageName + ".", ex); } continue; } } } catch (Exception ex) { ErrorLogging.SendEmailMessage(true, "Error calling imageusage API", "There was an error calling imageusage for getting page history for " + pgCurrentImagePage.title + ". Will ignore and move on.\r\n\r\n" + ex.ToString()); SleepApiDelay(); continue; } // Now, look and see if it is used in draft space try { String strImageUsage = site2.GetWebPage("https://en.wikipedia.org/w/api.php?action=query&list=imageusage&iutitle=" + Bot.UrlEncode(pgCurrentImagePage.title.Replace(" ", "_")) + "&iunamespace=118&format=json"); if (!String.IsNullOrWhiteSpace(strImageUsage)) { if (!Regex.IsMatch(strImageUsage, @"imageusage.*\[\s*\]")) { // The image is no longer orphaned. String strArticle = strImageUsage.Substring(1 + strImageUsage.IndexOf("\"Draft:")); strArticle = strArticle.Substring(0, strArticle.IndexOf("\"")); LogToEventLog(ref site, MessageType.DraftSpaceNote, "Did not add {{tls|orfud}} tag to :" + strPageName + " which is used only in draft space in the article :" + strArticle + ". This is NOT a permitted use under WP:NFCC#9 and such images may be removed from drafts and tagged as orphans at any time.", null); continue; } } } catch (Exception ex) { ErrorLogging.SendEmailMessage(true, "Error calling imageusage API", "There was an error calling imageusage for getting page history for " + pgCurrentImagePage.title + ". Will ignore and move on.\r\n\r\n" + ex.ToString()); SleepApiDelay(); continue; } // Tag the file with {{subst:orfud}} if (CallEditPage(site, pgCurrentImagePage.title, pgCurrentImagePage.text, "{{subst:orfud}}\r\n" + pgCurrentImagePage.text)) { pgCurrentImagePage.text = "{{subst:orfud}}\r\n" + pgCurrentImagePage.text; if (UserspaceTest) { pgUserspaceTest.text += " |
\r\n| :" + pgCurrentImagePage.title + " | ~~~~~ | " + pgCurrentImagePage.text.Substring(0, Math.Min(300, pgCurrentImagePage.text.Length)) + "\r\n"; pgUserspaceTest.Save(Properties.Settings.Default.OrfudTagComment, false); } else { pgCurrentImagePage.Save(pgCurrentImagePage.text, Properties.Settings.Default.OrfudTagComment, false); } } } catch (Exception ex) { // This probably means that the page was protected, but we will log it just to be sure LogToEventLog(ref site, MessageType.Error, "Failed to add {{tls|orfud}} tag to :" + strPageName + ".", ex); // If we failed to tag with orfud, we don't want to notify the user continue; } if (Abort) { LogToEventLog(ref site, MessageType.Error, "I was ordered to abort.", null); return; } intImagesTagged++; // Sleep for our editing delay SleepTaggingDelay(); // Determine the first contributor PageList pl = TryToFillFromPageHistory(ref site, strPageName); if (0 < pl.Count()) { String strNotifyUser = pl[pl.Count() - 1].lastUser; if (!String.IsNullOrWhiteSpace(strNotifyUser)) { try { // Retrieve this user's talk page Page pgUserTalkPage = new Page(site, "User talk:" + strNotifyUser); pgUserTalkPage.Load(); System.Threading.Thread.Sleep(1000 * Properties.Settings.Default.CheckStopDelaySeconds); // If it is a redirect, resolve it. if (pgUserTalkPage.IsRedirect()) { pgUserTalkPage.ResolveRedirect(); } // Can we notify this user? if (BotEditPermitted(pgUserTalkPage.text, Properties.Settings.Default.BotUserName, "orfud")) { if (CallEditPage(site, pgUserTalkPage.title, pgUserTalkPage.text, pgUserTalkPage.text + "\r\n{{subst:di-orphaned fair use-notice|" + strPageName + "}} --~~~~")) { pgUserTalkPage.text += "\r\n{{subst:di-orphaned fair use-notice|" + strPageName + "}} --~~~~"; if (UserspaceTest) { pgUserspaceTest.text += " | |
\r\n| :" + pgUserTalkPage.title + " | ~~~~~ | " +\r\n"; pgUserspaceTest.Save(String.Format(Properties.Settings.Default.OrfudWarningTagComment, strPageName), false); } else { pgUserTalkPage.Save(String.Format(Properties.Settings.Default.OrfudWarningTagComment, strPageName), false); } } } } catch (Exception ex) { LogToEventLog(ref site, MessageType.Error, "Failed to notify :" + "User talk:" + strNotifyUser + " that [[:" + strPageName + "]] is orphaned.", ex); } } } if (Abort) { LogToEventLog(ref site, MessageType.Error, "I was ordered to abort.", null); return; } // Sleep for our editing delay SleepTaggingDelay(); } } if (UserspaceTest) { if (CallEditPage(site, pgUserspaceTest.title, "", "footer")) { pgUserspaceTest.text += " |
pgUserspaceTest.Save();
}
}
}
LogToEventLog(ref site, MessageType.Finish, "B-Bot orphaned fair use image tagger process completed. " + intImagesTagged.ToString() + " orphaned images were tagged.", null);
}
catch (Exception ex)
{
// Connect to Wikipedia
Site site = TryToConnect("https://en.wikipedia.org", Properties.Settings.Default.BotUserName, Properties.Settings.Default.BotPassword);
LogToEventLog(ref site, MessageType.Error, "I crashed and will skip the rest of this run.", ex);
}
}
}