User:HersfoldCiteBot/Source

{{pp-protected|reason=it contains the source code to HersfoldCiteBot. While the bot does not operate off of this version of code, it is updated on occasion to reflect what the bot is using to run}}

This page contains a copy of the code used to run User:HersfoldCiteBot. This will be updated to reflect changes made to the code, which will be summarized at ../Version.

[http://en.wikipedia.org/w/index.php?title=User:HersfoldCiteBot/Source&oldid={{REVISIONID}} Permalink to this revision]

HersfoldCiteBot.java

{{collapse top}}

package citation;

import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStreamReader;

import java.io.OutputStreamWriter;

import java.io.UnsupportedEncodingException;

import javax.security.auth.login.*; // useful exception types

import java.net.*;

import java.io.*;

import java.util.*;

/*

* ERROR EXIT CODES:

* 1 = New messages

* 0 = Successful run

* -1 = Two failed login attempts

* -2 = IOException when creating log file

* -3 = FNF Exception when creating log file

* -4 = IOException when writing to log

* -5 = IOException (see log for location and cause)

* -6 = Bot blocked

* -7 = Bot unexpectedly logged out

* -8 = Log page is protected, someone needs a trouting

*/

public class HersfoldCiteBot implements Runnable{

/* ************************* VERSION ********************************** */

/**

* The current version of the program. Version control is outlined at

* w:User:HersfoldCiteBot/Version

*/

public static final String versionID = "1.1.4b";

/* ************************* GENERAL CONSTANTS ********************************** */

/**

* Domain for the English Wikipedia. This domain may be altered to that of another Wiki for testing

* purposes; however, HersfoldCiteBot must have a bot flagged account on that wiki to function.

*/

public static final String EN_WIKI_DOMAIN = "en.wikipedia.org";

/**

* The name of the log file to record actions on.

*/

public static final String LOGFILE_NAME = "HersfoldCiteBotLog.txt";

/**

* The maximum number of IOExceptions received when writing to the log before the bot shuts down.

*/

public static final int MAX_IO_COUNT = 5;

/**

* Commented disclaimer for bot-generated website titles

*/

public static final String BOT_TITLE_DISCLAIMER = "";

/**

* Commented disclaimer for assumed website titles

*/

public static final String ASSUMED_TITLE_DISCLAIMER = "";

/**

* Edit summary to use when fixing citations

*/

public static final String SUMMARY = "Automatically correcting errors in {{tl|cite web}} template(s). BOT EDIT (Task) Version " + versionID;

/**

* Page to record items needing manual review

*/

public static final String LOG_PAGE = "User:HersfoldCiteBot/Citation errors needing manual review";

/**

* Category to pull articles from

*/

public static final String SOURCE_CATEGORY = "Articles with broken citations";

/**

* List of redirects for {{cite web}}

*/

public static final String[] CITE_WEB_TEMPLATES = {"cite web", "Cite web",

"web reference", "Web reference",

"web reference 4", "Web reference 4",

"web-reference", "Web-reference",

"web cite", "Web cite",

"cite website", "Cite website",

"cite-web", "Cite-web",

"citeweb", "Citeweb",

"web", "Web",

"web citation", "Web citation",

"cite url", "Cite url",

"cite blog", "Cite blog",

"cite Web", "Cite Web",

"cite webpage", "Cite webpage",

"cita web", "Cita web",

"lien web", "Lien web",

"c web", "C web",

"cit web", "Cit web",

"cw", "Cw",

"cite tweet", "Cite tweet"};

/* ********************** PROJECT-SPECIFIC CONSTANTS *********************** */

// None for this bot

/* ************************* PRIVATE VARIABLES ********************************** */

/**

* Wiki class object representing the English Wikipedia.

*/

private Wiki wikipedia = new Wiki(EN_WIKI_DOMAIN);

/**

* The current time, used for timestamps in logs.

*/

private Calendar currentTime = null;

/**

* The millisecond count at the time of the start of the program; used to update currentTime.

*/

private long lastTime = 0;

/**

* The list of articles to check for corrections.

*/

private ArrayList articles = new ArrayList();

/**

* The operation log file

*/

private File logfile = new File(LOGFILE_NAME);

/**

* A BufferedWriter used to print to logfile

*/

private BufferedWriter log = null;

/**

* A flag to indicate if the bot is logged in or not

*/

private boolean loggedin = false;

/**

* Counter to keep track of login attempts

*/

private int loginCount = 0;

/**

* Holds the exit code to give to the GUI. If this is anything other than 0, the bot will not do anything.

*/

private int exitCode = 0;

/**

* Number of IO Exceptions received when writing to the log. Bot terminates at 5.

*/

private int ioCount = 0;

/**

* List of pages to be manually reviewed at the end of a run.

*/

private TreeMap> toBeReviewed = new TreeMap>();

/* ************************* TRIAL MODE SETTINGS ************************* */

/**

* Trial edit counter - prevents the bot from editing too much in trial runs.

*/

private int editLimit = 10;

/**

* Trial run flag - set to true if we need to pay attention to editLimit

*/

private boolean trialRun = true;

/* ************************* CONSTRUCTORS ************************* */

/**

* Default constructor

*

* Doesn't do much of anything.

*/

public HersfoldCiteBot(){

wikipedia.setMaxLag(5);

}

/* ************************* OPERATING CODE ************************* */

/**

* See the operation outline in the class documentation above.

*/

public void run(){

try{Thread.sleep(1500);}catch(InterruptedException e){}

abortIfNewMessages();

if(loggedin && !fatalErrorExists()){

try{

addToLog("Getting articles in Category:Articles with broken citations");

String[] catMembers = wikipedia.getCategoryMembers(SOURCE_CATEGORY);

for( String page : catMembers ){

if(wikipedia.namespace(page) != Wiki.TEMPLATE_NAMESPACE){

articles.add(page);

}

}

}

catch(IOException e){

IOError("run() when getting category members", e.getMessage());

}

abortIfNewMessages();

if(!fatalErrorExists()){

for( String page : articles ){

addToLog("Processing " + page);

System.out.println("Processing " + page);

abortIfNewMessages();

if(searchForCiteWebErrors(page) && !fatalErrorExists()){

addToLog("Possible fixable errors found, attempting corrections");

System.out.println("Attempting to fix errors...");

String pagecontent = null;

abortIfNewMessages();

if(!fatalErrorExists())

pagecontent = correctCiteWebErrors(page);

abortIfNewMessages();

if(!fatalErrorExists()){

try{

addToLog("Saving changes to " + page + "\n");

System.out.println("Saving changes to + " + page);

wikipedia.edit(page, pagecontent, SUMMARY, false);

editLimit--;

}

catch(FileNotFoundException e){

System.err.println("FILE NOT FOUND EXCEPTION at run(), page " + page + " does not exist.");

addToLog("This page may have been deleted, please double check. Marking for review...");

ArrayList problems = null;

if(toBeReviewed.containsKey(page))

problems = toBeReviewed.get(page);

else

problems = new ArrayList();

problems.add("This page seems to have been deleted, I got a FileNotFoundException when trying to edit it.");

toBeReviewed.put(page, problems);

}

catch(IOException e){

IOError("run() when editing " + page, e.getMessage());

}

catch(AccountLockedException e){

System.err.println("ACCOUNT LOCKED EXCEPTION at run() when editing, bot is blocked and will terminate");

addToLog("The bot appears to have been blocked. System shutting down...");

exitCode = -6;

}

catch(CredentialException e){

System.err.println("CREDENTIAL EXCEPTION at run() when editing, " + page + " is protected.");

addToLog("This page is protected; it will need to be handled manually. Marking for review...");

ArrayList problems = null;

if(toBeReviewed.containsKey(page))

problems = toBeReviewed.get(page);

else

problems = new ArrayList();

problems.add("This page is protected, I got a CredentialException when trying to edit it.");

toBeReviewed.put(page, problems);

}

catch(LoginException e){

System.err.println("LOGIN EXCEPTION at run() when editing " + page + ", bot will terminate");

addToLog("The bot has been unexpectedly logged out and will terminate.");

exitCode = -7;

}

}

}

else if(!fatalErrorExists()){

addToLog("No {{cite web}} errors found in this article.\n");

System.out.println("No errors found.");

}

if(editLimit <= 0)

break;

}

}

// Create manual review log page - STILL DO THIS even if we have new messages

if(exitCode == 0 || exitCode == 1){

String logpagecontent = "";

logpagecontent += "This page contains a list of pages that contain {{tl|cite web}} errors that HersfoldCiteBot has attempted to fix, but was ";

logpagecontent += "unable to locate or resolve the problem itself. Most errors noted on this page will need to be handled manually.\n\n";

logpagecontent += "For more information, see User:HersfoldCiteBot or :Category:Articles with broken citations.\n\n";

logpagecontent += "== Errors as of " + logDatedTimestamp() + " ==\n\n";

for( String page : toBeReviewed.keySet() ){

logpagecontent += "* " + page + "\n";

for( String problem : toBeReviewed.get(page) )

logpagecontent += "** " + problem + "\n";

}

try{

addToLog("Saving manual corrections log to " + LOG_PAGE);

System.out.println("Saving updates to " + LOG_PAGE);

wikipedia.edit(LOG_PAGE, logpagecontent, "Logging errors needing manual review as of " + logDatedTimestamp(), false);

}

catch(IOException e){

IOError("run() when editing log page", e.getMessage());

}

catch(AccountLockedException e){

System.err.println("ACCOUNT LOCKED EXCEPTION at run() when editing, bot is blocked and will terminate");

addToLog("The bot appears to have been blocked. System shutting down...");

exitCode = -6;

}

catch(CredentialException e){

System.err.println("CREDENTIAL EXCEPTION at run() when editing log page.");

addToLog("Someone is trying to break me and protected my log page. Please go unprotect it and trout whoever did the protection.");

exitCode = -8;

}

catch(LoginException e){

System.err.println("LOGIN EXCEPTION at run() when editing log page, bot will terminate");

addToLog("The bot has been unexpectedly logged out and will terminate.");

exitCode = -7;

}

}

logout();

}

else if(!loggedin && exitCode != 1)

System.err.println("HersfoldCiteBot is not logged in!");

if(exitCode == 1)

System.err.println("The bot has new messages on Wikipedia and has shut down.");

}

/**

* Checks to see if the bot has new messages on either wiki and closes the program if so.

*/

private void abortIfNewMessages(){

try{

if(wikipedia.hasNewMessages() && !fatalErrorExists()){

addToLog("New messages received on Commons: shutting down for safety.\n");

System.err.println("SYSTEM DISABLED - New messages received on Commons.");

logout();

exitCode = 1;

}

}

catch(IOException e){

IOError("abortIfNewMessages()", e.getMessage());

}

}

/**

* Logs into Wikipedia and Wiktionary as HersfoldCiteBot with the provided password

* @param password a password entered by the user

* @return true on a successful login

*/

protected boolean login(String password){

return this.login(password.toCharArray());

}

/**

* Logs into Wikipedia and Wiktionary as HersfoldCiteBot with the provided password

* @param password a password entered by the user

* @return true on a successful login

*/

protected boolean login(char[] password){

loggedin = false;

if(!fatalErrorExists()){

setupLogFile();

try{

addToLog("Attempting login...\n");

System.out.println("Attempting login...");

wikipedia.login("HersfoldCiteBot", password);

loggedin = true;

}

catch(FailedLoginException e){

System.err.println("Incorrect password provided. Login failed.");

loggedin = false;

loginCount++;

}

catch(IOException e){

IOError("login()", e.getMessage());

}

if(loggedin && !fatalErrorExists()){

addToLog("Successfully logged in as HersfoldCiteBot on en.wp.\n");

}

if(loginCount == 2 && !fatalErrorExists()){

System.err.println("Two failed login attempts; aborting...");

addToLog("Login failed twice, closing system.\n");

logout(); // Safety step

exitCode = -1;

}

}

return loggedin;

}

/**

* Logs out of Wikipedia and closes the log file.

*/

protected void logout(){

if(loggedin){

try{

addToLog("Logging out and shutting down.\n");

log.close();

}

catch(IOException e){}// At this point I don't care

}

wikipedia.logout();

// Reset in case we log in again later

loggedin = false;

articles.clear();

// exitCode is not reset to force the user to check why the bot stopped working

}

/**

* @return true if the bot is logged in

*/

public boolean isLoggedIn(){

return loggedin;

}

public boolean equals(Object obj){

if(obj instanceof HersfoldCiteBot){

return equals((HersfoldCiteBot) obj);

}

return false;

}

public boolean equals(HersfoldCiteBot bot){

return this.exitCode == bot.exitCode;

}

/**

* Opens the log file for writing. The log file will not be overwritten if it already exists, but will

* be added to.

*/

private void setupLogFile(){

if(!logfile.exists() || !logfile.isFile() || !logfile.canWrite()){

try{

logfile.createNewFile();

}

catch(IOException e){

System.err.println("An error occured when creating the logfile " + LOGFILE_NAME + ":");

System.err.println(e.getMessage());

exitCode = -2;

}

}

if(!fatalErrorExists()){

FileOutputStream output = null;

try{

output = new FileOutputStream(logfile, true); // true appends stuff to it instead of overwriting

log = new BufferedWriter(new OutputStreamWriter(output, "UTF-16"));

}

catch(FileNotFoundException e){

System.err.println("FILE NOT FOUND EXCEPTION in setupLogFile() with " + LOGFILE_NAME + ":");

System.err.println(e.getMessage());

exitCode = -3;

}

catch(UnsupportedEncodingException e){} //won't happen

if(!fatalErrorExists()){

currentTime = new GregorianCalendar(TimeZone.getTimeZone("Universal"), Locale.ENGLISH);

lastTime = System.currentTimeMillis();

try{

log.append("\n\n");

log.append("------------------------------------------------------\n");

log.append("HersfoldCiteBot Operation Log\n");

log.append("Running version " + versionID + "\n");

log.append(logDatedTimestamp()+"\n");

log.append("------------------------------------------------------\n\n");

if(trialRun){

String message = "This is a trial run; the bot will make " + editLimit + " edits, then stop.";

log.append(message + "\n\n");

System.out.println("\n" + message + "\n");

}

}

catch(IOException e){

System.err.println("IO EXCEPTION in setupLogFile()");

addToLog("ERROR - IOException when making log file.");

exitCode = -4;

}

}

}

}

/**

* Returns a formatted timestamp with date in UTC

* @return the current time and date in UTC, formatted as "Month Day Year, HH:MM:SS UTC"

*/

private String logDatedTimestamp(){

String timestamp = "";

currentTime.add(Calendar.MILLISECOND, (int)(System.currentTimeMillis() - lastTime));

lastTime = System.currentTimeMillis();

switch(currentTime.get(Calendar.MONTH)){

case Calendar.JANUARY: timestamp = "January "; break;

case Calendar.FEBRUARY: timestamp = "February "; break;

case Calendar.MARCH: timestamp = "March "; break;

case Calendar.APRIL: timestamp = "April "; break;

case Calendar.MAY: timestamp = "May "; break;

case Calendar.JUNE: timestamp = "June "; break;

case Calendar.JULY: timestamp = "July "; break;

case Calendar.AUGUST: timestamp = "August "; break;

case Calendar.SEPTEMBER: timestamp = "September "; break;

case Calendar.OCTOBER: timestamp = "October "; break;

case Calendar.NOVEMBER: timestamp = "November "; break;

case Calendar.DECEMBER: timestamp = "December "; break;

}

timestamp += currentTime.get(Calendar.DAY_OF_MONTH) + " " + currentTime.get(Calendar.YEAR) + ", ";

timestamp += (currentTime.get(Calendar.HOUR_OF_DAY) < 10 ? "0" : "") + currentTime.get(Calendar.HOUR_OF_DAY) + ":" +

(currentTime.get(Calendar.MINUTE) < 10 ? "0" : "") + currentTime.get(Calendar.MINUTE) + ":" +

(currentTime.get(Calendar.SECOND) < 10 ? "0" : "") + currentTime.get(Calendar.SECOND) + " UTC";

return timestamp;

}

/**

* Returns a formatted timestamp in UTC

* @return the current time in UTC, formatted as "HH:MM:SS - "

*/

private String timestamp(){

currentTime.add(Calendar.MILLISECOND, (int)(System.currentTimeMillis() - lastTime));

lastTime = System.currentTimeMillis();

return (currentTime.get(Calendar.HOUR_OF_DAY) < 10 ? "0" : "") + currentTime.get(Calendar.HOUR_OF_DAY) + ":" +

(currentTime.get(Calendar.MINUTE) < 10 ? "0" : "") + currentTime.get(Calendar.MINUTE) + ":" +

(currentTime.get(Calendar.SECOND) < 10 ? "0" : "") + currentTime.get(Calendar.SECOND) + " - ";

}

private void addToLog(String message){

try{

log.append(timestamp() + message + "\n");

}

catch(IOException e){

ioCount++;

if(ioCount < MAX_IO_COUNT)

System.err.println("IO EXCEPTION when making log entry. Bot will continue running.");

else{

System.err.println("IO EXCEPTION when making log entry. This is the fifth such exception encountered. Bot will shut down.");

exitCode = -4;

}

}

}

public int getExitCode(){

return exitCode;

}

public boolean fatalErrorExists(){

return exitCode != 0;

}

/**

* Error handler for IOExceptions; prints an error message to the screen, logs the error, and

* shuts down the bot.

* @param source the method the IOException was received in

* @param message the exception's message

*/

private void IOError(String source, String message){

System.err.println("IO EXCEPTION at " + source + ", bot will terminate.");

System.err.println(message);

addToLog("IO Exception received, bot will shut down.");

logout();

exitCode = -5;

}

private int findCiteWebStart(String pagecontent, int startpoint){

int startIndex = -1;

for(int i = 0; i < CITE_WEB_TEMPLATES.length; i++){

int tempIndex = pagecontent.indexOf("{{" + CITE_WEB_TEMPLATES[i], startpoint);

if((tempIndex < startIndex && tempIndex != -1) || startIndex == -1){

startIndex = tempIndex;

}

}

return startIndex;

}

private boolean searchForCiteWebErrors(String pagetitle){

String displaytext = null;

try{

displaytext = wikipedia.getRenderedText(pagetitle);

}

catch(IOException e){

IOError("searchForCiteWebErrors() on " + pagetitle, e.getMessage());

}

if(displaytext.contains("Error: no |title= specified when using {{

return true;

if(displaytext.contains("Error: If you specify |archiveurl=, you must also specify |archivedate="))

return true;

return false;

}

private String correctCiteWebErrors(String pagetitle){

String pagecontent = null;

try{

addToLog("Getting text of " + pagetitle);

pagecontent = wikipedia.getPageText(pagetitle);

}

catch(IOException e){

IOError("correctCiteWebErrors() on " + pagetitle, e.getMessage());

}

ArrayList correctedTemplates = new ArrayList();

if(!fatalErrorExists()){

int startIndex = findCiteWebStart(pagecontent, 0);

while(startIndex != -1 && !fatalErrorExists()){

int endIndex = pagecontent.indexOf("}}", startIndex) + 2; //include }}

String citeTemplate = pagecontent.substring(startIndex, endIndex);

final String citeTemplateOriginal = citeTemplate;

if(!correctedTemplates.contains(citeTemplate)){

// Correct missing titles

// Looks for a lack of |title= OR for ...|title= (blank space) |...

// Both are empty parameters

if(indexOfRegex(citeTemplate, "\\|\\s*title\\s*=") == -1 || indexOfRegex(citeTemplate, "\\|\\s*title\\s*=\\s*\\|") != -1){

addToLog("Trying to add a title= parameter to " + citeTemplate);

boolean existingArg = false;

int titleArgEnd = 0;

if(indexOfRegex(citeTemplate, "\\|\\s*title\\s*=\\s*\\|") != -1){

titleArgEnd = citeTemplate.indexOf("=", indexOfRegex(citeTemplate, "title\\s*=")) + 1;

existingArg = true;

}

int urlStart = indexOfRegex(citeTemplate, "url\\s*=");

boolean urlNotFound = false;

if(urlStart == -1){

urlStart = citeTemplate.indexOf("http://");

if(urlStart == -1){

urlStart = citeTemplate.indexOf("https://");

if(urlStart == -1){

addToLog("I can't find the |url= parameter in this citation.");

ArrayList problems = null;

if(toBeReviewed.containsKey(pagetitle))

problems = toBeReviewed.get(pagetitle);

else

problems = new ArrayList();

problems.add("I can't find the |url= parameter in this citation: " + citeTemplate + "");

toBeReviewed.put(pagetitle, problems);

urlNotFound = true;

}

}

// Add |url= parameter to fix template

// If statement is present to avoid {{cite web||url=http://...

if(citeTemplate.substring(0, urlStart).endsWith("|")){

citeTemplate = citeTemplate.substring(0, urlStart - 1) + "|url=" + citeTemplate.substring(urlStart);

}

else{

citeTemplate = citeTemplate.substring(0, urlStart) + "|url=" + citeTemplate.substring(urlStart);

}

urlStart = citeTemplate.indexOf("=", indexOfRegex(citeTemplate, "url\\s*=")) + 1; // Correct index

}

else{

urlStart = citeTemplate.indexOf("=", urlStart) + 1; // index of the = in the parameter plus 1

}

if(!urlNotFound){

int pipe = citeTemplate.indexOf("|", urlStart);

int space = citeTemplate.indexOf(" ", urlStart);

int bracket = citeTemplate.indexOf("}}", urlStart);

int urlEnd;

if(((pipe < space || space == -1) && pipe < bracket) && pipe != -1)

urlEnd = pipe;

else if(space < bracket && space != -1)

urlEnd = space;

else

urlEnd = bracket;

boolean titlefound = false;

// Check to see if title is included with url ([url title] error)

if(urlEnd == space){

int argEnd;

if(pipe != -1 && pipe < bracket){

argEnd = pipe;

}

else{

argEnd = bracket;

}

String possibleTitle = citeTemplate.substring(urlEnd + 1, argEnd).trim();

// If there are alphanumeric characters after the end of the URL, we'll assume that is the title

if(indexOfRegex(possibleTitle, "[A-Za-z0-9]") != -1){

titlefound = true;

if(existingArg){

//This is more complex than I'd like, but oh well.

citeTemplate = citeTemplate.replaceFirst("\\|\\s*title\\s*=", ""); // remove existing empty param, assuming there is only one

citeTemplate = citeTemplate.replace(possibleTitle, ""); // remove identified title

bracket = citeTemplate.indexOf("}}"); // figure out where that ended up

citeTemplate = citeTemplate.substring(0, bracket) + "|title=" + ASSUMED_TITLE_DISCLAIMER + possibleTitle + "}}";

}

else{

citeTemplate = citeTemplate.substring(0, urlEnd + 1) + "|title=" +

ASSUMED_TITLE_DISCLAIMER + citeTemplate.substring(urlEnd + 1);

}

}

}

if(!titlefound){

String url = citeTemplate.substring(urlStart, urlEnd).trim();

boolean logged = false;

if(url.matches("\\s*")){ // If is empty or completely whitespace

addToLog("There does not appear to be a URL for this template.");

ArrayList problems = null;

if(toBeReviewed.containsKey(pagetitle))

problems = toBeReviewed.get(pagetitle);

else

problems = new ArrayList();

problems.add("There does not seem to be a URL in this template: " + citeTemplate);

toBeReviewed.put(pagetitle, problems);

logged = true;

}

else{

if(!url.startsWith("http://") && !url.startsWith("https://"))

url = "http://" + url;

// Don't try to find titles for PDFs, not going to work

if(!url.contains(".pdf") && !url.contains(".PDF")){

String linktitle = null;

titlefound = false;

try{

URLConnection connection = new URL(url).openConnection();

connection.connect();

BufferedReader urlcontent = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"));

while(urlcontent.ready() && !titlefound){

String nextline = urlcontent.readLine();

if(nextline != null && (nextline.contains("") || nextline.contains("<TITLE>"))){</p> <p>int titleStart = nextline.indexOf("<title>") + 7;</p> <p>if(titleStart == -1){</p> <p>titleStart = nextline.indexOf("<TITLE>") + 7;</p> <p>}</p> <p>int titleEnd = -1;</p> <p>titleEnd = nextline.indexOf("", titleStart);

if(titleEnd == -1){

titleEnd = nextline.indexOf("", titleStart);

}

if(titleEnd != -1){

linktitle = nextline.substring(titleStart, titleEnd);

}

else{

linktitle = "";

while(titleEnd == -1){

nextline = urlcontent.readLine();

titleEnd = nextline.indexOf("");

if(titleEnd == -1){

titleEnd = nextline.indexOf("");

}

if(titleEnd == -1){

linktitle += nextline;

}

else{

linktitle += nextline.substring(0, titleEnd);

}

}

}

titlefound = true;

}

else if(nextline == null || (nextline.contains("") || nextline.contains(""))){

addToLog("Unable to find title for page at " + url);

ArrayList problems = null;

if(toBeReviewed.containsKey(pagetitle))

problems = toBeReviewed.get(pagetitle);

else

problems = new ArrayList();

problems.add("Unable to find a title for the page at " + url);

toBeReviewed.put(pagetitle, problems);

logged = true;

}

}

}

catch(UnsupportedEncodingException e){}//Won't happen

catch(IOException e){

addToLog("IOException recieved when trying to access " + url + " .");

ArrayList problems = null;

if(toBeReviewed.containsKey(pagetitle))

problems = toBeReviewed.get(pagetitle);

else

problems = new ArrayList();

problems.add("IOException received when trying to access " + url + " . Depending on the reason for the " +

"error (provided at the end of this entry), this may be a temporary problem that the bot can " +

"resolve itself on a later run. IOException message: " + e.getMessage());

toBeReviewed.put(pagetitle, problems);

logged = true;

}

if(linktitle != null){

String newtitle = (existingArg ? "" : "|title=") + linktitle + BOT_TITLE_DISCLAIMER;

if(existingArg){

citeTemplate = citeTemplate.substring(0, titleArgEnd) + newtitle + citeTemplate.substring(titleArgEnd);

}

else{

int bracketIndex = citeTemplate.indexOf("}}");

citeTemplate = citeTemplate.substring(0, bracketIndex) + newtitle + "}}";

}

}

else if(!logged){

addToLog("Unable to find title for page at " + url);

ArrayList problems = null;

if(toBeReviewed.containsKey(pagetitle))

problems = toBeReviewed.get(pagetitle);

else

problems = new ArrayList();

problems.add("Unable to find a title for the page at " + url);

toBeReviewed.put(pagetitle, problems);

}

}

else{

addToLog("Referenced website at " + url + " appears to be a PDF, requires manual attention to add title.");

ArrayList problems = null;

if(toBeReviewed.containsKey(pagetitle))

problems = toBeReviewed.get(pagetitle);

else

problems = new ArrayList();

problems.add("Referenced website at " + url + " appears to be a PDF, requires manual attention to add title.");

toBeReviewed.put(pagetitle, problems);

}

}

}

}

}

if(!fatalErrorExists()){

// Add missing archivedate= args for web.archive.org sites

// If archiveurl= is neither missing nor empty, AND archivedate= is either missing or empty

if((indexOfRegex(citeTemplate, "\\|\\s*archiveurl\\s*=") != -1 && indexOfRegex(citeTemplate, "\\|\\s*archiveurl\\s*=\\s*\\|") == -1) &&

(indexOfRegex(citeTemplate, "\\|\\s*archivedate\\s*=") == -1 || indexOfRegex(citeTemplate, "\\|\\s*archivedate\\s*=\\s*\\|") != -1)){

addToLog("Trying to add an archivedate= parameter to " + citeTemplate);

boolean existingArg = false;

int dateArgEnd = 0;

if(indexOfRegex(citeTemplate, "\\|\\s*archivedate\\s*=\\s*\\|") != -1){

dateArgEnd = citeTemplate.indexOf("=", indexOfRegex(citeTemplate, "archivedate\\s*=")) + 1;

existingArg = true;

}

int urlStart = citeTemplate.indexOf("=", indexOfRegex(citeTemplate, "archiveurl\\s*=")) + 1;

int pipe = citeTemplate.indexOf("|", urlStart);

int space = citeTemplate.indexOf(" ", urlStart);

int bracket = citeTemplate.indexOf("}}", urlStart);

int urlEnd;

if(((pipe < space || space == -1) && pipe < bracket) && pipe != -1)

urlEnd = pipe;

else if(space < bracket && space != -1)

urlEnd = space;

else

urlEnd = bracket;

String archiveurl = citeTemplate.substring(urlStart, urlEnd).trim();

if(archiveurl.contains("web.archive.org")){

int yearStart = archiveurl.indexOf("/web/") + 5;

int monthStart = yearStart + 4;

int dayStart = monthStart + 2;

String year = archiveurl.substring(yearStart, monthStart);

String month = archiveurl.substring(monthStart, dayStart);

String day = archiveurl.substring(dayStart, dayStart + 2);

// Verify

if(year.matches("\\d{4}") && month.matches("\\d{2}") && day.matches("\\d{2}")){

switch(Integer.parseInt(month)){

case 1: month = "January"; break;

case 2: month = "February"; break;

case 3: month = "March"; break;

case 4: month = "April"; break;

case 5: month = "May"; break;

case 6: month = "June"; break;

case 7: month = "July"; break;

case 8: month = "August"; break;

case 9: month = "September"; break;

case 10: month = "October"; break;

case 11: month = "November"; break;

case 12: month = "December"; break;

default: // we'll just leave it as is

}

if(existingArg)

citeTemplate = citeTemplate.substring(0, dateArgEnd) + month + " " + day + " " + year + citeTemplate.substring(dateArgEnd);

else

citeTemplate = citeTemplate.substring(0, bracket) + "|archivedate=" + month + " " + day + " " + year + "}}";

}

else{

addToLog("This web.archive.org link appears to be botched. Marking for review...");

ArrayList problems = null;

if(toBeReviewed.containsKey(pagetitle))

problems = toBeReviewed.get(pagetitle);

else

problems = new ArrayList();

problems.add("web.archive.org link " + archiveurl + " appears to be botched. Please check and add the archival date.");

toBeReviewed.put(pagetitle, problems);

}

}

else{

addToLog("The archive link isn't from web.archive.org. Marking for review...");

ArrayList problems = null;

if(toBeReviewed.containsKey(pagetitle))

problems = toBeReviewed.get(pagetitle);

else

problems = new ArrayList();

problems.add(archiveurl + " isn't a web.archive.org address, so I wasn't able to pull a date from it.");

toBeReviewed.put(pagetitle, problems);

}

}

/* DISABLED per recommendations at BRFA

// While we're at it, add missing dates too

if(indexOfRegex(citeTemplate, "\\|\\s*accessdate\\s*=") == -1 || indexOfRegex(citeTemplate, "\\|\\s*accessdate\\s*=\\s*\\|") != -1){

addToLog("Trying to add an accessdate= parameter to " + citeTemplate);

boolean existingArg = false;

int dateArgEnd = 0;

if(indexOfRegex(citeTemplate, "\\|\\s*accessdate\\s*=\\s*\\|") != -1){

dateArgEnd = citeTemplate.indexOf("=", indexOfRegex(citeTemplate, "accessdate\\s*=")) + 1;

existingArg = true;

}

currentTime.add(Calendar.MILLISECOND, (int)(System.currentTimeMillis() - lastTime));

String timestamp = "" + currentTime.get(Calendar.DAY_OF_MONTH);

switch(currentTime.get(Calendar.MONTH)){

case Calendar.JANUARY: timestamp += " January "; break;

case Calendar.FEBRUARY: timestamp += " February "; break;

case Calendar.MARCH: timestamp += " March "; break;

case Calendar.APRIL: timestamp += " April "; break;

case Calendar.MAY: timestamp += " May "; break;

case Calendar.JUNE: timestamp += " June "; break;

case Calendar.JULY: timestamp += " July "; break;

case Calendar.AUGUST: timestamp += " August "; break;

case Calendar.SEPTEMBER: timestamp += " September "; break;

case Calendar.OCTOBER: timestamp += " October "; break;

case Calendar.NOVEMBER: timestamp += " November "; break;

case Calendar.DECEMBER: timestamp += " December "; break;

}

timestamp += "" + currentTime.get(Calendar.YEAR);

if(existingArg)

citeTemplate = citeTemplate.substring(0, dateArgEnd) + timestamp + citeTemplate.substring(dateArgEnd);

else{

int bracketIndex = citeTemplate.indexOf("}}");

citeTemplate = citeTemplate.substring(0, bracketIndex) + "|accessdate=" + timestamp + "}}";

}

}

DISABLED per recommendations at BRFA

*/

// Now that everything is done, replace the template

correctedTemplates.add(citeTemplate);

if(!citeTemplate.equals(citeTemplateOriginal)){

pagecontent = pagecontent.replace(citeTemplateOriginal, citeTemplate);

}

}

}

startIndex = findCiteWebStart(pagecontent, startIndex + citeTemplate.length());

}

}

return pagecontent;

}

private int indexOfRegex(String source, String regex){

int index = 0;

while(source.length() > 0){

if(source.matches(regex + "(.|\\n)*")) // (.|\\n)* needed to ensure we're not matching the whole string

return index;

else{

source = source.substring(1);

index++;

}

}

return -1;

}

/*

public static void main(String args[]){

HersfoldCiteBot bot = new HersfoldCiteBot();

bot.login("this is not the bot's password");

String content = bot.correctCiteWebErrors("User:Hersfold/Hersfold's Sandbox");

bot.logout();

System.out.println("\n\n\n" + content);

}

*/

}

{{collapse bottom}}

HersfoldCiteBotGUI.java

{{collapse top}}

package citation;

import generic.JConsole;

import javax.swing.*;

import java.awt.*;

import java.awt.event.*;

public class HersfoldCiteBotGUI implements ActionListener {

public static final String RUN = "Run";

public static final String CLOSE = "Close";

private static JFrame gui = new JFrame("Hersfold Cite Bot version " + HersfoldCiteBot.versionID);

private static JConsole logscreen = new JConsole(24, JConsole.DEFAULT_COLS);

private static JPasswordField passwordField;

private static JButton runButton;

private static JButton closeButton;

private static HersfoldCiteBot bot = null;

public static void main(String[] args) {

javax.swing.SwingUtilities.invokeLater(new Runnable() {

HersfoldCiteBotGUI botGUI = new HersfoldCiteBotGUI();

public void run() {

botGUI.buildAndRunGUI();

}

});

}

public void buildAndRunGUI(){

gui.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);

// Build the title of the program

JLabel title = new JLabel("Hersfold Cite Bot, Automated Citation Error Correction System");

// Build the password field

passwordField = new JPasswordField(20);

passwordField.setActionCommand(RUN);

passwordField.addActionListener(this);

JLabel pwdLabel = new JLabel("Password: ");

pwdLabel.setLabelFor(passwordField);

JPanel passwordPanel = new JPanel(new FlowLayout(FlowLayout.TRAILING));

passwordPanel.add(pwdLabel);

passwordPanel.add(passwordField);

// Build the buttons

runButton = new JButton(RUN);

runButton.setActionCommand(RUN);

runButton.addActionListener(this);

runButton.setBackground(new Color(128,255,128));

closeButton = new JButton(CLOSE);

closeButton.setActionCommand(CLOSE);

closeButton.addActionListener(this);

closeButton.setBackground(new Color(255,128,128));

JPanel runCancelButtons = new JPanel(new GridLayout(1,2));

runCancelButtons.add(runButton);

runCancelButtons.add(closeButton);

// Build what will effectively be the console, where the runtime reports go

JScrollPane logarea = new JScrollPane(logscreen.getTextArea());

logarea.setVerticalScrollBarPolicy(JScrollPane.VERTICAL_SCROLLBAR_AS_NEEDED);

// Build the credits line

JLabel credits = new JLabel("(c) Brett Reynolds/User:Hersfold 2010");

// Redirect system.out and .err

System.setOut(logscreen.getStdPrintStream());

System.setErr(logscreen.getErrPrintStream());

// Smack it all together so it looks somewhat nice

JPanel upperPanel = new JPanel(new GridLayout(2,1));

JPanel lowerPanel = new JPanel(new GridLayout(3,1));

JPanel textPanel = new JPanel(new GridLayout(2,1));

JPanel overallPanel = new JPanel(new FlowLayout(FlowLayout.CENTER));

upperPanel.add(title);

lowerPanel.add(passwordPanel);

lowerPanel.add(runCancelButtons);

lowerPanel.add(credits);

textPanel.add(upperPanel);

textPanel.add(lowerPanel);

overallPanel.add(textPanel);

overallPanel.add(logarea);

// Put it in the GUI

gui.getContentPane().add(overallPanel);

// And send it off to do its thing

gui.pack();

gui.setVisible(true);

System.out.println("HersfoldCiteBot system ready to operate.");

System.out.println("Please enter the bot's password to begin.\n");

}

public void actionPerformed(ActionEvent event){

String command = event.getActionCommand();

if(RUN.equals(command)){

char[] password = passwordField.getPassword();

if(bot == null)

bot = new HersfoldCiteBot();

System.out.println("Activating citation error correction system...");

try{Thread.sleep(100);}catch(InterruptedException e){}

if(bot.login(password)){

Thread operatingBot = new Thread(bot);

operatingBot.start();

}

// After first failed attempt

else if (!bot.fatalErrorExists())

System.err.println("Incorrect password. Please try again.");

// After second failed attempt

else{

System.err.println("Incorrect password. Further access attempts denied.");

runButton.setEnabled(false);

passwordField.setEnabled(false);

}

while(!bot.isLoggedIn() && !bot.fatalErrorExists()){}

if(!bot.fatalErrorExists()){

runButton.setEnabled(true);

passwordField.setEnabled(true);

}

}

else{

gui.setVisible(false);

gui.dispose();

if(bot != null)

bot.logout();

System.exit(bot != null ? bot.getExitCode() : 0);

}

}

}

{{collapse bottom}}

Wiki.java

This file is originally from User:MER-C/Wiki.java. This version may have some slight differences specifically for this bot or to fix minor errors.

{{collapse top}}

/**

* @(#)Wiki.java 0.22 18/02/2010

* Copyright (C) 2007 - 2010 MER-C

*

* This program is free software; you can redistribute it and/or

* modify it under the terms of the GNU General Public License

* as published by the Free Software Foundation; either version 3

* of the License, or (at your option) any later version. Additionally

* this file is subject to the "Classpath" exception.

*

* This program is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

* GNU General Public License for more details.

*

* You should have received a copy of the GNU General Public License

* along with this program; if not, write to the Free Software Foundation,

* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

*/

package citation;

import java.awt.image.*;

import java.io.*;

import java.net.*;

import java.util.*;

import java.util.logging.*;

import java.util.zip.*;

import javax.imageio.*;

import javax.security.auth.login.*; // useful exception types

/**

* This is a somewhat sketchy bot framework for editing MediaWiki wikis.

* Requires JDK 1.5 (5.0) or greater. Uses the MediaWiki API for

* most operations. It is recommended that the server runs the latest version

* of MediaWiki (1.14), otherwise some functions may not work.

*

*

* A typical program would go something like this:

*

*

* Wiki wiki;

* File f = new File("wiki.dat");

* if (f.exists()) // we already have a copy on disk

* {

* ObjectInputStream in = new ObjectInputStream(new FileInputStream(f));

* wiki = (Wiki)in.readObject();

* }

* else

* {

* try

* {

* wiki = new Wiki("en.wikipedia.org"); // create a new wiki connection to en.wikipedia.org

* wiki.setThrottle(5000); // set the edit throttle to 0.2 Hz

* wiki.login("ExampleBot", password); // log in as user ExampleBot, with the specified password

* }

* catch (FailedLoginException ex)

* {

* // deal with failed login attempt

* }

* }

* String[] titles = . . . ; // fetch a list of titles

* try

* {

* for (int i = 0; i < titles.length; i++)

* {

* try

* {

* // do something with titles[i]

* }

* // this exception isn't fatal - probably won't affect the task as a whole

* catch (CredentialException ex)

* {

* // deal with protected page

* }

* }

* }

* // these exceptions are fatal - we need to abandon the task

* catch (CredentialNotFoundException ex)

* {

* // deal with trying to do something we can't

* }

* catch (AccountLockedException ex)

* {

* // deal with being blocked

* }

* catch (IOException ex)

* {

* // deal with network error

* }

*

*

* Don't forget to release system resources held by this object when done.

* This may be achieved by logging out of the wiki. Since logout() is

* entirely offline, we can have a persistent session by simply serializing

* this wiki, then logging out as follows:

*

*

* File f = new File("wiki.dat");

* ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(f));

* out.writeObject(wiki); // if we want the session to persist

* out.close();

* wiki.logout();

*

*

* Long term storage of data (particularly greater than 20 days) is not

* recommended as the cookies may expire on the server.

*

*

Assertions

*

* Without too much effort, it is possible to emulate assertions supported

* by mw:Extension:Assert Edit. The extension need not be installed

* for these assertions to work. Use setAssertionMode(int mode)

* to set the assertion mode. Checking for login, bot flag or new messages is

* supported by default. Other assertions can easily be defined, see {@link

* http://java.sun.com/j2se/1.4.2/docs/guide/lang/assert.html Programming

* With Assertions}. Assertions are applied on write methods only and are

* disabled by default.

*

*

* IMPORTANT: You need to run the program with the flag -enableassertions

* or -ea to enable assertions, example: java -ea Mybot.

*

*

* Please file bug reports at User talk:MER-C/Wiki.java. Revision

* history is on the same page.

*

*

* @author MER-C

* @version 0.22

*/

public class Wiki implements Serializable

{

// NAMESPACES

/**

* Denotes the namespace of images and media, such that there is no

* description page. Uses the "Media:" prefix.

* @see IMAGE_NAMESPACE

* @since 0.03

*/

public static final int MEDIA_NAMESPACE = -2;

/**

* Denotes the namespace of pages with the "Special:" prefix. Note

* that many methods dealing with special pages may spew due to

* raw content not being available.

* @since 0.03

*/

public static final int SPECIAL_NAMESPACE = -1;

/**

* Denotes the main namespace, with no prefix.

* @since 0.03

*/

public static final int MAIN_NAMESPACE = 0;

/**

* Denotes the namespace for talk pages relating to the main

* namespace, denoted by the prefix "Talk:".

* @since 0.03

*/

public static final int TALK_NAMESPACE = 1;

/**

* Denotes the namespace for user pages, given the prefix "User:".

* @since 0.03

*/

public static final int USER_NAMESPACE = 2;

/**

* Denotes the namespace for user talk pages, given the prefix

* "User talk:".

* @since 0.03

*/

public static final int USER_TALK_NAMESPACE = 3;

/**

* Denotes the namespace for pages relating to the project,

* with prefix "Project:". It also goes by the name of whatever

* the project name was.

* @since 0.03

*/

public static final int PROJECT_NAMESPACE = 4;

/**

* Denotes the namespace for talk pages relating to project

* pages, with prefix "Project talk:". It also goes by the name

* of whatever the project name was, + "talk:".

* @since 0.03

*/

public static final int PROJECT_TALK_NAMESPACE = 5;

/**

* Denotes the namespace for image/file description pages. Has the prefix

* prefix "File:". Do not create these directly, use upload() instead.

* (This namespace used to have the prefix "Image:", hence the name.)

* @see MEDIA_NAMESPACE

* @since 0.03

*/

public static final int IMAGE_NAMESPACE = 6;

/**

* Denotes talk pages for image description pages. Has the prefix

* "File talk:".

* @since 0.03

*/

public static final int IMAGE_TALK_NAMESPACE = 7;

/**

* Denotes the namespace for (wiki) system messages, given the prefix

* "MediaWiki:".

* @since 0.03

*/

public static final int MEDIAWIKI_NAMESPACE = 8;

/**

* Denotes the namespace for talk pages relating to system messages,

* given the prefix "MediaWiki talk:".

* @since 0.03

*/

public static final int MEDIAWIKI_TALK_NAMESPACE = 9;

/**

* Denotes the namespace for templates, given the prefix "Template:".

* @since 0.03

*/

public static final int TEMPLATE_NAMESPACE = 10;

/**

* Denotes the namespace for talk pages regarding templates, given

* the prefix "Template talk:".

* @since 0.03

*/

public static final int TEMPLATE_TALK_NAMESPACE = 11;

/**

* Denotes the namespace for help pages, given the prefix "Help:".

* @since 0.03

*/

public static final int HELP_NAMESPACE = 12;

/**

* Denotes the namespace for talk pages regarding help pages, given

* the prefix "Help talk:".

* @since 0.03

*/

public static final int HELP_TALK_NAMESPACE = 13;

/**

* Denotes the namespace for category description pages. Has the

* prefix "Category:".

* @since 0.03

*/

public static final int CATEGORY_NAMESPACE = 14;

/**

* Denotes the namespace for talk pages regarding categories. Has the

* prefix "Category talk:".

* @since 0.03

*/

public static final int CATEGORY_TALK_NAMESPACE = 15;

/**

* Denotes all namespaces.

* @since 0.03

*/

public static final int ALL_NAMESPACES = 0x09f91102;

// USER RIGHTS

/**

* Denotes no user rights.

* @see User#userRights()

* @since 0.05

*/

public static final int IP_USER = -1;

/**

* Denotes a registered account.

* @see User#userRights()

* @since 0.05

*/

public static final int REGISTERED_USER = 1;

/**

* Denotes a user who has admin rights.

* @see User#userRights()

* @since 0.05

*/

public static final int ADMIN = 2;

/**

* Denotes a user who has bureaucrat rights.

* @see User#userRights()

* @since 0.05

*/

public static final int BUREAUCRAT = 4;

/**

* Denotes a user who has steward rights.

* @see User#userRights()

* @since 0.05

*/

public static final int STEWARD = 8;

/**

* Denotes a user who has a bot flag.

* @see User#userRights()

* @since 0.05

*/

public static final int BOT = 16;

// LOG TYPES

/**

* Denotes all logs.

* @since 0.06

*/

public static final String ALL_LOGS = "";

/**

* Denotes the user creation log.

* @since 0.06

*/

public static final String USER_CREATION_LOG = "newusers";

/**

* Denotes the upload log.

* @since 0.06

*/

public static final String UPLOAD_LOG = "upload";

/**

* Denotes the deletion log.

* @since 0.06

*/

public static final String DELETION_LOG = "delete";

/**

* Denotes the move log.

* @since 0.06

*/

public static final String MOVE_LOG = "move";

/**

* Denotes the block log.

* @since 0.06

*/

public static final String BLOCK_LOG = "block";

/**

* Denotes the protection log.

* @since 0.06

*/

public static final String PROTECTION_LOG = "protect";

/**

* Denotes the user rights log.

* @since 0.06

*/

public static final String USER_RIGHTS_LOG = "rights";

/**

* Denotes the user renaming log.

* @since 0.06

*/

public static final String USER_RENAME_LOG = "renameuser";

/**

* Denotes the bot status log.

* @since 0.08

* @deprecated Special:Makebot is deprecated, use

* USER_RIGHTS_LOG instead.

*/

public static final String BOT_STATUS_LOG = "makebot";

/**

* Denotes the page importation log.

* @since 0.08

*/

public static final String IMPORT_LOG = "import";

/**

* Denotes the edit patrol log.

* @since 0.08

*/

public static final String PATROL_LOG = "patrol";

// PROTECTION LEVELS

/**

* Denotes a non-protected page.

* @since 0.09

*/

public static final int NO_PROTECTION = -1;

/**

* Denotes semi-protection (i.e. only autoconfirmed users can edit this page)

* [edit=autoconfirmed;move=autoconfirmed].

* @since 0.09

*/

public static final int SEMI_PROTECTION = 1;

/**

* Denotes full protection (i.e. only admins can edit this page)

* [edit=sysop;move=sysop].

* @see #ADMIN

* @see User#userRights()

* @since 0.09

*/

public static final int FULL_PROTECTION = 2;

/**

* Denotes move protection (i.e. only admins can move this page) [move=sysop].

* We don't define semi-move protection because only autoconfirmed users

* can move pages anyway.

*

* @see #ADMIN

* @see User#userRights()

* @since 0.09

*/

public static final int MOVE_PROTECTION = 3;

/**

* Denotes move and semi-protection (i.e. autoconfirmed editors can edit the

* page, but you need to be a sysop to move) [edit=autoconfirmed;move=sysop].

* Naturally, this value (4) is equal to SEMI_PROTECTION (1) +

* MOVE_PROTECTION (3).

*

* @see #ADMIN

* @see User#userRights()

* @since 0.09

*/

public static final int SEMI_AND_MOVE_PROTECTION = 4;

/**

* Denotes protected deleted pages [create=sysop].

* @since 0.12

* @see #ADMIN

*/

public static final int PROTECTED_DELETED_PAGE = 5;

/**

* Denotes protected images where the corresponding image description

* page can be edited.

* @since 0.21

*/

public static final int UPLOAD_PROTECTION = 6;

// ASSERTION MODES

/**

* Use no assertions (i.e. 0).

* @see #setAssertionMode

* @since 0.11

*/

public static final int ASSERT_NONE = 0;

/**

* Assert that we are logged in (i.e. 1).

* @see #setAssertionMode

* @since 0.11

*/

public static final int ASSERT_LOGGED_IN = 1;

/**

* Assert that we have a bot flag (i.e. 2).

* @see #setAssertionMode

* @since 0.11

*/

public static final int ASSERT_BOT = 2;

/**

* Assert that we have no new messages. Not defined in Assert Edit, but

* some bots have this.

* @see #setAssertionMode

* @since 0.11

*/

public static final int ASSERT_NO_MESSAGES = 4;

// RC OPTIONS

/**

* In queries against the recent changes table, this would mean we don't

* fetch anonymous edits.

* @since 0.20

*/

public static final int HIDE_ANON = 1;

/**

* In queries against the recent changes table, this would mean we don't

* fetch edits made by bots.

* @since 0.20

*/

public static final int HIDE_BOT = 2;

/**

* In queries against the recent changes table, this would mean we don't

* fetch by the logged in user.

* @since 0.20

*/

public static final int HIDE_SELF = 4;

/**

* In queries against the recent changes table, this would mean we don't

* fetch minor edits.

* @since 0.20

*/

public static final int HIDE_MINOR = 8;

/**

* In queries against the recent changes table, this would mean we don't

* fetch patrolled edits.

* @since 0.20

*/

public static final int HIDE_PATROLLED = 16;

// REVISION OPTIONS

/**

* In Revision.diff(), denotes the next revision.

* @see #Revision.diff

* @since 0.21

*/

public static final long NEXT_REVISION = -1L;

/**

* In Revision.diff(), denotes the current revision.

* @see #Revision.diff

* @since 0.21

*/

public static final long CURRENT_REVISION = -2L;

/**

* In Revision.diff(), denotes the previous revision.

* @see #Revision.diff

* @since 0.21

*/

public static final long PREVIOUS_REVISION = -3L;

private static String version = "0.22";

// the domain of the wiki

private String domain, query, base;

private String scriptPath = "/w"; // need this for sites like partyvan.info

// user management

private HashMap cookies = new HashMap(12);

private HashMap cookies2 = new HashMap(10);

private User user;

private int statuscounter = 0;

// various caches

private HashMap namespaces = null;

private ArrayList watchlist = null;

// preferences

private int max = 500; // awkward workaround

private static Logger logger = Logger.getLogger("wiki"); // only one required

private int throttle = 10000; // throttle

private int maxlag = 5;

private volatile long lastlagcheck;

private int assertion = 0; // assertion mode

private int statusinterval = 100; // status check

private String useragent = "Wiki.java " + version;

// retry flag

private boolean retry = true;

// serial version

private static final long serialVersionUID = -8745212681497644126L;

// CONSTRUCTORS AND CONFIGURATION

/**

* Logs which version we're using.

* @since 0.12

*/

static

{

logger.logp(Level.CONFIG, "Wiki", "", "Using Wiki.java " + version);

}

/**

* Creates a new connection to the English Wikipedia.

* @since 0.02

*/

public Wiki()

{

this("");

}

/**

* Creates a new connection to a wiki. WARNING: if the wiki uses a

* $wgScriptpath other than the default /w, you need to call

* getScriptPath() to automatically set it. Alternatively, you

* can use the constructor below if you know it in advance.

*

* @param domain the wiki domain name e.g. en.wikipedia.org (defaults to

* en.wikipedia.org)

*/

public Wiki(String domain)

{

if (domain == null || domain.equals(""))

domain = "en.wikipedia.org";

this.domain = domain;

// init variables

base = "http://" + domain + scriptPath + "/index.php?title=";

query = "http://" + domain + scriptPath + "/api.php?format=xml&";

}

/**

* Creates a new connection to a wiki with $wgScriptpath set to

* scriptPath.

*

* @param domain the wiki domain name

* @param scriptPath the script path

* @since 0.14

*/

public Wiki(String domain, String scriptPath)

{

this.domain = domain;

this.scriptPath = scriptPath;

// init variables

base = "http://" + domain + scriptPath + "/index.php?title=";

query = "http://" + domain + scriptPath + "/api.php?format=xml&";

}

/**

* Gets the domain of the wiki, as supplied on construction.

* @return the domain of the wiki

* @since 0.06

*/

public String getDomain()

{

return domain;

}

/**

* Gets the editing throttle.

* @return the throttle value in milliseconds

* @see #setThrottle

* @since 0.09

*/

public int getThrottle()

{

return throttle;

}

/**

* Sets the editing throttle. Read requests are not throttled or restricted

* in any way. Default is 10s.

* @param throttle the new throttle value in milliseconds

* @see #getThrottle

* @since 0.09

*/

public void setThrottle(int throttle)

{

this.throttle = throttle;

log(Level.CONFIG, "Throttle set to " + throttle + " milliseconds", "setThrottle");

}

/**

* Detects the $wgScriptpath wiki variable and sets the bot framework up

* to use it. You need not call this if you know the script path is

* /w. See also mw:Manual:$wgScriptpath.

*

* @throws IOException if a network error occurs

* @return the script path, if you have any use for it

* @since 0.14

*/

public String getScriptPath() throws IOException

{

scriptPath = parseAndCleanup("{{SCRIPTPATH}}");

base = "http://" + domain + scriptPath + "/index.php?title=";

query = "http://" + domain + scriptPath + "/api.php?format=xml&";

return scriptPath;

}

/**

* Sets the user agent HTTP header to be used for requests. Default is

* "Wiki.java " + version.

* @param useragent the new user agent

* @since 0.22

*/

public void setUserAgent(String useragent)

{

this.useragent = useragent;

}

/**

* Gets the user agent HTTP header to be used for requests. Default is

* "Wiki.java " + version.

* @return useragent the user agent

* @since 0.22

*/

public String getUserAgent()

{

return useragent;

}

/**

* Determines whether this wiki is equal to another object.

* @param obj the object to compare

* @return whether this wiki is equal to such object

* @since 0.10

*/

public boolean equals(Object obj)

{

if (!(obj instanceof Wiki))

return false;

return domain.equals(((Wiki)obj).domain);

}

/**

* Returns a hash code of this object.

* @return a hash code

* @since 0.12

*/

public int hashCode()

{

return domain.hashCode() * maxlag - throttle;

}

/**

* Returns a string representation of this Wiki.

* @return a string representation of this Wiki.

* @since 0.10

*/

public String toString()

{

try

{

// domain

StringBuilder buffer = new StringBuilder("Wiki[domain=");

buffer.append(domain);

// user

buffer.append(",user=");

if (user != null)

{

buffer.append(user.getUsername());

buffer.append("[rights=");

buffer.append(user.userRights());

buffer.append("],");

}

else

buffer.append("null,");

// throttle mechanisms

buffer.append("throttle=");

buffer.append(throttle);

buffer.append(",maxlag=");

buffer.append(maxlag);

buffer.append(",assertionMode=");

buffer.append(assertion);

buffer.append(",statusCheckInterval=");

buffer.append(statusinterval);

buffer.append(",cookies=");

buffer.append(cookies);

buffer.append(",cookies2=");

buffer.append(cookies2);

return buffer.toString();

}

catch (IOException ex)

{

// this shouldn't happen due to the user rights cache

logger.logp(Level.SEVERE, "Wiki", "toString()", "Cannot retrieve user rights!", ex);

return "";

}

}

/**

* Gets the maxlag parameter. See mw:Manual:Maxlag parameter.

* @return the current maxlag, in seconds

* @see #setMaxLag

* @see #getCurrentDatabaseLag

* @since 0.11

*/

public int getMaxLag()

{

return maxlag;

}

/**

* Sets the maxlag parameter. A value of less than 1s disables this

* mechanism. Default is 5s.

* @param lag the desired maxlag in seconds

* @see #getMaxLag

* @see #getCurrentDatabaseLag

* @since 0.11

*/

public void setMaxLag(int lag)

{

maxlag = lag;

log(Level.CONFIG, "Setting maximum allowable database lag to " + lag, "setMaxLag");

}

/**

* Gets the assertion mode. See mw:Extension:Assert Edit for what

* functionality this mimics. Assertion modes are bitmasks.

* @return the current assertion mode

* @see #setAssertionMode

* @since 0.11

*/

public int getAssertionMode()

{

return assertion;

}

/**

* Sets the assertion mode. See mw:Extension:Assert Edit for what this

* functionality this mimics. Assertion modes are bitmasks. Default is

* ASSERT_NONE.

* @param an assertion mode

* @see #getAssertionMode

* @since 0.11

*/

public void setAssertionMode(int mode)

{

assertion = mode;

log(Level.CONFIG, "Set assertion mode to " + mode, "setAssertionMode");

}

/**

* Gets the number of actions (edit, move, block, delete, etc) between

* status checks. A status check is where we update user rights, block

* status and check for new messages (if the appropriate assertion mode

* is set).

*

* @return the number of edits between status checks

* @see #setStatusCheckInterval

* @since 0.18

*/

public int getStatusCheckInterval()

{

return statusinterval;

}

/**

* Sets the number of actions (edit, move, block, delete, etc) between

* status checks. A status check is where we update user rights, block

* status and check for new messages (if the appropriate assertion mode

* is set). Default is 100.

*

* @param interval the number of edits between status checks

* @see #getStatusCheckInterval

* @since 0.18

*/

public void setStatusCheckInterval(int interval)

{

statusinterval = interval;

log(Level.CONFIG, "Status check interval set to " + interval, "setStatusCheckInterval");

}

// META STUFF

/**

* Logs in to the wiki. This method is thread-safe. If the specified

* username or password is incorrect, the thread blocks for 20 seconds

* then throws an exception.

*

* @param username a username

* @param password a password (as a char[] due to JPasswordField)

* @throws FailedLoginException if the login failed due to incorrect

* username and/or password

* @throws IOException if a network error occurs

* @see #logout

*/

public synchronized void login(String username, char[] password) throws IOException, FailedLoginException

{

// @revised 0.11 to remove screen scraping

// @revised 0.23 readded screen scraping

// Scrape a login token from Special:Userlogin. Login tokens should be

// available through prop=info !

String blah = fetch(base + "Special:Userlogin", "login", true);

int a = blah.indexOf("wpLoginToken") + 21;

int b = blah.indexOf("\"", a);

String wpLoginToken = blah.substring(a, b);

// sanitize

username = URLEncoder.encode(username, "UTF-8");

// start

String url = query + "action=login";

URLConnection connection = new URL(url).openConnection();

logurl(url, "login");

setCookies(connection, cookies2);

connection.setDoOutput(true);

connection.connect();

// send

PrintWriter out = new PrintWriter(connection.getOutputStream());

out.print("lgname=");

out.print(username);

out.print("&lgpassword=");

out.print(password);

out.print("&lgtoken=");

out.print(URLEncoder.encode(wpLoginToken, "UTF-8"));

out.close();

// get the cookies

grabCookies(connection, cookies);

// determine success

BufferedReader in = new BufferedReader(new InputStreamReader(new GZIPInputStream(connection.getInputStream()), "UTF-8"));

String line = in.readLine();

boolean success = line.contains("result=\"Success\"");

in.close();

if (success)

{

user = new User(username);

boolean apihighlimit = (user.userRights() & BOT) == BOT || (user.userRights() & ADMIN) == ADMIN;

max = apihighlimit ? 5000 : 500;

log(Level.INFO, "Successfully logged in as " + username + ", highLimit = " + apihighlimit, "login");

}

else

{

log(Level.WARNING, "Failed to log in as " + username, "login");

try

{

Thread.sleep(20000); // to prevent brute force

}

catch (InterruptedException e)

{

// nobody cares

}

// test for some common failure reasons

if (line.contains("WrongPass") || line.contains("WrongPluginPass"))

throw new FailedLoginException("Login failed: incorrect password.");

else if (line.contains("NotExists"))

throw new FailedLoginException("Login failed: user does not exist.");

throw new FailedLoginException("Login failed: unknown reason.");

}

}

/**

* Logs out of the wiki. This method is thread safe (so that we don't log

* out during an edit). All operations are conducted offline, so you can

* serialize this Wiki first.

* @see #login

* @see #logoutServerSide

*/

public synchronized void logout()

{

cookies.clear();

cookies2.clear();

user = null;

max = 500;

log(Level.INFO, "Logged out", "logout");

}

/**

* Logs out of the wiki and destroys the session on the server. You will

* need to log in again instead of just reading in a serialized wiki.

* Equivalent to Special:Userlogout. This method is thread safe

* (so that we don't log out during an edit). WARNING: kills all

* concurrent sessions as well - if you are logged in with a browser this

* will log you out there as well.

*

* @throws IOException if a network error occurs

* @since 0.14

* @see #login

* @see #logout

*/

public synchronized void logoutServerSide() throws IOException

{

fetch(query + "action=logout", "logoutServerSide", false);

logout(); // destroy local cookies

}

/**

* Determines whether the current user has new messages. (A human would

* notice a yellow bar at the top of the page).

* @return whether the user has new messages

* @throws IOException if a network error occurs

* @since 0.11

*/

public boolean hasNewMessages() throws IOException

{

String url = query + "action=query&meta=userinfo&uiprop=hasmsg";

return fetch(url, "hasNewMessages", false).contains("messages=\"\"");

}

/**

* Determines the current database replication lag.

* @return the current database replication lag

* @throws IOException if a network error occurs

* @see #setMaxLag

* @see #getMaxLag

* @since 0.10

*/

public int getCurrentDatabaseLag() throws IOException

{

String line = fetch(query + "action=query&meta=siteinfo&siprop=dbrepllag", "getCurrentDatabaseLag", false);

int z = line.indexOf("lag=\"") + 5;

String lag = line.substring(z, line.indexOf("\" />", z));

log(Level.INFO, "Current database replication lag is " + lag + " seconds", "getCurrentDatabaseLag");

return Integer.parseInt(lag);

}

/**

* Fetches some site statistics, namely the number of articles, pages,

* files, edits, users and admins. Equivalent to Special:Statistics.

*

* @return a map containing the stats. Use "articles", "pages", "files"

* "edits", "users" or "admins" to retrieve the respective value

* @throws IOException if a network error occurs

* @since 0.14

*/

public HashMap getSiteStatistics() throws IOException

{

// ZOMG hack to avoid excessive substring code

String text = parseAndCleanup("{{NUMBEROFARTICLES:R}} {{NUMBEROFPAGES:R}} {{NUMBEROFFILES:R}} {{NUMBEROFEDITS:R}} " +

"{{NUMBEROFUSERS:R}} {{NUMBEROFADMINS:R}}");

String[] values = text.split("\\s");

HashMap ret = new HashMap();

String[] keys =

{

"articles", "pages", "files", "edits", "users", "admins"

};

for (int i = 0; i < values.length; i++)

{

Integer value = new Integer(values[i]);

ret.put(keys[i], value);

}

return ret;

}

/**

* Gets the version of MediaWiki this wiki runs e.g. 1.13 alpha (r31567).

* The r number corresponds to a revision in MediaWiki subversion

* (http://svn.wikimedia.org/viewvc/mediawiki/).

* @return the version of MediaWiki used

* @throws IOException if a network error occurs

* @since 0.14

*/

public String version() throws IOException

{

return parseAndCleanup("{{CURRENTVERSION}}"); // ahh, the magicness of magic words

}

/**

* Renders the specified wiki markup by passing it to the MediaWiki

* parser through the API. (Note: this isn't implemented locally because

* I can't be stuffed porting Parser.php). One use of this method is to

* emulate the previewing functionality of the MediaWiki software.

*

* @param markup the markup to parse

* @return the parsed markup as HTML

* @throws IOException if a network error occurs

* @since 0.13

*/

public String parse(String markup) throws IOException

{

// This is POST because markup can be arbitrarily large, as in the size

// of an article (over 10kb).

String url = query + "action=parse";

URLConnection connection = new URL(url).openConnection();

logurl(url, "parse");

setCookies(connection, cookies);

connection.setDoOutput(true);

connection.connect();

// send

PrintWriter out = new PrintWriter(connection.getOutputStream());

out.print("prop=text&text=");

out.print(URLEncoder.encode(markup, "UTF-8"));

out.close();

// parse

BufferedReader in = new BufferedReader(new InputStreamReader(new GZIPInputStream(connection.getInputStream()), "UTF-8"));

String line;

StringBuilder text = new StringBuilder(100000);

while ((line = in.readLine()) != null)

{

int y = line.indexOf(">", line.indexOf("

int z = line.indexOf("");

if (y != -1)

{

text.append(line.substring(y));

text.append("\n");

}

else if (z != -1)

{

text.append(line.substring(0, z));

text.append("\n");

break; // done

}

else

{

text.append(line);

text.append("\n");

}

}

return decode(text.toString());

}

/**

* Same as parse(), but also strips out unwanted crap. This might

* be useful to subclasses.

*

* @param in the string to parse

* @return that string without the crap

* @throws IOException if a network error occurs

* @since 0.14

*/

protected String parseAndCleanup(String in) throws IOException

{

String output = parse(in);

output = output.replace("

", "").replace("

", ""); // remove paragraph tags

output = output.replace("\n", ""); // remove new lines

// strip out the parser report, which comes at the end

int a = output.indexOf("