User:MER-C/Spamsearch.java

/**

* @(#)Spamsearch.java 0.02 23/10/2007

* Copyright (C) 2007 MER-C

*

* This program is free software; you can redistribute it and/or

* modify it under the terms of the GNU General Public License

* as published by the Free Software Foundation; either version 3

* of the License, or (at your option) any later version.

*

* This program is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

* GNU General Public License for more details.

*

* You should have received a copy of the GNU General Public License

* along with this program; if not, write to the Free Software

* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

*/

import java.io.*;

import java.net.*;

import java.util.*;

import java.util.concurrent.*;

import java.util.logging.*;

import javax.swing.*;

import java.util.regex.*;

/**

* Searches all Wikimedia wikis for spam. Usage: java Spamsearch

* example.com example.net ..., where example.com and example.net are the

* sites spammed. Outputs the results to a text file in the current directory

* (i.e. results.txt)

*

* Requires Wiki.java 0.11.

*

* KNOWN ISSUES: multi-site search does not work for some reason.

*

* @author MER-C

* @version 0.02

*/

public class Spamsearch

{

private ArrayList wikis = new ArrayList(1333);

private PrintWriter out; // output file

private ProgressMonitor monitor; // progress monitor

private int progress = 0;

private int hits = 0; // number of links found

public static void main(String[] args) throws IOException

{

new Spamsearch(args);

}

private Spamsearch(String[] args)

{

// check if command line arguments were specified

if (args.length == 0)

{

String sites = JOptionPane.showInputDialog(null, "Enter sites to search");

args = sites.split("\\s");

}

try

{

// various initialisation

out = new PrintWriter(new FileWriter("results.txt"));

out.println("Starting spamsearch at " + new Date() + ".");

// suppress log records below INFO

Logger.getLogger("wiki").setLevel(Level.INFO);

// fetch site matrix

Logger.getLogger("wiki").info("Fetching site matrix.");

InputStream in = new URL("http://en.wikipedia.org/w/api.php?action=sitematrix&format=xml").openStream();

BufferedReader reader = new BufferedReader(new InputStreamReader(in));

String line = reader.readLine();

// private wikis have API disabled and are NOT GOOD.

// Current private wikis are anything containing "com." or ".en." and

// (board|chair|exec|grants|internal|office|otrs-wiki|tlh|wikimaniateam).wikimedia.org.

String pattern = "(com\\.|\\.en\\.|board|chair|exec|grants|internal|office|otrs|tlh|wikimaniateam)";

Pattern p = Pattern.compile(pattern);

// parse the list

while (line.contains("url=\""))

{

int a = line.indexOf("url=\"") + 12;

int b = line.indexOf("\"", a) - 1;

String domain = line.substring(a, b);

Matcher matcher = p.matcher(domain);

if (matcher.find()) // private wiki, WOOP WOOP WOOP

{

line = line.substring(b);

continue;

}

Wiki wiki = new Wiki(domain);

wikis.add(wiki);

line = line.substring(b);

}

// now do the searching

for (int i = 0; i < args.length; i++)

{

// reset progress monitor

monitor = new ProgressMonitor(new JFrame(), "Searching for spamlink ", args[i], 0, wikis.size());

// resolve the website

InetAddress[] addresses = InetAddress.getAllByName(args[i]);

for (int j = 0; j < addresses.length; j++)

out.println(addresses[j]);

out.println("Searching " + wikis.size() + " wikis.\n");

// search for links

for (int j = 0; j < wikis.size(); j++)

{

newThread("*." + args[i], j);

if (j % 16 == 15) // wait for a while

Thread.sleep(8500);

}

synchronized(out)

{

out.wait();

Thread.sleep(7500);

out.println("" + hits + " links found.\n");

}

// recycle monitor

monitor.close();

monitor = null;

progress = 0;

}

}

catch (Exception ex)

{

if (!(ex instanceof InterruptedException))

{

ex.printStackTrace();

System.exit(2);

}

}

synchronized (out)

{

out.close();

}

System.exit(0);

}

/**

* Speed optimisation (runtime approx 4200s beforehand) because the

* internet is the major limitation. Don't you just love multithreading?

*/

private void newThread(final String domain, final int i)

{

new Thread()

{

public void run()

{

Wiki wiki = wikis.get(i);

wiki.setMaxLag(-1); // disable maxlag for performance

try

{

// do spamsearch

ArrayList[] links = wiki.spamsearch(domain);

hits += links[0].size();

synchronized(out) // so the output file doesn't get messed up

{

// don't print anything if there are no results

if (!links[0].isEmpty())

{

out.println("Results for " + wiki.getDomain() + "...");

for (int k = 0; k < links[0].size(); k++)

out.println("Page: " + links[0].get(k) + " URL: " + links[1].get(k));

out.println();

}

// done spamsearching

if (i == wikis.size() - 1)

{

out.flush();

out.notifyAll();

}

}

}

catch (IOException ex)

{

System.err.println(ex);

out.flush();

System.exit(2);

}

// update the progress monitor

SwingUtilities.invokeLater(new Runnable()

{

public void run()

{

if (monitor != null)

monitor.setProgress(++progress);

}

});

}

}.start();

}

}