User:Nn123645/christian list parse.php

//PHP Settings

set_time_limit(0);

ini_set( 'memory_limit', '500M' ); //set memory_limit to 500 MB of RAM so we don't run out

//Require Parsed Category Names

require 'cats.txt';

require 'privateconfig.php'; //we define the mysql variables here

//New Instance of MySQLi

$mysql = new mysqli ( $mysql_server, $mysql_user, $mysql_pass, $mysql_dbname );

unset( $mysql_server, $mysql_pass, $mysql_server, $mysql_user );

if ( $mysql->connect_error ){

die('Connect Error (' . $mysql->connect_errno . ') ' . $mysql->connect_error());

}

$dump = simplexml_load_file('Wikipedia-20090330211613.xml') or die('problem');

foreach ( $dump->page as $page ) {

$revision = $page->revision->text;

$revision = $revision[0];

if ( preg_match_all( '%\{\{(?i:Birthdeath|Lifetime)\|([\d]{1,3})(?:[^\}]*)\}\}%', $revision, $template_param ) ) {

//insert the page into the matches as we have found a page that belongs in a cat

foreach( $template_param[1] as $template_param ) {

$template_param = (int) $template_param;

if ( $template_param >= 2 && $template_param <= 19 ) {

++$template_param;

$result = $mysql->query( 'SELECT page_id FROM pages WHERE page_name = \'' . $mysql->real_escape_string($page->title) . '\' LIMIT 1;' ) or die('problem, sql');

$article_id = $result->fetch_row();

$result->close();

$result = $mysql->query( "SELECT match_id FROM matches WHERE page_id = '$article_id[0]' AND list_match = '$template_param' LIMIT 1;" ) or die( '(' . $mysql->errno . ') ' . $mysql->error );

if ( $result->num_rows == 0 ) {

$mysql->query( "INSERT INTO matches (list_match, page_id) VALUES ('$template_param', '$article_id[0]' );" ) or die('problem, sql');

echo "added query1\n";

}

} elseif ( $template_param == 200 ) {

$result = $mysql->query( 'SELECT page_id FROM pages WHERE page_name = \'' . $mysql->real_escape_string($page->title) . '\' LIMIT 1;' ) or die('problem, sql');

$article_id = $result->fetch_row();

$result->close();

$result = $mysql->query( "SELECT match_id FROM matches WHERE page_id = '$article_id[0]' AND list_match = '$template_param' LIMIT 1;" ) or die( '(' . $mysql->errno . ') ' . $mysql->error );

if ( $result->num_rows == 0 ) {

$mysql->query( "INSERT INTO matches (list_match, page_id) VALUES ('21', '$article_id[0]' );" ) or die('problem, sql');

echo "added query2\n";

}

}

}

preg_match_all( '%\[\[Category:([^\]]+)\]\]%', $revision, $category_match);

foreach ( $category_match[1] as $match ) {

foreach ( $categories as $century => $category_ar ) {

foreach ( $category_ar as $category_name ) {

if ( strcasecmp( $match, $category_name ) == 0 ) {

//We have a match

//echo $century . "\n";

$result = $mysql->query( 'SELECT page_id FROM pages WHERE page_name = \'' . $mysql->real_escape_string($page->title) . '\' LIMIT 1;' ) or die('problem, sql');

$article_id = $result->fetch_row();

$result->close();

$result = $mysql->query( "SELECT match_id FROM matches WHERE page_id = '$article_id[0]' AND list_match = '$century' LIMIT 2;" ) or die('problem, sql');

if ( $result->num_rows == 0 ) {

$mysql->query( "INSERT INTO matches (list_match, page_id) VALUES ('$century', '$article_id[0]' );" ) or die( '(' . $mysql->errno . ') ' . $mysql->error );

echo "added query3\n";

}

}

}

}

}

} elseif ( preg_match_all( '%\[\[Category:([^\]]+)\]\]%', $revision, $category_match) ) {

foreach ( $category_match[1] as $match ) {

foreach ( $categories as $century => $category_ar ) {

foreach ( $category_ar as $category_name ) {

if ( strcasecmp( $match, $category_name ) == 0 ) {

//We have a match

//echo $century . "\n";

$result = $mysql->query( 'SELECT page_id FROM pages WHERE page_name = \'' . $mysql->real_escape_string($page->title) . '\' LIMIT 1;' ) or die('problem, sql');

$article_id = $result->fetch_row();

$result->close();

$result = $mysql->query( "SELECT match_id FROM matches WHERE page_id = '$article_id[0]' AND list_match = '$century' LIMIT 2;" ) or die('problem, sql');

if ( $result->num_rows == 0 ) {

$mysql->query( "INSERT INTO matches (list_match, page_id) VALUES ('$century', '$article_id[0]' );" ) or die( '(' . $mysql->errno . ') ' . $mysql->error );

echo "added query4\n";

}

}

}

}

}

}

}

// Testing Section

/*

$revision = 'Hi my name is foo

I like to say foobar

foo is good

Category:Foobar

Category:Current national leaders

Category:14th-century Christian clergy';*/

// This portion was used for parsing the lists into something we can use

//$page = file_get_contents( 'input.txt' ) or die ('problem');

//$page = file_get_contents( 'criterion.txt' ) or die ('problem');

/* This was to parse the categories on the page

$page = explode( '===', $page );

$fp = fopen ( 'cats.txt', 'w' );

fwrite( $fp, "

$i = 22;

foreach ( $page as $page ) {

$matches = array();

if( preg_match_all( '%\[\[:Category:([^\]]+)\]\]%', $page, $matches) ) {

if( $i == 22 ) {

fwrite( $fp, "\t'LP' => array(\n" );

--$i;

} else {

fwrite( $fp, "\t'$i' => array(\n" );

--$i;

}

foreach ( $matches[1] as $match ) {

$match = addslashes($match);

fwrite($fp, "\t\t'$match',\n");

}

fwrite( $fp, "\t),\n");

}

}

fwrite( $fp, ");\n\n?>");

fclose( $fp );

  • /

/* This was to insert the list of pages into the database

foreach ( $match as $match ) {

//$mysql->query('INSERT INTO pages ( page_name ) VALUES ( "' . $mysql->real_escape_string( $match ) . '");');

}

  • /

?>