User:GreenC/software/search wikipedia

=Method to accurately search Wikipedia=

Find all articles which contain the string "sportsillustrated.cnn.com" AND a {{tlx|dead}} template AND < whatever > .. solving for complicated Wikipedia searches is trivial by downloading the Wikipedia database ([https://dumps.wikimedia.org/backup-index.html dumps.wikimedia.org]) and search using whatever tool you prefer. Here are two plug and play solutions.

=Awk=

Awk is probably the simplest language available though with a speed trade-off for lack of a real XML parser. Nevertheless, no additional software is required (awk is a POSIX tool).

::To run: awk -f search-wp.awk > out

!/bin/awk -f
Search entire Wikipedia database.
Download: https://en.wikipedia.org/wiki/Wikipedia:Database_download#English-language_Wikipedia

BEGIN {

MySearch = "archive.org/w?e?b?/?[0-9]{1,14}/"

WPdump = "/f/t/wikipedia-dump/enwiki-20150515-pages-articles.xml"

RS=("")

while ((getline rawstr < WPdump ) > 0) {

# Skip blank content

if(! gensub(/^:space:+|:space:+$/, "", "g", rawstr))

continue

# Convert XML formating

gsub(/</,"<",rawstr);gsub(/>/,">",rawstr);gsub(/"/,"\"",rawstr);gsub(/&/,"\\&",rawstr)

# Get article title

if ( match(rawstr, ".+", a) ) {

split(a[0], b, "(|)")

title = b[2]

}

# Get article body

if ( match(rawstr, ".+", a) ) {

split(a[0], b, "(|)")

body = b[2]

}

---------- Search -----

if ( match(body, MySearch, matched_text) ) {

print title

# print matched_text[0] # uncomment to print

continue

}

close(r)

}

Note: when redirecting large output, send to a different disk (ramdisk or other physical volume) otherwise it could slow reading the XML file.

=Nim=

For a faster solution here is a Nim example. Nim compiles to optimized C code, which then compiles using gcc to an executable binary. In a test between Awk and Nim, it took Awk 3m31s to complete a search, the same in Nim took 0m43s. The code below is pretty much copy-paste compile and run, just add your RegEx Perl compatible regex, or plain text. Example regex strings:

::mySearchRe = re"djvu[.]txt"

::mySearchRe = re"http[:][^ ]*[^ ]"

::(the regex string is wrapped by re"" )

Then [https://nim-lang.org/install_unix.html download Nim] compiler (choosenim method is easiest), and compile the source with nim c -d:release --opt:speed -d:danger --passC:"-flto" --passL:"-flto" search.nim.

Search wikipedia dump for a string and print the article title (or matched text) if located
Credit: Copyright User:Green_Cardamom, April 2016, MIT License
Language: Nim
Additional code credits: Rob Speer (https://github.com/rspeer/wiki2text)

import re, options, strutils, os, streams, parsexml

var # configuration variables

mySearchRe = re"djvu[.]txt"

wpDump = "/mnt/WindowsFdriveTdir/wikipedia-dump/enwiki-20150901-pages-articles.xml"

maxCount = 0 # Stop searching after X countArticle for speed testing. Set to 0 to find all.

var

countAllArticle = 0 # All article count

countArticle = 0 # Article titles containing a match (any number of matches)

countHits = 0 # Number of matches of search pattern (running total)

type

TagType = enum

TITLE, TEXT, REDIRECT, NS

ArticleData = array[TagType, string]

Search text

proc searchText(article: ArticleData): bool {.discardable.} =

var

artcount = 0

pos = -1

# matches = newSeq[string](1)

inc countAllArticle

while pos < article[TEXT].len:

pos = find(article[TEXT], mySearchRe, pos + 1)

if pos == -1: break

inc artcount

if artcount > 0:

inc countArticle # number of article titles matching

countHits += artcount # number of matches of search pattern

echo article[TITLE]

result = true

if maxCount > 0:

if countAllArticle >= maxCount:

echo ""

echo "Articles all: ", countAllArticle

echo "Articles with a match: ", countArticle

echo "Number of pattern matches: ", countHits

quit()

var

RELEVANT_XML_TAGS = ["title", "text", "ns"]

textBuffer = ""

s = newFileStream(wpDump, fmRead)

gettingText = false

gettingAttribute = false

article: ArticleData

xml: XmlParser

if s == nil: quit("cannot open the file " & wpDump)

for tag in TITLE..NS: article[tag] = ""

xml.open(s, wpDump, options={reportWhitespace})

while true:

# Scan through the XML, handling each token as it arrives.

xml.next()

case xml.kind

of xmlElementStart, xmlElementOpen:

if RELEVANT_XML_TAGS.contains(xml.elementName):

# If this is a "title", "text", or "ns" tag, prepare to get its

# text content. Move our writing pointer to the beginning of

# the text buffer, so we can overwrite what was there.

textBuffer.setLen(0)

gettingText = true

elif xml.elementName == "page":

# If this is a new instance of the tag that contains all

# these tags, then reset the value that won't necessarily be

# overridden, which is the redirect value.

article[REDIRECT].setLen(0)

elif xml.elementName == "redirect":

# If this is the start of a redirect tag, prepare to get its

# attribute value.

gettingAttribute = true

of xmlAttribute:

# If we're looking for an attribute value, and we found one, add it

# to the buffer.

if gettingAttribute:

textBuffer.add(xml.attrValue)

of xmlCharData, xmlWhitespace:

# If we're looking for text, and we found it, add it to the buffer.

if gettingText:

textBuffer.add(xml.charData)

of xmlElementEnd:

# When we reach the end of an element we care about, take the text

# we've found and store it in the 'article' data structure. We can

# accomplish this quickly by simply swapping their references.

case xml.elementName

of "title":

swap article[TITLE], textBuffer

of "text":

swap article[TEXT], textBuffer

of "redirect":

swap article[REDIRECT], textBuffer

of "ns":

swap article[NS], textBuffer

of "page":

# When we reach the end of the tag, send the article

# data to searchText().

searchText(article)

else:

discard

# Now that we've reached the end of an element, stop extracting

# text. (We'll never need to extract text from elements that can

# have other XML elements nested inside them.)

gettingText = false

gettingAttribute = false

of xmlEof:

break

else:

discard

xml.close

echo "Search Wikipedia completed"

echo "----"

echo "Articles all: ", countAllArticle

echo "Articles with a match: ", countArticle

echo "Number of pattern matches: ", countHits

Note: when redirecting large output, send to a different disk (ramdisk or other physical volume) otherwise it could slow reading the XML file.