User:GreenC/software/search wikipedia
=Method to accurately search Wikipedia=
Find all articles which contain the string "sportsillustrated.cnn.com" AND a {{tlx|dead}} template AND < whatever > .. solving for complicated Wikipedia searches is trivial by downloading the Wikipedia database ([https://dumps.wikimedia.org/backup-index.html dumps.wikimedia.org]) and search using whatever tool you prefer. Here are two plug and play solutions.
=Awk=
Awk is probably the simplest language available though with a speed trade-off for lack of a real XML parser. Nevertheless, no additional software is required (awk is a POSIX tool).
::To run: awk -f search-wp.awk > out
::
- !/bin/awk -f
- Search entire Wikipedia database.
- Download: https://en.wikipedia.org/wiki/Wikipedia:Database_download#English-language_Wikipedia
BEGIN {
MySearch = "archive.org/w?e?b?/?[0-9]{1,14}/"
WPdump = "/f/t/wikipedia-dump/enwiki-20150515-pages-articles.xml"
RS=("
while ((getline rawstr < WPdump ) > 0) {
# Skip blank content
if(! gensub(/^:space:+|:space:+$/, "", "g", rawstr))
continue
# Convert XML formating
gsub(/</,"<",rawstr);gsub(/>/,">",rawstr);gsub(/"/,"\"",rawstr);gsub(/&/,"\\&",rawstr)
# Get article title
if ( match(rawstr, "
split(a[0], b, "(
title = b[2]
}
# Get article body
if ( match(rawstr, "
split(a[0], b, "(
body = b[2]
}
- ---------- Search -----
if ( match(body, MySearch, matched_text) ) {
print title
# print matched_text[0] # uncomment to print
continue
}
}
close(r)
}
Note: when redirecting large output, send to a different disk (ramdisk or other physical volume) otherwise it could slow reading the XML file.
=Nim=
For a faster solution here is a Nim example. Nim compiles to optimized C code, which then compiles using gcc to an executable binary. In a test between Awk and Nim, it took Awk 3m31s to complete a search, the same in Nim took 0m43s. The code below is pretty much copy-paste compile and run, just add your RegEx Perl compatible regex, or plain text. Example regex strings:
::mySearchRe = re"djvu[.]txt"
::mySearchRe = re"http[:][^ ]*[^ ]"
::(the regex string is wrapped by re"" )
Then [https://nim-lang.org/install_unix.html download Nim] compiler (choosenim method is easiest), and compile the source with nim c -d:release --opt:speed -d:danger --passC:"-flto" --passL:"-flto" search.nim
.
::
- Search wikipedia dump for a string and print the article title (or matched text) if located
- Credit: Copyright User:Green_Cardamom, April 2016, MIT License
- Language: Nim
- Additional code credits: Rob Speer (https://github.com/rspeer/wiki2text)
import re, options, strutils, os, streams, parsexml
var # configuration variables
mySearchRe = re"djvu[.]txt"
wpDump = "/mnt/WindowsFdriveTdir/wikipedia-dump/enwiki-20150901-pages-articles.xml"
maxCount = 0 # Stop searching after X countArticle for speed testing. Set to 0 to find all.
var
countAllArticle = 0 # All article count
countArticle = 0 # Article titles containing a match (any number of matches)
countHits = 0 # Number of matches of search pattern (running total)
type
TagType = enum
TITLE, TEXT, REDIRECT, NS
ArticleData = array[TagType, string]
- Search text
proc searchText(article: ArticleData): bool {.discardable.} =
var
artcount = 0
pos = -1
# matches = newSeq[string](1)
inc countAllArticle
while pos < article[TEXT].len:
pos = find(article[TEXT], mySearchRe, pos + 1)
if pos == -1: break
inc artcount
if artcount > 0:
inc countArticle # number of article titles matching
countHits += artcount # number of matches of search pattern
echo article[TITLE]
result = true
if maxCount > 0:
if countAllArticle >= maxCount:
echo ""
echo "Articles all: ", countAllArticle
echo "Articles with a match: ", countArticle
echo "Number of pattern matches: ", countHits
quit()
var
RELEVANT_XML_TAGS = ["title", "text", "ns"]
textBuffer = ""
s = newFileStream(wpDump, fmRead)
gettingText = false
gettingAttribute = false
article: ArticleData
xml: XmlParser
if s == nil: quit("cannot open the file " & wpDump)
for tag in TITLE..NS: article[tag] = ""
xml.open(s, wpDump, options={reportWhitespace})
while true:
# Scan through the XML, handling each token as it arrives.
xml.next()
case xml.kind
of xmlElementStart, xmlElementOpen:
if RELEVANT_XML_TAGS.contains(xml.elementName):
# If this is a "title", "text", or "ns" tag, prepare to get its
# text content. Move our writing pointer to the beginning of
# the text buffer, so we can overwrite what was there.
textBuffer.setLen(0)
gettingText = true
elif xml.elementName == "page":
# If this is a new instance of the
# these tags, then reset the value that won't necessarily be
# overridden, which is the redirect value.
article[REDIRECT].setLen(0)
elif xml.elementName == "redirect":
# If this is the start of a redirect tag, prepare to get its
# attribute value.
gettingAttribute = true
of xmlAttribute:
# If we're looking for an attribute value, and we found one, add it
# to the buffer.
if gettingAttribute:
textBuffer.add(xml.attrValue)
of xmlCharData, xmlWhitespace:
# If we're looking for text, and we found it, add it to the buffer.
if gettingText:
textBuffer.add(xml.charData)
of xmlElementEnd:
# When we reach the end of an element we care about, take the text
# we've found and store it in the 'article' data structure. We can
# accomplish this quickly by simply swapping their references.
case xml.elementName
of "title":
swap article[TITLE], textBuffer
of "text":
swap article[TEXT], textBuffer
of "redirect":
swap article[REDIRECT], textBuffer
of "ns":
swap article[NS], textBuffer
of "page":
# When we reach the end of the
# data to searchText().
searchText(article)
else:
discard
# Now that we've reached the end of an element, stop extracting
# text. (We'll never need to extract text from elements that can
# have other XML elements nested inside them.)
gettingText = false
gettingAttribute = false
of xmlEof:
break
else:
discard
xml.close
echo "Search Wikipedia completed"
echo "----"
echo "Articles all: ", countAllArticle
echo "Articles with a match: ", countArticle
echo "Number of pattern matches: ", countHits
Note: when redirecting large output, send to a different disk (ramdisk or other physical volume) otherwise it could slow reading the XML file.