Wikipedia:WikiProject Dates/Parse script

Usage

cat enwiki-20080724-pages-articles.xml | php parse.php > data

Script: parse.php

while($line=fgets(STDIN)) {

if(preg_match('/^\s*(.*)<\/title>\s*$/', $line, $matches)) {</p> <p>$title = strtr($matches[1], ' ', '_');</p> <p>} elseif($buffer) {</p> <p>if(preg_match('/(.*)<\/text>\s*$/', $line, $matches)) {</p> <p>process_page($title, $buffer . ' ' . $matches[1]);</p> <p>$buffer = null;</p> <p>} else {</p> <p>$buffer .= ' ' . rtrim($line);</p> <p>}</p> <p>} elseif(preg_match('/^\s*<text[^>]*>(.*)<\/text>\s*$/', $line, $matches)) {</p> <p>process_page($title, rtrim($matches[1]));</p> <p>} elseif(preg_match('/^\s*<text[^>]*>(.*)/', $line, $matches)) {</p> <p>$buffer = ' ' . rtrim($matches[1]);</p> <p>}</p> <p>}</p> <p>function process_page($title, $body) {</p> <p>print $title;</p> <p>while(preg_match('/(.*)\{\{([^\{\}]*)\}\}(.*)/', $body, $matches)) {</p> <p>$body = $matches[1] . ' ' . $matches[3];</p> <p>$dates = tally_dates($matches[2], $dates);</p> <p>}</p> <p>pretty_print($dates, 'template');</p> <p>$dates = null;</p> <p>while(preg_match('/(.*)\<ref[^\&]*\>(.*?)\<\/ref[^\&]*\>(.*)/i', $body, $matches)) {</p> <p>$body = $matches[1] . ' ' . $matches[3];</p> <p>$dates = tally_dates($matches[2], $dates);</p> <p>}</p> <p>pretty_print($dates, 'references');</p> <p>$dates = null;</p> <p>$dates = tally_dates($body, $dates);</p> <p>pretty_print($dates);</p> <p>print "\n";</p> <p>}</p> <p>function tally_dates($string, $dates) {</p> <p>$month_regex = '(january|february|march|april|may|june|july|august|september|october|november|december)';</p> <p>$regexTrail = '(.*)/iu';</p> <p>$prxDM = "\[\[(\d{1,2})[ _]{$month_regex}]]";</p> <p>$prxMD = "\[\[{$month_regex}[ _](\d{1,2})]]";</p> <p>$prxY = "\[\[(\d{1,4}([ _]BC|))]]";</p> <p>$prxISO1 = "\[\[(-?\d{4})]]-\[\[(\d{2})-(\d{2})]]";</p> <p>$prxISO2 = "\[\[(-?\d{4})-(\d{2})-(\d{2})]]";</p> <p>$DMY_linked = "/(.*){$prxDM} *,? *{$prxY}{$regexTrail}";</p> <p>$DMY_raw = "/(.*)(\d{1,2})[ _]{$month_regex} *,? *(\d{1,4}([ _]BC|)){$regexTrail}";</p> <p>$YDM_linked = "/(.*){$prxY} *,? *{$prxDM}{$regexTrail}";</p> <p>$YDM_raw = "/(.*)(\d{1,4}([ _]BC|)) *,? +(\d{1,2})[ _]{$month_regex}{$regexTrail}";</p> <p>$MDY_linked = "/(.*){$prxMD} *,? *{$prxY}{$regexTrail}";</p> <p>$MDY_raw = "/(.*){$month_regex} +(\d{1,2}) *,? +(\d{1,4}([ _]BC|)){$regexTrail}";</p> <p>$YMD_linked = "/(.*){$prxY} *,? *{$prxMD}{$regexTrail}";</p> <p>$YMD_raw = "/(.*)(\d{1,4}([ _]BC|)) *,? +{$month_regex} +(\d{1,2}){$regexTrail}";</p> <p>$DM_linked = "/(.*){$prxDM}{$regexTrail}";</p> <p>$MD_linked = "/(.*){$prxMD}{$regexTrail}";</p> <p>$ISO1_linked = "/(.*){$prxISO1}{$regexTrail}";</p> <p>$ISO2_linked = "/(.*){$prxISO2}{$regexTrail}";</p> <p>$ISO_raw = "/(.*)(-?\d{4})-(\d{2})-(\d{2}){$regexTrail}";</p> <p>while(preg_match($DMY_linked, $string, $matches)) {</p> <p>$dates['DMY_linked']++;</p> <p>$string = $matches[1] . ' ' . $matches[6];</p> <p>}</p> <p>while(preg_match($MDY_linked, $string, $matches)) {</p> <p>$dates['MDY_linked']++;</p> <p>$string = $matches[1] . ' ' . $matches[6];</p> <p>}</p> <p>while(preg_match($YDM_linked, $string, $matches)) {</p> <p>$dates['YDM_linked']++;</p> <p>$string = $matches[1] . ' ' . $matches[6];</p> <p>}</p> <p>while(preg_match($YMD_linked, $string, $matches)) {</p> <p>$dates['YMD_linked']++;</p> <p>$string = $matches[1] . ' ' . $matches[6];</p> <p>}</p> <p>while(preg_match($MD_linked, $string, $matches)) {</p> <p>$dates['MD_linked']++;</p> <p>$string = $matches[1] . ' ' . $matches[4];</p> <p>}</p> <p>while(preg_match($DM_linked, $string, $matches)) {</p> <p>$dates['DM_linked']++;</p> <p>$string = $matches[1] . ' ' . $matches[4];</p> <p>}</p> <p>while(preg_match($DMY_raw, $string, $matches)) {</p> <p>$dates['DMY_raw']++;</p> <p>$string = $matches[1] . ' ' . $matches[6];</p> <p>}</p> <p>while(preg_match($MDY_raw, $string, $matches)) {</p> <p>$dates['MDY_raw']++;</p> <p>$string = $matches[1] . ' ' . $matches[6];</p> <p>}</p> <p>while(preg_match($YDM_raw, $string, $matches)) {</p> <p>$dates['YDM_raw']++;</p> <p>$string = $matches[1] . ' ' . $matches[6];</p> <p>}</p> <p>while(preg_match($ISO1_linked, $string, $matches)) {</p> <p>$dates['ISO1_linked']++;</p> <p>$string = $matches[1] . ' ' . $matches[6];</p> <p>}</p> <p>while(preg_match($ISO2_linked, $string, $matches)) {</p> <p>$dates['ISO2_linked']++;</p> <p>$string = $matches[1] . ' ' . $matches[6];</p> <p>}</p> <p>while(preg_match($ISO_raw, $string, $matches)) {</p> <p>$dates['ISO_raw']++;</p> <p>$string = $matches[1] . ' ' . $matches[5];</p> <p>}</p> <p>return $dates;</p> <p>}</p> <p>function pretty_print($dates, $type = null) {</p> <p>if(!is_array($dates)) {</p> <p>return;</p> <p>}</p> <p>if($type == 'template') {</p> <p>print ' {';</p> <p>} elseif($type == 'references') {</p> <p>print ' <';</p> <p>} else {</p> <p>print ' ';</p> <p>}</p> <p>foreach($dates as $format => $count) {</p> <p>print $maybe_comma . $format . ':' . $count;</p> <p>$maybe_comma = ',';</p> <p>}</p> <p>if($type == 'template') {</p> <p>print '}';</p> <p>} elseif($type == 'references') {</p> <p>print '>';</p> <p>}</p> <p>}</p> <p>?></p></div></section></div></main> <footer class="site-footer"> <div class="footer-container"> <div class="footer-links"> <a href="/about.php">About</a> <a href="/help.php">Help</a> <a href="/updates.php">Updates</a> <a href="/contact.php">Contact</a> <a href="/privacy.php">Privacy</a> <a href="/terms.php">Terms</a> <a href="https://github.com/yourusername/friendly-wiki" target="_blank" rel="noopener">GitHub</a> </div> <div class="footer-copy"> © 2025 Friendly Wiki. All rights reserved. </div> </div> </footer> <script> const toggle = document.getElementById('mobileMenuToggle'); const menu = document.getElementById('mobileMenu'); toggle.addEventListener('click', () => { menu.classList.toggle('active'); }); </script> <!-- Collapsible toggle --> <script> document.addEventListener("DOMContentLoaded", function () { const toggles = document.querySelectorAll('.section-toggle'); toggles.forEach(toggle => { toggle.addEventListener('click', function () { const section = toggle.closest('.collapsible'); const body = section.querySelector('.wiki-body'); body.classList.toggle('collapsed'); }); }); }); </script>