User:Dr pda/generatestats.js

//

//This script generates a list of the ten shortest and ten longest articles which transclude a template,

//e.g. {{featured article}}, calculates some statistics and plots a histogram.

//To use this function add {{subst:js|User:Dr pda/generatestats.js}} to your monobook.js

//then go to http://en.wikipedia.org/w/index.php?title=User:Dr_pda/generatestats&action=edit

//See the talk page for documentation.

function keyValuePair(key,value){

this.key = key;

this.value = value;

}

function sortByValue(a, b){

return a.value - b.value

}

function getBestScale(min,max){

scales = new Array(0.2,0.5,1,2,5,10,20,25,50,100,200,250,500,1000,2000,5000);

var val = (max-min)/15;

for(var x in scales){

if (scales[x]-val >= 0) return scales[x];

}

return 5000;

}

function loadXMLDocPassingTemplate(url,handler,template)

{

// branch for native XMLHttpRequest object

if (window.XMLHttpRequest) {

var req = new XMLHttpRequest();

}

// branch for IE/Windows ActiveX version

else if (window.ActiveXObject) {

var req = new ActiveXObject("Microsoft.XMLHTTP");

}

if (req) {

req.onreadystatechange = function () {handler(req,template)};

req.open("GET", url, true);

req.send("");

}

}

function getSizeFromAPI(req,template) {

// only if req shows "loaded"

if (req.readyState == 4) {

// only if "OK"

if (req.status == 200) {

// ...processing statements go here...

if(useTalkCategory || useTemplateCategory) jobsLeft--;

var response = req.responseXML.documentElement;

var pages = response.getElementsByTagName('page');

if(pages.length > 0){

for(var i=0;i

pagesList[index++] = new keyValuePair(pages[i].getAttribute('title'),pages[i].getAttribute('length'));

}

document.getElementById('wpTextbox1').value = 'Retrieved ' + index + ' articles.\n To abort click the back button in your browser.';

//Check for more pages

var embeddedin = response.getElementsByTagName('embeddedin');

if(embeddedin.length > 0){

var geicontinue = embeddedin[0].getAttribute('geicontinue');

if(useTalkCategory || useTemplateCategory) jobsLeft++;

loadXMLDocPassingTemplate(queryURL+'&geicontinue='+geicontinue,getSizeFromAPI,template);

}

//If last page retrieved then start processing

else if(jobsLeft == 0){

//If using wiki text size

if(document.location.href.indexOf('prosesize') == -1){

sortAndMakeChart();

}

//If using readable prose size (WARNING:Will load every page which transcludes template. Could be thousands of pages!!)

else{

for(var x in pagesList){

var titleURL = encodeURIComponent(pagesList[x].key.replace(/ /g,'_'));

loadXMLDocPassingTemplate('/w/index.php?action=render&title='+titleURL,getProseSizeFromPage,pagesList[x].key);

}

}

}

}

} else {

alert("There was a problem retrieving the XML data:\n" +

req.statusText);

}

}

}

function getArticlePageFromTalkPage(req,template) {

// only if req shows "loaded"

if (req.readyState == 4) {

// only if "OK"

if (req.status == 200) {

// ...processing statements go here...

var response = req.responseXML.documentElement;

var pages = response.getElementsByTagName('page');

if(pages.length > 0){

for(var i=0;i

articleList.push(pages[i].getAttribute('subjectid'));

}

var categorymembers = response.getElementsByTagName('categorymembers');

if(categorymembers.length > 0){

var gcmcontinue = categorymembers[0].getAttribute('gcmcontinue');

loadXMLDocPassingTemplate(talkQueryURL+'&gcmcontinue='+gcmcontinue,getArticlePageFromTalkPage,template);

}

//All pages retrieved

else{

var pageIds='';

for(i in articleList){

//API limited to 50 titles per query

if( i%50 == 0 && i>0){

pageIds = pageIds.substr(1);

jobsLeft++;

loadXMLDocPassingTemplate(queryURL+pageIds,getSizeFromAPI,template);

pageIds='';

}

pageIds += '|' + articleList[i];

}

//Process remainder

pageIds = pageIds.substr(1);

jobsLeft++;

loadXMLDocPassingTemplate(queryURL+pageIds,getSizeFromAPI,template);

}

}

} else {

alert("There was a problem retrieving the XML data:\n" +

req.statusText);

}

}

}

function getPagesFromTemplateCategory(req,template) {

// only if req shows "loaded"

if (req.readyState == 4) {

// only if "OK"

if (req.status == 200) {

// ...processing statements go here...

var response = req.responseXML.documentElement;

var pages = response.getElementsByTagName('page');

if(pages.length > 0){

for(var i=0;i

articleList.push(encodeURIComponent(pages[i].getAttribute('title')));

}

var categorymembers = response.getElementsByTagName('categorymembers');

if(categorymembers.length > 0){

var gcmcontinue = categorymembers[0].getAttribute('gcmcontinue');

loadXMLDocPassingTemplate(templateQueryURL+'&gcmcontinue='+gcmcontinue,getPagesFromTemplateCategory,template);

}

//All pages retrieved

else{

for(i in articleList){

//API embeddedin query can only take one title

jobsLeft++;

loadXMLDocPassingTemplate(queryURL+articleList[i],getSizeFromAPI,template);

}

}

}

} else {

alert("There was a problem retrieving the XML data:\n" +

req.statusText);

}

}

}

function getProseSizeFromPage(req,title) {

// only if req shows "loaded"

if (req.readyState == 4) {

// only if "OK"

if (req.status == 200) {

// ...processing statements go here...

var response = req.responseText;

var start = response.indexOf('

',-1);

var stop = 0;

var proseSize = 0;

while(start > -1){

stop = response.indexOf('

',start);

var para = response.substring(start+3,stop);

para = para.replace(/\[\d{1,3}\]/g,'');

para = para.replace(/citation needed/g,'');

para = para.replace(/(<([^>]+)>)/ig,'');

proseSize += para.length;

start = response.indexOf('

',stop);

}

proseList[proseIndex++] = new keyValuePair(title,proseSize);

document.getElementById('wpTextbox1').value = 'Retrieved prose size for ' + proseIndex + ' out of ' + index + ' articles.\n To abort click the back button in your browser.';

//If last page retrieved then start processing

if(proseIndex == index){

pagesList = proseList;

sortAndMakeChart();

}

} else {

alert("There was a problem retrieving the XML data:\n" +

req.statusText);

}

}

}

function sortAndMakeChart(){

pagesList.sort(sortByValue);

//Get top ten and bottom ten

var bottomTen = '===Ten shortest articles===\n';

for(var i=0;i<10;i++){

bottomTen += ('# ' + pagesList[i].key + ' (' + Math.round(pagesList[i].value/1024) + ' kB)\n');

}

pagesList.reverse();

var topTen = '===Ten longest articles===\n';

for(var i=0;i<10;i++){

topTen += ('# ' + pagesList[i].key + ' (' + Math.round(pagesList[i].value/1024) + ' kB)\n');

}

var list = '===List of articles by size===\n';

if(document.location.href.indexOf('&list') != -1){

for(var i=0;i

list += ('# ' + pagesList[i].key + ' (' + Math.round(pagesList[i].value/1024) + ' kB)\n');

}

}

//Get Range

var max = Math.ceil(pagesList[0].value/1024);

var min = Math.floor(pagesList[pagesList.length-1].value/1024);

var xScale = getBestScale(min,max);

max = Math.ceil(max/xScale)*xScale;

min = Math.floor(min/xScale)*xScale;

var numBins = (max - min)/xScale;

//Calculate statistics

var sum = 0.0;

var bins = new Array(numBins);

for(var i=0;i

bins[i]=0;

}

for(var i=0;i

sum += pagesList[i].value*1.0;

bins[Math.floor((pagesList[i].value/1024-min)/(xScale*1.0))]++;

}

var mean = (sum/(pagesList.length*1024)).toFixed(3);

var median = (pagesList[Math.floor(pagesList.length/2)+1].value/1024).toFixed(3);

var statistics = '===Statistics===\n*Number of articles: '+pagesList.length+'\n*Mean: '+mean+' kB\n*Median: '+median+' kB\n';

//Calculate best vertical scale

var yMax = Math.max.apply(Math,bins);

var yScale = getBestScale(0,yMax);

yScale = Math.max(1,yScale);

yMax = Math.ceil(yMax/yScale)*yScale;

var verticalScale = '\nScaleMajor = gridcolor:darkgrey increment:' + yScale + ' start:0';

if(Math.floor(yScale/2) == yScale/2) verticalScale += '\nScaleMinor = gridcolor:lightgrey increment:' + yScale/2 + ' start:0'

//Draw chart

var chart = '===Chart===\n\nColors=\n id:lightgrey value:gray(0.8)\n id:darkgrey value:gray(0.8)\n id:white value:rgb(1,1,1)\n id:steel value:rgb(0.6,0.7,0.8)\n\nImageSize = width:auto height:303 barincrement:25\nPlotArea = left:50 bottom:50 top:30 right:30\nDateFormat = x.y\nPeriod = from:0 till:' + yMax +'\nTimeAxis = orientation:vertical\nAlignBars = early'+ verticalScale +'\nBackgroundColors = canvas:white\n\nPlotData=\n color:steel width:20 align:left\n';

for(var i=0;i

chart += ' bar:'+(min+i*xScale)+' from:0 till:'+bins[i]+'\n';

}

//Add axis label

chart += ' bar:'+(min + Math.floor(2*numBins/5)*xScale)+' at:0 text:"Article size in kB" shift:(0,-30)\n\n';

if(document.location.href.indexOf('&list') != -1){

document.getElementById('wpTextbox1').value = topTen + '\n' + bottomTen + '\n' + statistics + '\n' + chart + '\n' + list;

}

else{

document.getElementById('wpTextbox1').value = topTen + '\n' + bottomTen + '\n' + statistics + '\n' + chart;

}

document.getElementById('wpPreview').click();

}

function generateStatistics(){

pagesList = new Array();

index = 0;

proseList = new Array();

proseIndex = 0;

articleList = new Array();

template ='';

queryURL ='';

talkQueryURL ='';

templateQueryURL ='';

jobsLeft = 0;

namespace = '0';

useTalkCategory = (document.location.href.indexOf('usetalkcategory') != -1) ? true : false;

useTemplateCategory = (document.location.href.indexOf('usetemplatecategory') != -1) ? true : false;

specifyNamespace = (document.location.href.indexOf('specifynamespace') != -1) ? true : false;

if(specifyNamespace){

namespace=prompt("Enter the number of the namespace the pages are in\n (0=article, 2=User, 4=Wikipedia etc)","");

}

if(useTalkCategory){

template=prompt("Enter the talk page category you want to check for\n (Don't include Category:)","");

template = "Category:"+template.toUpperCase().substr(0,1)+template.substr(1);

talkQueryURL = '/w/api.php?action=query&generator=categorymembers&gcmtitle=' + template + '&gcmlimit=500&gcmnamespace=1&prop=info&inprop=subjectid&format=xml';

queryURL = '/w/api.php?action=query&prop=info&format=xml&pageids=';

loadXMLDocPassingTemplate(talkQueryURL,getArticlePageFromTalkPage,template);

}

else if(useTemplateCategory){

template=prompt("Enter the template category you want to check\n (Don't include Category:)","");

template = "Category:"+template.toUpperCase().substr(0,1)+template.substr(1);

templateQueryURL = '/w/api.php?action=query&generator=categorymembers&gcmtitle=' + template + '&gcmlimit=500&gcmnamespace=10&prop=info&format=xml';

queryURL = '/w/api.php?action=query&generator=embeddedin&geilimit=500&geinamespace=0&prop=info&format=xml&geititle=';

loadXMLDocPassingTemplate(templateQueryURL,getPagesFromTemplateCategory,template);

}

else{

template=prompt("Enter the template you want to check for\n (Don't include Template:)","");

template = "Template:"+template.toUpperCase().substr(0,1)+template.substr(1);

queryURL = '/w/api.php?action=query&generator=embeddedin&geititle=' + template + '&geilimit=500&geinamespace=' + namespace + '&prop=info&format=xml';

loadXMLDocPassingTemplate(queryURL,getSizeFromAPI,template);

}

document.getElementById('wpTextbox1').value = 'Started.';

}

addOnloadHook(function () {

if(document.location.href.indexOf('User:Dr_pda/generatestats&action=edit') != -1){

generateStatistics();

}

});

//