User:Polygnotus/Scripts/DeduplicateReferences.js

// Only exact duplicates

// Tries to come up with a name for the reference

// Only runs when editing the current version of the article

// Function to deduplicate references in Wikipedia articles

function deduplicateReferences() {

console.log('Starting deduplication process...');

// Get the edit textarea and summary input

const editTextarea = document.getElementById('wpTextbox1');

const summaryInput = document.getElementById('wpSummary');

if (!editTextarea || !summaryInput) {

console.log('Edit textarea or summary input not found');

return;

}

let content = editTextarea.value;

console.log('Content length:', content.length);

// FIXED: Use a more robust approach to find ref tags

// First find all start positions, then manually parse each one

function findAllReferences(text) {

const refs = [];

let pos = 0;

while (pos < text.length) {

const refStart = text.indexOf('

if (refStart === -1) break;

// Find the end of the opening tag

const tagEnd = text.indexOf('>', refStart);

if (tagEnd === -1) break;

const openingTag = text.substring(refStart, tagEnd + 1);

// Check if it's self-closing

if (openingTag.endsWith('/>')) {

refs.push(openingTag);

pos = tagEnd + 1;

} else {

// Find the closing

const closeStart = text.indexOf('', tagEnd);

if (closeStart === -1) {

pos = tagEnd + 1;

continue;

}

const fullRef = text.substring(refStart, closeStart + 6);

refs.push(fullRef);

pos = closeStart + 6;

}

return refs;

}

// Object to store all references

const allRefs = {};

// Set to store all used reference names

const usedNames = new Set();

// Blacklist of reference names to ignore

const blacklist = [

"doi_org",

"jstor_org",

"amazon_com",

"books_google_com",

"web_archive_org",

"worldcat_org",

"dx_doi_org",

"patents_google_com",

"cite_journal",

"cite_book",

"cite_web",

"cite_news",

"cite_magazine",

"cite_newspaper",

"cite_thesis",

"cite_conference",

"cite_encyclopedia",

"cite_album_notes",

"cite_comic",

"cite_court",

"cite_act",

"cite_episode",

"cite_mailing_list",

"cite_map",

"cite_newsgroup",

"cite_patent",

"cite_press_release",

"cite_report",

"cite_video_game",

"citation"

];

// Function to extract domain name from URL

function extractDomain(url) {

try {

let domain = new URL(url).hostname;

domain = domain.replace(/^www\./, '');

return domain === 'archive.org' ? extractDomain(url.split('archive.org/web/')[1]) : domain;

} catch (e) {

return null;

}

// Function to extract title from cite templates

function extractTitleFromCiteTemplate(ref) {

// Look for title parameter in cite templates

const titleMatch = ref.match(/\|\s*title\s*=\s*([^|{}]+?)(?:\s*\||$)/i);

if (titleMatch) {

let title = titleMatch[1].trim();

// Remove any remaining markup

title = title.replace(/<[^>]*>/g, '').replace(/\[\[([^\]]*)\]\]/g, '$1');

// Extract first 3 words

const words = title.match(/\b[A-Za-z0-9]+\b/g);

if (words && words.length > 0) {

return words.slice(0, 3).join('').toLowerCase();

}

return null;

}

// Function to generate a unique name for the reference

function generateUniqueName(ref) {

// Check if this is a cite template and try to extract title (case insensitive)

if (ref.toLowerCase().includes('{{cite') || ref.toLowerCase().includes('{{citation')) {

const titleName = extractTitleFromCiteTemplate(ref);

if (titleName) {

let uniqueName = titleName;

let counter = 1;

while (usedNames.has(uniqueName)) {

uniqueName = `${titleName}_${counter}`;

counter++;

}

usedNames.add(uniqueName);

return uniqueName;

}

// Try to extract meaningful text from the reference

const textContent = ref.replace(/<[^>]*>/g, '').trim();

// Look for URLs first

const urlMatch = ref.match(/https?:\/\/[^\s<>"]+/i);

if (urlMatch) {

const domain = extractDomain(urlMatch[0]);

if (domain && !blacklist.includes(domain.replace(/\./g, '_'))) {

let baseName = domain.replace(/\./g, '_');

let uniqueName = baseName;

let counter = 1;

while (usedNames.has(uniqueName)) {

uniqueName = `${baseName}_${counter}`;

counter++;

}

usedNames.add(uniqueName);

return uniqueName;

}

// If no URL, try to create name from content

if (textContent) {

// Extract first 3 meaningful words (changed from 2 to 3)

const words = textContent.match(/\b[A-Za-z0-9]+\b/g);

if (words && words.length > 0) {

let baseName = words.slice(0, 3).join('_').toLowerCase();

let uniqueName = baseName;

let counter = 1;

while (usedNames.has(uniqueName)) {

uniqueName = `${baseName}_${counter}`;

counter++;

}

usedNames.add(uniqueName);

return uniqueName;

}

// Fallback to generic name

let baseName = 'ref';

let uniqueName = baseName;

let counter = 1;

while (usedNames.has(uniqueName)) {

uniqueName = `${baseName}_${counter}`;

counter++;

}

usedNames.add(uniqueName);

return uniqueName;

}

// Function to extract existing name from a reference

function extractExistingName(ref) {

const nameMatch = ref.match(/name\s*=\s*(["']?)([^"'\s/>]+(?:\s+[^"'\s/>]+)*)\1/i);

return nameMatch ? nameMatch[2] : null;

}

// Function to create a reference tag

function createRefTag(name, content = null) {

if (content) {

return `${content}`;

} else {

return ``;

}

// Function to check if a reference is blacklisted

function isBlacklisted(ref) {

const name = extractExistingName(ref);

return name && blacklist.includes(name);

}

// Function to normalize reference content for comparison

function normalizeRef(ref) {

// Remove name attribute and normalize whitespace for comparison

return ref.replace(/\s*name\s*=\s*(["']?)[^"'\s/>]+(?:\s+[^"'\s/>]+)*\1/i, '')

.replace(/\s*\|\s*/g, '|') // normalize spacing around pipe characters

.replace(/\s+/g, ' ')

.trim();

}

console.log('Starting first pass - collecting references...');

// First pass: collect all references and used names

const matches = findAllReferences(content);

console.log('Found', matches.length, 'reference tags');

// Debug: Show first few matches

console.log('First 5 matches:', matches.slice(0, 5).map(m => m.substring(0, 100) + '...'));

matches.forEach(match => {

if (!isBlacklisted(match)) {

const normalizedRef = normalizeRef(match);

const existingName = extractExistingName(match);

if (existingName) {

usedNames.add(existingName);

// FIXED: Skip named references - they should never be deduplicated or modified

return;

}

// Only process unnamed references for deduplication

if (allRefs[normalizedRef]) {

allRefs[normalizedRef].count++;

allRefs[normalizedRef].instances.push(match);

} else {

allRefs[normalizedRef] = {

firstOccurrence: match,

instances: [match]

};

}

});

console.log('Reference analysis:', Object.keys(allRefs).map(key => ({

ref: key.substring(0, 50) + '...',

hasName: !!allRefs[key].name

})));

// Find duplicates (only among unnamed references)

const duplicates = Object.keys(allRefs).filter(key =>

allRefs[key].count > 1 && !allRefs[key].name // Only unnamed duplicates

);

console.log('Found', duplicates.length, 'duplicate unnamed reference groups');

// Second pass: replace duplicates with named references

let deduplicatedCount = 0;

let processedRefs = new Set();

console.log('Starting second pass - replacing duplicates...');

// Process each duplicate group (only unnamed references)

duplicates.forEach(normalizedRef => {

const refInfo = allRefs[normalizedRef];

// Double-check this is an unnamed reference group

if (refInfo.name) {

console.log('WARNING: Skipping named reference group:', refInfo.name);

return;

}

// Generate a name for this duplicate reference

const generatedName = generateUniqueName(refInfo.firstOccurrence);

if (generatedName && !blacklist.includes(generatedName)) {

refInfo.name = generatedName;

console.log('Generated name for duplicate group:', generatedName);

// Extract content for the first occurrence

const contentMatch = refInfo.firstOccurrence.match(/]*>([\s\S]*?)<\/ref>/i);

if (contentMatch) {

refInfo.firstOccurrence = createRefTag(generatedName, contentMatch[1]);

}

// Replace all instances

refInfo.instances.forEach((instance, index) => {

if (index === 0) {

// First occurrence - replace with named ref

content = content.replace(instance, refInfo.firstOccurrence);

console.log('Replaced first occurrence with named ref');

} else {

// Subsequent occurrences - replace with short ref

content = content.replace(instance, createRefTag(refInfo.name));

deduplicatedCount++;

console.log('Replaced duplicate with short ref:', createRefTag(refInfo.name));

}

});

}

});

console.log('Deduplication complete. Count:', deduplicatedCount);

// Update the textarea with the deduplicated content

if (deduplicatedCount > 0) {

editTextarea.value = content;

// Add edit summary

let currentSummary = summaryInput.value;

let deduplicationSummary = `Deduplicated ${deduplicatedCount} reference${deduplicatedCount > 1 ? 's' : ''} using DeduplicateReferences`;

summaryInput.value = currentSummary ? `${currentSummary} • ${deduplicationSummary}` : deduplicationSummary;

// Check minor edit if available

const minorEditCheckbox = document.querySelector('#wpMinoredit, input[name="wpMinoredit"]');

if (minorEditCheckbox) {

minorEditCheckbox.checked = true;

}

console.log('Successfully deduplicated', deduplicatedCount, 'references');

alert(`Successfully deduplicated ${deduplicatedCount} reference${deduplicatedCount > 1 ? 's' : ''}!`);

} else {

//console.log('No duplicates found to deduplicate');

//alert('No duplicate references found to deduplicate.');

}

// Function to check if we're editing the current version (not an old revision)

function isEditingCurrentVersion() {

// Check if there's an oldid parameter in the URL

const urlParams = new URLSearchParams(window.location.search);

const oldid = urlParams.get('oldid');

// If there's an oldid parameter, we're editing an old revision

if (oldid) {

console.log('Editing old revision (oldid=' + oldid + '), skipping deduplication');

return false;

}

// Check if we're in diff view

const diff = urlParams.get('diff');

if (diff) {

console.log('In diff view, skipping deduplication');

return false;

}

// Check for section editing of old revisions

const section = urlParams.get('section');

if (section && oldid) {

console.log('Editing section of old revision, skipping deduplication');

return false;

}

// Additional check: look for revision warning messages

const revisionWarning = document.querySelector('.mw-revision-warning, .mw-editnotice-base');

if (revisionWarning && revisionWarning.textContent.toLowerCase().includes('old revision')) {

console.log('Old revision warning detected, skipping deduplication');

return false;

}

console.log('Editing current version - deduplication allowed');

return true;

}

// Function to check if the edit textarea is ready

function isEditTextareaReady() {

const editTextarea = document.getElementById('wpTextbox1');

const summaryInput = document.getElementById('wpSummary');

return editTextarea && editTextarea.value && summaryInput;

}

// Function to run deduplication when everything is ready

function runDeduplicationWhenReady() {

console.log('Checking if ready...');

if (isEditTextareaReady()) {

console.log('Ready! Checking if editing current version...');

if (isEditingCurrentVersion()) {

console.log('Editing current version - running deduplication...');

deduplicateReferences();

} else {

console.log('Not editing current version - skipping deduplication');

}

} else {

console.log('Not ready yet, retrying...');

setTimeout(runDeduplicationWhenReady, 100);

}

// Add a button to manually trigger deduplication

function addDeduplicationButton() {

const toolbar = document.querySelector('.wikiEditor-ui-toolbar') ||

document.querySelector('#wpTextbox1').parentNode;

if (toolbar && !document.getElementById('dedupe-button')) {

const button = document.createElement('button');

button.id = 'dedupe-button';

button.type = 'button';

button.textContent = 'Deduplicate References';

button.style.margin = '5px';

// Only enable button if editing current version

if (isEditingCurrentVersion()) {

button.onclick = deduplicateReferences;

console.log('Added deduplication button (enabled)');

} else {

button.disabled = true;

button.title = 'Deduplication only works when editing the current version of an article';

button.onclick = () => alert('Deduplication only works when editing the current version of an article, not old revisions.');

console.log('Added deduplication button (disabled - old revision)');

}

toolbar.appendChild(button);

}

// Run the deduplication when the edit page is fully loaded

if (typeof mw !== 'undefined') {

const action = mw.config.get('wgAction');

const namespace = mw.config.get('wgNamespaceNumber');

if (action === 'edit' && namespace === 0) {

console.log('Article edit page detected, checking version and setting up deduplication...');

if (document.readyState === 'complete') {

runDeduplicationWhenReady();

addDeduplicationButton();

} else {

window.addEventListener('load', () => {

runDeduplicationWhenReady();

addDeduplicationButton();

});

}

} else {

console.log('Not on article edit page - action:', action, 'namespace:', namespace);

}

} else {

console.log('MediaWiki not available');

}

// Also add the button when the page is ready (only for article namespace)

setTimeout(() => {

if (typeof mw !== 'undefined') {

const action = mw.config.get('wgAction');

const namespace = mw.config.get('wgNamespaceNumber');

if (action === 'edit' && namespace === 0) {

addDeduplicationButton();

}

}, 2000);