User:Polygnotus/Scripts/DeduplicateReferences.js
//
// Only exact duplicates
// Tries to come up with a name for the reference
// Only runs when editing the current version of the article
// Function to deduplicate references in Wikipedia articles
function deduplicateReferences() {
console.log('Starting deduplication process...');
// Get the edit textarea and summary input
const editTextarea = document.getElementById('wpTextbox1');
const summaryInput = document.getElementById('wpSummary');
if (!editTextarea || !summaryInput) {
console.log('Edit textarea or summary input not found');
return;
}
let content = editTextarea.value;
console.log('Content length:', content.length);
// FIXED: Use a more robust approach to find ref tags
// First find all start positions, then manually parse each one
function findAllReferences(text) {
const refs = [];
let pos = 0;
while (pos < text.length) {
const closeStart = text.indexOf('', tagEnd);
if (closeStart === -1) {
pos = tagEnd + 1;
continue;
}
const fullRef = text.substring(refStart, closeStart + 6);
refs.push(fullRef);
pos = closeStart + 6;
}
}
return refs;
}
// Object to store all references
const allRefs = {};
// Set to store all used reference names
const usedNames = new Set();
// Blacklist of reference names to ignore
const blacklist = [
"doi_org",
"jstor_org",
"amazon_com",
"books_google_com",
"web_archive_org",
"worldcat_org",
"dx_doi_org",
"patents_google_com",
"cite_journal",
"cite_book",
"cite_web",
"cite_news",
"cite_magazine",
"cite_newspaper",
"cite_thesis",
"cite_conference",
"cite_encyclopedia",
"cite_album_notes",
"cite_comic",
"cite_court",
"cite_act",
"cite_episode",
"cite_mailing_list",
"cite_map",
"cite_newsgroup",
"cite_patent",
"cite_press_release",
"cite_report",
"cite_video_game",
"citation"
];
// Function to extract domain name from URL
function extractDomain(url) {
try {
let domain = new URL(url).hostname;
domain = domain.replace(/^www\./, '');
return domain === 'archive.org' ? extractDomain(url.split('archive.org/web/')[1]) : domain;
} catch (e) {
return null;
}
}
// Function to extract title from cite templates
function extractTitleFromCiteTemplate(ref) {
// Look for title parameter in cite templates
const titleMatch = ref.match(/\|\s*title\s*=\s*([^|{}]+?)(?:\s*\||$)/i);
if (titleMatch) {
let title = titleMatch[1].trim();
// Remove any remaining markup
title = title.replace(/<[^>]*>/g, '').replace(/\[\[([^\]]*)\]\]/g, '$1');
// Extract first 3 words
const words = title.match(/\b[A-Za-z0-9]+\b/g);
if (words && words.length > 0) {
return words.slice(0, 3).join('').toLowerCase();
}
}
return null;
}
// Function to generate a unique name for the reference
function generateUniqueName(ref) {
// Check if this is a cite template and try to extract title (case insensitive)
if (ref.toLowerCase().includes('{{cite') || ref.toLowerCase().includes('{{citation')) {
const titleName = extractTitleFromCiteTemplate(ref);
if (titleName) {
let uniqueName = titleName;
let counter = 1;
while (usedNames.has(uniqueName)) {
uniqueName = `${titleName}_${counter}`;
counter++;
}
usedNames.add(uniqueName);
return uniqueName;
}
}
// Try to extract meaningful text from the reference
const textContent = ref.replace(/<[^>]*>/g, '').trim();
// Look for URLs first
const urlMatch = ref.match(/https?:\/\/[^\s<>"]+/i);
if (urlMatch) {
const domain = extractDomain(urlMatch[0]);
if (domain && !blacklist.includes(domain.replace(/\./g, '_'))) {
let baseName = domain.replace(/\./g, '_');
let uniqueName = baseName;
let counter = 1;
while (usedNames.has(uniqueName)) {
uniqueName = `${baseName}_${counter}`;
counter++;
}
usedNames.add(uniqueName);
return uniqueName;
}
}
// If no URL, try to create name from content
if (textContent) {
// Extract first 3 meaningful words (changed from 2 to 3)
const words = textContent.match(/\b[A-Za-z0-9]+\b/g);
if (words && words.length > 0) {
let baseName = words.slice(0, 3).join('_').toLowerCase();
let uniqueName = baseName;
let counter = 1;
while (usedNames.has(uniqueName)) {
uniqueName = `${baseName}_${counter}`;
counter++;
}
usedNames.add(uniqueName);
return uniqueName;
}
}
// Fallback to generic name
let baseName = 'ref';
let uniqueName = baseName;
let counter = 1;
while (usedNames.has(uniqueName)) {
uniqueName = `${baseName}_${counter}`;
counter++;
}
usedNames.add(uniqueName);
return uniqueName;
}
// Function to extract existing name from a reference
function extractExistingName(ref) {
const nameMatch = ref.match(/name\s*=\s*(["']?)([^"'\s/>]+(?:\s+[^"'\s/>]+)*)\1/i);
return nameMatch ? nameMatch[2] : null;
}
// Function to create a reference tag
function createRefTag(name, content = null) {
if (content) {
return `${content}`;
} else {
}
}
// Function to check if a reference is blacklisted
function isBlacklisted(ref) {
const name = extractExistingName(ref);
return name && blacklist.includes(name);
}
// Function to normalize reference content for comparison
function normalizeRef(ref) {
// Remove name attribute and normalize whitespace for comparison
return ref.replace(/\s*name\s*=\s*(["']?)[^"'\s/>]+(?:\s+[^"'\s/>]+)*\1/i, '')
.replace(/\s*\|\s*/g, '|') // normalize spacing around pipe characters
.replace(/\s+/g, ' ')
.trim();
}
console.log('Starting first pass - collecting references...');
// First pass: collect all references and used names
const matches = findAllReferences(content);
console.log('Found', matches.length, 'reference tags');
// Debug: Show first few matches
console.log('First 5 matches:', matches.slice(0, 5).map(m => m.substring(0, 100) + '...'));
matches.forEach(match => {
if (!isBlacklisted(match)) {
const normalizedRef = normalizeRef(match);
const existingName = extractExistingName(match);
if (existingName) {
usedNames.add(existingName);
// FIXED: Skip named references - they should never be deduplicated or modified
return;
}
// Only process unnamed references for deduplication
if (allRefs[normalizedRef]) {
allRefs[normalizedRef].count++;
allRefs[normalizedRef].instances.push(match);
} else {
allRefs[normalizedRef] = {
count: 1,
name: null, // Ensure unnamed refs don't get existing names
firstOccurrence: match,
instances: [match]
};
}
}
});
console.log('Reference analysis:', Object.keys(allRefs).map(key => ({
ref: key.substring(0, 50) + '...',
count: allRefs[key].count,
hasName: !!allRefs[key].name
})));
// Find duplicates (only among unnamed references)
const duplicates = Object.keys(allRefs).filter(key =>
allRefs[key].count > 1 && !allRefs[key].name // Only unnamed duplicates
);
console.log('Found', duplicates.length, 'duplicate unnamed reference groups');
// Second pass: replace duplicates with named references
let deduplicatedCount = 0;
let processedRefs = new Set();
console.log('Starting second pass - replacing duplicates...');
// Process each duplicate group (only unnamed references)
duplicates.forEach(normalizedRef => {
const refInfo = allRefs[normalizedRef];
// Double-check this is an unnamed reference group
if (refInfo.name) {
console.log('WARNING: Skipping named reference group:', refInfo.name);
return;
}
// Generate a name for this duplicate reference
const generatedName = generateUniqueName(refInfo.firstOccurrence);
if (generatedName && !blacklist.includes(generatedName)) {
refInfo.name = generatedName;
console.log('Generated name for duplicate group:', generatedName);
// Extract content for the first occurrence
const contentMatch = refInfo.firstOccurrence.match(/]*>([\s\S]*?)<\/ref>/i);
if (contentMatch) {
refInfo.firstOccurrence = createRefTag(generatedName, contentMatch[1]);
}
// Replace all instances
refInfo.instances.forEach((instance, index) => {
if (index === 0) {
// First occurrence - replace with named ref
content = content.replace(instance, refInfo.firstOccurrence);
console.log('Replaced first occurrence with named ref');
} else {
// Subsequent occurrences - replace with short ref
content = content.replace(instance, createRefTag(refInfo.name));
deduplicatedCount++;
console.log('Replaced duplicate with short ref:', createRefTag(refInfo.name));
}
});
}
});
console.log('Deduplication complete. Count:', deduplicatedCount);
// Update the textarea with the deduplicated content
if (deduplicatedCount > 0) {
editTextarea.value = content;
// Add edit summary
let currentSummary = summaryInput.value;
let deduplicationSummary = `Deduplicated ${deduplicatedCount} reference${deduplicatedCount > 1 ? 's' : ''} using DeduplicateReferences`;
summaryInput.value = currentSummary ? `${currentSummary} • ${deduplicationSummary}` : deduplicationSummary;
// Check minor edit if available
const minorEditCheckbox = document.querySelector('#wpMinoredit, input[name="wpMinoredit"]');
if (minorEditCheckbox) {
minorEditCheckbox.checked = true;
}
console.log('Successfully deduplicated', deduplicatedCount, 'references');
alert(`Successfully deduplicated ${deduplicatedCount} reference${deduplicatedCount > 1 ? 's' : ''}!`);
} else {
//console.log('No duplicates found to deduplicate');
//alert('No duplicate references found to deduplicate.');
}
}
// Function to check if we're editing the current version (not an old revision)
function isEditingCurrentVersion() {
// Check if there's an oldid parameter in the URL
const urlParams = new URLSearchParams(window.location.search);
const oldid = urlParams.get('oldid');
// If there's an oldid parameter, we're editing an old revision
if (oldid) {
console.log('Editing old revision (oldid=' + oldid + '), skipping deduplication');
return false;
}
// Check if we're in diff view
const diff = urlParams.get('diff');
if (diff) {
console.log('In diff view, skipping deduplication');
return false;
}
// Check for section editing of old revisions
const section = urlParams.get('section');
if (section && oldid) {
console.log('Editing section of old revision, skipping deduplication');
return false;
}
// Additional check: look for revision warning messages
const revisionWarning = document.querySelector('.mw-revision-warning, .mw-editnotice-base');
if (revisionWarning && revisionWarning.textContent.toLowerCase().includes('old revision')) {
console.log('Old revision warning detected, skipping deduplication');
return false;
}
console.log('Editing current version - deduplication allowed');
return true;
}
// Function to check if the edit textarea is ready
function isEditTextareaReady() {
const editTextarea = document.getElementById('wpTextbox1');
const summaryInput = document.getElementById('wpSummary');
return editTextarea && editTextarea.value && summaryInput;
}
// Function to run deduplication when everything is ready
function runDeduplicationWhenReady() {
console.log('Checking if ready...');
if (isEditTextareaReady()) {
console.log('Ready! Checking if editing current version...');
if (isEditingCurrentVersion()) {
console.log('Editing current version - running deduplication...');
deduplicateReferences();
} else {
console.log('Not editing current version - skipping deduplication');
}
} else {
console.log('Not ready yet, retrying...');
setTimeout(runDeduplicationWhenReady, 100);
}
}
// Add a button to manually trigger deduplication
function addDeduplicationButton() {
const toolbar = document.querySelector('.wikiEditor-ui-toolbar') ||
document.querySelector('#wpTextbox1').parentNode;
if (toolbar && !document.getElementById('dedupe-button')) {
const button = document.createElement('button');
button.id = 'dedupe-button';
button.type = 'button';
button.textContent = 'Deduplicate References';
button.style.margin = '5px';
// Only enable button if editing current version
if (isEditingCurrentVersion()) {
button.onclick = deduplicateReferences;
console.log('Added deduplication button (enabled)');
} else {
button.disabled = true;
button.title = 'Deduplication only works when editing the current version of an article';
button.onclick = () => alert('Deduplication only works when editing the current version of an article, not old revisions.');
console.log('Added deduplication button (disabled - old revision)');
}
toolbar.appendChild(button);
}
}
// Run the deduplication when the edit page is fully loaded
if (typeof mw !== 'undefined') {
const action = mw.config.get('wgAction');
const namespace = mw.config.get('wgNamespaceNumber');
if (action === 'edit' && namespace === 0) {
console.log('Article edit page detected, checking version and setting up deduplication...');
if (document.readyState === 'complete') {
runDeduplicationWhenReady();
addDeduplicationButton();
} else {
window.addEventListener('load', () => {
runDeduplicationWhenReady();
addDeduplicationButton();
});
}
} else {
console.log('Not on article edit page - action:', action, 'namespace:', namespace);
}
} else {
console.log('MediaWiki not available');
}
// Also add the button when the page is ready (only for article namespace)
setTimeout(() => {
if (typeof mw !== 'undefined') {
const action = mw.config.get('wgAction');
const namespace = mw.config.get('wgNamespaceNumber');
if (action === 'edit' && namespace === 0) {
addDeduplicationButton();
}
}
}, 2000);
//