lure-2026/public/csspageweaver/plugins/frenchTypoRegex/frenchTypoRegex.js
2026-01-10 18:33:22 +01:00

169 lines
4.2 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* @name French Typography Regex
* @file A collection of regex rules for French text normalization
* @description
* This file provides a set of regular expressions to clean and standardize
* French typography. It automatically fixes spacing, quotation marks,
* apostrophes, ellipses, and common orthotypographic conventions
* (non-breaking spaces before punctuation, French guillemets, etc.).
*
* @author Julie Blanc (contact@julie-blanc.fr)
* @created 2025-08-26
* @updated 2025-08-26
* @see {@link https://gitlab.com/csspageweaver/frenchTypoRegex/ }
*/
import { Handler } from '/csspageweaver/lib/paged.esm.js';
export default class frenchTypoRegex extends Handler {
constructor(chunker, polisher, caller) {
super(chunker, polisher, caller);
}
beforeParsed(content) {
//SPECIFIC, delete span with apostroph
content.querySelectorAll('span[dir="rtl"]').forEach(span => {
if (span.textContent.trim() === '' || span.textContent.trim() === '""' ) {
span.replaceWith(document.createTextNode(''));
}
});
applyRegex(content);
}
}
const arrayRegexFrenchTypo = [
{
// Remplacement des doubles espaces (ou plus) par un seul espace normal
reg: /\s{2,}/g,
repl: " ",
},
{
// XIème = XIe
reg: /(X|I|V)ème/g,
repl: "$1e",
},
{
// guillemets français ouvrants
reg: /"([^\s])/g,
repl: "«"
},
{
// guillemets français fermants
reg: /([^\s])"/g,
repl: "»"
},
{
// espace insécable avant ; : ? ! »
reg: /\s+([;:?!»])/g,
repl: "\u00A0$1",
},
{
// espace insécable après «
reg: /(«)\s+/g,
repl: "$1\u00A0",
},
{
// real apostrophe
reg: /(\w)'(\w)/g,
repl: '$1$2'
},
{
// real suspension points
reg: /\.{3,}/g,
repl: '\u2026'
},
{
// espace insécable après certains mots de deux lettres
reg: /\b(le|la|ce|on|il|de|du|un|au|en)\s+/gi,
repl: '$1\u00A0'
},
{
// no break space after one letter words
reg: /\s+([a-zà])\s+/gi,
repl: ' $1\u00A0'
},
{
// no break space after first word (2-3 letter) of the sentence
reg: /\.\s([A-ZÀ-Ö])([A-Za-zÀ-ÖØ-öø-ÿœŒ]{1,3})\s+/g,
repl: '. $1$2\u00A0'
},
{
// delete all spaces before punctuation !?;:»›”%€)].,
reg: /\s+([!?;:»›”%€$)\]\.\,])/g,
repl: '$1'
},
{
// add narrow no break space before !?;:»›%€ (sauf http://)
reg: /(?<!https?)[:!?;»›%€$]/g,
repl: '\u202F$&'
},
{
// delete all spaces after «‹“[(
reg: /([«‹“\[(])\s+/g,
repl: '$1'
},
{
// add narrow no break space after «‹
reg: /([«‹])/g,
repl: '$1\u202F'
},
{
// no break space before 'siècle'
reg: /(X|I|V)(er|e)\s+siècle/g,
repl: '$1$2\u00A0siècle'
},
{
// Espaces insécables pour les nombres
reg: /(\d)(?=(\d{3})+(?!\d))/g,
repl: '$1\u202F'
},
{
// no break space after figures table page chapitre ect. + number
reg: /(figures?|tables?|planches?|chapitres?|pages?|parties?|sections?|volumes?|vol\.)\s+(\d|I|X|V)/g,
repl: '$1\u00A0$2'
},
{
// Espace insécable après "p." ou "pp." en bibliographie
reg: /\b(pp?)\.\s*(\d+)/gi,
repl: '$1.\u00A0$2'
},
{
// Coupures possibles dans les URLs (ajout dun zero-width space après / - _ . sauf les // initiaux)
reg: /(https?:\/\/[^\s]+)/g,
repl: (match) => match.replace(/(?<!:)\/(?!\/)|[-_.]/g, "$&\u200B")
},
];
function applyRegex(content) {
// TREE WALKER
const walker = document.createTreeWalker(
content,
NodeFilter.SHOW_TEXT,
null,
false
);
let node;
while ((node = walker.nextNode())) {
// Exclure <code> ou <pre>
const codeParent = node.parentElement?.closest("code, pre");
if (codeParent) continue;
// APPLYREGEXFROMARRAY
for (let i = 0; i < arrayRegexFrenchTypo.length; i++) {
node.textContent = node.textContent.replace(arrayRegexFrenchTypo[i].reg, arrayRegexFrenchTypo[i].repl);
}
}
}