lure-2026/public/csspageweaver/plugins/frenchTypoRegex/frenchTypoRegex.js

170 lines
4.2 KiB
JavaScript
Raw Normal View History

2026-01-10 18:33:22 +01:00
/**
* @name French Typography Regex
* @file A collection of regex rules for French text normalization
* @description
* This file provides a set of regular expressions to clean and standardize
* French typography. It automatically fixes spacing, quotation marks,
* apostrophes, ellipses, and common orthotypographic conventions
* (non-breaking spaces before punctuation, French guillemets, etc.).
*
* @author Julie Blanc (contact@julie-blanc.fr)
* @created 2025-08-26
* @updated 2025-08-26
* @see {@link https://gitlab.com/csspageweaver/frenchTypoRegex/ }
*/
import { Handler } from '/csspageweaver/lib/paged.esm.js';
export default class frenchTypoRegex extends Handler {
constructor(chunker, polisher, caller) {
super(chunker, polisher, caller);
}
beforeParsed(content) {
//SPECIFIC, delete span with apostroph
content.querySelectorAll('span[dir="rtl"]').forEach(span => {
if (span.textContent.trim() === '' || span.textContent.trim() === '""' ) {
span.replaceWith(document.createTextNode(''));
}
});
applyRegex(content);
}
}
const arrayRegexFrenchTypo = [
{
// Remplacement des doubles espaces (ou plus) par un seul espace normal
reg: /\s{2,}/g,
repl: " ",
},
{
// XIème = XIe
reg: /(X|I|V)ème/g,
repl: "$1e",
},
{
// guillemets français ouvrants
reg: /"([^\s])/g,
repl: "«"
},
{
// guillemets français fermants
reg: /([^\s])"/g,
repl: "»"
},
{
// espace insécable avant ; : ? ! »
reg: /\s+([;:?!»])/g,
repl: "\u00A0$1",
},
{
// espace insécable après «
reg: /(«)\s+/g,
repl: "$1\u00A0",
},
{
// real apostrophe
reg: /(\w)'(\w)/g,
repl: '$1$2'
},
{
// real suspension points
reg: /\.{3,}/g,
repl: '\u2026'
},
{
// espace insécable après certains mots de deux lettres
reg: /\b(le|la|ce|on|il|de|du|un|au|en)\s+/gi,
repl: '$1\u00A0'
},
{
// no break space after one letter words
reg: /\s+([a-zà])\s+/gi,
repl: ' $1\u00A0'
},
{
// no break space after first word (2-3 letter) of the sentence
reg: /\.\s([A-ZÀ-Ö])([A-Za-zÀ-ÖØ-öø-ÿœŒ]{1,3})\s+/g,
repl: '. $1$2\u00A0'
},
{
// delete all spaces before punctuation !?;:»›”%€)].,
reg: /\s+([!?;:»›”%€$)\]\.\,])/g,
repl: '$1'
},
{
// add narrow no break space before !?;:»›%€ (sauf http://)
reg: /(?<!https?)[:!?;»›%€$]/g,
repl: '\u202F$&'
},
{
// delete all spaces after «‹“[(
reg: /([«‹“\[(])\s+/g,
repl: '$1'
},
{
// add narrow no break space after «‹
reg: /([«‹])/g,
repl: '$1\u202F'
},
{
// no break space before 'siècle'
reg: /(X|I|V)(er|e)\s+siècle/g,
repl: '$1$2\u00A0siècle'
},
{
// Espaces insécables pour les nombres
reg: /(\d)(?=(\d{3})+(?!\d))/g,
repl: '$1\u202F'
},
{
// no break space after figures table page chapitre ect. + number
reg: /(figures?|tables?|planches?|chapitres?|pages?|parties?|sections?|volumes?|vol\.)\s+(\d|I|X|V)/g,
repl: '$1\u00A0$2'
},
{
// Espace insécable après "p." ou "pp." en bibliographie
reg: /\b(pp?)\.\s*(\d+)/gi,
repl: '$1.\u00A0$2'
},
{
// Coupures possibles dans les URLs (ajout dun zero-width space après / - _ . sauf les // initiaux)
reg: /(https?:\/\/[^\s]+)/g,
repl: (match) => match.replace(/(?<!:)\/(?!\/)|[-_.]/g, "$&\u200B")
},
];
function applyRegex(content) {
// TREE WALKER
const walker = document.createTreeWalker(
content,
NodeFilter.SHOW_TEXT,
null,
false
);
let node;
while ((node = walker.nextNode())) {
// Exclure <code> ou <pre>
const codeParent = node.parentElement?.closest("code, pre");
if (codeParent) continue;
// APPLYREGEXFROMARRAY
for (let i = 0; i < arrayRegexFrenchTypo.length; i++) {
node.textContent = node.textContent.replace(arrayRegexFrenchTypo[i].reg, arrayRegexFrenchTypo[i].repl);
}
}
}