initial commit
This commit is contained in:
commit
21711bd5dd
253 changed files with 78415 additions and 0 deletions
1
public/csspageweaver/plugins/frenchTypoRegex/.gitignore
vendored
Normal file
1
public/csspageweaver/plugins/frenchTypoRegex/.gitignore
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
.DS_Store
|
||||
10
public/csspageweaver/plugins/frenchTypoRegex/config.json
Normal file
10
public/csspageweaver/plugins/frenchTypoRegex/config.json
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
{
|
||||
"name": "French Typography Regex",
|
||||
"description": "A collection of regex rules for French text normalization. This file provides a set of regular expressions to clean and standardize French typography. It automatically fixes spacing, quotation marks, apostrophes, ellipses, and common orthotypographic conventions (non-breaking spaces before punctuation, French guillemets, etc.).",
|
||||
"author": ["Julie Blanc"],
|
||||
"license": "MIT License",
|
||||
"version": "1.0",
|
||||
"created": "2025-08-26",
|
||||
"updated": "2025-08-26",
|
||||
"hook": "frenchTypoRegex.js"
|
||||
}
|
||||
169
public/csspageweaver/plugins/frenchTypoRegex/frenchTypoRegex.js
Normal file
169
public/csspageweaver/plugins/frenchTypoRegex/frenchTypoRegex.js
Normal file
|
|
@ -0,0 +1,169 @@
|
|||
/**
|
||||
* @name French Typography Regex
|
||||
* @file A collection of regex rules for French text normalization
|
||||
* @description
|
||||
* This file provides a set of regular expressions to clean and standardize
|
||||
* French typography. It automatically fixes spacing, quotation marks,
|
||||
* apostrophes, ellipses, and common orthotypographic conventions
|
||||
* (non-breaking spaces before punctuation, French guillemets, etc.).
|
||||
*
|
||||
* @author Julie Blanc (contact@julie-blanc.fr)
|
||||
* @created 2025-08-26
|
||||
* @updated 2025-08-26
|
||||
* @see {@link https://gitlab.com/csspageweaver/frenchTypoRegex/ }
|
||||
*/
|
||||
|
||||
import { Handler } from '/csspageweaver/lib/paged.esm.js';
|
||||
|
||||
export default class frenchTypoRegex extends Handler {
|
||||
|
||||
constructor(chunker, polisher, caller) {
|
||||
super(chunker, polisher, caller);
|
||||
}
|
||||
|
||||
beforeParsed(content) {
|
||||
|
||||
// SPECIFIC, delete span with apostroph
|
||||
content.querySelectorAll('span[dir="rtl"]').forEach(span => {
|
||||
if (span.textContent.trim() === '’' || span.textContent.trim() === '"’"' ) {
|
||||
span.replaceWith(document.createTextNode('’'));
|
||||
}
|
||||
});
|
||||
|
||||
applyRegex(content);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
const arrayRegexFrenchTypo = [
|
||||
{
|
||||
// Remplacement des doubles espaces (ou plus) par un seul espace normal
|
||||
reg: /\s{2,}/g,
|
||||
repl: " ",
|
||||
},
|
||||
{
|
||||
// XIème = XIe
|
||||
reg: /(X|I|V)ème/g,
|
||||
repl: "$1e",
|
||||
},
|
||||
{
|
||||
// guillemets français ouvrants
|
||||
reg: /"([^\s])/g,
|
||||
repl: "«"
|
||||
},
|
||||
{
|
||||
// guillemets français fermants
|
||||
reg: /([^\s])"/g,
|
||||
repl: "»"
|
||||
},
|
||||
{
|
||||
// espace insécable avant ; : ? ! »
|
||||
reg: /\s+([;:?!»])/g,
|
||||
repl: "\u00A0$1",
|
||||
},
|
||||
{
|
||||
// espace insécable après «
|
||||
reg: /(«)\s+/g,
|
||||
repl: "$1\u00A0",
|
||||
},
|
||||
{
|
||||
// real apostrophe
|
||||
reg: /(\w)'(\w)/g,
|
||||
repl: '$1’$2'
|
||||
},
|
||||
{
|
||||
// real suspension points
|
||||
reg: /\.{3,}/g,
|
||||
repl: '\u2026'
|
||||
},
|
||||
{
|
||||
// espace insécable après certains mots de deux lettres
|
||||
reg: /\b(le|la|ce|on|il|de|du|un|au|en)\s+/gi,
|
||||
repl: '$1\u00A0'
|
||||
},
|
||||
{
|
||||
// no break space after one letter words
|
||||
reg: /\s+([a-zà])\s+/gi,
|
||||
repl: ' $1\u00A0'
|
||||
},
|
||||
{
|
||||
// no break space after first word (2-3 letter) of the sentence
|
||||
reg: /\.\s([A-ZÀ-Ö])([A-Za-zÀ-ÖØ-öø-ÿœŒ]{1,3})\s+/g,
|
||||
repl: '. $1$2\u00A0'
|
||||
},
|
||||
{
|
||||
// delete all spaces before punctuation !?;:»›”%€)].,
|
||||
reg: /\s+([!?;:»›”%€$)\]\.\,])/g,
|
||||
repl: '$1'
|
||||
},
|
||||
{
|
||||
// add narrow no break space before !?;:»›%€ (sauf http://)
|
||||
reg: /(?<!https?)[:!?;»›%€$]/g,
|
||||
repl: '\u202F$&'
|
||||
},
|
||||
{
|
||||
// delete all spaces after «‹“[(
|
||||
reg: /([«‹“\[(])\s+/g,
|
||||
repl: '$1'
|
||||
},
|
||||
{
|
||||
// add narrow no break space after «‹
|
||||
reg: /([«‹])/g,
|
||||
repl: '$1\u202F'
|
||||
},
|
||||
{
|
||||
// no break space before 'siècle'
|
||||
reg: /(X|I|V)(er|e)\s+siècle/g,
|
||||
repl: '$1$2\u00A0siècle'
|
||||
},
|
||||
{
|
||||
// Espaces insécables pour les nombres
|
||||
reg: /(\d)(?=(\d{3})+(?!\d))/g,
|
||||
repl: '$1\u202F'
|
||||
},
|
||||
{
|
||||
// no break space after figures table page chapitre ect. + number
|
||||
reg: /(figures?|tables?|planches?|chapitres?|pages?|parties?|sections?|volumes?|vol\.)\s+(\d|I|X|V)/g,
|
||||
repl: '$1\u00A0$2'
|
||||
},
|
||||
{
|
||||
// Espace insécable après "p." ou "pp." en bibliographie
|
||||
reg: /\b(pp?)\.\s*(\d+)/gi,
|
||||
repl: '$1.\u00A0$2'
|
||||
},
|
||||
{
|
||||
// Coupures possibles dans les URLs (ajout d’un zero-width space après / - _ . sauf les // initiaux)
|
||||
reg: /(https?:\/\/[^\s]+)/g,
|
||||
repl: (match) => match.replace(/(?<!:)\/(?!\/)|[-_.]/g, "$&\u200B")
|
||||
},
|
||||
];
|
||||
|
||||
function applyRegex(content) {
|
||||
|
||||
// TREE WALKER
|
||||
const walker = document.createTreeWalker(
|
||||
content,
|
||||
NodeFilter.SHOW_TEXT,
|
||||
null,
|
||||
false
|
||||
);
|
||||
|
||||
let node;
|
||||
while ((node = walker.nextNode())) {
|
||||
// Exclure <code> ou <pre>
|
||||
const codeParent = node.parentElement?.closest("code, pre");
|
||||
if (codeParent) continue;
|
||||
|
||||
// APPLY REGEX FROM ARRAY
|
||||
for (let i = 0; i < arrayRegexFrenchTypo.length; i++) {
|
||||
node.textContent = node.textContent.replace(arrayRegexFrenchTypo[i].reg, arrayRegexFrenchTypo[i].repl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue