initial commit

2026-01-10 18:33:22 +01:00 · 2026-01-10 18:33:22 +01:00 · 21711bd5dd
commit 21711bd5dd
253 changed files with 78415 additions and 0 deletions
--- a/public/csspageweaver/plugins/frenchTypoRegex/.gitignore
+++ b/public/csspageweaver/plugins/frenchTypoRegex/.gitignore
@ -0,0 +1 @@
+.DS_Store
--- a/public/csspageweaver/plugins/frenchTypoRegex/config.json
+++ b/public/csspageweaver/plugins/frenchTypoRegex/config.json
@ -0,0 +1,10 @@
+{
+	"name": "French Typography Regex",
+	"description": "A collection of regex rules for French text normalization. This file provides a set of regular expressions to clean and standardize French typography. It automatically fixes spacing, quotation marks, apostrophes, ellipses, and common orthotypographic conventions (non-breaking spaces before punctuation, French guillemets, etc.).",
+	"author": ["Julie Blanc"],
+	"license": "MIT License",
+	"version": "1.0",
+	"created": "2025-08-26",
+	"updated": "2025-08-26",
+	"hook": "frenchTypoRegex.js"
+}
--- a/public/csspageweaver/plugins/frenchTypoRegex/frenchTypoRegex.js
+++ b/public/csspageweaver/plugins/frenchTypoRegex/frenchTypoRegex.js
@ -0,0 +1,169 @@
+/**
+ * @name French Typography Regex
+ * @file A collection of regex rules for French text normalization
+ * @description 
+ *  This file provides a set of regular expressions to clean and standardize 
+ *  French typography. It automatically fixes spacing, quotation marks, 
+ *  apostrophes, ellipses, and common orthotypographic conventions 
+ *  (non-breaking spaces before punctuation, French guillemets, etc.).
+ * 
+ * @author Julie Blanc (contact@julie-blanc.fr)
+ * @created 2025-08-26
+ * @updated 2025-08-26
+ * @see {@link https://gitlab.com/csspageweaver/frenchTypoRegex/ }
+ */
+
+import { Handler } from '/csspageweaver/lib/paged.esm.js';
+
+export default class frenchTypoRegex extends Handler {
+
+  constructor(chunker, polisher, caller) {
+    super(chunker, polisher, caller);
+  }
+
+  beforeParsed(content) {
+    
+    // SPECIFIC, delete span with apostroph
+    content.querySelectorAll('span[dir="rtl"]').forEach(span => {
+      if (span.textContent.trim() === '’' || span.textContent.trim() === '"’"' ) {
+        span.replaceWith(document.createTextNode('’'));
+      }
+    });
+
+      applyRegex(content);
+  }
+
+  
+}
+
+
+const arrayRegexFrenchTypo = [
+  {
+    // Remplacement des doubles espaces (ou plus) par un seul espace normal
+    reg: /\s{2,}/g,
+    repl: " ",
+  },
+  {
+    // XIème = XIe
+    reg: /(X|I|V)ème/g,
+    repl: "$1e",
+  },
+  {
+    // guillemets français ouvrants
+    reg: /"([^\s])/g,
+    repl: "«"
+  },
+  {
+    // guillemets français fermants
+    reg: /([^\s])"/g,
+    repl: "»"
+  },
+  {
+    // espace insécable avant ; : ? ! »
+    reg: /\s+([;:?!»])/g,
+    repl: "\u00A0$1",
+  },
+  {
+    // espace insécable après «
+    reg: /(«)\s+/g,
+    repl: "$1\u00A0",
+  },
+  {
+      // real apostrophe
+      reg: /(\w)'(\w)/g,
+      repl: '$1’$2'
+  },
+  {
+      // real suspension points
+      reg: /\.{3,}/g,
+      repl: '\u2026'
+  },
+  {
+    // espace insécable après certains mots de deux lettres
+    reg: /\b(le|la|ce|on|il|de|du|un|au|en)\s+/gi,
+    repl: '$1\u00A0'
+  },
+  {
+  // no break space after one letter words
+      reg: /\s+([a-zà])\s+/gi, 
+      repl: ' $1\u00A0'
+  },
+  {
+      // no break space after first word (2-3 letter) of the sentence
+      reg: /\.\s([A-ZÀ-Ö])([A-Za-zÀ-ÖØ-öø-ÿœŒ]{1,3})\s+/g, 
+      repl: '. $1$2\u00A0'
+  },
+  {
+      // delete all spaces before punctuation !?;:»›”%€)].,
+      reg: /\s+([!?;:»›”%€$)\]\.\,])/g, 
+      repl: '$1'
+  },
+  {
+    // add narrow no break space before !?;:»›%€ (sauf http://)
+    reg: /(?<!https?)[:!?;»›%€$]/g,
+    repl: '\u202F$&'
+  },
+  {
+      // delete all spaces after «‹“[(
+      reg: /([«‹“\[(])\s+/g, 
+      repl: '$1'
+  },
+  {
+      // add narrow no break space after «‹
+      reg: /([«‹])/g, 
+      repl: '$1\u202F'
+  },
+  {
+      // no break space before 'siècle'
+      reg: /(X|I|V)(er|e)\s+siècle/g, 
+      repl: '$1$2\u00A0siècle'
+  },
+  {
+    // Espaces insécables pour les nombres
+    reg: /(\d)(?=(\d{3})+(?!\d))/g,
+    repl: '$1\u202F'
+  },
+  {
+      // no break space after figures table page chapitre ect. + number
+      reg: /(figures?|tables?|planches?|chapitres?|pages?|parties?|sections?|volumes?|vol\.)\s+(\d|I|X|V)/g, 
+      repl: '$1\u00A0$2'
+  },
+  {
+    // Espace insécable après "p." ou "pp." en bibliographie
+    reg: /\b(pp?)\.\s*(\d+)/gi,
+    repl: '$1.\u00A0$2'
+  },
+  {
+    // Coupures possibles dans les URLs (ajout d’un zero-width space après / - _ . sauf les // initiaux)
+    reg: /(https?:\/\/[^\s]+)/g,
+    repl: (match) => match.replace(/(?<!:)\/(?!\/)|[-_.]/g, "$&\u200B")
+  },
+];
+
+function  applyRegex(content) {
+
+  // TREE WALKER
+  const walker = document.createTreeWalker(
+    content,
+    NodeFilter.SHOW_TEXT,
+    null,
+    false
+  );
+
+  let node;
+  while ((node = walker.nextNode())) {
+    // Exclure <code> ou <pre>
+    const codeParent = node.parentElement?.closest("code, pre");
+    if (codeParent) continue;
+
+    // APPLY REGEX FROM ARRAY
+    for (let i = 0; i < arrayRegexFrenchTypo.length; i++) {
+      node.textContent = node.textContent.replace(arrayRegexFrenchTypo[i].reg, arrayRegexFrenchTypo[i].repl);
+    }
+
+  }
+
+}
+
+
+