lure-2026/public/csspageweaver/plugins/frenchTypoRegex/frenchTypoRegex.js

/**
 * @name French Typography Regex
 * @file A collection of regex rules for French text normalization
 * @description 
 *  This file provides a set of regular expressions to clean and standardize 
 *  French typography. It automatically fixes spacing, quotation marks, 
 *  apostrophes, ellipses, and common orthotypographic conventions 
 *  (non-breaking spaces before punctuation, French guillemets, etc.).
 * 
 * @author Julie Blanc (contact@julie-blanc.fr)
 * @created 2025-08-26
 * @updated 2025-08-26
 * @see {@link https://gitlab.com/csspageweaver/frenchTypoRegex/ }
 */

import { Handler } from '/csspageweaver/lib/paged.esm.js';

export default class frenchTypoRegex extends Handler {

  constructor(chunker, polisher, caller) {
    super(chunker, polisher, caller);
  }

  beforeParsed(content) {
    
    // SPECIFIC, delete span with apostroph
    content.querySelectorAll('span[dir="rtl"]').forEach(span => {
      if (span.textContent.trim() === '’' || span.textContent.trim() === '"’"' ) {
        span.replaceWith(document.createTextNode('’'));
      }
    });

      applyRegex(content);
  }

  
}


const arrayRegexFrenchTypo = [
  {
    // Remplacement des doubles espaces (ou plus) par un seul espace normal
    reg: /\s{2,}/g,
    repl: " ",
  },
  {
    // XIème = XIe
    reg: /(X|I|V)ème/g,
    repl: "$1e",
  },
  {
    // guillemets français ouvrants
    reg: /"([^\s])/g,
    repl: "«"
  },
  {
    // guillemets français fermants
    reg: /([^\s])"/g,
    repl: "»"
  },
  {
    // espace insécable avant ; : ? ! »
    reg: /\s+([;:?!»])/g,
    repl: "\u00A0$1",
  },
  {
    // espace insécable après «
    reg: /(«)\s+/g,
    repl: "$1\u00A0",
  },
  {
      // real apostrophe
      reg: /(\w)'(\w)/g,
      repl: '$1’$2'
  },
  {
      // real suspension points
      reg: /\.{3,}/g,
      repl: '\u2026'
  },
  {
    // espace insécable après certains mots de deux lettres
    reg: /\b(le|la|ce|on|il|de|du|un|au|en)\s+/gi,
    repl: '$1\u00A0'
  },
  {
  // no break space after one letter words
      reg: /\s+([a-zà])\s+/gi, 
      repl: ' $1\u00A0'
  },
  {
      // no break space after first word (2-3 letter) of the sentence
      reg: /\.\s([A-ZÀ-Ö])([A-Za-zÀ-ÖØ-öø-ÿœŒ]{1,3})\s+/g, 
      repl: '. $1$2\u00A0'
  },
  {
      // delete all spaces before punctuation !?;:»›”%€)].,
      reg: /\s+([!?;:»›”%€$)\]\.\,])/g, 
      repl: '$1'
  },
  {
    // add narrow no break space before !?;:»›%€ (sauf http://)
    reg: /(?<!https?)[:!?;»›%€$]/g,
    repl: '\u202F$&'
  },
  {
      // delete all spaces after «‹“[(
      reg: /([«‹“\[(])\s+/g, 
      repl: '$1'
  },
  {
      // add narrow no break space after «‹
      reg: /([«‹])/g, 
      repl: '$1\u202F'
  },
  {
      // no break space before 'siècle'
      reg: /(X|I|V)(er|e)\s+siècle/g, 
      repl: '$1$2\u00A0siècle'
  },
  {
    // Espaces insécables pour les nombres
    reg: /(\d)(?=(\d{3})+(?!\d))/g,
    repl: '$1\u202F'
  },
  {
      // no break space after figures table page chapitre ect. + number
      reg: /(figures?|tables?|planches?|chapitres?|pages?|parties?|sections?|volumes?|vol\.)\s+(\d|I|X|V)/g, 
      repl: '$1\u00A0$2'
  },
  {
    // Espace insécable après "p." ou "pp." en bibliographie
    reg: /\b(pp?)\.\s*(\d+)/gi,
    repl: '$1.\u00A0$2'
  },
  {
    // Coupures possibles dans les URLs (ajout d’un zero-width space après / - _ . sauf les // initiaux)
    reg: /(https?:\/\/[^\s]+)/g,
    repl: (match) => match.replace(/(?<!:)\/(?!\/)|[-_.]/g, "$&\u200B")
  },
];

function  applyRegex(content) {

  // TREE WALKER
  const walker = document.createTreeWalker(
    content,
    NodeFilter.SHOW_TEXT,
    null,
    false
  );

  let node;
  while ((node = walker.nextNode())) {
    // Exclure <code> ou <pre>
    const codeParent = node.parentElement?.closest("code, pre");
    if (codeParent) continue;

    // APPLY REGEX FROM ARRAY
    for (let i = 0; i < arrayRegexFrenchTypo.length; i++) {
      node.textContent = node.textContent.replace(arrayRegexFrenchTypo[i].reg, arrayRegexFrenchTypo[i].repl);
    }

  }

}