popnoire/site/OFF_plugins/typography/php-typography/class-php-typography.php
2026-02-12 15:22:46 +01:00

3386 lines
130 KiB
PHP

<?php
/**
* This file is part of wp-Typography.
*
* Copyright 2014-2016 Peter Putzer.
* Copyright 2012-2013 Marie Hogebrandt.
* Coypright 2009-2011 KINGdesk, LLC.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* ***
*
* @package wpTypography/PHPTypography
* @license http://www.gnu.org/licenses/gpl-2.0.html
*/
namespace PHP_Typography;
/**
* A few utility functions.
*/
require_once __DIR__ . '/php-typography-functions.php'; // @codeCoverageIgnore
/**
* HTML5-PHP - a DOM-based HTML5 parser
*/
require_once dirname( __DIR__ ) . '/vendor/Masterminds/HTML5.php'; // @codeCoverageIgnore
require_once dirname( __DIR__ ) . '/vendor/Masterminds/HTML5/autoload.php'; // @codeCoverageIgnore
/**
* Parses HTML5 (or plain text) and applies various typographic fixes to the text.
*
* If used with multibyte language, UTF-8 encoding is required.
*
* Portions of this code have been inspired by:
* - typogrify (https://code.google.com/p/typogrify/)
* - WordPress code for wptexturize (https://developer.wordpress.org/reference/functions/wptexturize/)
* - PHP SmartyPants Typographer (https://michelf.ca/projects/php-smartypants/typographer/)
*
* @author Jeffrey D. King <jeff@kingdesk.com>
* @author Peter Putzer <github@mundschenk.at>
*/
class PHP_Typography {
/**
* A hashmap for various special characters.
*
* @var array
*/
public $chr = array();
/**
* A hashmap of settings for the various typographic options.
*
* @var array
*/
public $settings = array();
/**
* A custom parser for \DOMText to separate words, whitespace etc. for HTML injection.
*
* @var Parse_Text
*/
private $text_parser;
/**
* A DOM-based HTML5 parser.
*
* @var \Masterminds\HTML5
*/
private $html5_parser;
/**
* An array containing all self-closing HTML5 tags.
*
* @var array
*/
private $self_closing_tags = array();
/**
* A array of tags we should never touch.
*
* @var array
*/
private $inappropriate_tags = array();
/**
* An array of ( $tag => true ) for quick checking with `isset`.
*
* @var array
*/
private $heading_tags = array( 'h1' => true, 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true, 'h6' => true );
/**
* An array of encodings in detection order.
*
* @var array
*/
private $encodings = array( 'ASCII', 'UTF-8' );
/**
* A hash map for string functions according to encoding.
*
* @var array $encoding => array( 'strlen' => $function_name, ... ).
*/
private $str_functions = array(
'UTF-8' => array(),
'ASCII' => array(),
false => array(),
);
/**
* An array of various regex components (not complete patterns).
*
* @var array $components
*/
private $components = array();
/**
* An array of regex patterns.
*
* @var array $regex
*/
private $regex = array();
/**
* An array in the form of [ '$style' => [ 'open' => $chr, 'close' => $chr ] ]
*
* @var array
*/
private $quote_styles = array();
/**
* An array in the form of [ '$style' => [ 'parenthetical' => $chr, 'interval' => $chr ] ]
*
* @var array
*/
private $dash_styles = array();
/**
* An array in the form of [ '$tag' => true ]
*
* @var array
*/
private $block_tags = array();
/**
* An array of CSS classes that are added for ampersands, numbers etc that can be overridden in a subclass.
*
* @var array
*/
protected $css_classes = array(
'caps' => 'caps',
'numbers' => 'numbers',
'amp' => 'amp',
'quo' => 'quo',
'dquo' => 'dquo',
'pull-single' => 'pull-single',
'pull-double' => 'pull-double',
'push-single' => 'push-single',
'push-double' => 'push-double',
'numerator' => 'numerator',
'denominator' => 'denominator',
'ordinal' => 'ordinal',
);
/**
* Set up a new PHP_Typography object.
*
* @param boolean $set_defaults If true, set default values for various properties. Defaults to true.
* @param string $init Flag to control initialization. Valid inputs are 'now' and 'lazy'. Optional. Default 'now'.
*/
function __construct( $set_defaults = true, $init = 'now' ) {
// ASCII has to be first to have chance at detection.
mb_detect_order( $this->encodings );
// Not sure if this is necessary - but error_log seems to have problems with the strings.
// Used as the default encoding for mb_* functions.
$encoding_set = mb_internal_encoding( 'UTF-8' );
if ( 'now' === $init ) {
$this->init( $set_defaults );
}
}
/**
* Load the given state.
*
* @param array $state The state array. Has to contain 'block_tags', 'chr', 'quote_styles', 'dash_styles', 'str_functions',
* 'components', 'regex', 'self_closing_tags', 'inappropriate_tags', 'css_classes', 'settings'.
* @return boolean True if successful, false if $state is incomplete.
*/
function load_state( $state ) {
if ( ! isset( $state['block_tags'] ) ||
! isset( $state['chr'] ) ||
! isset( $state['quote_styles'] ) ||
! isset( $state['str_functions'] ) ||
! isset( $state['components'] ) ||
! isset( $state['regex'] ) ||
! isset( $state['self_closing_tags'] ) ||
! isset( $state['inappropriate_tags'] ) ||
! isset( $state['css_classes'] ) ||
! isset( $state['settings'] ) ) {
return false;
}
$this->block_tags = $state['block_tags'];
$this->chr = $state['chr'];
$this->quote_styles = $state['quote_styles'];
$this->dash_styles = $state['dash_styles'];
$this->str_functions = $state['str_functions'];
$this->components = $state['components'];
$this->regex = $state['regex'];
$this->self_closing_tags = $state['self_closing_tags'];
$this->inappropriate_tags = $state['inappropriate_tags'];
$this->css_classes = $state['css_classes'];
$this->settings = $state['settings'];
return true;
}
/**
* Retrieves to current state of the PHP_Typography object for caching.
*
* @return array The state array.
*/
function save_state() {
return array(
'block_tags' => $this->block_tags,
'chr' => $this->chr,
'quote_styles' => $this->quote_styles,
'dash_styles' => $this->dash_styles,
'str_functions' => $this->str_functions,
'components' => $this->components,
'regex' => $this->regex,
'self_closing_tags' => $this->self_closing_tags,
'inappropriate_tags' => $this->inappropriate_tags,
'css_classes' => $this->css_classes,
'settings' => $this->settings,
);
}
/**
* Initialize the PHP_Typography object.
*
* @param boolean $set_defaults If true, set default values for various properties. Defaults to true.
*/
function init( $set_defaults = true ) {
$this->block_tags = array_flip( array_filter( array_keys( \Masterminds\HTML5\Elements::$html5 ), function( $tag ) { return \Masterminds\HTML5\Elements::isA( $tag, \Masterminds\HTML5\Elements::BLOCK_TAG ); } )
+ array( 'li', 'td', 'dt' ) ); // not included as "block tags" in current HTML5-PHP version.
$this->chr['noBreakSpace'] = uchr( 160 );
$this->chr['noBreakNarrowSpace'] = uchr( 160 ); // used in unit spacing - can be changed to 8239 via set_true_no_break_narrow_space.
$this->chr['copyright'] = uchr( 169 );
$this->chr['guillemetOpen'] = uchr( 171 );
$this->chr['softHyphen'] = uchr( 173 );
$this->chr['registeredMark'] = uchr( 174 );
$this->chr['guillemetClose'] = uchr( 187 );
$this->chr['multiplication'] = uchr( 215 );
$this->chr['division'] = uchr( 247 );
$this->chr['figureSpace'] = uchr( 8199 );
$this->chr['thinSpace'] = uchr( 8201 );
$this->chr['hairSpace'] = uchr( 8202 );
$this->chr['zeroWidthSpace'] = uchr( 8203 );
$this->chr['hyphen'] = '-'; // should be uchr(8208), but IE6 chokes.
$this->chr['noBreakHyphen'] = uchr( 8209 );
$this->chr['enDash'] = uchr( 8211 );
$this->chr['emDash'] = uchr( 8212 );
$this->chr['parentheticalDash'] = uchr( 8212 ); // defined separate from emDash so it can be redefined in set_smart_dashes_style.
$this->chr['intervalDash'] = uchr( 8211 ); // defined separate from enDash so it can be redefined in set_smart_dashes_style.
$this->chr['parentheticalDashSpace'] = uchr( 8201 );
$this->chr['intervalDashSpace'] = uchr( 8201 );
$this->chr['singleQuoteOpen'] = uchr( 8216 );
$this->chr['singleQuoteClose'] = uchr( 8217 );
$this->chr['apostrophe'] = uchr( 8217 ); // defined seperate from singleQuoteClose so quotes can be redefined in set_smart_quotes_language() without disrupting apostrophies.
$this->chr['singleLow9Quote'] = uchr( 8218 );
$this->chr['doubleQuoteOpen'] = uchr( 8220 );
$this->chr['doubleQuoteClose'] = uchr( 8221 );
$this->chr['doubleLow9Quote'] = uchr( 8222 );
$this->chr['ellipses'] = uchr( 8230 );
$this->chr['singlePrime'] = uchr( 8242 );
$this->chr['doublePrime'] = uchr( 8243 );
$this->chr['singleAngleQuoteOpen'] = uchr( 8249 );
$this->chr['singleAngleQuoteClose'] = uchr( 8250 );
$this->chr['fractionSlash'] = uchr( 8260 );
$this->chr['soundCopyMark'] = uchr( 8471 );
$this->chr['serviceMark'] = uchr( 8480 );
$this->chr['tradeMark'] = uchr( 8482 );
$this->chr['minus'] = uchr( 8722 );
$this->chr['leftCornerBracket'] = uchr( 12300 );
$this->chr['rightCornerBracket'] = uchr( 12301 );
$this->chr['leftWhiteCornerBracket'] = uchr( 12302 );
$this->chr['rightWhiteCornerBracket'] = uchr( 12303 );
$this->quote_styles = array(
'doubleCurled' => array(
'open' => uchr( 8220 ),
'close' => uchr( 8221 ),
),
'doubleCurledReversed' => array(
'open' => uchr( 8221 ),
'close' => uchr( 8221 ),
),
'doubleLow9' => array(
'open' => $this->chr['doubleLow9Quote'],
'close' => uchr( 8221 ),
),
'doubleLow9Reversed' => array(
'open' => $this->chr['doubleLow9Quote'],
'close' => uchr( 8220 ),
),
'singleCurled' => array(
'open' => uchr( 8216 ),
'close' => uchr( 8217 ),
),
'singleCurledReversed' => array(
'open' => uchr( 8217 ),
'close' => uchr( 8217 ),
),
'singleLow9' => array(
'open' => $this->chr['singleLow9Quote'],
'close' => uchr( 8217 ),
),
'singleLow9Reversed' => array(
'open' => $this->chr['singleLow9Quote'],
'close' => uchr( 8216 ),
),
'doubleGuillemetsFrench' => array(
'open' => $this->chr['guillemetOpen'] . $this->chr['noBreakNarrowSpace'],
'close' => $this->chr['noBreakNarrowSpace'] . $this->chr['guillemetClose'],
),
'doubleGuillemets' => array(
'open' => $this->chr['guillemetOpen'],
'close' => $this->chr['guillemetClose'],
),
'doubleGuillemetsReversed' => array(
'open' => $this->chr['guillemetClose'],
'close' => $this->chr['guillemetOpen'],
),
'singleGuillemets' => array(
'open' => $this->chr['singleAngleQuoteOpen'],
'close' => $this->chr['singleAngleQuoteClose'],
),
'singleGuillemetsReversed' => array(
'open' => $this->chr['singleAngleQuoteClose'],
'close' => $this->chr['singleAngleQuoteOpen'],
),
'cornerBrackets' => array(
'open' => $this->chr['leftCornerBracket'],
'close' => $this->chr['rightCornerBracket'],
),
'whiteCornerBracket' => array(
'open' => $this->chr['leftWhiteCornerBracket'],
'close' => $this->chr['rightWhiteCornerBracket'],
),
);
$this->dash_styles = array(
'traditionalUS' => array(
'parenthetical' => $this->chr['emDash'],
'interval' => $this->chr['enDash'],
'parentheticalSpace' => $this->chr['thinSpace'],
'intervalSpace' => $this->chr['thinSpace'],
),
'international' => array(
'parenthetical' => $this->chr['enDash'],
'interval' => $this->chr['enDash'],
'parentheticalSpace' => ' ',
'intervalSpace' => $this->chr['hairSpace'],
),
);
// Set up both UTF-8 and ASCII string functions.
// UTF-8 first.
$this->str_functions['UTF-8']['strlen'] = 'mb_strlen';
$this->str_functions['UTF-8']['str_split'] = __NAMESPACE__ . '\mb_str_split';
$this->str_functions['UTF-8']['strtolower'] = 'mb_strtolower';
$this->str_functions['UTF-8']['substr'] = 'mb_substr';
$this->str_functions['UTF-8']['u'] = 'u'; // unicode flag for regex.
// Now ASCII.
$this->str_functions['ASCII']['strlen'] = 'strlen';
$this->str_functions['ASCII']['str_split'] = 'str_split';
$this->str_functions['ASCII']['strtolower'] = 'strtolower';
$this->str_functions['ASCII']['substr'] = 'substr';
$this->str_functions['ASCII']['u'] = ''; // no regex flag needed.
// All other encodings get the empty array.
// Set up regex patterns.
$this->initialize_components();
$this->initialize_patterns();
// Set up some arrays for quick HTML5 introspection.
$this->self_closing_tags = array_filter( array_keys( \Masterminds\HTML5\Elements::$html5 ), function( $tag ) { return \Masterminds\HTML5\Elements::isA( $tag, \Masterminds\HTML5\Elements::VOID_TAG );
} );
$this->inappropriate_tags = array( 'iframe', 'textarea', 'button', 'select', 'optgroup', 'option', 'map', 'style', 'head', 'title', 'script', 'applet', 'object', 'param' );
if ( $set_defaults ) {
$this->set_defaults();
}
}
/**
* (Re)set various options to their default values.
*/
function set_defaults() {
// General attributes.
$this->set_tags_to_ignore();
$this->set_classes_to_ignore();
$this->set_ids_to_ignore();
// Smart characters.
$this->set_smart_quotes();
$this->set_smart_quotes_primary(); // added in version 1.15.
$this->set_smart_quotes_secondary(); // added in version 1.15.
$this->set_smart_dashes();
$this->set_smart_dashes_style();
$this->set_smart_ellipses();
$this->set_smart_diacritics();
$this->set_diacritic_language();
$this->set_diacritic_custom_replacements();
$this->set_smart_marks();
$this->set_smart_ordinal_suffix();
$this->set_smart_math();
$this->set_smart_fractions();
$this->set_smart_exponents();
// Smart spacing.
$this->set_single_character_word_spacing();
$this->set_fraction_spacing();
$this->set_unit_spacing();
$this->set_french_punctuation_spacing();
$this->set_units();
$this->set_dash_spacing();
$this->set_dewidow();
$this->set_max_dewidow_length();
$this->set_max_dewidow_pull();
$this->set_wrap_hard_hyphens();
$this->set_url_wrap();
$this->set_email_wrap();
$this->set_min_after_url_wrap();
$this->set_space_collapse();
$this->set_true_no_break_narrow_space();
// Character styling.
$this->set_style_ampersands();
$this->set_style_caps();
$this->set_style_initial_quotes();
$this->set_style_numbers();
$this->set_style_hanging_punctuation();
$this->set_initial_quote_tags();
// Hyphenation.
$this->set_hyphenation();
$this->set_hyphenation_language();
$this->set_min_length_hyphenation();
$this->set_min_before_hyphenation();
$this->set_min_after_hyphenation();
$this->set_hyphenate_headings();
$this->set_hyphenate_all_caps();
$this->set_hyphenate_title_case(); // added in version 1.5.
$this->set_hyphenate_compounds();
$this->set_hyphenation_exceptions();
}
/**
* Set up our regex components for later use.
*
* Call before initialize_patterns().
*/
private function initialize_components() {
// Various regex components (but not complete patterns).
$this->components['nonEnglishWordCharacters'] = "
[0-9A-Za-z]|\x{00c0}|\x{00c1}|\x{00c2}|\x{00c3}|\x{00c4}|\x{00c5}|\x{00c6}|\x{00c7}|\x{00c8}|\x{00c9}|
\x{00ca}|\x{00cb}|\x{00cc}|\x{00cd}|\x{00ce}|\x{00cf}|\x{00d0}|\x{00d1}|\x{00d2}|\x{00d3}|\x{00d4}|
\x{00d5}|\x{00d6}|\x{00d8}|\x{00d9}|\x{00da}|\x{00db}|\x{00dc}|\x{00dd}|\x{00de}|\x{00df}|\x{00e0}|
\x{00e1}|\x{00e2}|\x{00e3}|\x{00e4}|\x{00e5}|\x{00e6}|\x{00e7}|\x{00e8}|\x{00e9}|\x{00ea}|\x{00eb}|
\x{00ec}|\x{00ed}|\x{00ee}|\x{00ef}|\x{00f0}|\x{00f1}|\x{00f2}|\x{00f3}|\x{00f4}|\x{00f5}|\x{00f6}|
\x{00f8}|\x{00f9}|\x{00fa}|\x{00fb}|\x{00fc}|\x{00fd}|\x{00fe}|\x{00ff}|\x{0100}|\x{0101}|\x{0102}|
\x{0103}|\x{0104}|\x{0105}|\x{0106}|\x{0107}|\x{0108}|\x{0109}|\x{010a}|\x{010b}|\x{010c}|\x{010d}|
\x{010e}|\x{010f}|\x{0110}|\x{0111}|\x{0112}|\x{0113}|\x{0114}|\x{0115}|\x{0116}|\x{0117}|\x{0118}|
\x{0119}|\x{011a}|\x{011b}|\x{011c}|\x{011d}|\x{011e}|\x{011f}|\x{0120}|\x{0121}|\x{0122}|\x{0123}|
\x{0124}|\x{0125}|\x{0126}|\x{0127}|\x{0128}|\x{0129}|\x{012a}|\x{012b}|\x{012c}|\x{012d}|\x{012e}|
\x{012f}|\x{0130}|\x{0131}|\x{0132}|\x{0133}|\x{0134}|\x{0135}|\x{0136}|\x{0137}|\x{0138}|\x{0139}|
\x{013a}|\x{013b}|\x{013c}|\x{013d}|\x{013e}|\x{013f}|\x{0140}|\x{0141}|\x{0142}|\x{0143}|\x{0144}|
\x{0145}|\x{0146}|\x{0147}|\x{0148}|\x{0149}|\x{014a}|\x{014b}|\x{014c}|\x{014d}|\x{014e}|\x{014f}|
\x{0150}|\x{0151}|\x{0152}|\x{0153}|\x{0154}|\x{0155}|\x{0156}|\x{0157}|\x{0158}|\x{0159}|\x{015a}|
\x{015b}|\x{015c}|\x{015d}|\x{015e}|\x{015f}|\x{0160}|\x{0161}|\x{0162}|\x{0163}|\x{0164}|\x{0165}|
\x{0166}|\x{0167}|\x{0168}|\x{0169}|\x{016a}|\x{016b}|\x{016c}|\x{016d}|\x{016e}|\x{016f}|\x{0170}|
\x{0171}|\x{0172}|\x{0173}|\x{0174}|\x{0175}|\x{0176}|\x{0177}|\x{0178}|\x{0179}|\x{017a}|\x{017b}|
\x{017c}|\x{017d}|\x{017e}|\x{017f}
";
/**
* Find the HTML character representation for the following characters:
* tab | line feed | carriage return | space | non-breaking space | ethiopic wordspace
* ogham space mark | en quad space | em quad space | en-space | three-per-em space
* four-per-em space | six-per-em space | figure space | punctuation space | em-space
* thin space | hair space | narrow no-break space
* medium mathematical space | ideographic space
* Some characters are used inside words, we will not count these as a space for the purpose
* of finding word boundaries:
* zero-width-space ("&#8203;", "&#x200b;")
* zero-width-joiner ("&#8204;", "&#x200c;", "&zwj;")
* zero-width-non-joiner ("&#8205;", "&#x200d;", "&zwnj;")
*/
$this->components['htmlSpaces'] = '
\x{00a0} # no-break space
|
\x{1361} # ethiopic wordspace
|
\x{2000} # en quad-space
|
\x{2001} # em quad-space
|
\x{2002} # en space
|
\x{2003} # em space
|
\x{2004} # three-per-em space
|
\x{2005} # four-per-em space
|
\x{2006} # six-per-em space
|
\x{2007} # figure space
|
\x{2008} # punctuation space
|
\x{2009} # thin space
|
\x{200a} # hair space
|
\x{200b} # zero-width space
|
\x{200c} # zero-width joiner
|
\x{200d} # zero-width non-joiner
|
\x{202f} # narrow no-break space
|
\x{205f} # medium mathematical space
|
\x{3000} # ideographic space
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
$this->components['normalSpaces'] = ' \f\n\r\t\v'; // equivalent to \s in non-Unicode mode.
// Hanging punctuation.
$this->components['doubleHangingPunctuation'] = "
\"
{$this->chr['doubleQuoteOpen']}
{$this->chr['doubleQuoteClose']}
{$this->chr['doubleLow9Quote']}
{$this->chr['doublePrime']}
{$this->quote_styles['doubleCurled']['open']}
{$this->quote_styles['doubleCurled']['close']}
"; // requires modifiers: x (multiline pattern) u (utf8).
$this->components['singleHangingPunctuation'] = "
'
{$this->chr['singleQuoteOpen']}
{$this->chr['singleQuoteClose']}
{$this->chr['singleLow9Quote']}
{$this->chr['singlePrime']}
{$this->quote_styles['singleCurled']['open']}
{$this->quote_styles['singleCurled']['close']}
{$this->chr['apostrophe']}
"; // requires modifiers: x (multiline pattern) u (utf8).
$this->components['unitSpacingStandardUnits'] = '
### Temporal units
(?:ms|s|secs?|mins?|hrs?)\.?|
milliseconds?|seconds?|minutes?|hours?|days?|years?|decades?|century|centuries|millennium|millennia|
### Imperial units
(?:in|ft|yd|mi)\.?|
(?:ac|ha|oz|pt|qt|gal|lb|st)\.?
s\.f\.|sf|s\.i\.|si|square[ ]feet|square[ ]foot|
inch|inches|foot|feet|yards?|miles?|acres?|hectares?|ounces?|pints?|quarts?|gallons?|pounds?|stones?|
### Metric units (with prefixes)
(?:p|µ|[mcdhkMGT])?
(?:[mgstAKNJWCVFSTHBL]|mol|cd|rad|Hz|Pa|Wb|lm|lx|Bq|Gy|Sv|kat|Ω|Ohm|&Omega;|&\#0*937;|&\#[xX]0*3[Aa]9;)|
(?:nano|micro|milli|centi|deci|deka|hecto|kilo|mega|giga|tera)?
(?:liters?|meters?|grams?|newtons?|pascals?|watts?|joules?|amperes?)|
### Computers units (KB, Kb, TB, Kbps)
[kKMGT]?(?:[oBb]|[oBb]ps|flops)|
### Money
¢|M?(?:£|¥|€|$)|
### Other units
°[CF]? |
%|pi|M?px|em|en|[NSEOW]|[NS][EOW]|mbar
'; // required modifiers: x (multiline pattern).
$this->components['hyphensArray'] = array_unique( array( '-', $this->chr['hyphen'] ) );
$this->components['hyphens'] = implode( '|', $this->components['hyphensArray'] );
/*
// \p{Lu} equals upper case letters and should match non english characters; since PHP 4.4.0 and 5.1.0
// for more info, see http://www.regextester.com/pregsyntax.html#regexp.reference.unicode
$this->components['styleCaps'] = '
(?<![\w\-_'.$this->chr['zeroWidthSpace'].$this->chr['softHyphen'].'])
# negative lookbehind assertion
(
(?: # CASE 1: " 9A "
[0-9]+ # starts with at least one number
\p{Lu} # must contain at least one capital letter
(?:\p{Lu}|[0-9]|\-|_|'.$this->chr['zeroWidthSpace'].'|'.$this->chr['softHyphen'].')*
# may be followed by any number of numbers capital letters, hyphens, underscores, zero width spaces, or soft hyphens
)
|
(?: # CASE 2: " A9 "
\p{Lu} # starts with capital letter
(?:\p{Lu}|[0-9]) # must be followed a number or capital letter
(?:\p{Lu}|[0-9]|\-|_|'.$this->chr['zeroWidthSpace'].'|'.$this->chr['softHyphen'].')*
# may be followed by any number of numbers capital letters, hyphens, underscores, zero width spaces, or soft hyphens
)
)
(?![\w\-_'.$this->chr['zeroWidthSpace'].$this->chr['softHyphen'].'])
# negative lookahead assertion
'; // required modifiers: x (multiline pattern) u (utf8)
*/
// Servers with PCRE compiled without "--enable-unicode-properties" fail at \p{Lu} by returning an empty string (this leaving the screen void of text
// thus are testing this alternative.
$this->components['styleCaps'] = '
(?<![\w\-_' . $this->chr['zeroWidthSpace'] . $this->chr['softHyphen'] . ']) # negative lookbehind assertion
(
(?: # CASE 1: " 9A "
[0-9]+ # starts with at least one number
[A-ZÀ-ÖØ-Ý] # must contain at least one capital letter
(?:[A-ZÀ-ÖØ-Ý]|[0-9]|\-|_|' . $this->chr['zeroWidthSpace'] . '|' . $this->chr['softHyphen'] . ')*
# may be followed by any number of numbers capital letters, hyphens, underscores, zero width spaces, or soft hyphens
)
|
(?: # CASE 2: " A9 "
[A-ZÀ-ÖØ-Ý] # starts with capital letter
(?:[A-ZÀ-ÖØ-Ý]|[0-9]) # must be followed a number or capital letter
(?:[A-ZÀ-ÖØ-Ý]|[0-9]|\-|_|' . $this->chr['zeroWidthSpace'] . '|' . $this->chr['softHyphen'] . ')*
# may be followed by any number of numbers capital letters, hyphens, underscores, zero width spaces, or soft hyphens
)
)
(?![\w\-_' . $this->chr['zeroWidthSpace'] . $this->chr['softHyphen'] . ']) # negative lookahead assertion
'; // required modifiers: x (multiline pattern) u (utf8).
// Initialize valid top level domains from IANA list.
$this->components['validTopLevelDomains'] = $this->get_top_level_domains_from_file( dirname( __DIR__ ) . '/vendor/IANA/tlds-alpha-by-domain.txt' );
// Valid URL schemes.
$this->components['urlScheme'] = '(?:https?|ftps?|file|nfs|feed|itms|itpc)';
// Combined URL pattern.
$this->components['urlPattern'] = "(?:
\A
(?<schema>{$this->components['urlScheme']}:\/\/)? # Subpattern 1: contains _http://_ if it exists
(?<domain> # Subpattern 2: contains subdomains.domain.tld
(?:
[a-z0-9] # first chr of (sub)domain can not be a hyphen
[a-z0-9\-]{0,61} # middle chrs of (sub)domain may be a hyphen;
# limit qty of middle chrs so total domain does not exceed 63 chrs
[a-z0-9] # last chr of (sub)domain can not be a hyphen
\. # dot separator
)+
(?:
{$this->components['validTopLevelDomains']} # validates top level domain
)
(?: # optional port numbers
:
(?:
[1-5]?[0-9]{1,4} | 6[0-4][0-9]{3} | 65[0-4][0-9]{2} | 655[0-2][0-9] | 6553[0-5]
)
)?
)
(?<path> # Subpattern 3: contains path following domain
(?:
\/ # marks nested directory
[a-z0-9\"\$\-_\.\+!\*\'\(\),;\?:@=&\#]+ # valid characters within directory structure
)*
[\/]? # trailing slash if any
)
\Z
)"; // required modifiers: x (multiline pattern) i (case insensitive).
$this->components['wrapEmailsEmailPattern'] = "(?:
\A
[a-z0-9\!\#\$\%\&\'\*\+\/\=\?\^\_\`\{\|\}\~\-]+
(?:
\.
[a-z0-9\!\#\$\%\&\'\*\+\/\=\?\^\_\`\{\|\}\~\-]+
)*
@
(?:
[a-z0-9]
[a-z0-9\-]{0,61}
[a-z0-9]
\.
)+
(?:
{$this->components['validTopLevelDomains']}
)
\Z
)"; // required modifiers: x (multiline pattern) i (case insensitive).
$this->components['smartQuotesApostropheExceptions'] = array(
"'tain" . $this->chr['apostrophe'] . 't' => $this->chr['apostrophe'] . 'tain' . $this->chr['apostrophe'] . 't',
"'twere" => $this->chr['apostrophe'] . 'twere',
"'twas" => $this->chr['apostrophe'] . 'twas',
"'tis" => $this->chr['apostrophe'] . 'tis',
"'til" => $this->chr['apostrophe'] . 'til',
"'bout" => $this->chr['apostrophe'] . 'bout',
"'nuff" => $this->chr['apostrophe'] . 'nuff',
"'round" => $this->chr['apostrophe'] . 'round',
"'cause" => $this->chr['apostrophe'] . 'cause',
"'splainin" => $this->chr['apostrophe'] . 'splainin',
);
$this->components['smartQuotesApostropheExceptionMatches'] = array_keys( $this->components['smartQuotesApostropheExceptions'] );
$this->components['smartQuotesApostropheExceptionReplacements'] = array_values( $this->components['smartQuotesApostropheExceptions'] );
// These patterns need to be updated whenever the quote style changes.
$this->update_smart_quotes_brackets();
// Marker for strings that should not be replaced.
$this->components['escapeMarker'] = '_E_S_C_A_P_E_D_';
}
/**
* Update smartQuotesBrackets component after quote style change.
*/
private function update_smart_quotes_brackets() {
$this->components['smartQuotesBrackets'] = array(
// Single quotes.
"['" => '[' . $this->chr['singleQuoteOpen'],
"{'" => '{' . $this->chr['singleQuoteOpen'],
"('" => '(' . $this->chr['singleQuoteOpen'],
"']" => $this->chr['singleQuoteClose'] . ']',
"'}" => $this->chr['singleQuoteClose'] . '}',
"')" => $this->chr['singleQuoteClose'] . ')',
// Double quotes.
'["' => '[' . $this->chr['doubleQuoteOpen'],
'{"' => '{' . $this->chr['doubleQuoteOpen'],
'("' => '(' . $this->chr['doubleQuoteOpen'],
'"]' => $this->chr['doubleQuoteClose'] . ']',
'"}' => $this->chr['doubleQuoteClose'] . '}',
'")' => $this->chr['doubleQuoteClose'] . ')',
// Quotes & quotes.
"\"'" => $this->chr['doubleQuoteOpen'] . $this->chr['singleQuoteOpen'],
"'\"" => $this->chr['singleQuoteClose'] . $this->chr['doubleQuoteClose'],
);
$this->components['smartQuotesBracketMatches'] = array_keys( $this->components['smartQuotesBrackets'] );
$this->components['smartQuotesBracketReplacements'] = array_values( $this->components['smartQuotesBrackets'] );
}
/**
* Load a list of top-level domains from a file.
*
* @param string $path The full path and filename.
* @return string A list of top-level domains concatenated with '|'.
*/
function get_top_level_domains_from_file( $path ) {
$domains = array();
if ( file_exists( $path ) ) {
$file = new \SplFileObject( $path );
while ( ! $file->eof() ) {
$line = $file->fgets();
if ( preg_match( '#^[a-zA-Z0-9][a-zA-Z0-9-]*$#', $line, $matches ) ) {
$domains[] = strtolower( $matches[0] );
}
}
}
if ( count( $domains ) > 0 ) {
return implode( '|', $domains );
} else {
return 'ac|ad|aero|ae|af|ag|ai|al|am|an|ao|aq|arpa|ar|asia|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|biz|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|cat|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|com|coop|co|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|info|int|in|io|iq|ir|is|it|je|jm|jobs|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mobi|mo|mp|mq|mr|ms|mt|museum|mu|mv|mw|mx|my|mz|name|na|nc|net|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pro|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|travel|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw';
}
}
/**
* Set up our regex patterns for later use.
*
* Call after intialize_components().
*/
private function initialize_patterns() {
// Actual regex patterns.
$this->regex['customDiacriticsDoubleQuoteKey'] = '/(?:")([^"]+)(?:"\s*=>)/';
$this->regex['customDiacriticsSingleQuoteKey'] = "/(?:')([^']+)(?:'\s*=>)/";
$this->regex['customDiacriticsDoubleQuoteValue'] = '/(?:=>\s*")([^"]+)(?:")/';
$this->regex['customDiacriticsSingleQuoteValue'] = "/(?:=>\s*')([^']+)(?:')/";
$this->regex['controlCharacters'] = '/\p{C}/Su';
$this->regex['smartQuotesSingleQuotedNumbers'] = "/(?<=\W|\A)'(\d+)'(?=\W|\Z)/u";
$this->regex['smartQuotesDoubleQuotedNumbers'] = '/(?<=\W|\A)"(\d+)"(?=\W|\Z)/u';
$this->regex['smartQuotesDoublePrime'] = "/(\b\d{1,3})''(?=\W|\Z)/u";
$this->regex['smartQuotesDoublePrimeCompound'] = "/(\b\d{1,3})''(?=-\w)/u";
$this->regex['smartQuotesDoublePrime1GlyphCompound'] = "/(\b\d{1,3})\"(?=-\w)/u";
$this->regex['smartQuotesSinglePrimeCompound'] = "/(\b\d{1,3})'(?=-\w)/u";
$this->regex['smartQuotesSingleDoublePrime'] = "/(\b\d{1,3})'(\s*)(\b\d+)''(?=\W|\Z)/u";
$this->regex['smartQuotesSingleDoublePrime1Glyph'] = "/(\b\d{1,3})'(\s*)(\b\d+)\"(?=\W|\Z)/u";
$this->regex['smartQuotesCommaQuote'] = '/(?<=\s|\A),(?=\S)/';
$this->regex['smartQuotesApostropheWords'] = "/(?<=[\w|{$this->components['nonEnglishWordCharacters']}])'(?=[\w|{$this->components['nonEnglishWordCharacters']}])/u";
$this->regex['smartQuotesApostropheDecades'] = "/'(\d\d\b)/";
$this->regex['smartQuotesSingleQuoteOpen'] = "/'(?=[\w|{$this->components['nonEnglishWordCharacters']}])/u";
$this->regex['smartQuotesSingleQuoteClose'] = "/(?<=[\w|{$this->components['nonEnglishWordCharacters']}])'/u";
$this->regex['smartQuotesSingleQuoteOpenSpecial'] = "/(?<=\s|\A)'(?=\S)/"; // like _'¿hola?'_.
$this->regex['smartQuotesSingleQuoteCloseSpecial'] = "/(?<=\S)'(?=\s|\Z)/";
$this->regex['smartQuotesDoubleQuoteOpen'] = "/\"(?=[\w|{$this->components['nonEnglishWordCharacters']}])/u";
$this->regex['smartQuotesDoubleQuoteClose'] = "/(?<=[\w|{$this->components['nonEnglishWordCharacters']}])\"/u";
$this->regex['smartQuotesDoubleQuoteOpenSpecial'] = '/(?<=\s|\A)"(?=\S)/';
$this->regex['smartQuotesDoubleQuoteCloseSpecial'] = '/(?<=\S)"(?=\s|\Z)/';
$this->regex['smartDashesParentheticalDoubleDash'] = "/(\s|{$this->components['htmlSpaces']})--(\s|{$this->components['htmlSpaces']})/xui"; // ' -- '.
$this->regex['smartDashesParentheticalSingleDash'] = "/(\s|{$this->components['htmlSpaces']})-(\s|{$this->components['htmlSpaces']})/xui"; // ' - '.
$this->regex['smartDashesEnDashAll'] = "/(\A|\s)\-([\w|{$this->components['nonEnglishWordCharacters']}])/u";
$this->regex['smartDashesEnDashWords'] = "/([\w|{$this->components['nonEnglishWordCharacters']}])\-(\Z|{$this->chr['thinSpace']}|{$this->chr['hairSpace']}|{$this->chr['noBreakNarrowSpace']})/u";
$this->regex['smartDashesEnDashNumbers'] = "/(\b\d+)\-(\d+\b)/";
$this->regex['smartDashesEnDashPhoneNumbers'] = "/(\b\d{3})" . $this->chr['enDash'] . "(\d{4}\b)/";
$this->regex['smartDashesYYYY-MM-DD'] = '/
(
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
[12][0-9]{3}
)
[\-' . $this->chr['enDash'] . ']
(
(?:[0][1-9]|[1][0-2])
)
[\-' . $this->chr['enDash'] . "]
(
(?:[0][1-9]|[12][0-9]|[3][0-1])
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
)
/xu';
$this->regex['smartDashesMM-DD-YYYY'] = '/
(?:
(?:
(
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
(?:[0]?[1-9]|[1][0-2])
)
[\-' . $this->chr['enDash'] . ']
(
(?:[0]?[1-9]|[12][0-9]|[3][0-1])
)
)
|
(?:
(
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
(?:[0]?[1-9]|[12][0-9]|[3][0-1])
)
[\-' . $this->chr['enDash'] . ']
(
(?:[0]?[1-9]|[1][0-2])
)
)
)
[\-' . $this->chr['enDash'] . "]
(
[12][0-9]{3}
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
)
/xu';
$this->regex['smartDashesYYYY-MM'] = '/
(
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
[12][0-9]{3}
)
[\-' . $this->chr['enDash'] . "]
(
(?:
(?:[0][1-9]|[1][0-2])
|
(?:[0][0-9][1-9]|[1-2][0-9]{2}|[3][0-5][0-9]|[3][6][0-6])
)
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
)
/xu';
// Smart math.
// First, let's find math equations.
$this->regex['smartMathEquation'] = "/
(?<=\A|\s) # lookbehind assertion: proceeded by beginning of string or space
[\.,\'\"\¿\¡" . $this->chr['ellipses'] . $this->chr['singleQuoteOpen'] . $this->chr['doubleQuoteOpen'] . $this->chr['guillemetOpen'] . $this->chr['guillemetClose'] . $this->chr['singleLow9Quote'] . $this->chr['doubleLow9Quote'] . ']*
# allowed proceeding punctuation
[\-\(' . $this->chr['minus'] . ']* # optionally proceeded by dash, minus sign or open parenthesis
[0-9]+ # must begin with a number
(\.[0-9]+)? # optionally allow decimal values after first integer
( # followed by a math symbol and a number
[\/\*x\-+=\^' . $this->chr['minus'] . $this->chr['multiplication'] . $this->chr['division'] . ']
# allowed math symbols
[\-\(' . $this->chr['minus'] . ']* # opptionally preceeded by dash, minus sign or open parenthesis
[0-9]+ # must begin with a number
(\.[0-9]+)? # optionally allow decimal values after first integer
[\-\(\)' . $this->chr['minus'] . "]* # opptionally preceeded by dash, minus sign or parenthesis
)+
[\.,;:\'\"\?\!" . $this->chr['ellipses'] . $this->chr['singleQuoteClose'] . $this->chr['doubleQuoteClose'] . $this->chr['guillemetOpen'] . $this->chr['guillemetClose'] . ']*
# allowed trailing punctuation
(?=\Z|\s) # lookahead assertion: followed by end of string or space
/ux';
// Revert 4-4 to plain minus-hyphen so as to not mess with ranges of numbers (i.e. pp. 46-50).
$this->regex['smartMathRevertRange'] = '/
(
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
\d+
)
[\-' . $this->chr['minus'] . "]
(
\d+
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
)
/xu';
// Revert fractions to basic slash.
// We'll leave styling fractions to smart_fractions.
$this->regex['smartMathRevertFraction'] = "/
(
(?<=\s|\A|\'|\"|" . $this->chr['noBreakSpace'] . ')
\d+
)
' . $this->chr['division'] . "
(
\d+
(?:st|nd|rd|th)?
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
)
/xu';
// Revert date back to original formats:
// YYYY-MM-DD.
$this->regex['smartMathRevertDateYYYY-MM-DD'] = '/
(
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
[12][0-9]{3}
)
[\-' . $this->chr['minus'] . ']
(
(?:[0]?[1-9]|[1][0-2])
)
[\-' . $this->chr['minus'] . "]
(
(?:[0]?[1-9]|[12][0-9]|[3][0-1])
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
)
/xu';
// MM-DD-YYYY or DD-MM-YYYY.
$this->regex['smartMathRevertDateMM-DD-YYYY'] = '/
(?:
(?:
(
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
(?:[0]?[1-9]|[1][0-2])
)
[\-' . $this->chr['minus'] . ']
(
(?:[0]?[1-9]|[12][0-9]|[3][0-1])
)
)
|
(?:
(
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
(?:[0]?[1-9]|[12][0-9]|[3][0-1])
)
[\-' . $this->chr['minus'] . ']
(
(?:[0]?[1-9]|[1][0-2])
)
)
)
[\-' . $this->chr['minus'] . "]
(
[12][0-9]{3}
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
)
/xu';
// YYYY-MM or YYYY-DDD next.
$this->regex['smartMathRevertDateYYYY-MM'] = '/
(
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
[12][0-9]{3}
)
[\-' . $this->chr['minus'] . "]
(
(?:
(?:[0][1-9]|[1][0-2])
|
(?:[0][0-9][1-9]|[1-2][0-9]{2}|[3][0-5][0-9]|[3][6][0-6])
)
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
)
/xu';
// MM/DD/YYYY or DD/MM/YYYY.
$this->regex['smartMathRevertDateMM/DD/YYYY'] = '/
(?:
(?:
(
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
(?:[0][1-9]|[1][0-2])
)
[\/' . $this->chr['division'] . ']
(
(?:[0][1-9]|[12][0-9]|[3][0-1])
)
)
|
(?:
(
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
(?:[0][1-9]|[12][0-9]|[3][0-1])
)
[\/' . $this->chr['division'] . ']
(
(?:[0][1-9]|[1][0-2])
)
)
)
[\/' . $this->chr['division'] . "]
(
[12][0-9]{3}
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
)
/xu';
// Handle exponents (ie. 4^2).
$this->regex['smartExponents'] = "/
\b
(\d+)
\^
(\w+)
\b
/xu";
$this->regex['smartFractionsSpacing'] = '/\b(\d+)\s(\d+\s?\/\s?\d+)\b/';
$this->regex['smartFractionsReplacement'] = "/
(?<=\A|\s|{$this->chr['noBreakSpace']}|{$this->chr['noBreakNarrowSpace']}) # lookbehind assertion: makes sure we are not messing up a url
(\d+)
(?:\s?\/\s?{$this->chr['zeroWidthSpace']}?) # strip out any zero-width spaces inserted by wrap_hard_hyphens
(\d+)
(
(?:\<sup\>(?:st|nd|rd|th)<\/sup\>)? # handle ordinals after fractions
(?:\Z|\s|{$this->chr['noBreakSpace']}|{$this->chr['noBreakNarrowSpace']}|\.|\!|\?|\)|\;|\:|\'|\") # makes sure we are not messing up a url
)
/xu";
$this->regex['smartFractionsEscapeMM/YYYY'] = "/
(?<=\A|\s|{$this->chr['noBreakSpace']}|{$this->chr['noBreakNarrowSpace']}) # lookbehind assertion: makes sure we are not messing up a url
(\d\d?)
(\s?\/\s?{$this->chr['zeroWidthSpace']}?) # capture any zero-width spaces inserted by wrap_hard_hyphens
(
(?:19\d\d)|(?:20\d\d) # handle 4-decimal years in the 20th and 21st centuries
)
(
(?:\Z|\s|{$this->chr['noBreakSpace']}|{$this->chr['noBreakNarrowSpace']}|\.|\!|\?|\)|\;|\:|\'|\") # makes sure we are not messing up a url
)
/xu";
$year_regex = array();
for ( $year = 1900; $year < 2100; ++$year ) {
$year_regex[] = "(?: ( $year ) (\s?\/\s?{$this->chr['zeroWidthSpace']}?) ( " . ( $year + 1 ) . ' ) )';
}
$this->regex['smartFractionsEscapeYYYY/YYYY'] = "/
(?<=\A|\s|{$this->chr['noBreakSpace']}|{$this->chr['noBreakNarrowSpace']}) # lookbehind assertion: makes sure we are not messing up a url
(?| " . implode( '|', $year_regex ) . " )
(
(?:\Z|\s|{$this->chr['noBreakSpace']}|{$this->chr['noBreakNarrowSpace']}|\.|\!|\?|\)|\;|\:|\'|\") # makes sure we are not messing up a url
)
/xu";
$this->regex['smartOrdinalSuffix'] = "/\b(\d+)(st|nd|rd|th)\b/"; // End smart math.
// Smart marks.
$this->regex['smartMarksEscape501(c)'] = '/\b(501\()(c)(\)\((?:[1-9]|[1-2][0-9])\))/u';
// Whitespace handling.
$this->regex['singleCharacterWordSpacing'] = "/
(?:
(\s)
(\w)
[{$this->components['normalSpaces']}]
(?=\w)
)
/xu";
$this->regex['dashSpacingEmDash'] = "/
(?:
\s
({$this->chr['emDash']})
\s
)
|
(?:
(?<=\S) # lookbehind assertion
({$this->chr['emDash']})
(?=\S) # lookahead assertion
)
/xu";
$this->regex['dashSpacingParentheticalDash'] = "/
(?:
\s
({$this->chr['enDash']})
\s
)
/xu";
$this->regex['dashSpacingIntervalDash'] = "/
(?:
(?<=\S) # lookbehind assertion
({$this->chr['enDash']})
(?=\S) # lookahead assertion
)
/xu";
$this->regex['spaceCollapseNormal'] = "/[{$this->components['normalSpaces']}]+/xu";
$this->regex['spaceCollapseNonBreakable'] = "/(?:[{$this->components['normalSpaces']}]|{$this->components['htmlSpaces']})*{$this->chr['noBreakSpace']}(?:[{$this->components['normalSpaces']}]|{$this->components['htmlSpaces']})*/xu";
$this->regex['spaceCollapseOther'] = "/(?:[{$this->components['normalSpaces']}])*({$this->components['htmlSpaces']})(?:[{$this->components['normalSpaces']}]|{$this->components['htmlSpaces']})*/xu";
$this->regex['spaceCollapseBlockStart'] = "/\A(?:[{$this->components['normalSpaces']}]|{$this->components['htmlSpaces']})+/xu";
// Unit spacing.
$this->regex['unitSpacingEscapeSpecialChars'] = '#([\[\\\^\$\.\|\?\*\+\(\)\{\}])#';
$this->update_unit_pattern( isset( $this->settings['units'] ) ? $this->settings['units'] : array() );
// French punctuation spacing.
$this->regex['frenchPunctuationSpacingNarrow'] = '/(\w+)(\s?)([?!»])(\s|\Z)/u';
$this->regex['frenchPunctuationSpacingFull'] = '/(\w+)(\s?)(:)(\s|\Z)/u';
$this->regex['frenchPunctuationSpacingSemicolon'] = '/(\w+)(\s?)((?<!&amp|&gt|&lt);)(\s|\Z)/u';
$this->regex['frenchPunctuationSpacingOpeningQuote'] = '/(\s|\A)(«)(\s?)(\w+)/u';
// Wrap hard hyphens.
$this->regex['wrapHardHyphensRemoveEndingSpace'] = "/({$this->components['hyphens']}){$this->chr['zeroWidthSpace']}\$/";
// Wrap emails.
$this->regex['wrapEmailsMatchEmails'] = "/{$this->components['wrapEmailsEmailPattern']}/xi";
$this->regex['wrapEmailsReplaceEmails'] = '/([^a-zA-Z])/';
// Wrap URLs.
$this->regex['wrapUrlsPattern'] = "`{$this->components['urlPattern']}`xi";
$this->regex['wrapUrlsDomainParts'] = '#(\-|\.)#';
// Style caps.
$this->regex['styleCaps'] = "/{$this->components['styleCaps']}/xu";
// Style numbers.
$this->regex['styleNumbers'] = '/([0-9]+)/u';
// Style hanging punctuation.
$this->regex['styleHangingPunctuationDouble'] = "/(\s)([{$this->components['doubleHangingPunctuation']}])(\w+)/u";
$this->regex['styleHangingPunctuationSingle'] = "/(\s)([{$this->components['singleHangingPunctuation']}])(\w+)/u";
$this->regex['styleHangingPunctuationInitialDouble'] = "/(?:\A)([{$this->components['doubleHangingPunctuation']}])(\w+)/u";
$this->regex['styleHangingPunctuationInitialSingle'] = "/(?:\A)([{$this->components['singleHangingPunctuation']}])(\w+)/u";
// Style ampersands.
$this->regex['styleAmpersands'] = '/(\&amp\;)/u';
// Dewidowing.
$this->regex['dewidow'] = "/
(?:
\A
|
(?:
(?<space_before> # subpattern 1: space before (note: ZWSP is not a space)
[\s{$this->chr['zeroWidthSpace']}{$this->chr['softHyphen']}]+
)
(?<neighbor> # subpattern 2: neighbors widow (short as possible)
[^\s{$this->chr['zeroWidthSpace']}{$this->chr['softHyphen']}]+?
)
)
)
(?<space_between> # subpattern 3: space between
[\s]+ # \s includes all special spaces (but not ZWSP) with the u flag
)
(?<widow> # subpattern 4: widow
[\w\pM\-]+? # \w includes all alphanumeric Unicode characters but not composed characters
)
(?<trailing> # subpattern 5: any trailing punctuation or spaces
[^\w\pM]*
)
\Z
/xu";
// Utility patterns for splitting string parameter lists into arrays.
$this->regex['parameterSplitting'] = '/[\s,]+/';
// Add the "study" flag to all our regular expressions.
foreach ( $this->regex as &$regex ) {
$regex .= 'S';
}
}
/**
* Enable usage of true "no-break narrow space" (&#8239;) instead of the normal no-break space (&nbsp;).
*
* @param boolean $on Optional. Default false.
*/
function set_true_no_break_narrow_space( $on = false ) {
if ( $on ) {
$this->chr['noBreakNarrowSpace'] = uchr( 8239 );
} else {
$this->chr['noBreakNarrowSpace'] = uchr( 160 );
}
// Update French guillemets.
$this->quote_styles['doubleGuillemetsFrench'] = array(
'open' => $this->chr['guillemetOpen'] . $this->chr['noBreakNarrowSpace'],
'close' => $this->chr['noBreakNarrowSpace'] . $this->chr['guillemetClose'],
);
}
/**
* Sets tags for which the typography of their children will be left untouched.
*
* @param string|array $tags A comma separated list or an array of tag names.
*/
function set_tags_to_ignore( $tags = array( 'code', 'head', 'kbd', 'object', 'option', 'pre', 'samp', 'script', 'noscript', 'noembed', 'select', 'style', 'textarea', 'title', 'var', 'math' ) ) {
if ( ! is_array( $tags ) ) {
$tags = preg_split( $this->regex['parameterSplitting'], $tags, -1, PREG_SPLIT_NO_EMPTY );
}
// Ensure that we pass only lower-case tag names to XPath.
$tags = array_filter( array_map( 'strtolower', $tags ), 'ctype_alnum' );
// Self closing tags shouldn't be in $tags.
$this->settings['ignoreTags'] = array_unique( array_merge( array_diff( $tags, $this->self_closing_tags ), $this->inappropriate_tags ) );
}
/**
* Sets classes for which the typography of their children will be left untouched.
*
* @param string|array $classes A comma separated list or an array of class names.
*/
function set_classes_to_ignore( $classes = array( 'vcard', 'noTypo' ) ) {
if ( ! is_array( $classes ) ) {
$classes = preg_split( $this->regex['parameterSplitting'], $classes, -1, PREG_SPLIT_NO_EMPTY );
}
$this->settings['ignoreClasses'] = $classes;
}
/**
* Sets IDs for which the typography of their children will be left untouched.
*
* @param string|array $ids A comma separated list or an array of tag names.
*/
function set_ids_to_ignore( $ids = array() ) {
if ( ! is_array( $ids ) ) {
$ids = preg_split( $this->regex['parameterSplitting'], $ids, -1, PREG_SPLIT_NO_EMPTY );
}
$this->settings['ignoreIDs'] = $ids;
}
/**
* Enable/disable typographic quotes.
*
* @param boolean $on Optional. Default true.
*/
function set_smart_quotes( $on = true ) {
$this->settings['smartQuotes'] = $on;
}
/**
* Set the style for primary ('double') quotemarks.
*
* Allowed values for $style:
* "doubleCurled" => "&ldquo;foo&rdquo;",
* "doubleCurledReversed" => "&rdquo;foo&rdquo;",
* "doubleLow9" => "&bdquo;foo&rdquo;",
* "doubleLow9Reversed" => "&bdquo;foo&ldquo;",
* "singleCurled" => "&lsquo;foo&rsquo;",
* "singleCurledReversed" => "&rsquo;foo&rsquo;",
* "singleLow9" => "&sbquo;foo&rsquo;",
* "singleLow9Reversed" => "&sbquo;foo&lsquo;",
* "doubleGuillemetsFrench" => "&laquo;&nbsp;foo&nbsp;&raquo;",
* "doubleGuillemets" => "&laquo;foo&raquo;",
* "doubleGuillemetsReversed" => "&raquo;foo&laquo;",
* "singleGuillemets" => "&lsaquo;foo&rsaquo;",
* "singleGuillemetsReversed" => "&rsaquo;foo&lsaquo;",
* "cornerBrackets" => "&#x300c;foo&#x300d;",
* "whiteCornerBracket" => "&#x300e;foo&#x300f;"
*
* @param string $style Defaults to 'doubleCurled.
*/
function set_smart_quotes_primary( $style = 'doubleCurled' ) {
if ( isset( $this->quote_styles[ $style ] ) ) {
if ( ! empty( $this->quote_styles[ $style ]['open'] ) ) {
$this->chr['doubleQuoteOpen'] = $this->quote_styles[ $style ]['open'];
}
if ( ! empty( $this->quote_styles[ $style ]['close'] ) ) {
$this->chr['doubleQuoteClose'] = $this->quote_styles[ $style ]['close'];
}
// Update brackets component.
$this->update_smart_quotes_brackets();
} else {
trigger_error( "Invalid quote style $style.", E_USER_WARNING ); // @codingStandardsIgnoreLine.
}
}
/**
* Set the style for secondary ('single') quotemarks.
*
* Allowed values for $style:
* "doubleCurled" => "&ldquo;foo&rdquo;",
* "doubleCurledReversed" => "&rdquo;foo&rdquo;",
* "doubleLow9" => "&bdquo;foo&rdquo;",
* "doubleLow9Reversed" => "&bdquo;foo&ldquo;",
* "singleCurled" => "&lsquo;foo&rsquo;",
* "singleCurledReversed" => "&rsquo;foo&rsquo;",
* "singleLow9" => "&sbquo;foo&rsquo;",
* "singleLow9Reversed" => "&sbquo;foo&lsquo;",
* "doubleGuillemetsFrench" => "&laquo;&nbsp;foo&nbsp;&raquo;",
* "doubleGuillemets" => "&laquo;foo&raquo;",
* "doubleGuillemetsReversed" => "&raquo;foo&laquo;",
* "singleGuillemets" => "&lsaquo;foo&rsaquo;",
* "singleGuillemetsReversed" => "&rsaquo;foo&lsaquo;",
* "cornerBrackets" => "&#x300c;foo&#x300d;",
* "whiteCornerBracket" => "&#x300e;foo&#x300f;"
*
* @param string $style Defaults to 'singleCurled'.
*/
function set_smart_quotes_secondary( $style = 'singleCurled' ) {
if ( isset( $this->quote_styles[ $style ] ) ) {
if ( ! empty( $this->quote_styles[ $style ]['open'] ) ) {
$this->chr['singleQuoteOpen'] = $this->quote_styles[ $style ]['open'];
}
if ( ! empty( $this->quote_styles[ $style ]['close'] ) ) {
$this->chr['singleQuoteClose'] = $this->quote_styles[ $style ]['close'];
}
// Update brackets component.
$this->update_smart_quotes_brackets();
} else {
trigger_error( "Invalid quote style $style.", E_USER_WARNING ); // @codingStandardsIgnoreLine.
}
}
/**
* Enable/disable replacement of "a--a" with En Dash " -- " and "---" with Em Dash.
*
* @param boolean $on Optional. Default true.
*/
function set_smart_dashes( $on = true ) {
$this->settings['smartDashes'] = $on;
}
/**
* Sets the typographical conventions used by smart_dashes.
*
* Allowed values for $style:
* - "traditionalUS"
* - "international"
*
* @param string $style Optional. Default "englishTraditional".
*/
function set_smart_dashes_style( $style = 'traditionalUS' ) {
if ( isset( $this->dash_styles[ $style ] ) ) {
if ( ! empty( $this->dash_styles[ $style ]['parenthetical'] ) ) {
$this->chr['parentheticalDash'] = $this->dash_styles[ $style ]['parenthetical'];
}
if ( ! empty( $this->dash_styles[ $style ]['interval'] ) ) {
$this->chr['intervalDash'] = $this->dash_styles[ $style ]['interval'];
}
if ( ! empty( $this->dash_styles[ $style ]['parentheticalSpace'] ) ) {
$this->chr['parentheticalDashSpace'] = $this->dash_styles[ $style ]['parentheticalSpace'];
}
if ( ! empty( $this->dash_styles[ $style ]['intervalSpace'] ) ) {
$this->chr['intervalDashSpace'] = $this->dash_styles[ $style ]['intervalSpace'];
}
// Update dash spacing regex.
$this->regex['dashSpacingParentheticalDash'] = "/
(?:
\s
({$this->chr['parentheticalDash']})
\s
)
/xu";
$this->regex['dashSpacingIntervalDash'] = "/
(?:
(?<=\S) # lookbehind assertion
({$this->chr['intervalDash']})
(?=\S) # lookahead assertion
)
/xu";
} else {
trigger_error( "Invalid dash style $style.", E_USER_WARNING ); // @codingStandardsIgnoreLine.
}
}
/**
* Enable/disable replacement of "..." with "…".
*
* @param boolean $on Optional. Default true.
*/
function set_smart_ellipses( $on = true ) {
$this->settings['smartEllipses'] = $on;
}
/**
* Enable/disable replacement "creme brulee" with "crème brûlée".
*
* @param boolean $on Optional. Default true.
*/
function set_smart_diacritics( $on = true ) {
$this->settings['smartDiacritics'] = $on;
}
/**
* Set the language used for diacritics replacements.
*
* @param string $lang Has to correspond to a filename in 'diacritics'. Optional. Default 'en-US'.
*/
function set_diacritic_language( $lang = 'en-US' ) {
if ( isset( $this->settings['diacriticLanguage'] ) && $this->settings['diacriticLanguage'] === $lang ) {
return;
}
$this->settings['diacriticLanguage'] = $lang;
if ( file_exists( dirname( __FILE__ ) . '/diacritics/' . $this->settings['diacriticLanguage'] . '.php' ) ) {
include( 'diacritics/' . $this->settings['diacriticLanguage'] . '.php' );
$this->settings['diacriticWords'] = $diacritic_words;
} else {
unset( $this->settings['diacriticWords'] );
}
$this->update_diacritics_replacement_arrays();
}
/**
* Set up custom diacritics replacements.
*
* @param string|array $custom_replacements An array formatted array(needle=>replacement, needle=>replacement...),
* or a string formatted `"needle"=>"replacement","needle"=>"replacement",...
*/
function set_diacritic_custom_replacements( $custom_replacements = array() ) {
if ( ! is_array( $custom_replacements ) ) {
$custom_replacements = preg_split( '/,/', $custom_replacements, -1, PREG_SPLIT_NO_EMPTY );
}
$replacements = array();
foreach ( $custom_replacements as $custom_key => $custom_replacement ) {
// Account for single and double quotes.
preg_match( $this->regex['customDiacriticsDoubleQuoteKey'], $custom_replacement, $double_quote_key_match );
preg_match( $this->regex['customDiacriticsSingleQuoteKey'], $custom_replacement, $single_quote_key_match );
preg_match( $this->regex['customDiacriticsDoubleQuoteValue'], $custom_replacement, $double_quote_value_match );
preg_match( $this->regex['customDiacriticsSingleQuoteValue'], $custom_replacement, $single_quote_value_match );
if ( ! empty( $double_quote_key_match[1] ) ) {
$key = $double_quote_key_match[1];
} elseif ( ! empty( $single_quote_key_match[1] ) ) {
$key = $single_quote_key_match[1];
} else {
$key = $custom_key;
}
if ( ! empty( $double_quote_value_match[1] ) ) {
$value = $double_quote_value_match[1];
} elseif ( ! empty( $single_quote_value_match[1] ) ) {
$value = $single_quote_value_match[1];
} else {
$value = $custom_replacement;
}
if ( isset( $key ) && isset( $value ) ) {
$replacements[ strip_tags( trim( $key ) ) ] = strip_tags( trim( $value ) );
}
}
$this->settings['diacriticCustomReplacements'] = $replacements;
$this->update_diacritics_replacement_arrays();
}
/**
* Update the pattern and replacement arrays in $settings['diacriticReplacement'].
*
* Should be called whenever a new diacritics replacement language is selected or
* when the custom replacements are updated.
*/
private function update_diacritics_replacement_arrays() {
$patterns = array();
$replacements = array();
if ( ! empty( $this->settings['diacriticCustomReplacements'] ) ) {
foreach ( $this->settings['diacriticCustomReplacements'] as $needle => $replacement ) {
$patterns[] = "/\b$needle\b/u";
$replacements[ $needle ] = $replacement;
}
}
if ( ! empty( $this->settings['diacriticWords'] ) ) {
foreach ( $this->settings['diacriticWords'] as $needle => $replacement ) {
$patterns[] = "/\b$needle\b/u";
$replacements[ $needle ] = $replacement;
}
}
$this->settings['diacriticReplacement'] = array( 'patterns' => $patterns, 'replacements' => $replacements );
}
/**
* Enable/disable replacement of (r) (c) (tm) (sm) (p) (R) (C) (TM) (SM) (P) with ® © ™ ℠ ℗.
*
* @param boolean $on Optional. Default true.
*/
function set_smart_marks( $on = true ) {
$this->settings['smartMarks'] = $on;
}
/**
* Enable/disable proper mathematical symbols.
*
* @param boolean $on Optional. Default true.
*/
function set_smart_math( $on = true ) {
$this->settings['smartMath'] = $on;
}
/**
* Enable/disable replacement of 2^2 with 2<sup>2</sup>
*
* @param boolean $on Optional. Default true.
*/
function set_smart_exponents( $on = true ) {
$this->settings['smartExponents'] = $on;
}
/**
* Enable/disable replacement of 1/4 with <sup>1</sup>&#8260;<sub>4</sub>.
*
* @param boolean $on Optional. Default true.
*/
function set_smart_fractions( $on = true ) {
$this->settings['smartFractions'] = $on;
}
/**
* Enable/disable replacement of 1st with 1<sup>st</sup>.
*
* @param boolean $on Optional. Default true.
*/
function set_smart_ordinal_suffix( $on = true ) {
$this->settings['smartOrdinalSuffix'] = $on;
}
/**
* Enable/disable forcing single character words to next line with the insertion of &nbsp;.
*
* @param boolean $on Optional. Default true.
*/
function set_single_character_word_spacing( $on = true ) {
$this->settings['singleCharacterWordSpacing'] = $on;
}
/**
* Enable/disable fraction spacing.
*
* @param boolean $on Optional. Default true.
*/
function set_fraction_spacing( $on = true ) {
$this->settings['fractionSpacing'] = $on;
}
/**
* Enable/disable keeping units and values together with the insertion of &nbsp;.
*
* @param boolean $on Optional. Default true.
*/
function set_unit_spacing( $on = true ) {
$this->settings['unitSpacing'] = $on;
}
/**
* Enable/disable extra whitespace before certain punction marks, as is the French custom.
*
* @param boolean $on Optional. Default true.
*/
function set_french_punctuation_spacing( $on = true ) {
$this->settings['frenchPunctuationSpacing'] = $on;
}
/**
* Set the list of units to keep together with their values.
*
* @param string|array $units A comma separated list or an array of units.
*/
function set_units( $units = array() ) {
if ( ! is_array( $units ) ) {
$units = preg_split( $this->regex['parameterSplitting'], $units, -1, PREG_SPLIT_NO_EMPTY );
}
$this->settings['units'] = $units;
$this->update_unit_pattern( $units );
}
/**
* Update components and pattern for matching both standard and custom units.
*
* @param array $units An array of unit names.
*/
private function update_unit_pattern( array $units ) {
// Update components & regex pattern.
foreach ( $units as $index => $unit ) {
// Escape special chars.
$units[ $index ] = preg_replace( $this->regex['unitSpacingEscapeSpecialChars'], '\\\\$1', $unit );
}
$custom_units = implode( '|', $units );
$custom_units .= ( $custom_units ) ? '|' : '';
$this->components['unitSpacingUnits'] = $custom_units . $this->components['unitSpacingStandardUnits'];
$this->regex['unitSpacingUnitPattern'] = "/(\d\.?)\s({$this->components['unitSpacingUnits']})\b/x";
}
/**
* Enable/disable wrapping of Em and En dashes are in thin spaces.
*
* @param boolean $on Optional. Default true.
*/
function set_dash_spacing( $on = true ) {
$this->settings['dashSpacing'] = $on;
}
/**
* Enable/disable removal of extra whitespace characters.
*
* @param boolean $on Optional. Default true.
*/
function set_space_collapse( $on = true ) {
$this->settings['spaceCollapse'] = $on;
}
/**
* Enable/disable widow handling.
*
* @param boolean $on Optional. Default true.
*/
function set_dewidow( $on = true ) {
$this->settings['dewidow'] = $on;
}
/**
* Set the maximum length of widows that will be protected.
*
* @param number $length Defaults to 5. Trying to set the value to less than 2 resets the length to the default.
*/
function set_max_dewidow_length( $length = 5 ) {
$length = ( $length > 1 ) ? $length : 5;
$this->settings['dewidowMaxLength'] = $length;
}
/**
* Set the maximum length of pulled text to keep widows company.
*
* @param number $length Defaults to 5. Trying to set the value to less than 2 resets the length to the default.
*/
function set_max_dewidow_pull( $length = 5 ) {
$length = ( $length > 1 ) ? $length : 5;
$this->settings['dewidowMaxPull'] = $length;
}
/**
* Enable/disable wrapping at internal hard hyphens with the insertion of a zero-width-space.
*
* @param boolean $on Optional. Default true.
*/
function set_wrap_hard_hyphens( $on = true ) {
$this->settings['hyphenHardWrap'] = $on;
}
/**
* Enable/disable wrapping of urls.
*
* @param boolean $on Optional. Default true.
*/
function set_url_wrap( $on = true ) {
$this->settings['urlWrap'] = $on;
}
/**
* Enable/disable wrapping of email addresses.
*
* @param boolean $on Optional. Default true.
*/
function set_email_wrap( $on = true ) {
$this->settings['emailWrap'] = $on;
}
/**
* Set the minimum character requirement after an URL wrapping point.
*
* @param number $length Defaults to 5. Trying to set the value to less than 1 resets the length to the default.
*/
function set_min_after_url_wrap( $length = 5 ) {
$length = ( $length > 0 ) ? $length : 5;
$this->settings['urlMinAfterWrap'] = $length;
}
/**
* Enable/disable wrapping of ampersands in <span class="amp">.
*
* @param boolean $on Optional. Default true.
*/
function set_style_ampersands( $on = true ) {
$this->settings['styleAmpersands'] = $on;
}
/**
* Enable/disable wrapping caps in <span class="caps">.
*
* @param boolean $on Optional. Default true.
*/
function set_style_caps( $on = true ) {
$this->settings['styleCaps'] = $on;
}
/**
* Enable/disable wrapping of initial quotes in <span class="quo"> or <span class="dquo">.
*
* @param boolean $on Optional. Default true.
*/
function set_style_initial_quotes( $on = true ) {
$this->settings['styleInitialQuotes'] = $on;
}
/**
* Enable/disable wrapping of numbers in <span class="numbers">.
*
* @param boolean $on Optional. Default true.
*/
function set_style_numbers( $on = true ) {
$this->settings['styleNumbers'] = $on;
}
/**
* Enable/disable wrapping of punctiation and wide characters in <span class="pull-*">.
*
* @param boolean $on Optional. Default true.
*/
function set_style_hanging_punctuation( $on = true ) {
$this->settings['styleHangingPunctuation'] = $on;
}
/**
* Set the list of tags where initial quotes and guillemets should be styled.
*
* @param string|array $tags A comma separated list or an array of tag names.
*/
function set_initial_quote_tags( $tags = array( 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'li', 'dd', 'dt' ) ) {
// Make array if handed a list of tags as a string.
if ( ! is_array( $tags ) ) {
$tags = preg_split( '/[^a-z0-9]+/', $tags, -1, PREG_SPLIT_NO_EMPTY );
}
// Store the tag array inverted (with the tagName as its index for faster lookup).
$this->settings['initialQuoteTags'] = array_change_key_case( array_flip( $tags ), CASE_LOWER );
}
/**
* Enable/disable hyphenation.
*
* @param boolean $on Optional. Default true.
*/
function set_hyphenation( $on = true ) {
$this->settings['hyphenation'] = $on;
}
/**
* Set the hyphenation pattern language.
*
* @param string $lang Has to correspond to a filename in 'lang'. Optional. Default 'en-US'.
*/
function set_hyphenation_language( $lang = 'en-US' ) {
if ( isset( $this->settings['hyphenLanguage'] ) && $this->settings['hyphenLanguage'] === $lang ) {
return; // Bail out, no need to do anything.
}
$this->settings['hyphenLanguage'] = $lang;
if ( file_exists( dirname( __FILE__ ) . '/lang/' . $this->settings['hyphenLanguage'] . '.php' ) ) {
include( 'lang/' . $this->settings['hyphenLanguage'] . '.php' );
// @todo Fix variable naming in language files. @codingStandardsIgnoreStart.
$this->settings['hyphenationPattern'] = $patgen;
$this->settings['hyphenationPatternMaxSegment'] = $patgenMaxSeg;
$this->settings['hyphenationPatternExceptions'] = $patgenExceptions; // @codingStandardsIgnoreEnd.
} else {
unset( $this->settings['hyphenationPattern'] );
unset( $this->settings['hyphenationPatternMaxSegment'] );
unset( $this->settings['hyphenationPatternExceptions'] );
}
// Make sure hyphenationExceptions is not set to force remerging of patgen and custom exceptions.
if ( isset( $this->settings['hyphenationExceptions'] ) ) {
unset( $this->settings['hyphenationExceptions'] );
}
}
/**
* Set the minimum length of a word that may be hyphenated.
*
* @param number $length Defaults to 5. Trying to set the value to less than 2 resets the length to the default.
*/
function set_min_length_hyphenation( $length = 5 ) {
$length = ( $length > 1 ) ? $length : 5;
$this->settings['hyphenMinLength'] = $length;
}
/**
* Set the minimum character requirement before a hyphenation point.
*
* @param number $length Defaults to 3. Trying to set the value to less than 1 resets the length to the default.
*/
function set_min_before_hyphenation( $length = 3 ) {
$length = ( $length > 0 ) ? $length : 3;
$this->settings['hyphenMinBefore'] = $length;
}
/**
* Set the minimum character requirement after a hyphenation point.
*
* @param number $length Defaults to 2. Trying to set the value to less than 1 resets the length to the default.
*/
function set_min_after_hyphenation( $length = 2 ) {
$length = ( $length > 0 ) ? $length : 2;
$this->settings['hyphenMinAfter'] = $length;
}
/**
* Enable/disable hyphenation of titles and headings.
*
* @param boolean $on Optional. Default true.
*/
function set_hyphenate_headings( $on = true ) {
$this->settings['hyphenateTitle'] = $on;
}
/**
* Enable/disable hyphenation of words set completely in capital letters.
*
* @param boolean $on Optional. Default true.
*/
function set_hyphenate_all_caps( $on = true ) {
$this->settings['hyphenateAllCaps'] = $on;
}
/**
* Enable/disable hyphenation of words starting with a capital letter.
*
* @param boolean $on Optional. Default true.
*/
function set_hyphenate_title_case( $on = true ) {
$this->settings['hyphenateTitleCase'] = $on;
}
/**
* Enable/disable hyphenation of compound words (e.g. "editor-in-chief").
*
* @param boolean $on Optional. Default true.
*/
function set_hyphenate_compounds( $on = true ) {
$this->settings['hyphenateCompounds'] = $on;
}
/**
* Sets custom word hyphenations.
*
* @param string|array $exceptions An array of words with all hyphenation points marked with a hard hyphen (or a string list of such words).
* In the latter case, only alphanumeric characters and hyphens are recognized. The default is empty.
*/
function set_hyphenation_exceptions( $exceptions = array() ) {
if ( ! is_array( $exceptions ) ) {
$exceptions = preg_split( $this->regex['parameterSplitting'], $exceptions, -1, PREG_SPLIT_NO_EMPTY );
}
$exception_keys = array();
$func = array();
foreach ( $exceptions as $exception ) {
$func = $this->str_functions[ mb_detect_encoding( $exception, $this->encodings, true ) ];
if ( empty( $func ) || empty( $func['strlen'] ) ) {
continue; // unknown encoding, abort.
}
$exception = $func['strtolower']( $exception );
$exception_keys[ $exception ] = preg_replace( "#-#{$func['u']}", '', $exception );
}
$this->settings['hyphenationCustomExceptions'] = array_flip( $exception_keys );
// Make sure hyphenationExceptions is not set to force remerging of patgen and custom exceptions.
if ( isset( $this->settings['hyphenationExceptions'] ) ) {
unset( $this->settings['hyphenationExceptions'] );
}
}
/**
* Modifies $html according to the defined settings.
*
* @param string $html A HTML fragment.
* @param string $is_title If the HTML fragment is a title. Optional. Default false.
* @return string The processed $html.
*/
function process( $html, $is_title = false ) {
if ( isset( $this->settings['ignoreTags'] ) && $is_title && ( in_array( 'h1', $this->settings['ignoreTags'], true ) || in_array( 'h2', $this->settings['ignoreTags'], true ) ) ) {
return $html;
}
// Lazy-load our HTML parser.
$html5_parser = $this->get_html5_parser();
// Parse the HTML.
$dom = $this->parse_html( $html5_parser, $html );
$xpath = new \DOMXPath( $dom );
// Query some nodes.
$body_node = $xpath->query( '/html/body' )->item( 0 );
$all_textnodes = $xpath->query( '//text()', $body_node );
$tags_to_ignore = $this->query_tags_to_ignore( $xpath, $body_node );
// Start processing.
foreach ( $all_textnodes as $textnode ) {
if ( arrays_intersect( get_ancestors( $textnode ), $tags_to_ignore ) ) {
continue;
}
// We won't be doing anything with spaces, so we can jump ship if that is all we have.
if ( $textnode->isWhitespaceInElementContent() ) {
continue;
}
// Decode all characters except < > &.
$textnode->data = htmlspecialchars( $textnode->data, ENT_NOQUOTES, 'UTF-8' ); // returns < > & to encoded HTML characters (&lt; &gt; and &amp; respectively).
// Nodify anything that requires adjacent text awareness here.
$this->smart_math( $textnode );
$this->smart_diacritics( $textnode );
$this->smart_quotes( $textnode );
$this->smart_dashes( $textnode );
$this->smart_ellipses( $textnode );
$this->smart_marks( $textnode );
// Keep spacing after smart character replacement.
$this->single_character_word_spacing( $textnode );
$this->dash_spacing( $textnode );
$this->unit_spacing( $textnode );
$this->french_punctuation_spacing( $textnode );
// Parse and process individual words.
$this->process_words( $textnode, $is_title );
// Some final space manipulation.
$this->dewidow( $textnode );
$this->space_collapse( $textnode );
// Everything that requires HTML injection occurs here (functions above assume tag-free content)
// pay careful attention to functions below for tolerance of injected tags.
$this->smart_ordinal_suffix( $textnode ); // call before "style_numbers" and "smart_fractions".
$this->smart_exponents( $textnode ); // call before "style_numbers".
$this->smart_fractions( $textnode ); // call before "style_numbers" and after "smart_ordinal_suffix".
if ( ! has_class( $textnode, $this->css_classes['caps'] ) ) {
// Call before "style_numbers".
$this->style_caps( $textnode );
}
if ( ! has_class( $textnode, $this->css_classes['numbers'] ) ) {
// Call after "smart_ordinal_suffix", "smart_exponents", "smart_fractions", and "style_caps".
$this->style_numbers( $textnode );
}
if ( ! has_class( $textnode, $this->css_classes['amp'] ) ) {
$this->style_ampersands( $textnode );
}
if ( ! has_class( $textnode, array( $this->css_classes['quo'], $this->css_classes['dquo'] ) ) ) {
$this->style_initial_quotes( $textnode, $is_title );
}
if ( ! has_class( $textnode, array( $this->css_classes['pull-single'], $this->css_classes['pull-double'] ) ) ) {
$this->style_hanging_punctuation( $textnode );
}
// Until now, we've only been working on a single textnode: HTMLify result.
$this->replace_node_with_html( $textnode, $textnode->data );
}
return $html5_parser->saveHTML( $body_node->childNodes ); // @codingStandardsIgnoreLine.
}
/**
* Modifies $html according to the defined settings, in a way that is appropriate for RSS feeds
* (i.e. excluding processes that may not display well with limited character set intelligence).
*
* @param string $html A HTML fragment.
* @param string $is_title If the HTML fragment is a title. Optional. Default false.
* @return string The processed $html.
*/
function process_feed( $html, $is_title = false ) {
if ( isset( $this->settings['ignoreTags'] ) && $is_title && ( in_array( 'h1', $this->settings['ignoreTags'], true ) || in_array( 'h2', $this->settings['ignoreTags'], true ) ) ) {
return $html;
}
// Lazy-load our parser (the text parser is not needed for feeds).
$html5_parser = $this->get_html5_parser();
// Parse the HTML.
$dom = $this->parse_html( $html5_parser, $html );
$xpath = new \DOMXPath( $dom );
// Query some nodes in the DOM.
$body_node = $xpath->query( '/html/body' )->item( 0 );
$all_textnodes = $xpath->query( '//text()', $body_node );
$tags_to_ignore = $this->query_tags_to_ignore( $xpath, $body_node );
// Start processing.
foreach ( $all_textnodes as $textnode ) {
if ( arrays_intersect( get_ancestors( $textnode ), $tags_to_ignore ) ) {
continue;
}
// We won't be doing anything with spaces, so we can jump ship if that is all we have.
if ( $textnode->isWhitespaceInElementContent() ) {
continue;
}
// Decode all characters except < > &.
$textnode->data = htmlspecialchars( $textnode->data, ENT_NOQUOTES, 'UTF-8' ); // returns < > & to encoded HTML characters (&lt; &gt; and &amp; respectively).
// Modify anything that requires adjacent text awareness here.
$this->smart_quotes( $textnode );
$this->smart_dashes( $textnode );
$this->smart_ellipses( $textnode );
$this->smart_marks( $textnode );
// Until now, we've only been working on a textnode: HTMLify result.
$this->replace_node_with_html( $textnode, $textnode->data );
}
return $html5_parser->saveHTML( $body_node->childNodes ); // @codingStandardsIgnoreLine.
}
/**
* Tokenize the content of a textnode and process the individual words separately.
*
* Currently this functions applies the following enhancements:
* - wrapping hard hyphens
* - hyphenation
* - wrapping URLs
* - wrapping email addresses
*
* @param \DOMText $textnode The textnode to process.
* @param boolean $is_title If the HTML fragment is a title. Defaults to false.
*/
function process_words( \DOMText $textnode, $is_title = false ) {
// Lazy-load text parser.
$text_parser = $this->get_text_parser();
// Set up parameters for word categories.
$mixed_caps = empty( $this->settings['hyphenateAllCaps'] ) ? 'allow-all-caps' : 'no-all-caps';
$letter_caps = empty( $this->settings['hyphenateAllCaps'] ) ? 'no-all-caps' : 'allow-all-caps';
$mixed_compounds = empty( $this->settings['hyphenateCompounds'] ) ? 'allow-compounds' : 'no-compounds';
$letter_compounds = empty( $this->settings['hyphenateCompounds'] ) ? 'no-compounds' : 'allow-compounds';
// Break text down for a bit more granularity.
$text_parser->load( $textnode->data );
$parsed_mixed_words = $text_parser->get_words( 'no-all-letters', $mixed_caps, $mixed_compounds ); // prohibit letter-only words, allow caps, allow compounds (or not).
$parsed_compound_words = ! empty( $this->settings['hyphenateCompounds'] ) ? $text_parser->get_words( 'no-all-letters', $letter_caps, 'require-compounds' ) : array();
$parsed_words = $text_parser->get_words( 'require-all-letters', $letter_caps, $letter_compounds ); // require letter-only words allow/prohibit caps & compounds vice-versa.
$parsed_other = $text_parser->get_other();
// Process individual text parts here.
$parsed_mixed_words = $this->wrap_hard_hyphens( $parsed_mixed_words );
$parsed_compound_words = $this->hyphenate_compounds( $parsed_compound_words, $is_title, $textnode );
$parsed_words = $this->hyphenate( $parsed_words, $is_title, $textnode );
$parsed_other = $this->wrap_urls( $parsed_other );
$parsed_other = $this->wrap_emails( $parsed_other );
// Apply updates to our text.
$text_parser->update( $parsed_mixed_words + $parsed_compound_words + $parsed_words + $parsed_other );
$textnode->data = $text_parser->unload();
}
/**
* Parse HTML5 fragment while ignoring certain warnings for invalid HTML code (e.g. duplicate IDs).
*
* @param \Masterminds\HTML5 $parser An intialized parser object.
* @param string $html The HTML fragment to parse (not a complete document).
*
* @return \DOMDocument The encoding has already been set to UTF-8.
*/
function parse_html( \Masterminds\HTML5 $parser, $html ) {
// Silence some parsing errors for invalid HTML.
set_error_handler( array( $this, 'handle_parsing_errors' ) );
$xml_error_handling = libxml_use_internal_errors( true );
// Do the actual parsing.
$dom = $parser->loadHTML( '<body>' . $html . '</body>' );
$dom->encoding = 'UTF-8';
// Restore original error handling.
libxml_clear_errors();
libxml_use_internal_errors( $xml_error_handling );
restore_error_handler();
return $dom;
}
/**
* Silently handle certain HTML parsing errors.
*
* @param int $errno Error number.
* @param string $errstr Error message.
* @param string $errfile The file in which the error occurred.
* @param int $errline The line in which the error occurred.
* @param array $errcontext Calling context.
*
* @return boolean Returns true if the error was handled, false otherwise.
*/
public function handle_parsing_errors( $errno, $errstr, $errfile, $errline, array $errcontext ) {
if ( ! ( error_reporting() & $errno ) ) {
return true; // not interesting.
}
if ( $errno & E_USER_WARNING && 0 === substr_compare( $errfile, 'DOMTreeBuilder.php', -18 ) ) {
// Ignore warnings from parser.
return true;
}
// Let PHP handle the rest.
return false;
}
/**
* Retrieve an array of nodes that should be skipped during processing.
*
* @param \DOMXPath $xpath A valid XPath instance for the DOM to be queried.
* @param \DOMNode $initial_node The starting node of the XPath query.
* @return array An array of \DOMNode (can be empty).
*/
function query_tags_to_ignore( \DOMXPath $xpath, \DOMNode $initial_node ) {
$elements = array();
$query_parts = array();
if ( ! empty( $this->settings['ignoreTags'] ) ) {
$query_parts[] = '//' . implode( ' | //', $this->settings['ignoreTags'] );
}
if ( ! empty( $this->settings['ignoreClasses'] ) ) {
$query_parts[] = "//*[contains(concat(' ', @class, ' '), ' " . implode( " ') or contains(concat(' ', @class, ' '), ' ", $this->settings['ignoreClasses'] ) . " ')]";
}
if ( ! empty( $this->settings['ignoreIDs'] ) ) {
$query_parts[] = '//*[@id=\'' . implode( '\' or @id=\'', $this->settings['ignoreIDs'] ) . '\']';
}
if ( ! empty( $query_parts ) ) {
$ignore_query = implode( ' | ', $query_parts );
if ( false !== ( $nodelist = $xpath->query( $ignore_query, $initial_node ) ) ) {
$elements = nodelist_to_array( $nodelist );
}
}
return $elements;
}
/**
* Retrieve the last character of the previous \DOMText sibling (if there is one).
*
* @param \DOMNode $element The content node.
* @return string A single character (or the empty string).
*/
function get_prev_chr( \DOMNode $element ) {
$previous_textnode = $this->get_previous_textnode( $element );
if ( isset( $previous_textnode ) && isset( $previous_textnode->data ) ) {
// First determine encoding.
$func = $this->str_functions[ mb_detect_encoding( $previous_textnode->data, $this->encodings, true ) ];
if ( ! empty( $func ) && ! empty( $func['substr'] ) ) {
return preg_replace( $this->regex['controlCharacters'], '', $func['substr']( $previous_textnode->data, - 1 ) );
}
} // @codeCoverageIgnore
return '';
}
/**
* Retrieve the first character of the next \DOMText sibling (if there is one).
*
* @param \DOMNode $element The content node.
* @return string A single character (or the empty string).
*/
function get_next_chr( \DOMNode $element ) {
$next_textnode = $this->get_next_textnode( $element );
if ( isset( $next_textnode ) && isset( $next_textnode->data ) ) {
// First determine encoding.
$func = $this->str_functions[ mb_detect_encoding( $next_textnode->data, $this->encodings, true ) ];
if ( ! empty( $func ) && ! empty( $func['substr'] ) ) {
return preg_replace( $this->regex['controlCharacters'], '', $func['substr']( $next_textnode->data, 0, 1 ) );
}
} // @codeCoverageIgnore
return '';
}
/**
* Retrieve the previous \DOMText sibling (if there is one).
*
* @param \DOMNode $element The content node. Optional. Default null.
* @return \DOMText Null if $element is a block-level element or no text sibling exists.
*/
function get_previous_textnode( \DOMNode $element = null ) {
if ( ! isset( $element ) ) {
return null;
}
$previous_textnode = null;
$node = $element;
if ( $node instanceof \DOMElement && isset( $this->block_tags[ $node->tagName ] ) ) { // @codingStandardsIgnoreLine.
return null;
}
while ( ( $node = $node->previousSibling ) && empty( $previous_textnode ) ) { // @codingStandardsIgnoreLine.
$previous_textnode = $this->get_last_textnode( $node );
}
if ( ! $previous_textnode ) {
$previous_textnode = $this->get_previous_textnode( $element->parentNode ); // @codingStandardsIgnoreLine.
}
return $previous_textnode;
}
/**
* Retrieve the next \DOMText sibling (if there is one).
*
* @param \DOMNode $element The content node. Optional. Default null.
* @return \DOMText Null if $element is a block-level element or no text sibling exists.
*/
function get_next_textnode( \DOMNode $element = null ) {
if ( ! isset( $element ) ) {
return null;
}
$next_textnode = null;
$node = $element;
if ( $node instanceof \DOMElement && isset( $this->block_tags[ $node->tagName ] ) ) { // @codingStandardsIgnoreLine.
return null;
}
while ( ( $node = $node->nextSibling ) && empty( $next_textnode ) ) { // @codingStandardsIgnoreLine.
$next_textnode = $this->get_first_textnode( $node );
}
if ( ! $next_textnode ) {
$next_textnode = $this->get_next_textnode( $element->parentNode ); // @codingStandardsIgnoreLine.
}
return $next_textnode;
}
/**
* Retrieve the first \DOMText child of the element. Block-level child elements are ignored.
*
* @param \DOMNode $element Optional. Default null.
* @param boolean $recursive Should be set to true on recursive calls. Optional. Default false.
*
* @return \DOMNode The first child of type \DOMText, the element itself if it is of type \DOMText or null.
*/
function get_first_textnode( \DOMNode $element = null, $recursive = false ) {
if ( ! isset( $element ) ) {
return null;
}
if ( $element instanceof \DOMText ) {
return $element;
} elseif ( ! $element instanceof \DOMElement ) {
// Return null if $element is neither \DOMText nor \DOMElement.
return null;
} elseif ( $recursive && isset( $this->block_tags[ $element->tagName ] ) ) { // @codingStandardsIgnoreLine.
return null;
}
$first_textnode = null;
if ( $element->hasChildNodes() ) {
$children = $element->childNodes; // @codingStandardsIgnoreLine.
$i = 0;
while ( $i < $children->length && empty( $first_textnode ) ) {
$first_textnode = $this->get_first_textnode( $children->item( $i ), true );
$i++;
}
}
return $first_textnode;
}
/**
* Retrieve the last \DOMText child of the element. Block-level child elements are ignored.
*
* @param \DOMNode $element Optional. Default null.
* @param boolean $recursive Should be set to true on recursive calls. Optional. Default false.
*
* @return \DOMNode The last child of type \DOMText, the element itself if it is of type \DOMText or null.
*/
function get_last_textnode( \DOMNode $element = null, $recursive = false ) {
if ( ! isset( $element ) ) {
return null;
}
if ( $element instanceof \DOMText ) {
return $element;
} elseif ( ! $element instanceof \DOMElement ) {
// Return null if $element is neither \DOMText nor \DOMElement.
return null;
} elseif ( $recursive && isset( $this->block_tags[ $element->tagName ] ) ) { // @codingStandardsIgnoreLine.
return null;
}
$last_textnode = null;
if ( $element->hasChildNodes() ) {
$children = $element->childNodes; // @codingStandardsIgnoreLine.
$i = $children->length - 1;
while ( $i >= 0 && empty( $last_textnode ) ) {
$last_textnode = $this->get_last_textnode( $children->item( $i ), true );
$i--;
}
}
return $last_textnode;
}
/**
* Apply smart quotes (if enabled).
*
* @param \DOMText $textnode The content node.
*/
function smart_quotes( \DOMText $textnode ) {
if ( empty( $this->settings['smartQuotes'] ) ) {
return;
}
// Need to get context of adjacent characters outside adjacent inline tags or HTML comment
// if we have adjacent characters add them to the text.
$previous_character = $this->get_prev_chr( $textnode );
if ( '' !== $previous_character ) {
$textnode->data = $previous_character . $textnode->data;
}
$next_character = $this->get_next_chr( $textnode );
if ( '' !== $next_character ) {
$textnode->data = $textnode->data . $next_character;
}
// Before primes, handle quoted numbers.
$textnode->data = preg_replace( $this->regex['smartQuotesSingleQuotedNumbers'], $this->chr['singleQuoteOpen'] . '$1' . $this->chr['singleQuoteClose'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartQuotesDoubleQuotedNumbers'], $this->chr['doubleQuoteOpen'] . '$1' . $this->chr['doubleQuoteClose'], $textnode->data );
// Guillemets.
$textnode->data = str_replace( '<<', $this->chr['guillemetOpen'], $textnode->data );
$textnode->data = str_replace( '&lt;&lt;', $this->chr['guillemetOpen'], $textnode->data );
$textnode->data = str_replace( '>>', $this->chr['guillemetClose'], $textnode->data );
$textnode->data = str_replace( '&gt;&gt;', $this->chr['guillemetClose'], $textnode->data );
// Primes.
$textnode->data = preg_replace( $this->regex['smartQuotesSingleDoublePrime'], '$1' . $this->chr['singlePrime'] . '$2$3' . $this->chr['doublePrime'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartQuotesSingleDoublePrime1Glyph'], '$1' . $this->chr['singlePrime'] . '$2$3' . $this->chr['doublePrime'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartQuotesDoublePrime'], '$1' . $this->chr['doublePrime'], $textnode->data ); // should not interfere with regular quote matching.
$textnode->data = preg_replace( $this->regex['smartQuotesSinglePrimeCompound'], '$1' . $this->chr['singlePrime'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartQuotesDoublePrimeCompound'], '$1' . $this->chr['doublePrime'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartQuotesDoublePrime1GlyphCompound'], '$1' . $this->chr['doublePrime'], $textnode->data );
// Backticks.
$textnode->data = str_replace( '``', $this->chr['doubleQuoteOpen'], $textnode->data );
$textnode->data = str_replace( '`', $this->chr['singleQuoteOpen'], $textnode->data );
$textnode->data = str_replace( "''", $this->chr['doubleQuoteClose'], $textnode->data );
// Comma quotes.
$textnode->data = str_replace( ',,', $this->chr['doubleLow9Quote'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartQuotesCommaQuote'], $this->chr['singleLow9Quote'], $textnode->data ); // like _,¿hola?'_.
// Apostrophes.
$textnode->data = preg_replace( $this->regex['smartQuotesApostropheWords'], $this->chr['apostrophe'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartQuotesApostropheDecades'], $this->chr['apostrophe'] . '$1', $textnode->data ); // decades: '98.
$textnode->data = str_replace( $this->components['smartQuotesApostropheExceptionMatches'], $this->components['smartQuotesApostropheExceptionReplacements'], $textnode->data );
// Quotes.
$textnode->data = str_replace( $this->components['smartQuotesBracketMatches'], $this->components['smartQuotesBracketReplacements'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartQuotesSingleQuoteOpen'], $this->chr['singleQuoteOpen'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartQuotesSingleQuoteClose'], $this->chr['singleQuoteClose'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartQuotesSingleQuoteOpenSpecial'], $this->chr['singleQuoteOpen'], $textnode->data ); // like _'¿hola?'_.
$textnode->data = preg_replace( $this->regex['smartQuotesSingleQuoteCloseSpecial'], $this->chr['singleQuoteClose'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartQuotesDoubleQuoteOpen'], $this->chr['doubleQuoteOpen'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartQuotesDoubleQuoteClose'], $this->chr['doubleQuoteClose'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartQuotesDoubleQuoteOpenSpecial'], $this->chr['doubleQuoteOpen'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartQuotesDoubleQuoteCloseSpecial'], $this->chr['doubleQuoteClose'], $textnode->data );
// Quote catch-alls - assume left over quotes are closing - as this is often the most complicated position, thus most likely to be missed.
$textnode->data = str_replace( "'", $this->chr['singleQuoteClose'], $textnode->data );
$textnode->data = str_replace( '"', $this->chr['doubleQuoteClose'], $textnode->data );
// If we have adjacent characters remove them from the text.
$func = $this->str_functions[ mb_detect_encoding( $textnode->data, $this->encodings, true ) ];
if ( '' !== $previous_character ) {
$textnode->data = $func['substr']( $textnode->data, 1, $func['strlen']( $textnode->data ) );
}
if ( '' !== $next_character ) {
$textnode->data = $func['substr']( $textnode->data, 0, $func['strlen']( $textnode->data ) - 1 );
}
}
/**
* Apply smart dashes (if enabled).
*
* @param \DOMText $textnode The content node.
*/
function smart_dashes( \DOMText $textnode ) {
if ( empty( $this->settings['smartDashes'] ) ) {
return;
}
$textnode->data = str_replace( '---', $this->chr['emDash'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartDashesParentheticalDoubleDash'], "\$1{$this->chr['parentheticalDash']}\$2", $textnode->data );
$textnode->data = str_replace( '--', $this->chr['enDash'], $textnode->data );
$textnode->data = preg_replace( $this->regex['smartDashesParentheticalSingleDash'], "\$1{$this->chr['parentheticalDash']}\$2", $textnode->data );
$textnode->data = preg_replace( $this->regex['smartDashesEnDashAll'], '$1' . $this->chr['enDash'] . '$2', $textnode->data );
$textnode->data = preg_replace( $this->regex['smartDashesEnDashWords'] , '$1' . $this->chr['enDash'] . '$2', $textnode->data );
$textnode->data = preg_replace( $this->regex['smartDashesEnDashNumbers'], '$1' . $this->chr['intervalDash'] . '$2', $textnode->data );
$textnode->data = preg_replace( $this->regex['smartDashesEnDashPhoneNumbers'], '$1' . $this->chr['noBreakHyphen'] . '$2', $textnode->data ); // phone numbers.
$textnode->data = str_replace( "xn{$this->chr['enDash']}", 'xn--', $textnode->data ); // revert messed-up punycode.
// Revert dates back to original formats
// YYYY-MM-DD.
$textnode->data = preg_replace( $this->regex['smartDashesYYYY-MM-DD'], '$1-$2-$3', $textnode->data );
// MM-DD-YYYY or DD-MM-YYYY.
$textnode->data = preg_replace( $this->regex['smartDashesMM-DD-YYYY'], '$1$3-$2$4-$5', $textnode->data );
// YYYY-MM or YYYY-DDDD next.
$textnode->data = preg_replace( $this->regex['smartDashesYYYY-MM'], '$1-$2', $textnode->data );
}
/**
* Apply smart ellipses (if enabled).
*
* @param \DOMText $textnode The content node.
*/
function smart_ellipses( \DOMText $textnode ) {
if ( empty( $this->settings['smartEllipses'] ) ) {
return;
}
$textnode->data = str_replace( array( '....', '. . . .' ), '.' . $this->chr['ellipses'], $textnode->data );
$textnode->data = str_replace( array( '...', '. . .' ), $this->chr['ellipses'], $textnode->data );
}
/**
* Apply smart diacritics (if enabled).
*
* @param \DOMText $textnode The content node.
*/
function smart_diacritics( \DOMText $textnode ) {
if ( empty( $this->settings['smartDiacritics'] ) ) {
return; // abort.
}
if ( ! empty( $this->settings['diacriticReplacement'] ) &&
! empty( $this->settings['diacriticReplacement']['patterns'] ) &&
! empty( $this->settings['diacriticReplacement']['replacements'] ) ) {
// Uses "word" => "replacement" pairs from an array to make fast preg_* replacements.
$replacements = $this->settings['diacriticReplacement']['replacements'];
$textnode->data = preg_replace_callback( $this->settings['diacriticReplacement']['patterns'], function( $match ) use ( $replacements ) {
if ( isset( $replacements[ $match[0] ] ) ) {
return $replacements[ $match[0] ];
} else {
return $match[0];
}
}, $textnode->data );
}
}
/**
* Apply smart marks (if enabled).
*
* @param \DOMText $textnode The content node.
*/
function smart_marks( \DOMText $textnode ) {
if ( empty( $this->settings['smartMarks'] ) ) {
return;
}
// Escape usage of "501(c)(1...29)" (US non-profit).
$textnode->data = preg_replace( $this->regex['smartMarksEscape501(c)'], '$1' . $this->components['escapeMarker'] . '$2' . $this->components['escapeMarker'] . '$3', $textnode->data );
// Replace marks.
$textnode->data = str_replace( array( '(c)', '(C)' ), $this->chr['copyright'], $textnode->data );
$textnode->data = str_replace( array( '(r)', '(R)' ), $this->chr['registeredMark'], $textnode->data );
$textnode->data = str_replace( array( '(p)', '(P)' ), $this->chr['soundCopyMark'], $textnode->data );
$textnode->data = str_replace( array( '(sm)', '(SM)' ), $this->chr['serviceMark'], $textnode->data );
$textnode->data = str_replace( array( '(tm)', '(TM)' ), $this->chr['tradeMark'], $textnode->data );
// Un-escape escaped sequences.
$textnode->data = str_replace( $this->components['escapeMarker'], '', $textnode->data );
}
/**
* Apply smart math (if enabled).
*
* @param \DOMText $textnode The content node.
*/
function smart_math( \DOMText $textnode ) {
if ( empty( $this->settings['smartMath'] ) ) {
return;
}
// First, let's find math equations.
$textnode->data = preg_replace_callback( $this->regex['smartMathEquation'], array( $this, '_smart_math_callback' ), $textnode->data );
// Revert 4-4 to plain minus-hyphen so as to not mess with ranges of numbers (i.e. pp. 46-50).
$textnode->data = preg_replace( $this->regex['smartMathRevertRange'], '$1-$2', $textnode->data );
// Revert fractions to basic slash.
// We'll leave styling fractions to smart_fractions.
$textnode->data = preg_replace( $this->regex['smartMathRevertFraction'], '$1/$2', $textnode->data );
// Revert date back to original formats.
// YYYY-MM-DD.
$textnode->data = preg_replace( $this->regex['smartMathRevertDateYYYY-MM-DD'], '$1-$2-$3', $textnode->data );
// MM-DD-YYYY or DD-MM-YYYY.
$textnode->data = preg_replace( $this->regex['smartMathRevertDateMM-DD-YYYY'], '$1$3-$2$4-$5', $textnode->data );
// YYYY-MM or YYYY-DDD next.
$textnode->data = preg_replace( $this->regex['smartMathRevertDateYYYY-MM'], '$1-$2', $textnode->data );
// MM/DD/YYYY or DD/MM/YYYY.
$textnode->data = preg_replace( $this->regex['smartMathRevertDateMM/DD/YYYY'], '$1$3/$2$4/$5', $textnode->data );
}
/**
* Callback function for smart math.
*
* @param array $matches Regex matches.
*/
private function _smart_math_callback( array $matches ) {
$matches[0] = str_replace( '-', $this->chr['minus'], $matches[0] );
$matches[0] = str_replace( '/', $this->chr['division'], $matches[0] );
$matches[0] = str_replace( 'x', $this->chr['multiplication'], $matches[0] );
$matches[0] = str_replace( '*', $this->chr['multiplication'], $matches[0] );
return $matches[0];
}
/**
* Apply smart exponents (if enabled).
* Purposefully seperated from smart_math because of HTML code injection.
*
* @param \DOMText $textnode The content node.
*/
function smart_exponents( \DOMText $textnode ) {
if ( empty( $this->settings['smartExponents'] ) ) {
return;
}
// Handle exponents (ie. 4^2).
$textnode->data = preg_replace( $this->regex['smartExponents'], '$1<sup>$2</sup>', $textnode->data );
}
/**
* Apply smart fractions (if enabled).
*
* Call before style_numbers, but after smart_ordinal_suffix.
* Purposefully seperated from smart_math because of HTML code injection.
*
* @param \DOMText $textnode The content node.
*/
function smart_fractions( \DOMText $textnode ) {
if ( empty( $this->settings['smartFractions'] ) && empty( $this->settings['fractionSpacing'] ) ) {
return;
}
if ( ! empty( $this->settings['fractionSpacing'] ) && ! empty( $this->settings['smartFractions'] ) ) {
$textnode->data = preg_replace( $this->regex['smartFractionsSpacing'], '$1' . $this->chr['noBreakNarrowSpace'] . '$2', $textnode->data );
} elseif ( ! empty( $this->settings['fractionSpacing'] ) && empty( $this->settings['smartFractions'] ) ) {
$textnode->data = preg_replace( $this->regex['smartFractionsSpacing'], '$1' . $this->chr['noBreakSpace'] . '$2', $textnode->data );
}
if ( ! empty( $this->settings['smartFractions'] ) ) {
// Escape sequences we don't want fractionified.
$textnode->data = preg_replace( $this->regex['smartFractionsEscapeYYYY/YYYY'], '$1' . $this->components['escapeMarker'] . '$2$3$4', $textnode->data );
$textnode->data = preg_replace( $this->regex['smartFractionsEscapeMM/YYYY'], '$1' . $this->components['escapeMarker'] . '$2$3$4', $textnode->data );
// Replace fractions.
$numerator_class = empty( $this->css_classes['numerator'] ) ? '' : ' class="' . $this->css_classes['numerator'] . '"';
$denominator_class = empty( $this->css_classes['denominator'] ) ? '' : ' class="' . $this->css_classes['denominator'] . '"';
$textnode->data = preg_replace( $this->regex['smartFractionsReplacement'], "<sup{$numerator_class}>\$1</sup>" . $this->chr['fractionSlash'] . "<sub{$denominator_class}>\$2</sub>\$3", $textnode->data );
// Unescape escaped sequences.
$textnode->data = str_replace( $this->components['escapeMarker'], '', $textnode->data );
}
}
/**
* Apply smart ordinal suffix (if enabled).
*
* Call before style_numbers.
*
* @param \DOMText $textnode The content node.
*/
function smart_ordinal_suffix( \DOMText $textnode ) {
if ( empty( $this->settings['smartOrdinalSuffix'] ) ) {
return;
}
$ordinal_class = empty( $this->css_classes['ordinal'] ) ? '' : ' class="' . $this->css_classes['ordinal'] . '"';
$textnode->data = preg_replace( $this->regex['smartOrdinalSuffix'], '$1' . "<sup{$ordinal_class}>$2</sup>", $textnode->data );
}
/**
* Prevent single character words from being alone (if enabled).
*
* @param \DOMText $textnode The content node.
*/
function single_character_word_spacing( \DOMText $textnode ) {
if ( empty( $this->settings['singleCharacterWordSpacing'] ) ) {
return;
}
// Add $next_character and $previous_character for context.
$previous_character = $this->get_prev_chr( $textnode );
if ( '' !== $previous_character ) {
$textnode->data = $previous_character . $textnode->data;
}
$next_character = $this->get_next_chr( $textnode );
if ( '' !== $next_character ) {
$textnode->data = $textnode->data . $next_character;
}
$textnode->data = preg_replace( $this->regex['singleCharacterWordSpacing'], '$1$2' . $this->chr['noBreakSpace'], $textnode->data );
// If we have adjacent characters remove them from the text.
$func = $this->str_functions[ mb_detect_encoding( $textnode->data, $this->encodings, true ) ];
if ( '' !== $previous_character ) {
$textnode->data = $func['substr']( $textnode->data, 1, $func['strlen']( $textnode->data ) );
}
if ( '' !== $next_character ) {
$textnode->data = $func['substr']( $textnode->data, 0, $func['strlen']( $textnode->data ) - 1 );
}
}
/**
* Apply spacing around dashes (if enabled).
*
* @param \DOMText $textnode The content node.
*/
function dash_spacing( \DOMText $textnode ) {
if ( empty( $this->settings['dashSpacing'] ) ) {
return;
}
$textnode->data = preg_replace( $this->regex['dashSpacingEmDash'], $this->chr['intervalDashSpace'] . '$1$2' . $this->chr['intervalDashSpace'], $textnode->data );
$textnode->data = preg_replace( $this->regex['dashSpacingParentheticalDash'], $this->chr['parentheticalDashSpace'] . '$1$2' . $this->chr['parentheticalDashSpace'], $textnode->data );
$textnode->data = preg_replace( $this->regex['dashSpacingIntervalDash'], $this->chr['intervalDashSpace'] . '$1$2' . $this->chr['intervalDashSpace'], $textnode->data );
}
/**
* Collapse spaces (if enabled).
*
* @param \DOMText $textnode The content node.
*/
function space_collapse( \DOMText $textnode ) {
if ( empty( $this->settings['spaceCollapse'] ) ) {
return;
}
// Normal spacing.
$textnode->data = preg_replace( $this->regex['spaceCollapseNormal'], ' ', $textnode->data );
// Non-breakable space get's priority. If non-breakable space exists in a string of spaces, it collapses to a single non-breakable space.
$textnode->data = preg_replace( $this->regex['spaceCollapseNonBreakable'], $this->chr['noBreakSpace'], $textnode->data );
// For any other spaceing, replace with the first occurance of an unusual space character.
$textnode->data = preg_replace( $this->regex['spaceCollapseOther'], '$1', $textnode->data );
// Remove all spacing at beginning of block level elements.
if ( '' === $this->get_prev_chr( $textnode ) ) { // we have the first text in a block level element.
$textnode->data = preg_replace( $this->regex['spaceCollapseBlockStart'], '', $textnode->data );
}
}
/**
* Prevent values being split from their units (if enabled).
*
* @param \DOMText $textnode The content node.
*/
function unit_spacing( \DOMText $textnode ) {
if ( empty( $this->settings['unitSpacing'] ) ) {
return;
}
$textnode->data = preg_replace( $this->regex['unitSpacingUnitPattern'], '$1' . $this->chr['noBreakNarrowSpace'] . '$2', $textnode->data );
}
/**
* Add a narrow no-break space before
* - exclamation mark (!)
* - question mark (?)
* - semicolon (;)
* - colon (:)
*
* If there already is a space there, it is replaced.
*
* @param \DOMText $textnode The content node.
*/
function french_punctuation_spacing( \DOMText $textnode ) {
if ( empty( $this->settings['frenchPunctuationSpacing'] ) ) {
return;
}
$textnode->data = preg_replace( $this->regex['frenchPunctuationSpacingNarrow'], '$1' . $this->chr['noBreakNarrowSpace'] . '$3$4', $textnode->data );
$textnode->data = preg_replace( $this->regex['frenchPunctuationSpacingFull'], '$1' . $this->chr['noBreakSpace'] . '$3$4', $textnode->data );
$textnode->data = preg_replace( $this->regex['frenchPunctuationSpacingSemicolon'], '$1' . $this->chr['noBreakNarrowSpace'] . '$3$4', $textnode->data );
$textnode->data = preg_replace( $this->regex['frenchPunctuationSpacingOpeningQuote'], '$1$2' . $this->chr['noBreakNarrowSpace'] . '$4', $textnode->data );
}
/**
* Wrap hard hypens with zero-width spaces (if enabled).
*
* @param array $parsed_text_tokens The tokenized content of a textnode.
*/
function wrap_hard_hyphens( array $parsed_text_tokens ) {
if ( ! empty( $this->settings['hyphenHardWrap'] ) || ! empty( $this->settings['smartDashes'] ) ) {
foreach ( $parsed_text_tokens as &$text_token ) {
if ( isset( $this->settings['hyphenHardWrap'] ) && $this->settings['hyphenHardWrap'] ) {
$text_token['value'] = str_replace( $this->components['hyphensArray'], '-' . $this->chr['zeroWidthSpace'], $text_token['value'] );
$text_token['value'] = str_replace( '_', '_' . $this->chr['zeroWidthSpace'], $text_token['value'] );
$text_token['value'] = str_replace( '/', '/' . $this->chr['zeroWidthSpace'], $text_token['value'] );
$text_token['value'] = preg_replace( $this->regex['wrapHardHyphensRemoveEndingSpace'], '$1', $text_token['value'] );
}
if ( ! empty( $this->settings['smartDashes'] ) ) {
// Handled here because we need to know we are inside a word and not a URL.
$text_token['value'] = str_replace( '-', $this->chr['hyphen'], $text_token['value'] );
}
}
}
return $parsed_text_tokens;
}
/**
* Prevent widows (if enabled).
*
* @param \DOMText $textnode The content node.
*/
function dewidow( \DOMText $textnode ) {
// Intervening inline tags may interfere with widow identification, but that is a sacrifice of using the parser.
// Intervening tags will only interfere if they separate the widow from previous or preceding whitespace.
if ( empty( $this->settings['dewidow'] ) || empty( $this->settings['dewidowMaxPull'] ) || empty( $this->settings['dewidowMaxLength'] ) ) {
return;
}
if ( '' === $this->get_next_chr( $textnode ) ) {
// We have the last type "text" child of a block level element.
$textnode->data = preg_replace_callback( $this->regex['dewidow'], array( $this, '_dewidow_callback' ), $textnode->data );
}
}
/**
* Callback function for de-widowing.
*
* @param array $widow Regex matching array.
* @return string
*/
private function _dewidow_callback( array $widow ) {
$func = $this->str_functions[ mb_detect_encoding( $widow[0], $this->encodings, true ) ];
// If we are here, we know that widows are being protected in some fashion
// with that, we will assert that widows should never be hyphenated or wrapped
// as such, we will strip soft hyphens and zero-width-spaces.
$widow['widow'] = str_replace( $this->chr['zeroWidthSpace'], '', $widow['widow'] ); // TODO: check if this can match here.
$widow['widow'] = str_replace( $this->chr['softHyphen'], '', $widow['widow'] ); // TODO: check if this can match here.
$widow['trailing'] = preg_replace( "/\s+/{$func['u']}", $this->chr['noBreakSpace'], $widow['trailing'] );
$widow['trailing'] = str_replace( $this->chr['zeroWidthSpace'], '', $widow['trailing'] );
$widow['trailing'] = str_replace( $this->chr['softHyphen'], '', $widow['trailing'] );
// Eject if widows neighbor is proceeded by a no break space (the pulled text would be too long).
if ( '' === $widow['space_before'] || strstr( $this->chr['noBreakSpace'], $widow['space_before'] ) ) {
return $widow['space_before'] . $widow['neighbor'] . $widow['space_between'] . $widow['widow'] . $widow['trailing'];
}
// Eject if widows neighbor length exceeds the max allowed or widow length exceeds max allowed.
if ( $func['strlen']( $widow['neighbor'] ) > $this->settings['dewidowMaxPull'] ||
$func['strlen']( $widow['widow'] ) > $this->settings['dewidowMaxLength'] ) {
return $widow['space_before'] . $widow['neighbor'] . $widow['space_between'] . $widow['widow'] . $widow['trailing'];
}
// Never replace thin and hair spaces with &nbsp;.
switch ( $widow['space_between'] ) {
case $this->chr['thinSpace']:
case $this->chr['hairSpace']:
return $widow['space_before'] . $widow['neighbor'] . $widow['space_between'] . $widow['widow'] . $widow['trailing'];
}
// Let's protect some widows!
return $widow['space_before'] . $widow['neighbor'] . $this->chr['noBreakSpace'] . $widow['widow'] . $widow['trailing'];
}
/**
* Wrap URL parts zero-width spaces (if enabled).
*
* @param array $parsed_text_tokens The tokenized content of a textnode.
*/
function wrap_urls( array $parsed_text_tokens ) {
if ( empty( $this->settings['urlWrap'] ) || empty( $this->settings['urlMinAfterWrap'] ) ) {
return $parsed_text_tokens;
}
// Test for and parse urls.
foreach ( $parsed_text_tokens as &$text_token ) {
if ( preg_match( $this->regex['wrapUrlsPattern'], $text_token['value'], $url_match ) ) {
// $url_match['schema'] holds "http://".
// $url_match['domain'] holds "subdomains.domain.tld".
// $url_match['path'] holds the path after the domain.
$http = ( $url_match['schema'] ) ? $url_match[1] . $this->chr['zeroWidthSpace'] : '';
$domain_parts = preg_split( $this->regex['wrapUrlsDomainParts'], $url_match['domain'], -1, PREG_SPLIT_DELIM_CAPTURE );
// This is a hack, but it works.
// First, we hyphenate each part, we need it formated like a group of words.
$parsed_words_like = array();
foreach ( $domain_parts as $key => $part ) {
$parsed_words_like[ $key ]['value'] = $part;
}
// Do the hyphenation.
$parsed_words_like = $this->do_hyphenate( $parsed_words_like );
// Restore format.
foreach ( $parsed_words_like as $key => $parsed_word ) {
$domain_parts[ $key ] = $parsed_word['value'];
}
foreach ( $domain_parts as $key => &$part ) {
// Then we swap out each soft-hyphen" with a zero-space.
$part = str_replace( $this->chr['softHyphen'], $this->chr['zeroWidthSpace'], $part );
// We also insert zero-spaces before periods and hyphens.
if ( $key > 0 && 1 === strlen( $part ) ) {
$part = $this->chr['zeroWidthSpace'] . $part;
}
}
// Lastly let's recombine.
$domain = implode( $domain_parts );
// Break up the URL path to individual characters.
$path_parts = str_split( $url_match['path'], 1 );
$path_count = count( $path_parts );
$path = '';
foreach ( $path_parts as $index => $path_part ) {
if ( 0 === $index || $path_count - $index < $this->settings['urlMinAfterWrap'] ) {
$path .= $path_part;
} else {
$path .= $this->chr['zeroWidthSpace'] . $path_part;
}
}
$text_token['value'] = $http . $domain . $path;
}
}
return $parsed_text_tokens;
}
/**
* Wrap email parts zero-width spaces (if enabled).
*
* @param array $parsed_text_tokens The tokenized content of a textnode.
*/
function wrap_emails( array $parsed_text_tokens ) {
if ( empty( $this->settings['emailWrap'] ) ) {
return $parsed_text_tokens;
}
// Test for and parse urls.
foreach ( $parsed_text_tokens as &$text_token ) {
if ( preg_match( $this->regex['wrapEmailsMatchEmails'], $text_token['value'], $email_match ) ) {
$text_token['value'] = preg_replace( $this->regex['wrapEmailsReplaceEmails'], '$1' . $this->chr['zeroWidthSpace'], $text_token['value'] );
}
}
return $parsed_text_tokens;
}
/**
* Wraps words of all caps (may include numbers) in <span class="caps"> if enabled.
*
* Call before style_numbers().Only call if you are certain that no html tags have been
* injected containing capital letters.
*
* @param \DOMText $textnode The content node.
*/
function style_caps( \DOMText $textnode ) {
if ( empty( $this->settings['styleCaps'] ) ) {
return;
}
$textnode->data = preg_replace( $this->regex['styleCaps'], '<span class="' . $this->css_classes['caps'] . '">$1</span>', $textnode->data );
}
/**
* Replace the given node with HTML content. Uses the HTML5 parser.
*
* @param \DOMNode $node The node to replace.
* @param string $content The HTML fragment used to replace the node.
*
* @return \DOMNode|array An array of \DOMNode containing the new nodes or the old \DOMNode if the replacement failed.
*/
function replace_node_with_html( \DOMNode $node, $content ) {
$result = $node;
$parent = $node->parentNode; // @codingStandardsIgnoreLine.
if ( empty( $parent ) ) {
return $node; // abort early to save cycles.
}
set_error_handler( array( $this, 'handle_parsing_errors' ) );
$html_fragment = $this->get_html5_parser()->loadHTMLFragment( $content );
if ( ! empty( $html_fragment ) ) {
$imported_fragment = $node->ownerDocument->importNode( $html_fragment, true ); // @codingStandardsIgnoreLine.
if ( ! empty( $imported_fragment ) ) {
// Save the children of the imported DOMDocumentFragment before replacement.
$children = nodelist_to_array( $imported_fragment->childNodes ); // @codingStandardsIgnoreLine.
if ( false !== $parent->replaceChild( $imported_fragment, $node ) ) {
// Success! We return the saved array of DOMNodes as
// $imported_fragment is just an empty DOMDocumentFragment now.
$result = $children;
}
}
}
restore_error_handler();
return $result;
}
/**
* Wraps numbers in <span class="numbers"> (even numbers that appear inside a word,
* i.e. A9 becomes A<span class="numbers">9</span>), if enabled.
*
* Call after style_caps so A9 becomes <span class="caps">A<span class="numbers">9</span></span>.
* Call after smart_fractions and smart_ordinal_suffix.
* Only call if you are certain that no html tags have been injected containing numbers.
*
* @param \DOMText $textnode The content node.
*/
function style_numbers( \DOMText $textnode ) {
if ( empty( $this->settings['styleNumbers'] ) ) {
return;
}
$textnode->data = preg_replace( $this->regex['styleNumbers'], '<span class="' . $this->css_classes['numbers'] . '">$1</span>', $textnode->data );
}
/**
* Wraps hanging punctuation in <span class="pull-*"> and <span class="push-*">, if enabled.
*
* @param \DOMText $textnode The content node.
*/
function style_hanging_punctuation( \DOMText $textnode ) {
if ( empty( $this->settings['styleHangingPunctuation'] ) ) {
return;
}
// We need the parent.
$block = $this->get_block_parent( $textnode );
$firstnode = ! empty( $block ) ? $this->get_first_textnode( $block ) : null;
// Need to get context of adjacent characters outside adjacent inline tags or HTML comment
// if we have adjacent characters add them to the text.
$next_character = $this->get_next_chr( $textnode );
if ( '' !== $next_character ) {
$textnode->data = $textnode->data . $next_character;
}
$textnode->data = preg_replace( $this->regex['styleHangingPunctuationDouble'], '$1<span class="' . $this->css_classes['push-double'] . '"></span>' . $this->chr['zeroWidthSpace'] . '<span class="' . $this->css_classes['pull-double'] . '">$2</span>$3', $textnode->data );
$textnode->data = preg_replace( $this->regex['styleHangingPunctuationSingle'], '$1<span class="' . $this->css_classes['push-single'] . '"></span>' . $this->chr['zeroWidthSpace'] . '<span class="' . $this->css_classes['pull-single'] . '">$2</span>$3', $textnode->data );
if ( empty( $block ) || $firstnode === $textnode ) {
$textnode->data = preg_replace( $this->regex['styleHangingPunctuationInitialDouble'], '<span class="' . $this->css_classes['pull-double'] . '">$1</span>$2', $textnode->data );
$textnode->data = preg_replace( $this->regex['styleHangingPunctuationInitialSingle'], '<span class="' . $this->css_classes['pull-single'] . '">$1</span>$2', $textnode->data );
} else {
$textnode->data = preg_replace( $this->regex['styleHangingPunctuationInitialDouble'], '<span class="' . $this->css_classes['push-double'] . '"></span>' . $this->chr['zeroWidthSpace'] . '<span class="' . $this->css_classes['pull-double'] . '">$1</span>$2', $textnode->data );
$textnode->data = preg_replace( $this->regex['styleHangingPunctuationInitialSingle'], '<span class="' . $this->css_classes['push-single'] . '"></span>' . $this->chr['zeroWidthSpace'] . '<span class="' . $this->css_classes['pull-single'] . '">$1</span>$2', $textnode->data );
}
// Remove any added characters.
if ( '' !== $next_character ) {
$func = $this->str_functions[ mb_detect_encoding( $textnode->data, $this->encodings, true ) ];
$textnode->data = $func['substr']( $textnode->data, 0, $func['strlen']( $textnode->data ) - 1 );
}
}
/**
* Wraps ampersands in <span class="amp"> (i.e. H&amp;J becomes H<span class="amp">&amp;</span>J),
* if enabled.
*
* Call after style_caps so H&amp;J becomes <span class="caps">H<span class="amp">&amp;</span>J</span>.
* Note that all standalone ampersands were previously converted to &amp;.
* Only call if you are certain that no html tags have been injected containing "&amp;".
*
* @param \DOMText $textnode The content node.
*/
function style_ampersands( \DOMText $textnode ) {
if ( empty( $this->settings['styleAmpersands'] ) ) {
return;
}
$textnode->data = preg_replace( $this->regex['styleAmpersands'], '<span class="' . $this->css_classes['amp'] . '">$1</span>', $textnode->data );
}
/**
* Styles initial quotes and guillemets (if enabled).
*
* @param \DOMText $textnode The content node.
* @param boolean $is_title Default false.
*/
function style_initial_quotes( \DOMText $textnode, $is_title = false ) {
if ( empty( $this->settings['styleInitialQuotes'] ) || empty( $this->settings['initialQuoteTags'] ) ) {
return;
}
if ( '' === $this->get_prev_chr( $textnode ) ) { // we have the first text in a block level element.
$func = $this->str_functions[ mb_detect_encoding( $textnode->data, $this->encodings, true ) ];
$first_character = $func['substr']( $textnode->data, 0, 1 );
switch ( $first_character ) {
case "'":
case $this->chr['singleQuoteOpen']:
case $this->chr['singleLow9Quote']:
case ',':
case '"':
case $this->chr['doubleQuoteOpen']:
case $this->chr['guillemetOpen']:
case $this->chr['guillemetClose']:
case $this->chr['doubleLow9Quote']:
$block_level_parent = $this->get_block_parent( $textnode );
$block_level_parent = isset( $block_level_parent->tagName ) ? $block_level_parent->tagName : false; // @codingStandardsIgnoreLine.
if ( $is_title ) {
// Assume page title is h2.
$block_level_parent = 'h2';
}
if ( $block_level_parent && isset( $this->settings['initialQuoteTags'][ $block_level_parent ] ) ) {
switch ( $first_character ) {
case "'":
case $this->chr['singleQuoteOpen']:
case $this->chr['singleLow9Quote']:
case ',':
$span_class = 'quo';
break;
default: // double quotes or guillemets.
$span_class = 'dquo';
}
$textnode->data = '<span class="' . $this->css_classes[ $span_class ] . '">' . $first_character . '</span>' . $func['substr']( $textnode->data, 1, $func['strlen']( $textnode->data ) );
}
}
}
}
/**
* Inject the PatGen segments pattern into the PatGen words pattern.
*
* @param array $word_pattern Required.
* @param array $segment_pattern Required.
* @param number $segment_position Required.
* @param number $segment_length Required.
*/
function hyphenation_pattern_injection( array $word_pattern, array $segment_pattern, $segment_position, $segment_length ) {
for ( $number_position = $segment_position;
$number_position <= $segment_position + $segment_length;
$number_position++ ) {
$word_pattern[ $number_position ] =
( intval( $word_pattern[ $number_position ] ) >= intval( $segment_pattern[ $number_position - $segment_position ] ) ) ?
$word_pattern[ $number_position ] : $segment_pattern[ $number_position - $segment_position ];
}
return $word_pattern;
}
/**
* Hyphenate given text fragment (if enabled).
*
* Actual work is done in do_hyphenate().
*
* @param array $parsed_text_tokens Filtered to words.
* @param boolean $is_title Flag to indicate title fragments. Optional. Default false.
* @param \DOMText $textnode The textnode corresponding to the $parsed_text_tokens. Optional. Default null.
*/
function hyphenate( $parsed_text_tokens, $is_title = false, \DOMText $textnode = null ) {
if ( empty( $this->settings['hyphenation'] ) ) {
return $parsed_text_tokens; // abort.
}
$is_heading = false;
if ( ! empty( $textnode ) && ! empty( $textnode->parentNode ) ) { // @codingStandardsIgnoreLine.
$block_level_parent = $this->get_block_parent( $textnode );
$block_level_parent = isset( $block_level_parent->tagName ) ? $block_level_parent->tagName : false; // @codingStandardsIgnoreLine.
if ( $block_level_parent && isset( $this->heading_tags[ $block_level_parent ] ) ) {
$is_heading = true;
}
}
if ( empty( $this->settings['hyphenateTitle'] ) && ( $is_title || $is_heading ) ) {
return $parsed_text_tokens; // abort.
}
// Call functionality as seperate function so it can be run without test for setting['hyphenation'] - such as with url wrapping.
return $this->do_hyphenate( $parsed_text_tokens );
}
/**
* Hyphenate hyphenated compound words (if enabled).
*
* Calls hyphenate() on the component words.
*
* @param array $parsed_text_tokens Filtered to compound words.
* @param boolean $is_title Flag to indicate title fragments. Optional. Default false.
* @param \DOMText $textnode The textnode corresponding to the $parsed_text_tokens. Optional. Default null.
*/
function hyphenate_compounds( array $parsed_text_tokens, $is_title = false, \DOMText $textnode = null ) {
if ( empty( $this->settings['hyphenateCompounds'] ) ) {
return $parsed_text_tokens; // abort.
}
// Hyphenate compound words.
foreach ( $parsed_text_tokens as $key => $word_token ) {
$component_words = array();
foreach ( preg_split( '/(-)/', $word_token['value'], -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE ) as $word_part ) {
$component_words[] = array( 'value' => $word_part );
}
$parsed_text_tokens[ $key ]['value'] = array_reduce( $this->hyphenate( $component_words, $is_title, $textnode ), function( $carry, $item ) {
return $carry . $item['value'];
});
}
return $parsed_text_tokens;
}
/**
* Really hyphenate given text fragment.
*
* @param array $parsed_text_tokens Filtered to words.
* @return array The hyphenated text token.
*/
function do_hyphenate( array $parsed_text_tokens ) {
if ( empty( $this->settings['hyphenMinLength'] ) ||
empty( $this->settings['hyphenMinBefore'] ) ||
! isset( $this->settings['hyphenationPatternMaxSegment'] ) ||
! isset( $this->settings['hyphenationPatternExceptions'] ) ||
! isset( $this->settings['hyphenationPattern'] ) ) {
return $parsed_text_tokens;
}
// Make sure we have full exceptions list.
if ( ! isset( $this->settings['hyphenationExceptions'] ) ) {
$exceptions = array();
if ( $this->settings['hyphenationPatternExceptions'] || ! empty( $this->settings['hyphenationCustomExceptions'] ) ) {
if ( isset( $this->settings['hyphenationCustomExceptions'] ) ) {
// Nerges custom and language specific word hyphenations.
$exceptions = array_merge( $this->settings['hyphenationCustomExceptions'], $this->settings['hyphenationPatternExceptions'] );
} else {
$exceptions = $this->settings['hyphenationPatternExceptions'];
}
}
$this->settings['hyphenationExceptions'] = $exceptions;
}
$func = array(); // quickly reference string functions according to encoding.
foreach ( $parsed_text_tokens as &$text_token ) {
$func = $this->str_functions[ mb_detect_encoding( $text_token['value'], $this->encodings, true ) ];
if ( empty( $func ) || empty( $func['strlen'] ) ) {
continue; // unknown encoding, abort.
}
$word_length = $func['strlen']( $text_token['value'] );
$the_key = $func['strtolower']( $text_token['value'] );
if ( $word_length < $this->settings['hyphenMinLength'] ) {
continue;
}
// If this is a capitalized word, and settings do not allow hyphenation of such, abort!
// Note: This is different than uppercase words, where we are looking for title case.
if ( empty( $this->settings['hyphenateTitleCase'] ) && $func['substr']( $the_key , 0 , 1 ) !== $func['substr']( $text_token['value'], 0, 1 ) ) {
continue;
}
// Give exceptions preference.
if ( isset( $this->settings['hyphenationExceptions'][ $the_key ] ) ) {
// Set the word_pattern - this method keeps any contextually important capitalization.
$lowercase_hyphened_word = $this->settings['hyphenationExceptions'][ $the_key ];
$lowercase_hyphened_word_parts = $func['str_split']( $lowercase_hyphened_word, 1 );
$lowercase_hyphened_word_length = $func['strlen']( $lowercase_hyphened_word );
$word_pattern = array();
for ( $i = 0; $i < $lowercase_hyphened_word_length; $i++ ) {
if ( '-' === $lowercase_hyphened_word_parts[ $i ] ) {
$word_pattern[] = '9';
$i++;
} else {
$word_pattern[] = '0';
}
}
$word_pattern[] = '0'; // For consistent length with the other word patterns.
}
if ( ! isset( $word_pattern ) ) {
// First we set up the matching pattern to be a series of zeros one character longer than $parsedTextToken.
$word_pattern = array();
for ( $i = 0; $i < $word_length + 1; $i++ ) {
$word_pattern[] = '0';
}
// We grab all possible segments from $parsedTextToken of length 1 through $this->settings['hyphenationPatternMaxSegment'].
for ( $segment_length = 1; ( $segment_length <= $word_length ) && ( $segment_length <= $this->settings['hyphenationPatternMaxSegment'] ); $segment_length++ ) {
for ( $segment_position = 0; $segment_position + $segment_length <= $word_length; $segment_position++ ) {
$segment = $func['strtolower']( $func['substr']( $text_token['value'], $segment_position, $segment_length ) );
if ( 0 === $segment_position && isset( $this->settings['hyphenationPattern']['begin'][ $segment ] ) ) {
$segment_pattern = $func['str_split']( $this->settings['hyphenationPattern']['begin'][ $segment ], 1 );
$word_pattern = $this->hyphenation_pattern_injection( $word_pattern, $segment_pattern, $segment_position, $segment_length );
}
if ( $segment_position + $segment_length === $word_length && isset( $this->settings['hyphenationPattern']['end'][ $segment ] ) ) {
$segment_pattern = $func['str_split']( $this->settings['hyphenationPattern']['end'][ $segment ], 1 );
$word_pattern = $this->hyphenation_pattern_injection( $word_pattern, $segment_pattern, $segment_position, $segment_length );
}
if ( isset( $this->settings['hyphenationPattern']['all'][ $segment ] ) ) {
$segment_pattern = $func['str_split']( $this->settings['hyphenationPattern']['all'][ $segment ], 1 );
$word_pattern = $this->hyphenation_pattern_injection( $word_pattern, $segment_pattern, $segment_position, $segment_length );
}
}
}
}
// Add soft-hyphen based on $wordPattern.
$word_parts = $func['str_split']( $text_token['value'], 1 );
$hyphenated_word = '';
for ( $i = 0; $i < $word_length; $i++ ) {
if ( is_odd( intval( $word_pattern[ $i ] ) ) && ( $i >= $this->settings['hyphenMinBefore']) && ( $i < $word_length - $this->settings['hyphenMinAfter'] ) ) {
$hyphenated_word .= $this->chr['softHyphen'] . $word_parts[ $i ];
} else {
$hyphenated_word .= $word_parts[ $i ];
}
}
$text_token['value'] = $hyphenated_word;
unset( $word_pattern );
}
return $parsed_text_tokens;
}
/**
* Returns the nearest block-level parent.
*
* @param \DOMNode $element The node to get the containing block-level tag.
*
* @return \DOMElement
*/
function get_block_parent( \DOMNode $element ) {
$parent = $element->parentNode; // @codingStandardsIgnoreLine.
while ( isset( $parent->tagName ) && ! isset( $this->block_tags[ $parent->tagName ] ) && ! empty( $parent->parentNode ) && $parent->parentNode instanceof \DOMElement ) { // @codingStandardsIgnoreLine.
$parent = $parent->parentNode; // @codingStandardsIgnoreLine.
}
return $parent;
}
/**
* Retrieve a unique hash value for the current settings.
*
* @param number $max_length The maximum number of bytes returned.
* @return string An binary hash value for the current settings limited to $max_length.
*/
public function get_settings_hash( $max_length = 8 ) {
$hash = md5( json_encode( $this->settings ), true );
if ( $max_length < strlen( $hash ) ) {
$hash = substr( $hash, 0, $max_length );
}
return $hash;
}
/**
* Retrieve the HTML5 parser instance.
*
* @return \Mastermind\HTML5
*/
public function get_html5_parser() {
// Lazy-load HTML5 parser.
if ( ! isset( $this->html5_parser ) ) {
$this->html5_parser = new \Masterminds\HTML5( array( 'disable_html_ns' => true ) );
}
return $this->html5_parser;
}
/**
* Retrieve the text parser instance.
*
* @return \PHP_Typography\Parse_Text
*/
public function get_text_parser() {
// Lazy-load text parser.
if ( ! isset( $this->text_parser ) ) {
$this->text_parser = new Parse_Text( $this->encodings );
}
return $this->text_parser;
}
/**
* Retrieve the list of valid hyphenation languages.
* The language names are translation-ready but not translated yet.
*
* @return array An array in the form of ( LANG_CODE => LANGUAGE ).
*/
static public function get_hyphenation_languages() {
return \PHP_Typography\get_language_plugin_list( __DIR__ . '/lang/', 'patgenLanguage' );
}
/**
* Retrieve the list of valid diacritic replacement languages.
* The language names are translation-ready but not translated yet.
*
* @return array An array in the form of ( LANG_CODE => LANGUAGE ).
*/
static public function get_diacritic_languages() {
return \PHP_Typography\get_language_plugin_list( __DIR__ . '/diacritics/', 'diacriticLanguage' );
}
}