3386 lines
130 KiB
PHP
3386 lines
130 KiB
PHP
<?php
|
|
/**
|
|
* This file is part of wp-Typography.
|
|
*
|
|
* Copyright 2014-2016 Peter Putzer.
|
|
* Copyright 2012-2013 Marie Hogebrandt.
|
|
* Coypright 2009-2011 KINGdesk, LLC.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version 2
|
|
* of the License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
*
|
|
* ***
|
|
*
|
|
* @package wpTypography/PHPTypography
|
|
* @license http://www.gnu.org/licenses/gpl-2.0.html
|
|
*/
|
|
|
|
namespace PHP_Typography;
|
|
|
|
/**
|
|
* A few utility functions.
|
|
*/
|
|
require_once __DIR__ . '/php-typography-functions.php'; // @codeCoverageIgnore
|
|
|
|
/**
|
|
* HTML5-PHP - a DOM-based HTML5 parser
|
|
*/
|
|
require_once dirname( __DIR__ ) . '/vendor/Masterminds/HTML5.php'; // @codeCoverageIgnore
|
|
require_once dirname( __DIR__ ) . '/vendor/Masterminds/HTML5/autoload.php'; // @codeCoverageIgnore
|
|
|
|
/**
|
|
* Parses HTML5 (or plain text) and applies various typographic fixes to the text.
|
|
*
|
|
* If used with multibyte language, UTF-8 encoding is required.
|
|
*
|
|
* Portions of this code have been inspired by:
|
|
* - typogrify (https://code.google.com/p/typogrify/)
|
|
* - WordPress code for wptexturize (https://developer.wordpress.org/reference/functions/wptexturize/)
|
|
* - PHP SmartyPants Typographer (https://michelf.ca/projects/php-smartypants/typographer/)
|
|
*
|
|
* @author Jeffrey D. King <jeff@kingdesk.com>
|
|
* @author Peter Putzer <github@mundschenk.at>
|
|
*/
|
|
class PHP_Typography {
|
|
|
|
/**
|
|
* A hashmap for various special characters.
|
|
*
|
|
* @var array
|
|
*/
|
|
public $chr = array();
|
|
|
|
/**
|
|
* A hashmap of settings for the various typographic options.
|
|
*
|
|
* @var array
|
|
*/
|
|
public $settings = array();
|
|
|
|
/**
|
|
* A custom parser for \DOMText to separate words, whitespace etc. for HTML injection.
|
|
*
|
|
* @var Parse_Text
|
|
*/
|
|
private $text_parser;
|
|
|
|
/**
|
|
* A DOM-based HTML5 parser.
|
|
*
|
|
* @var \Masterminds\HTML5
|
|
*/
|
|
private $html5_parser;
|
|
|
|
/**
|
|
* An array containing all self-closing HTML5 tags.
|
|
*
|
|
* @var array
|
|
*/
|
|
private $self_closing_tags = array();
|
|
|
|
/**
|
|
* A array of tags we should never touch.
|
|
*
|
|
* @var array
|
|
*/
|
|
private $inappropriate_tags = array();
|
|
|
|
/**
|
|
* An array of ( $tag => true ) for quick checking with `isset`.
|
|
*
|
|
* @var array
|
|
*/
|
|
private $heading_tags = array( 'h1' => true, 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true, 'h6' => true );
|
|
|
|
/**
|
|
* An array of encodings in detection order.
|
|
*
|
|
* @var array
|
|
*/
|
|
private $encodings = array( 'ASCII', 'UTF-8' );
|
|
|
|
/**
|
|
* A hash map for string functions according to encoding.
|
|
*
|
|
* @var array $encoding => array( 'strlen' => $function_name, ... ).
|
|
*/
|
|
private $str_functions = array(
|
|
'UTF-8' => array(),
|
|
'ASCII' => array(),
|
|
false => array(),
|
|
);
|
|
|
|
/**
|
|
* An array of various regex components (not complete patterns).
|
|
*
|
|
* @var array $components
|
|
*/
|
|
private $components = array();
|
|
|
|
/**
|
|
* An array of regex patterns.
|
|
*
|
|
* @var array $regex
|
|
*/
|
|
private $regex = array();
|
|
|
|
/**
|
|
* An array in the form of [ '$style' => [ 'open' => $chr, 'close' => $chr ] ]
|
|
*
|
|
* @var array
|
|
*/
|
|
private $quote_styles = array();
|
|
|
|
/**
|
|
* An array in the form of [ '$style' => [ 'parenthetical' => $chr, 'interval' => $chr ] ]
|
|
*
|
|
* @var array
|
|
*/
|
|
private $dash_styles = array();
|
|
|
|
/**
|
|
* An array in the form of [ '$tag' => true ]
|
|
*
|
|
* @var array
|
|
*/
|
|
private $block_tags = array();
|
|
|
|
/**
|
|
* An array of CSS classes that are added for ampersands, numbers etc that can be overridden in a subclass.
|
|
*
|
|
* @var array
|
|
*/
|
|
protected $css_classes = array(
|
|
'caps' => 'caps',
|
|
'numbers' => 'numbers',
|
|
'amp' => 'amp',
|
|
'quo' => 'quo',
|
|
'dquo' => 'dquo',
|
|
'pull-single' => 'pull-single',
|
|
'pull-double' => 'pull-double',
|
|
'push-single' => 'push-single',
|
|
'push-double' => 'push-double',
|
|
'numerator' => 'numerator',
|
|
'denominator' => 'denominator',
|
|
'ordinal' => 'ordinal',
|
|
);
|
|
|
|
/**
|
|
* Set up a new PHP_Typography object.
|
|
*
|
|
* @param boolean $set_defaults If true, set default values for various properties. Defaults to true.
|
|
* @param string $init Flag to control initialization. Valid inputs are 'now' and 'lazy'. Optional. Default 'now'.
|
|
*/
|
|
function __construct( $set_defaults = true, $init = 'now' ) {
|
|
|
|
// ASCII has to be first to have chance at detection.
|
|
mb_detect_order( $this->encodings );
|
|
|
|
// Not sure if this is necessary - but error_log seems to have problems with the strings.
|
|
// Used as the default encoding for mb_* functions.
|
|
$encoding_set = mb_internal_encoding( 'UTF-8' );
|
|
|
|
if ( 'now' === $init ) {
|
|
$this->init( $set_defaults );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Load the given state.
|
|
*
|
|
* @param array $state The state array. Has to contain 'block_tags', 'chr', 'quote_styles', 'dash_styles', 'str_functions',
|
|
* 'components', 'regex', 'self_closing_tags', 'inappropriate_tags', 'css_classes', 'settings'.
|
|
* @return boolean True if successful, false if $state is incomplete.
|
|
*/
|
|
function load_state( $state ) {
|
|
if ( ! isset( $state['block_tags'] ) ||
|
|
! isset( $state['chr'] ) ||
|
|
! isset( $state['quote_styles'] ) ||
|
|
! isset( $state['str_functions'] ) ||
|
|
! isset( $state['components'] ) ||
|
|
! isset( $state['regex'] ) ||
|
|
! isset( $state['self_closing_tags'] ) ||
|
|
! isset( $state['inappropriate_tags'] ) ||
|
|
! isset( $state['css_classes'] ) ||
|
|
! isset( $state['settings'] ) ) {
|
|
return false;
|
|
}
|
|
|
|
$this->block_tags = $state['block_tags'];
|
|
$this->chr = $state['chr'];
|
|
$this->quote_styles = $state['quote_styles'];
|
|
$this->dash_styles = $state['dash_styles'];
|
|
$this->str_functions = $state['str_functions'];
|
|
$this->components = $state['components'];
|
|
$this->regex = $state['regex'];
|
|
$this->self_closing_tags = $state['self_closing_tags'];
|
|
$this->inappropriate_tags = $state['inappropriate_tags'];
|
|
$this->css_classes = $state['css_classes'];
|
|
$this->settings = $state['settings'];
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Retrieves to current state of the PHP_Typography object for caching.
|
|
*
|
|
* @return array The state array.
|
|
*/
|
|
function save_state() {
|
|
return array(
|
|
'block_tags' => $this->block_tags,
|
|
'chr' => $this->chr,
|
|
'quote_styles' => $this->quote_styles,
|
|
'dash_styles' => $this->dash_styles,
|
|
'str_functions' => $this->str_functions,
|
|
'components' => $this->components,
|
|
'regex' => $this->regex,
|
|
'self_closing_tags' => $this->self_closing_tags,
|
|
'inappropriate_tags' => $this->inappropriate_tags,
|
|
'css_classes' => $this->css_classes,
|
|
'settings' => $this->settings,
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Initialize the PHP_Typography object.
|
|
*
|
|
* @param boolean $set_defaults If true, set default values for various properties. Defaults to true.
|
|
*/
|
|
function init( $set_defaults = true ) {
|
|
$this->block_tags = array_flip( array_filter( array_keys( \Masterminds\HTML5\Elements::$html5 ), function( $tag ) { return \Masterminds\HTML5\Elements::isA( $tag, \Masterminds\HTML5\Elements::BLOCK_TAG ); } )
|
|
+ array( 'li', 'td', 'dt' ) ); // not included as "block tags" in current HTML5-PHP version.
|
|
|
|
$this->chr['noBreakSpace'] = uchr( 160 );
|
|
$this->chr['noBreakNarrowSpace'] = uchr( 160 ); // used in unit spacing - can be changed to 8239 via set_true_no_break_narrow_space.
|
|
$this->chr['copyright'] = uchr( 169 );
|
|
$this->chr['guillemetOpen'] = uchr( 171 );
|
|
$this->chr['softHyphen'] = uchr( 173 );
|
|
$this->chr['registeredMark'] = uchr( 174 );
|
|
$this->chr['guillemetClose'] = uchr( 187 );
|
|
$this->chr['multiplication'] = uchr( 215 );
|
|
$this->chr['division'] = uchr( 247 );
|
|
$this->chr['figureSpace'] = uchr( 8199 );
|
|
$this->chr['thinSpace'] = uchr( 8201 );
|
|
$this->chr['hairSpace'] = uchr( 8202 );
|
|
$this->chr['zeroWidthSpace'] = uchr( 8203 );
|
|
$this->chr['hyphen'] = '-'; // should be uchr(8208), but IE6 chokes.
|
|
$this->chr['noBreakHyphen'] = uchr( 8209 );
|
|
$this->chr['enDash'] = uchr( 8211 );
|
|
$this->chr['emDash'] = uchr( 8212 );
|
|
$this->chr['parentheticalDash'] = uchr( 8212 ); // defined separate from emDash so it can be redefined in set_smart_dashes_style.
|
|
$this->chr['intervalDash'] = uchr( 8211 ); // defined separate from enDash so it can be redefined in set_smart_dashes_style.
|
|
$this->chr['parentheticalDashSpace'] = uchr( 8201 );
|
|
$this->chr['intervalDashSpace'] = uchr( 8201 );
|
|
$this->chr['singleQuoteOpen'] = uchr( 8216 );
|
|
$this->chr['singleQuoteClose'] = uchr( 8217 );
|
|
$this->chr['apostrophe'] = uchr( 8217 ); // defined seperate from singleQuoteClose so quotes can be redefined in set_smart_quotes_language() without disrupting apostrophies.
|
|
$this->chr['singleLow9Quote'] = uchr( 8218 );
|
|
$this->chr['doubleQuoteOpen'] = uchr( 8220 );
|
|
$this->chr['doubleQuoteClose'] = uchr( 8221 );
|
|
$this->chr['doubleLow9Quote'] = uchr( 8222 );
|
|
$this->chr['ellipses'] = uchr( 8230 );
|
|
$this->chr['singlePrime'] = uchr( 8242 );
|
|
$this->chr['doublePrime'] = uchr( 8243 );
|
|
$this->chr['singleAngleQuoteOpen'] = uchr( 8249 );
|
|
$this->chr['singleAngleQuoteClose'] = uchr( 8250 );
|
|
$this->chr['fractionSlash'] = uchr( 8260 );
|
|
$this->chr['soundCopyMark'] = uchr( 8471 );
|
|
$this->chr['serviceMark'] = uchr( 8480 );
|
|
$this->chr['tradeMark'] = uchr( 8482 );
|
|
$this->chr['minus'] = uchr( 8722 );
|
|
$this->chr['leftCornerBracket'] = uchr( 12300 );
|
|
$this->chr['rightCornerBracket'] = uchr( 12301 );
|
|
$this->chr['leftWhiteCornerBracket'] = uchr( 12302 );
|
|
$this->chr['rightWhiteCornerBracket'] = uchr( 12303 );
|
|
|
|
$this->quote_styles = array(
|
|
'doubleCurled' => array(
|
|
'open' => uchr( 8220 ),
|
|
'close' => uchr( 8221 ),
|
|
),
|
|
'doubleCurledReversed' => array(
|
|
'open' => uchr( 8221 ),
|
|
'close' => uchr( 8221 ),
|
|
),
|
|
'doubleLow9' => array(
|
|
'open' => $this->chr['doubleLow9Quote'],
|
|
'close' => uchr( 8221 ),
|
|
),
|
|
'doubleLow9Reversed' => array(
|
|
'open' => $this->chr['doubleLow9Quote'],
|
|
'close' => uchr( 8220 ),
|
|
),
|
|
'singleCurled' => array(
|
|
'open' => uchr( 8216 ),
|
|
'close' => uchr( 8217 ),
|
|
),
|
|
'singleCurledReversed' => array(
|
|
'open' => uchr( 8217 ),
|
|
'close' => uchr( 8217 ),
|
|
),
|
|
'singleLow9' => array(
|
|
'open' => $this->chr['singleLow9Quote'],
|
|
'close' => uchr( 8217 ),
|
|
),
|
|
'singleLow9Reversed' => array(
|
|
'open' => $this->chr['singleLow9Quote'],
|
|
'close' => uchr( 8216 ),
|
|
),
|
|
'doubleGuillemetsFrench' => array(
|
|
'open' => $this->chr['guillemetOpen'] . $this->chr['noBreakNarrowSpace'],
|
|
'close' => $this->chr['noBreakNarrowSpace'] . $this->chr['guillemetClose'],
|
|
),
|
|
'doubleGuillemets' => array(
|
|
'open' => $this->chr['guillemetOpen'],
|
|
'close' => $this->chr['guillemetClose'],
|
|
),
|
|
'doubleGuillemetsReversed' => array(
|
|
'open' => $this->chr['guillemetClose'],
|
|
'close' => $this->chr['guillemetOpen'],
|
|
),
|
|
'singleGuillemets' => array(
|
|
'open' => $this->chr['singleAngleQuoteOpen'],
|
|
'close' => $this->chr['singleAngleQuoteClose'],
|
|
),
|
|
'singleGuillemetsReversed' => array(
|
|
'open' => $this->chr['singleAngleQuoteClose'],
|
|
'close' => $this->chr['singleAngleQuoteOpen'],
|
|
),
|
|
'cornerBrackets' => array(
|
|
'open' => $this->chr['leftCornerBracket'],
|
|
'close' => $this->chr['rightCornerBracket'],
|
|
),
|
|
'whiteCornerBracket' => array(
|
|
'open' => $this->chr['leftWhiteCornerBracket'],
|
|
'close' => $this->chr['rightWhiteCornerBracket'],
|
|
),
|
|
);
|
|
|
|
$this->dash_styles = array(
|
|
'traditionalUS' => array(
|
|
'parenthetical' => $this->chr['emDash'],
|
|
'interval' => $this->chr['enDash'],
|
|
'parentheticalSpace' => $this->chr['thinSpace'],
|
|
'intervalSpace' => $this->chr['thinSpace'],
|
|
),
|
|
'international' => array(
|
|
'parenthetical' => $this->chr['enDash'],
|
|
'interval' => $this->chr['enDash'],
|
|
'parentheticalSpace' => ' ',
|
|
'intervalSpace' => $this->chr['hairSpace'],
|
|
),
|
|
);
|
|
|
|
// Set up both UTF-8 and ASCII string functions.
|
|
// UTF-8 first.
|
|
$this->str_functions['UTF-8']['strlen'] = 'mb_strlen';
|
|
$this->str_functions['UTF-8']['str_split'] = __NAMESPACE__ . '\mb_str_split';
|
|
$this->str_functions['UTF-8']['strtolower'] = 'mb_strtolower';
|
|
$this->str_functions['UTF-8']['substr'] = 'mb_substr';
|
|
$this->str_functions['UTF-8']['u'] = 'u'; // unicode flag for regex.
|
|
// Now ASCII.
|
|
$this->str_functions['ASCII']['strlen'] = 'strlen';
|
|
$this->str_functions['ASCII']['str_split'] = 'str_split';
|
|
$this->str_functions['ASCII']['strtolower'] = 'strtolower';
|
|
$this->str_functions['ASCII']['substr'] = 'substr';
|
|
$this->str_functions['ASCII']['u'] = ''; // no regex flag needed.
|
|
// All other encodings get the empty array.
|
|
// Set up regex patterns.
|
|
$this->initialize_components();
|
|
$this->initialize_patterns();
|
|
|
|
// Set up some arrays for quick HTML5 introspection.
|
|
$this->self_closing_tags = array_filter( array_keys( \Masterminds\HTML5\Elements::$html5 ), function( $tag ) { return \Masterminds\HTML5\Elements::isA( $tag, \Masterminds\HTML5\Elements::VOID_TAG );
|
|
} );
|
|
$this->inappropriate_tags = array( 'iframe', 'textarea', 'button', 'select', 'optgroup', 'option', 'map', 'style', 'head', 'title', 'script', 'applet', 'object', 'param' );
|
|
|
|
if ( $set_defaults ) {
|
|
$this->set_defaults();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* (Re)set various options to their default values.
|
|
*/
|
|
function set_defaults() {
|
|
// General attributes.
|
|
$this->set_tags_to_ignore();
|
|
$this->set_classes_to_ignore();
|
|
$this->set_ids_to_ignore();
|
|
|
|
// Smart characters.
|
|
$this->set_smart_quotes();
|
|
$this->set_smart_quotes_primary(); // added in version 1.15.
|
|
$this->set_smart_quotes_secondary(); // added in version 1.15.
|
|
$this->set_smart_dashes();
|
|
$this->set_smart_dashes_style();
|
|
$this->set_smart_ellipses();
|
|
$this->set_smart_diacritics();
|
|
$this->set_diacritic_language();
|
|
$this->set_diacritic_custom_replacements();
|
|
$this->set_smart_marks();
|
|
$this->set_smart_ordinal_suffix();
|
|
$this->set_smart_math();
|
|
$this->set_smart_fractions();
|
|
$this->set_smart_exponents();
|
|
|
|
// Smart spacing.
|
|
$this->set_single_character_word_spacing();
|
|
$this->set_fraction_spacing();
|
|
$this->set_unit_spacing();
|
|
$this->set_french_punctuation_spacing();
|
|
$this->set_units();
|
|
$this->set_dash_spacing();
|
|
$this->set_dewidow();
|
|
$this->set_max_dewidow_length();
|
|
$this->set_max_dewidow_pull();
|
|
$this->set_wrap_hard_hyphens();
|
|
$this->set_url_wrap();
|
|
$this->set_email_wrap();
|
|
$this->set_min_after_url_wrap();
|
|
$this->set_space_collapse();
|
|
$this->set_true_no_break_narrow_space();
|
|
|
|
// Character styling.
|
|
$this->set_style_ampersands();
|
|
$this->set_style_caps();
|
|
$this->set_style_initial_quotes();
|
|
$this->set_style_numbers();
|
|
$this->set_style_hanging_punctuation();
|
|
$this->set_initial_quote_tags();
|
|
|
|
// Hyphenation.
|
|
$this->set_hyphenation();
|
|
$this->set_hyphenation_language();
|
|
$this->set_min_length_hyphenation();
|
|
$this->set_min_before_hyphenation();
|
|
$this->set_min_after_hyphenation();
|
|
$this->set_hyphenate_headings();
|
|
$this->set_hyphenate_all_caps();
|
|
$this->set_hyphenate_title_case(); // added in version 1.5.
|
|
$this->set_hyphenate_compounds();
|
|
$this->set_hyphenation_exceptions();
|
|
}
|
|
|
|
/**
|
|
* Set up our regex components for later use.
|
|
*
|
|
* Call before initialize_patterns().
|
|
*/
|
|
private function initialize_components() {
|
|
// Various regex components (but not complete patterns).
|
|
$this->components['nonEnglishWordCharacters'] = "
|
|
[0-9A-Za-z]|\x{00c0}|\x{00c1}|\x{00c2}|\x{00c3}|\x{00c4}|\x{00c5}|\x{00c6}|\x{00c7}|\x{00c8}|\x{00c9}|
|
|
\x{00ca}|\x{00cb}|\x{00cc}|\x{00cd}|\x{00ce}|\x{00cf}|\x{00d0}|\x{00d1}|\x{00d2}|\x{00d3}|\x{00d4}|
|
|
\x{00d5}|\x{00d6}|\x{00d8}|\x{00d9}|\x{00da}|\x{00db}|\x{00dc}|\x{00dd}|\x{00de}|\x{00df}|\x{00e0}|
|
|
\x{00e1}|\x{00e2}|\x{00e3}|\x{00e4}|\x{00e5}|\x{00e6}|\x{00e7}|\x{00e8}|\x{00e9}|\x{00ea}|\x{00eb}|
|
|
\x{00ec}|\x{00ed}|\x{00ee}|\x{00ef}|\x{00f0}|\x{00f1}|\x{00f2}|\x{00f3}|\x{00f4}|\x{00f5}|\x{00f6}|
|
|
\x{00f8}|\x{00f9}|\x{00fa}|\x{00fb}|\x{00fc}|\x{00fd}|\x{00fe}|\x{00ff}|\x{0100}|\x{0101}|\x{0102}|
|
|
\x{0103}|\x{0104}|\x{0105}|\x{0106}|\x{0107}|\x{0108}|\x{0109}|\x{010a}|\x{010b}|\x{010c}|\x{010d}|
|
|
\x{010e}|\x{010f}|\x{0110}|\x{0111}|\x{0112}|\x{0113}|\x{0114}|\x{0115}|\x{0116}|\x{0117}|\x{0118}|
|
|
\x{0119}|\x{011a}|\x{011b}|\x{011c}|\x{011d}|\x{011e}|\x{011f}|\x{0120}|\x{0121}|\x{0122}|\x{0123}|
|
|
\x{0124}|\x{0125}|\x{0126}|\x{0127}|\x{0128}|\x{0129}|\x{012a}|\x{012b}|\x{012c}|\x{012d}|\x{012e}|
|
|
\x{012f}|\x{0130}|\x{0131}|\x{0132}|\x{0133}|\x{0134}|\x{0135}|\x{0136}|\x{0137}|\x{0138}|\x{0139}|
|
|
\x{013a}|\x{013b}|\x{013c}|\x{013d}|\x{013e}|\x{013f}|\x{0140}|\x{0141}|\x{0142}|\x{0143}|\x{0144}|
|
|
\x{0145}|\x{0146}|\x{0147}|\x{0148}|\x{0149}|\x{014a}|\x{014b}|\x{014c}|\x{014d}|\x{014e}|\x{014f}|
|
|
\x{0150}|\x{0151}|\x{0152}|\x{0153}|\x{0154}|\x{0155}|\x{0156}|\x{0157}|\x{0158}|\x{0159}|\x{015a}|
|
|
\x{015b}|\x{015c}|\x{015d}|\x{015e}|\x{015f}|\x{0160}|\x{0161}|\x{0162}|\x{0163}|\x{0164}|\x{0165}|
|
|
\x{0166}|\x{0167}|\x{0168}|\x{0169}|\x{016a}|\x{016b}|\x{016c}|\x{016d}|\x{016e}|\x{016f}|\x{0170}|
|
|
\x{0171}|\x{0172}|\x{0173}|\x{0174}|\x{0175}|\x{0176}|\x{0177}|\x{0178}|\x{0179}|\x{017a}|\x{017b}|
|
|
\x{017c}|\x{017d}|\x{017e}|\x{017f}
|
|
";
|
|
|
|
/**
|
|
* Find the HTML character representation for the following characters:
|
|
* tab | line feed | carriage return | space | non-breaking space | ethiopic wordspace
|
|
* ogham space mark | en quad space | em quad space | en-space | three-per-em space
|
|
* four-per-em space | six-per-em space | figure space | punctuation space | em-space
|
|
* thin space | hair space | narrow no-break space
|
|
* medium mathematical space | ideographic space
|
|
* Some characters are used inside words, we will not count these as a space for the purpose
|
|
* of finding word boundaries:
|
|
* zero-width-space ("​", "​")
|
|
* zero-width-joiner ("‌", "‌", "‍")
|
|
* zero-width-non-joiner ("‍", "‍", "‌")
|
|
*/
|
|
$this->components['htmlSpaces'] = '
|
|
\x{00a0} # no-break space
|
|
|
|
|
\x{1361} # ethiopic wordspace
|
|
|
|
|
\x{2000} # en quad-space
|
|
|
|
|
\x{2001} # em quad-space
|
|
|
|
|
\x{2002} # en space
|
|
|
|
|
\x{2003} # em space
|
|
|
|
|
\x{2004} # three-per-em space
|
|
|
|
|
\x{2005} # four-per-em space
|
|
|
|
|
\x{2006} # six-per-em space
|
|
|
|
|
\x{2007} # figure space
|
|
|
|
|
\x{2008} # punctuation space
|
|
|
|
|
\x{2009} # thin space
|
|
|
|
|
\x{200a} # hair space
|
|
|
|
|
\x{200b} # zero-width space
|
|
|
|
|
\x{200c} # zero-width joiner
|
|
|
|
|
\x{200d} # zero-width non-joiner
|
|
|
|
|
\x{202f} # narrow no-break space
|
|
|
|
|
\x{205f} # medium mathematical space
|
|
|
|
|
\x{3000} # ideographic space
|
|
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
|
|
$this->components['normalSpaces'] = ' \f\n\r\t\v'; // equivalent to \s in non-Unicode mode.
|
|
|
|
// Hanging punctuation.
|
|
$this->components['doubleHangingPunctuation'] = "
|
|
\"
|
|
{$this->chr['doubleQuoteOpen']}
|
|
{$this->chr['doubleQuoteClose']}
|
|
{$this->chr['doubleLow9Quote']}
|
|
{$this->chr['doublePrime']}
|
|
{$this->quote_styles['doubleCurled']['open']}
|
|
{$this->quote_styles['doubleCurled']['close']}
|
|
|
|
"; // requires modifiers: x (multiline pattern) u (utf8).
|
|
$this->components['singleHangingPunctuation'] = "
|
|
'
|
|
{$this->chr['singleQuoteOpen']}
|
|
{$this->chr['singleQuoteClose']}
|
|
{$this->chr['singleLow9Quote']}
|
|
{$this->chr['singlePrime']}
|
|
{$this->quote_styles['singleCurled']['open']}
|
|
{$this->quote_styles['singleCurled']['close']}
|
|
{$this->chr['apostrophe']}
|
|
|
|
"; // requires modifiers: x (multiline pattern) u (utf8).
|
|
|
|
$this->components['unitSpacingStandardUnits'] = '
|
|
### Temporal units
|
|
(?:ms|s|secs?|mins?|hrs?)\.?|
|
|
milliseconds?|seconds?|minutes?|hours?|days?|years?|decades?|century|centuries|millennium|millennia|
|
|
|
|
### Imperial units
|
|
(?:in|ft|yd|mi)\.?|
|
|
(?:ac|ha|oz|pt|qt|gal|lb|st)\.?
|
|
s\.f\.|sf|s\.i\.|si|square[ ]feet|square[ ]foot|
|
|
inch|inches|foot|feet|yards?|miles?|acres?|hectares?|ounces?|pints?|quarts?|gallons?|pounds?|stones?|
|
|
|
|
### Metric units (with prefixes)
|
|
(?:p|µ|[mcdhkMGT])?
|
|
(?:[mgstAKNJWCVFSTHBL]|mol|cd|rad|Hz|Pa|Wb|lm|lx|Bq|Gy|Sv|kat|Ω|Ohm|Ω|&\#0*937;|&\#[xX]0*3[Aa]9;)|
|
|
(?:nano|micro|milli|centi|deci|deka|hecto|kilo|mega|giga|tera)?
|
|
(?:liters?|meters?|grams?|newtons?|pascals?|watts?|joules?|amperes?)|
|
|
|
|
### Computers units (KB, Kb, TB, Kbps)
|
|
[kKMGT]?(?:[oBb]|[oBb]ps|flops)|
|
|
|
|
### Money
|
|
¢|M?(?:£|¥|€|$)|
|
|
|
|
### Other units
|
|
°[CF]? |
|
|
%|pi|M?px|em|en|[NSEOW]|[NS][EOW]|mbar
|
|
'; // required modifiers: x (multiline pattern).
|
|
|
|
$this->components['hyphensArray'] = array_unique( array( '-', $this->chr['hyphen'] ) );
|
|
$this->components['hyphens'] = implode( '|', $this->components['hyphensArray'] );
|
|
|
|
/*
|
|
// \p{Lu} equals upper case letters and should match non english characters; since PHP 4.4.0 and 5.1.0
|
|
// for more info, see http://www.regextester.com/pregsyntax.html#regexp.reference.unicode
|
|
$this->components['styleCaps'] = '
|
|
(?<![\w\-_'.$this->chr['zeroWidthSpace'].$this->chr['softHyphen'].'])
|
|
# negative lookbehind assertion
|
|
(
|
|
(?: # CASE 1: " 9A "
|
|
[0-9]+ # starts with at least one number
|
|
\p{Lu} # must contain at least one capital letter
|
|
(?:\p{Lu}|[0-9]|\-|_|'.$this->chr['zeroWidthSpace'].'|'.$this->chr['softHyphen'].')*
|
|
# may be followed by any number of numbers capital letters, hyphens, underscores, zero width spaces, or soft hyphens
|
|
)
|
|
|
|
|
(?: # CASE 2: " A9 "
|
|
\p{Lu} # starts with capital letter
|
|
(?:\p{Lu}|[0-9]) # must be followed a number or capital letter
|
|
(?:\p{Lu}|[0-9]|\-|_|'.$this->chr['zeroWidthSpace'].'|'.$this->chr['softHyphen'].')*
|
|
# may be followed by any number of numbers capital letters, hyphens, underscores, zero width spaces, or soft hyphens
|
|
|
|
)
|
|
)
|
|
(?![\w\-_'.$this->chr['zeroWidthSpace'].$this->chr['softHyphen'].'])
|
|
# negative lookahead assertion
|
|
'; // required modifiers: x (multiline pattern) u (utf8)
|
|
*/
|
|
|
|
// Servers with PCRE compiled without "--enable-unicode-properties" fail at \p{Lu} by returning an empty string (this leaving the screen void of text
|
|
// thus are testing this alternative.
|
|
$this->components['styleCaps'] = '
|
|
(?<![\w\-_' . $this->chr['zeroWidthSpace'] . $this->chr['softHyphen'] . ']) # negative lookbehind assertion
|
|
(
|
|
(?: # CASE 1: " 9A "
|
|
[0-9]+ # starts with at least one number
|
|
[A-ZÀ-ÖØ-Ý] # must contain at least one capital letter
|
|
(?:[A-ZÀ-ÖØ-Ý]|[0-9]|\-|_|' . $this->chr['zeroWidthSpace'] . '|' . $this->chr['softHyphen'] . ')*
|
|
# may be followed by any number of numbers capital letters, hyphens, underscores, zero width spaces, or soft hyphens
|
|
)
|
|
|
|
|
(?: # CASE 2: " A9 "
|
|
[A-ZÀ-ÖØ-Ý] # starts with capital letter
|
|
(?:[A-ZÀ-ÖØ-Ý]|[0-9]) # must be followed a number or capital letter
|
|
(?:[A-ZÀ-ÖØ-Ý]|[0-9]|\-|_|' . $this->chr['zeroWidthSpace'] . '|' . $this->chr['softHyphen'] . ')*
|
|
# may be followed by any number of numbers capital letters, hyphens, underscores, zero width spaces, or soft hyphens
|
|
|
|
)
|
|
)
|
|
(?![\w\-_' . $this->chr['zeroWidthSpace'] . $this->chr['softHyphen'] . ']) # negative lookahead assertion
|
|
'; // required modifiers: x (multiline pattern) u (utf8).
|
|
|
|
// Initialize valid top level domains from IANA list.
|
|
$this->components['validTopLevelDomains'] = $this->get_top_level_domains_from_file( dirname( __DIR__ ) . '/vendor/IANA/tlds-alpha-by-domain.txt' );
|
|
// Valid URL schemes.
|
|
$this->components['urlScheme'] = '(?:https?|ftps?|file|nfs|feed|itms|itpc)';
|
|
// Combined URL pattern.
|
|
$this->components['urlPattern'] = "(?:
|
|
\A
|
|
(?<schema>{$this->components['urlScheme']}:\/\/)? # Subpattern 1: contains _http://_ if it exists
|
|
(?<domain> # Subpattern 2: contains subdomains.domain.tld
|
|
(?:
|
|
[a-z0-9] # first chr of (sub)domain can not be a hyphen
|
|
[a-z0-9\-]{0,61} # middle chrs of (sub)domain may be a hyphen;
|
|
# limit qty of middle chrs so total domain does not exceed 63 chrs
|
|
[a-z0-9] # last chr of (sub)domain can not be a hyphen
|
|
\. # dot separator
|
|
)+
|
|
(?:
|
|
{$this->components['validTopLevelDomains']} # validates top level domain
|
|
)
|
|
(?: # optional port numbers
|
|
:
|
|
(?:
|
|
[1-5]?[0-9]{1,4} | 6[0-4][0-9]{3} | 65[0-4][0-9]{2} | 655[0-2][0-9] | 6553[0-5]
|
|
)
|
|
)?
|
|
)
|
|
(?<path> # Subpattern 3: contains path following domain
|
|
(?:
|
|
\/ # marks nested directory
|
|
[a-z0-9\"\$\-_\.\+!\*\'\(\),;\?:@=&\#]+ # valid characters within directory structure
|
|
)*
|
|
[\/]? # trailing slash if any
|
|
)
|
|
\Z
|
|
)"; // required modifiers: x (multiline pattern) i (case insensitive).
|
|
|
|
$this->components['wrapEmailsEmailPattern'] = "(?:
|
|
\A
|
|
[a-z0-9\!\#\$\%\&\'\*\+\/\=\?\^\_\`\{\|\}\~\-]+
|
|
(?:
|
|
\.
|
|
[a-z0-9\!\#\$\%\&\'\*\+\/\=\?\^\_\`\{\|\}\~\-]+
|
|
)*
|
|
@
|
|
(?:
|
|
[a-z0-9]
|
|
[a-z0-9\-]{0,61}
|
|
[a-z0-9]
|
|
\.
|
|
)+
|
|
(?:
|
|
{$this->components['validTopLevelDomains']}
|
|
)
|
|
\Z
|
|
)"; // required modifiers: x (multiline pattern) i (case insensitive).
|
|
|
|
$this->components['smartQuotesApostropheExceptions'] = array(
|
|
"'tain" . $this->chr['apostrophe'] . 't' => $this->chr['apostrophe'] . 'tain' . $this->chr['apostrophe'] . 't',
|
|
"'twere" => $this->chr['apostrophe'] . 'twere',
|
|
"'twas" => $this->chr['apostrophe'] . 'twas',
|
|
"'tis" => $this->chr['apostrophe'] . 'tis',
|
|
"'til" => $this->chr['apostrophe'] . 'til',
|
|
"'bout" => $this->chr['apostrophe'] . 'bout',
|
|
"'nuff" => $this->chr['apostrophe'] . 'nuff',
|
|
"'round" => $this->chr['apostrophe'] . 'round',
|
|
"'cause" => $this->chr['apostrophe'] . 'cause',
|
|
"'splainin" => $this->chr['apostrophe'] . 'splainin',
|
|
);
|
|
$this->components['smartQuotesApostropheExceptionMatches'] = array_keys( $this->components['smartQuotesApostropheExceptions'] );
|
|
$this->components['smartQuotesApostropheExceptionReplacements'] = array_values( $this->components['smartQuotesApostropheExceptions'] );
|
|
|
|
// These patterns need to be updated whenever the quote style changes.
|
|
$this->update_smart_quotes_brackets();
|
|
|
|
// Marker for strings that should not be replaced.
|
|
$this->components['escapeMarker'] = '_E_S_C_A_P_E_D_';
|
|
}
|
|
|
|
/**
|
|
* Update smartQuotesBrackets component after quote style change.
|
|
*/
|
|
private function update_smart_quotes_brackets() {
|
|
$this->components['smartQuotesBrackets'] = array(
|
|
// Single quotes.
|
|
"['" => '[' . $this->chr['singleQuoteOpen'],
|
|
"{'" => '{' . $this->chr['singleQuoteOpen'],
|
|
"('" => '(' . $this->chr['singleQuoteOpen'],
|
|
"']" => $this->chr['singleQuoteClose'] . ']',
|
|
"'}" => $this->chr['singleQuoteClose'] . '}',
|
|
"')" => $this->chr['singleQuoteClose'] . ')',
|
|
|
|
// Double quotes.
|
|
'["' => '[' . $this->chr['doubleQuoteOpen'],
|
|
'{"' => '{' . $this->chr['doubleQuoteOpen'],
|
|
'("' => '(' . $this->chr['doubleQuoteOpen'],
|
|
'"]' => $this->chr['doubleQuoteClose'] . ']',
|
|
'"}' => $this->chr['doubleQuoteClose'] . '}',
|
|
'")' => $this->chr['doubleQuoteClose'] . ')',
|
|
|
|
// Quotes & quotes.
|
|
"\"'" => $this->chr['doubleQuoteOpen'] . $this->chr['singleQuoteOpen'],
|
|
"'\"" => $this->chr['singleQuoteClose'] . $this->chr['doubleQuoteClose'],
|
|
);
|
|
$this->components['smartQuotesBracketMatches'] = array_keys( $this->components['smartQuotesBrackets'] );
|
|
$this->components['smartQuotesBracketReplacements'] = array_values( $this->components['smartQuotesBrackets'] );
|
|
}
|
|
|
|
/**
|
|
* Load a list of top-level domains from a file.
|
|
*
|
|
* @param string $path The full path and filename.
|
|
* @return string A list of top-level domains concatenated with '|'.
|
|
*/
|
|
function get_top_level_domains_from_file( $path ) {
|
|
$domains = array();
|
|
|
|
if ( file_exists( $path ) ) {
|
|
$file = new \SplFileObject( $path );
|
|
|
|
while ( ! $file->eof() ) {
|
|
$line = $file->fgets();
|
|
|
|
if ( preg_match( '#^[a-zA-Z0-9][a-zA-Z0-9-]*$#', $line, $matches ) ) {
|
|
$domains[] = strtolower( $matches[0] );
|
|
}
|
|
}
|
|
}
|
|
|
|
if ( count( $domains ) > 0 ) {
|
|
return implode( '|', $domains );
|
|
} else {
|
|
return 'ac|ad|aero|ae|af|ag|ai|al|am|an|ao|aq|arpa|ar|asia|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|biz|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|cat|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|com|coop|co|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|info|int|in|io|iq|ir|is|it|je|jm|jobs|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mobi|mo|mp|mq|mr|ms|mt|museum|mu|mv|mw|mx|my|mz|name|na|nc|net|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pro|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|travel|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw';
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Set up our regex patterns for later use.
|
|
*
|
|
* Call after intialize_components().
|
|
*/
|
|
private function initialize_patterns() {
|
|
// Actual regex patterns.
|
|
$this->regex['customDiacriticsDoubleQuoteKey'] = '/(?:")([^"]+)(?:"\s*=>)/';
|
|
$this->regex['customDiacriticsSingleQuoteKey'] = "/(?:')([^']+)(?:'\s*=>)/";
|
|
$this->regex['customDiacriticsDoubleQuoteValue'] = '/(?:=>\s*")([^"]+)(?:")/';
|
|
$this->regex['customDiacriticsSingleQuoteValue'] = "/(?:=>\s*')([^']+)(?:')/";
|
|
|
|
$this->regex['controlCharacters'] = '/\p{C}/Su';
|
|
|
|
$this->regex['smartQuotesSingleQuotedNumbers'] = "/(?<=\W|\A)'(\d+)'(?=\W|\Z)/u";
|
|
$this->regex['smartQuotesDoubleQuotedNumbers'] = '/(?<=\W|\A)"(\d+)"(?=\W|\Z)/u';
|
|
$this->regex['smartQuotesDoublePrime'] = "/(\b\d{1,3})''(?=\W|\Z)/u";
|
|
$this->regex['smartQuotesDoublePrimeCompound'] = "/(\b\d{1,3})''(?=-\w)/u";
|
|
$this->regex['smartQuotesDoublePrime1GlyphCompound'] = "/(\b\d{1,3})\"(?=-\w)/u";
|
|
$this->regex['smartQuotesSinglePrimeCompound'] = "/(\b\d{1,3})'(?=-\w)/u";
|
|
$this->regex['smartQuotesSingleDoublePrime'] = "/(\b\d{1,3})'(\s*)(\b\d+)''(?=\W|\Z)/u";
|
|
$this->regex['smartQuotesSingleDoublePrime1Glyph'] = "/(\b\d{1,3})'(\s*)(\b\d+)\"(?=\W|\Z)/u";
|
|
$this->regex['smartQuotesCommaQuote'] = '/(?<=\s|\A),(?=\S)/';
|
|
$this->regex['smartQuotesApostropheWords'] = "/(?<=[\w|{$this->components['nonEnglishWordCharacters']}])'(?=[\w|{$this->components['nonEnglishWordCharacters']}])/u";
|
|
$this->regex['smartQuotesApostropheDecades'] = "/'(\d\d\b)/";
|
|
$this->regex['smartQuotesSingleQuoteOpen'] = "/'(?=[\w|{$this->components['nonEnglishWordCharacters']}])/u";
|
|
$this->regex['smartQuotesSingleQuoteClose'] = "/(?<=[\w|{$this->components['nonEnglishWordCharacters']}])'/u";
|
|
$this->regex['smartQuotesSingleQuoteOpenSpecial'] = "/(?<=\s|\A)'(?=\S)/"; // like _'¿hola?'_.
|
|
$this->regex['smartQuotesSingleQuoteCloseSpecial'] = "/(?<=\S)'(?=\s|\Z)/";
|
|
$this->regex['smartQuotesDoubleQuoteOpen'] = "/\"(?=[\w|{$this->components['nonEnglishWordCharacters']}])/u";
|
|
$this->regex['smartQuotesDoubleQuoteClose'] = "/(?<=[\w|{$this->components['nonEnglishWordCharacters']}])\"/u";
|
|
$this->regex['smartQuotesDoubleQuoteOpenSpecial'] = '/(?<=\s|\A)"(?=\S)/';
|
|
$this->regex['smartQuotesDoubleQuoteCloseSpecial'] = '/(?<=\S)"(?=\s|\Z)/';
|
|
|
|
$this->regex['smartDashesParentheticalDoubleDash'] = "/(\s|{$this->components['htmlSpaces']})--(\s|{$this->components['htmlSpaces']})/xui"; // ' -- '.
|
|
$this->regex['smartDashesParentheticalSingleDash'] = "/(\s|{$this->components['htmlSpaces']})-(\s|{$this->components['htmlSpaces']})/xui"; // ' - '.
|
|
$this->regex['smartDashesEnDashAll'] = "/(\A|\s)\-([\w|{$this->components['nonEnglishWordCharacters']}])/u";
|
|
$this->regex['smartDashesEnDashWords'] = "/([\w|{$this->components['nonEnglishWordCharacters']}])\-(\Z|{$this->chr['thinSpace']}|{$this->chr['hairSpace']}|{$this->chr['noBreakNarrowSpace']})/u";
|
|
$this->regex['smartDashesEnDashNumbers'] = "/(\b\d+)\-(\d+\b)/";
|
|
$this->regex['smartDashesEnDashPhoneNumbers'] = "/(\b\d{3})" . $this->chr['enDash'] . "(\d{4}\b)/";
|
|
$this->regex['smartDashesYYYY-MM-DD'] = '/
|
|
(
|
|
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
|
|
[12][0-9]{3}
|
|
)
|
|
[\-' . $this->chr['enDash'] . ']
|
|
(
|
|
(?:[0][1-9]|[1][0-2])
|
|
)
|
|
[\-' . $this->chr['enDash'] . "]
|
|
(
|
|
(?:[0][1-9]|[12][0-9]|[3][0-1])
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
|
|
)
|
|
/xu';
|
|
|
|
$this->regex['smartDashesMM-DD-YYYY'] = '/
|
|
(?:
|
|
(?:
|
|
(
|
|
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
|
|
(?:[0]?[1-9]|[1][0-2])
|
|
)
|
|
[\-' . $this->chr['enDash'] . ']
|
|
(
|
|
(?:[0]?[1-9]|[12][0-9]|[3][0-1])
|
|
)
|
|
)
|
|
|
|
|
(?:
|
|
(
|
|
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
|
|
(?:[0]?[1-9]|[12][0-9]|[3][0-1])
|
|
)
|
|
[\-' . $this->chr['enDash'] . ']
|
|
(
|
|
(?:[0]?[1-9]|[1][0-2])
|
|
)
|
|
)
|
|
)
|
|
[\-' . $this->chr['enDash'] . "]
|
|
(
|
|
[12][0-9]{3}
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
|
|
)
|
|
/xu';
|
|
$this->regex['smartDashesYYYY-MM'] = '/
|
|
(
|
|
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
|
|
[12][0-9]{3}
|
|
)
|
|
[\-' . $this->chr['enDash'] . "]
|
|
(
|
|
(?:
|
|
(?:[0][1-9]|[1][0-2])
|
|
|
|
|
(?:[0][0-9][1-9]|[1-2][0-9]{2}|[3][0-5][0-9]|[3][6][0-6])
|
|
)
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
|
|
)
|
|
/xu';
|
|
|
|
// Smart math.
|
|
// First, let's find math equations.
|
|
$this->regex['smartMathEquation'] = "/
|
|
(?<=\A|\s) # lookbehind assertion: proceeded by beginning of string or space
|
|
[\.,\'\"\¿\¡" . $this->chr['ellipses'] . $this->chr['singleQuoteOpen'] . $this->chr['doubleQuoteOpen'] . $this->chr['guillemetOpen'] . $this->chr['guillemetClose'] . $this->chr['singleLow9Quote'] . $this->chr['doubleLow9Quote'] . ']*
|
|
# allowed proceeding punctuation
|
|
[\-\(' . $this->chr['minus'] . ']* # optionally proceeded by dash, minus sign or open parenthesis
|
|
[0-9]+ # must begin with a number
|
|
(\.[0-9]+)? # optionally allow decimal values after first integer
|
|
( # followed by a math symbol and a number
|
|
[\/\*x\-+=\^' . $this->chr['minus'] . $this->chr['multiplication'] . $this->chr['division'] . ']
|
|
# allowed math symbols
|
|
[\-\(' . $this->chr['minus'] . ']* # opptionally preceeded by dash, minus sign or open parenthesis
|
|
[0-9]+ # must begin with a number
|
|
(\.[0-9]+)? # optionally allow decimal values after first integer
|
|
[\-\(\)' . $this->chr['minus'] . "]* # opptionally preceeded by dash, minus sign or parenthesis
|
|
)+
|
|
[\.,;:\'\"\?\!" . $this->chr['ellipses'] . $this->chr['singleQuoteClose'] . $this->chr['doubleQuoteClose'] . $this->chr['guillemetOpen'] . $this->chr['guillemetClose'] . ']*
|
|
# allowed trailing punctuation
|
|
(?=\Z|\s) # lookahead assertion: followed by end of string or space
|
|
/ux';
|
|
// Revert 4-4 to plain minus-hyphen so as to not mess with ranges of numbers (i.e. pp. 46-50).
|
|
$this->regex['smartMathRevertRange'] = '/
|
|
(
|
|
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
|
|
\d+
|
|
)
|
|
[\-' . $this->chr['minus'] . "]
|
|
(
|
|
\d+
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
|
|
)
|
|
/xu';
|
|
// Revert fractions to basic slash.
|
|
// We'll leave styling fractions to smart_fractions.
|
|
$this->regex['smartMathRevertFraction'] = "/
|
|
(
|
|
(?<=\s|\A|\'|\"|" . $this->chr['noBreakSpace'] . ')
|
|
\d+
|
|
)
|
|
' . $this->chr['division'] . "
|
|
(
|
|
\d+
|
|
(?:st|nd|rd|th)?
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
|
|
)
|
|
/xu';
|
|
// Revert date back to original formats:
|
|
// YYYY-MM-DD.
|
|
$this->regex['smartMathRevertDateYYYY-MM-DD'] = '/
|
|
(
|
|
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
|
|
[12][0-9]{3}
|
|
)
|
|
[\-' . $this->chr['minus'] . ']
|
|
(
|
|
(?:[0]?[1-9]|[1][0-2])
|
|
)
|
|
[\-' . $this->chr['minus'] . "]
|
|
(
|
|
(?:[0]?[1-9]|[12][0-9]|[3][0-1])
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
|
|
)
|
|
/xu';
|
|
// MM-DD-YYYY or DD-MM-YYYY.
|
|
$this->regex['smartMathRevertDateMM-DD-YYYY'] = '/
|
|
(?:
|
|
(?:
|
|
(
|
|
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
|
|
(?:[0]?[1-9]|[1][0-2])
|
|
)
|
|
[\-' . $this->chr['minus'] . ']
|
|
(
|
|
(?:[0]?[1-9]|[12][0-9]|[3][0-1])
|
|
)
|
|
)
|
|
|
|
|
(?:
|
|
(
|
|
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
|
|
(?:[0]?[1-9]|[12][0-9]|[3][0-1])
|
|
)
|
|
[\-' . $this->chr['minus'] . ']
|
|
(
|
|
(?:[0]?[1-9]|[1][0-2])
|
|
)
|
|
)
|
|
)
|
|
[\-' . $this->chr['minus'] . "]
|
|
(
|
|
[12][0-9]{3}
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
|
|
)
|
|
/xu';
|
|
// YYYY-MM or YYYY-DDD next.
|
|
$this->regex['smartMathRevertDateYYYY-MM'] = '/
|
|
(
|
|
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
|
|
[12][0-9]{3}
|
|
)
|
|
[\-' . $this->chr['minus'] . "]
|
|
(
|
|
(?:
|
|
(?:[0][1-9]|[1][0-2])
|
|
|
|
|
(?:[0][0-9][1-9]|[1-2][0-9]{2}|[3][0-5][0-9]|[3][6][0-6])
|
|
)
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
|
|
)
|
|
/xu';
|
|
|
|
// MM/DD/YYYY or DD/MM/YYYY.
|
|
$this->regex['smartMathRevertDateMM/DD/YYYY'] = '/
|
|
(?:
|
|
(?:
|
|
(
|
|
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
|
|
(?:[0][1-9]|[1][0-2])
|
|
)
|
|
[\/' . $this->chr['division'] . ']
|
|
(
|
|
(?:[0][1-9]|[12][0-9]|[3][0-1])
|
|
)
|
|
)
|
|
|
|
|
(?:
|
|
(
|
|
(?<=\s|\A|' . $this->chr['noBreakSpace'] . ')
|
|
(?:[0][1-9]|[12][0-9]|[3][0-1])
|
|
)
|
|
[\/' . $this->chr['division'] . ']
|
|
(
|
|
(?:[0][1-9]|[1][0-2])
|
|
)
|
|
)
|
|
)
|
|
[\/' . $this->chr['division'] . "]
|
|
(
|
|
[12][0-9]{3}
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|" . $this->chr['noBreakSpace'] . ')
|
|
)
|
|
/xu';
|
|
|
|
// Handle exponents (ie. 4^2).
|
|
$this->regex['smartExponents'] = "/
|
|
\b
|
|
(\d+)
|
|
\^
|
|
(\w+)
|
|
\b
|
|
/xu";
|
|
|
|
$this->regex['smartFractionsSpacing'] = '/\b(\d+)\s(\d+\s?\/\s?\d+)\b/';
|
|
$this->regex['smartFractionsReplacement'] = "/
|
|
(?<=\A|\s|{$this->chr['noBreakSpace']}|{$this->chr['noBreakNarrowSpace']}) # lookbehind assertion: makes sure we are not messing up a url
|
|
(\d+)
|
|
(?:\s?\/\s?{$this->chr['zeroWidthSpace']}?) # strip out any zero-width spaces inserted by wrap_hard_hyphens
|
|
(\d+)
|
|
(
|
|
(?:\<sup\>(?:st|nd|rd|th)<\/sup\>)? # handle ordinals after fractions
|
|
(?:\Z|\s|{$this->chr['noBreakSpace']}|{$this->chr['noBreakNarrowSpace']}|\.|\!|\?|\)|\;|\:|\'|\") # makes sure we are not messing up a url
|
|
)
|
|
/xu";
|
|
$this->regex['smartFractionsEscapeMM/YYYY'] = "/
|
|
(?<=\A|\s|{$this->chr['noBreakSpace']}|{$this->chr['noBreakNarrowSpace']}) # lookbehind assertion: makes sure we are not messing up a url
|
|
(\d\d?)
|
|
(\s?\/\s?{$this->chr['zeroWidthSpace']}?) # capture any zero-width spaces inserted by wrap_hard_hyphens
|
|
(
|
|
(?:19\d\d)|(?:20\d\d) # handle 4-decimal years in the 20th and 21st centuries
|
|
)
|
|
(
|
|
(?:\Z|\s|{$this->chr['noBreakSpace']}|{$this->chr['noBreakNarrowSpace']}|\.|\!|\?|\)|\;|\:|\'|\") # makes sure we are not messing up a url
|
|
)
|
|
/xu";
|
|
|
|
$year_regex = array();
|
|
for ( $year = 1900; $year < 2100; ++$year ) {
|
|
$year_regex[] = "(?: ( $year ) (\s?\/\s?{$this->chr['zeroWidthSpace']}?) ( " . ( $year + 1 ) . ' ) )';
|
|
}
|
|
$this->regex['smartFractionsEscapeYYYY/YYYY'] = "/
|
|
(?<=\A|\s|{$this->chr['noBreakSpace']}|{$this->chr['noBreakNarrowSpace']}) # lookbehind assertion: makes sure we are not messing up a url
|
|
(?| " . implode( '|', $year_regex ) . " )
|
|
(
|
|
(?:\Z|\s|{$this->chr['noBreakSpace']}|{$this->chr['noBreakNarrowSpace']}|\.|\!|\?|\)|\;|\:|\'|\") # makes sure we are not messing up a url
|
|
)
|
|
/xu";
|
|
|
|
$this->regex['smartOrdinalSuffix'] = "/\b(\d+)(st|nd|rd|th)\b/"; // End smart math.
|
|
|
|
// Smart marks.
|
|
$this->regex['smartMarksEscape501(c)'] = '/\b(501\()(c)(\)\((?:[1-9]|[1-2][0-9])\))/u';
|
|
|
|
// Whitespace handling.
|
|
$this->regex['singleCharacterWordSpacing'] = "/
|
|
(?:
|
|
(\s)
|
|
(\w)
|
|
[{$this->components['normalSpaces']}]
|
|
(?=\w)
|
|
)
|
|
/xu";
|
|
|
|
$this->regex['dashSpacingEmDash'] = "/
|
|
(?:
|
|
\s
|
|
({$this->chr['emDash']})
|
|
\s
|
|
)
|
|
|
|
|
(?:
|
|
(?<=\S) # lookbehind assertion
|
|
({$this->chr['emDash']})
|
|
(?=\S) # lookahead assertion
|
|
)
|
|
/xu";
|
|
$this->regex['dashSpacingParentheticalDash'] = "/
|
|
(?:
|
|
\s
|
|
({$this->chr['enDash']})
|
|
\s
|
|
)
|
|
/xu";
|
|
$this->regex['dashSpacingIntervalDash'] = "/
|
|
(?:
|
|
(?<=\S) # lookbehind assertion
|
|
({$this->chr['enDash']})
|
|
(?=\S) # lookahead assertion
|
|
)
|
|
/xu";
|
|
|
|
$this->regex['spaceCollapseNormal'] = "/[{$this->components['normalSpaces']}]+/xu";
|
|
$this->regex['spaceCollapseNonBreakable'] = "/(?:[{$this->components['normalSpaces']}]|{$this->components['htmlSpaces']})*{$this->chr['noBreakSpace']}(?:[{$this->components['normalSpaces']}]|{$this->components['htmlSpaces']})*/xu";
|
|
$this->regex['spaceCollapseOther'] = "/(?:[{$this->components['normalSpaces']}])*({$this->components['htmlSpaces']})(?:[{$this->components['normalSpaces']}]|{$this->components['htmlSpaces']})*/xu";
|
|
$this->regex['spaceCollapseBlockStart'] = "/\A(?:[{$this->components['normalSpaces']}]|{$this->components['htmlSpaces']})+/xu";
|
|
|
|
// Unit spacing.
|
|
$this->regex['unitSpacingEscapeSpecialChars'] = '#([\[\\\^\$\.\|\?\*\+\(\)\{\}])#';
|
|
$this->update_unit_pattern( isset( $this->settings['units'] ) ? $this->settings['units'] : array() );
|
|
|
|
// French punctuation spacing.
|
|
$this->regex['frenchPunctuationSpacingNarrow'] = '/(\w+)(\s?)([?!»])(\s|\Z)/u';
|
|
$this->regex['frenchPunctuationSpacingFull'] = '/(\w+)(\s?)(:)(\s|\Z)/u';
|
|
$this->regex['frenchPunctuationSpacingSemicolon'] = '/(\w+)(\s?)((?<!&|>|<);)(\s|\Z)/u';
|
|
$this->regex['frenchPunctuationSpacingOpeningQuote'] = '/(\s|\A)(«)(\s?)(\w+)/u';
|
|
|
|
// Wrap hard hyphens.
|
|
$this->regex['wrapHardHyphensRemoveEndingSpace'] = "/({$this->components['hyphens']}){$this->chr['zeroWidthSpace']}\$/";
|
|
|
|
// Wrap emails.
|
|
$this->regex['wrapEmailsMatchEmails'] = "/{$this->components['wrapEmailsEmailPattern']}/xi";
|
|
$this->regex['wrapEmailsReplaceEmails'] = '/([^a-zA-Z])/';
|
|
|
|
// Wrap URLs.
|
|
$this->regex['wrapUrlsPattern'] = "`{$this->components['urlPattern']}`xi";
|
|
$this->regex['wrapUrlsDomainParts'] = '#(\-|\.)#';
|
|
|
|
// Style caps.
|
|
$this->regex['styleCaps'] = "/{$this->components['styleCaps']}/xu";
|
|
|
|
// Style numbers.
|
|
$this->regex['styleNumbers'] = '/([0-9]+)/u';
|
|
|
|
// Style hanging punctuation.
|
|
$this->regex['styleHangingPunctuationDouble'] = "/(\s)([{$this->components['doubleHangingPunctuation']}])(\w+)/u";
|
|
$this->regex['styleHangingPunctuationSingle'] = "/(\s)([{$this->components['singleHangingPunctuation']}])(\w+)/u";
|
|
$this->regex['styleHangingPunctuationInitialDouble'] = "/(?:\A)([{$this->components['doubleHangingPunctuation']}])(\w+)/u";
|
|
$this->regex['styleHangingPunctuationInitialSingle'] = "/(?:\A)([{$this->components['singleHangingPunctuation']}])(\w+)/u";
|
|
|
|
// Style ampersands.
|
|
$this->regex['styleAmpersands'] = '/(\&\;)/u';
|
|
|
|
// Dewidowing.
|
|
$this->regex['dewidow'] = "/
|
|
(?:
|
|
\A
|
|
|
|
|
(?:
|
|
(?<space_before> # subpattern 1: space before (note: ZWSP is not a space)
|
|
[\s{$this->chr['zeroWidthSpace']}{$this->chr['softHyphen']}]+
|
|
)
|
|
(?<neighbor> # subpattern 2: neighbors widow (short as possible)
|
|
[^\s{$this->chr['zeroWidthSpace']}{$this->chr['softHyphen']}]+?
|
|
)
|
|
)
|
|
)
|
|
(?<space_between> # subpattern 3: space between
|
|
[\s]+ # \s includes all special spaces (but not ZWSP) with the u flag
|
|
)
|
|
(?<widow> # subpattern 4: widow
|
|
[\w\pM\-]+? # \w includes all alphanumeric Unicode characters but not composed characters
|
|
)
|
|
(?<trailing> # subpattern 5: any trailing punctuation or spaces
|
|
[^\w\pM]*
|
|
)
|
|
\Z
|
|
/xu";
|
|
|
|
// Utility patterns for splitting string parameter lists into arrays.
|
|
$this->regex['parameterSplitting'] = '/[\s,]+/';
|
|
|
|
// Add the "study" flag to all our regular expressions.
|
|
foreach ( $this->regex as &$regex ) {
|
|
$regex .= 'S';
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Enable usage of true "no-break narrow space" ( ) instead of the normal no-break space ( ).
|
|
*
|
|
* @param boolean $on Optional. Default false.
|
|
*/
|
|
function set_true_no_break_narrow_space( $on = false ) {
|
|
|
|
if ( $on ) {
|
|
$this->chr['noBreakNarrowSpace'] = uchr( 8239 );
|
|
} else {
|
|
$this->chr['noBreakNarrowSpace'] = uchr( 160 );
|
|
}
|
|
|
|
// Update French guillemets.
|
|
$this->quote_styles['doubleGuillemetsFrench'] = array(
|
|
'open' => $this->chr['guillemetOpen'] . $this->chr['noBreakNarrowSpace'],
|
|
'close' => $this->chr['noBreakNarrowSpace'] . $this->chr['guillemetClose'],
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Sets tags for which the typography of their children will be left untouched.
|
|
*
|
|
* @param string|array $tags A comma separated list or an array of tag names.
|
|
*/
|
|
function set_tags_to_ignore( $tags = array( 'code', 'head', 'kbd', 'object', 'option', 'pre', 'samp', 'script', 'noscript', 'noembed', 'select', 'style', 'textarea', 'title', 'var', 'math' ) ) {
|
|
if ( ! is_array( $tags ) ) {
|
|
$tags = preg_split( $this->regex['parameterSplitting'], $tags, -1, PREG_SPLIT_NO_EMPTY );
|
|
}
|
|
|
|
// Ensure that we pass only lower-case tag names to XPath.
|
|
$tags = array_filter( array_map( 'strtolower', $tags ), 'ctype_alnum' );
|
|
|
|
// Self closing tags shouldn't be in $tags.
|
|
$this->settings['ignoreTags'] = array_unique( array_merge( array_diff( $tags, $this->self_closing_tags ), $this->inappropriate_tags ) );
|
|
}
|
|
|
|
/**
|
|
* Sets classes for which the typography of their children will be left untouched.
|
|
*
|
|
* @param string|array $classes A comma separated list or an array of class names.
|
|
*/
|
|
function set_classes_to_ignore( $classes = array( 'vcard', 'noTypo' ) ) {
|
|
if ( ! is_array( $classes ) ) {
|
|
$classes = preg_split( $this->regex['parameterSplitting'], $classes, -1, PREG_SPLIT_NO_EMPTY );
|
|
}
|
|
$this->settings['ignoreClasses'] = $classes;
|
|
}
|
|
|
|
/**
|
|
* Sets IDs for which the typography of their children will be left untouched.
|
|
*
|
|
* @param string|array $ids A comma separated list or an array of tag names.
|
|
*/
|
|
function set_ids_to_ignore( $ids = array() ) {
|
|
if ( ! is_array( $ids ) ) {
|
|
$ids = preg_split( $this->regex['parameterSplitting'], $ids, -1, PREG_SPLIT_NO_EMPTY );
|
|
}
|
|
$this->settings['ignoreIDs'] = $ids;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable typographic quotes.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_smart_quotes( $on = true ) {
|
|
$this->settings['smartQuotes'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Set the style for primary ('double') quotemarks.
|
|
*
|
|
* Allowed values for $style:
|
|
* "doubleCurled" => "“foo”",
|
|
* "doubleCurledReversed" => "”foo”",
|
|
* "doubleLow9" => "„foo”",
|
|
* "doubleLow9Reversed" => "„foo“",
|
|
* "singleCurled" => "‘foo’",
|
|
* "singleCurledReversed" => "’foo’",
|
|
* "singleLow9" => "‚foo’",
|
|
* "singleLow9Reversed" => "‚foo‘",
|
|
* "doubleGuillemetsFrench" => "« foo »",
|
|
* "doubleGuillemets" => "«foo»",
|
|
* "doubleGuillemetsReversed" => "»foo«",
|
|
* "singleGuillemets" => "‹foo›",
|
|
* "singleGuillemetsReversed" => "›foo‹",
|
|
* "cornerBrackets" => "「foo」",
|
|
* "whiteCornerBracket" => "『foo』"
|
|
*
|
|
* @param string $style Defaults to 'doubleCurled.
|
|
*/
|
|
function set_smart_quotes_primary( $style = 'doubleCurled' ) {
|
|
if ( isset( $this->quote_styles[ $style ] ) ) {
|
|
if ( ! empty( $this->quote_styles[ $style ]['open'] ) ) {
|
|
$this->chr['doubleQuoteOpen'] = $this->quote_styles[ $style ]['open'];
|
|
}
|
|
if ( ! empty( $this->quote_styles[ $style ]['close'] ) ) {
|
|
$this->chr['doubleQuoteClose'] = $this->quote_styles[ $style ]['close'];
|
|
}
|
|
|
|
// Update brackets component.
|
|
$this->update_smart_quotes_brackets();
|
|
} else {
|
|
trigger_error( "Invalid quote style $style.", E_USER_WARNING ); // @codingStandardsIgnoreLine.
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Set the style for secondary ('single') quotemarks.
|
|
*
|
|
* Allowed values for $style:
|
|
* "doubleCurled" => "“foo”",
|
|
* "doubleCurledReversed" => "”foo”",
|
|
* "doubleLow9" => "„foo”",
|
|
* "doubleLow9Reversed" => "„foo“",
|
|
* "singleCurled" => "‘foo’",
|
|
* "singleCurledReversed" => "’foo’",
|
|
* "singleLow9" => "‚foo’",
|
|
* "singleLow9Reversed" => "‚foo‘",
|
|
* "doubleGuillemetsFrench" => "« foo »",
|
|
* "doubleGuillemets" => "«foo»",
|
|
* "doubleGuillemetsReversed" => "»foo«",
|
|
* "singleGuillemets" => "‹foo›",
|
|
* "singleGuillemetsReversed" => "›foo‹",
|
|
* "cornerBrackets" => "「foo」",
|
|
* "whiteCornerBracket" => "『foo』"
|
|
*
|
|
* @param string $style Defaults to 'singleCurled'.
|
|
*/
|
|
function set_smart_quotes_secondary( $style = 'singleCurled' ) {
|
|
if ( isset( $this->quote_styles[ $style ] ) ) {
|
|
if ( ! empty( $this->quote_styles[ $style ]['open'] ) ) {
|
|
$this->chr['singleQuoteOpen'] = $this->quote_styles[ $style ]['open'];
|
|
}
|
|
if ( ! empty( $this->quote_styles[ $style ]['close'] ) ) {
|
|
$this->chr['singleQuoteClose'] = $this->quote_styles[ $style ]['close'];
|
|
}
|
|
|
|
// Update brackets component.
|
|
$this->update_smart_quotes_brackets();
|
|
} else {
|
|
trigger_error( "Invalid quote style $style.", E_USER_WARNING ); // @codingStandardsIgnoreLine.
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Enable/disable replacement of "a--a" with En Dash " -- " and "---" with Em Dash.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_smart_dashes( $on = true ) {
|
|
$this->settings['smartDashes'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Sets the typographical conventions used by smart_dashes.
|
|
*
|
|
* Allowed values for $style:
|
|
* - "traditionalUS"
|
|
* - "international"
|
|
*
|
|
* @param string $style Optional. Default "englishTraditional".
|
|
*/
|
|
function set_smart_dashes_style( $style = 'traditionalUS' ) {
|
|
if ( isset( $this->dash_styles[ $style ] ) ) {
|
|
if ( ! empty( $this->dash_styles[ $style ]['parenthetical'] ) ) {
|
|
$this->chr['parentheticalDash'] = $this->dash_styles[ $style ]['parenthetical'];
|
|
}
|
|
if ( ! empty( $this->dash_styles[ $style ]['interval'] ) ) {
|
|
$this->chr['intervalDash'] = $this->dash_styles[ $style ]['interval'];
|
|
}
|
|
if ( ! empty( $this->dash_styles[ $style ]['parentheticalSpace'] ) ) {
|
|
$this->chr['parentheticalDashSpace'] = $this->dash_styles[ $style ]['parentheticalSpace'];
|
|
}
|
|
if ( ! empty( $this->dash_styles[ $style ]['intervalSpace'] ) ) {
|
|
$this->chr['intervalDashSpace'] = $this->dash_styles[ $style ]['intervalSpace'];
|
|
}
|
|
|
|
// Update dash spacing regex.
|
|
$this->regex['dashSpacingParentheticalDash'] = "/
|
|
(?:
|
|
\s
|
|
({$this->chr['parentheticalDash']})
|
|
\s
|
|
)
|
|
/xu";
|
|
$this->regex['dashSpacingIntervalDash'] = "/
|
|
(?:
|
|
(?<=\S) # lookbehind assertion
|
|
({$this->chr['intervalDash']})
|
|
(?=\S) # lookahead assertion
|
|
)
|
|
/xu";
|
|
|
|
} else {
|
|
trigger_error( "Invalid dash style $style.", E_USER_WARNING ); // @codingStandardsIgnoreLine.
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Enable/disable replacement of "..." with "…".
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_smart_ellipses( $on = true ) {
|
|
$this->settings['smartEllipses'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable replacement "creme brulee" with "crème brûlée".
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_smart_diacritics( $on = true ) {
|
|
$this->settings['smartDiacritics'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Set the language used for diacritics replacements.
|
|
*
|
|
* @param string $lang Has to correspond to a filename in 'diacritics'. Optional. Default 'en-US'.
|
|
*/
|
|
function set_diacritic_language( $lang = 'en-US' ) {
|
|
if ( isset( $this->settings['diacriticLanguage'] ) && $this->settings['diacriticLanguage'] === $lang ) {
|
|
return;
|
|
}
|
|
|
|
$this->settings['diacriticLanguage'] = $lang;
|
|
|
|
if ( file_exists( dirname( __FILE__ ) . '/diacritics/' . $this->settings['diacriticLanguage'] . '.php' ) ) {
|
|
include( 'diacritics/' . $this->settings['diacriticLanguage'] . '.php' );
|
|
$this->settings['diacriticWords'] = $diacritic_words;
|
|
} else {
|
|
unset( $this->settings['diacriticWords'] );
|
|
}
|
|
|
|
$this->update_diacritics_replacement_arrays();
|
|
}
|
|
|
|
/**
|
|
* Set up custom diacritics replacements.
|
|
*
|
|
* @param string|array $custom_replacements An array formatted array(needle=>replacement, needle=>replacement...),
|
|
* or a string formatted `"needle"=>"replacement","needle"=>"replacement",...
|
|
*/
|
|
function set_diacritic_custom_replacements( $custom_replacements = array() ) {
|
|
if ( ! is_array( $custom_replacements ) ) {
|
|
$custom_replacements = preg_split( '/,/', $custom_replacements, -1, PREG_SPLIT_NO_EMPTY );
|
|
}
|
|
|
|
$replacements = array();
|
|
foreach ( $custom_replacements as $custom_key => $custom_replacement ) {
|
|
// Account for single and double quotes.
|
|
preg_match( $this->regex['customDiacriticsDoubleQuoteKey'], $custom_replacement, $double_quote_key_match );
|
|
preg_match( $this->regex['customDiacriticsSingleQuoteKey'], $custom_replacement, $single_quote_key_match );
|
|
preg_match( $this->regex['customDiacriticsDoubleQuoteValue'], $custom_replacement, $double_quote_value_match );
|
|
preg_match( $this->regex['customDiacriticsSingleQuoteValue'], $custom_replacement, $single_quote_value_match );
|
|
|
|
if ( ! empty( $double_quote_key_match[1] ) ) {
|
|
$key = $double_quote_key_match[1];
|
|
} elseif ( ! empty( $single_quote_key_match[1] ) ) {
|
|
$key = $single_quote_key_match[1];
|
|
} else {
|
|
$key = $custom_key;
|
|
}
|
|
|
|
if ( ! empty( $double_quote_value_match[1] ) ) {
|
|
$value = $double_quote_value_match[1];
|
|
} elseif ( ! empty( $single_quote_value_match[1] ) ) {
|
|
$value = $single_quote_value_match[1];
|
|
} else {
|
|
$value = $custom_replacement;
|
|
}
|
|
|
|
if ( isset( $key ) && isset( $value ) ) {
|
|
$replacements[ strip_tags( trim( $key ) ) ] = strip_tags( trim( $value ) );
|
|
}
|
|
}
|
|
|
|
$this->settings['diacriticCustomReplacements'] = $replacements;
|
|
$this->update_diacritics_replacement_arrays();
|
|
}
|
|
|
|
/**
|
|
* Update the pattern and replacement arrays in $settings['diacriticReplacement'].
|
|
*
|
|
* Should be called whenever a new diacritics replacement language is selected or
|
|
* when the custom replacements are updated.
|
|
*/
|
|
private function update_diacritics_replacement_arrays() {
|
|
$patterns = array();
|
|
$replacements = array();
|
|
|
|
if ( ! empty( $this->settings['diacriticCustomReplacements'] ) ) {
|
|
foreach ( $this->settings['diacriticCustomReplacements'] as $needle => $replacement ) {
|
|
$patterns[] = "/\b$needle\b/u";
|
|
$replacements[ $needle ] = $replacement;
|
|
}
|
|
}
|
|
if ( ! empty( $this->settings['diacriticWords'] ) ) {
|
|
foreach ( $this->settings['diacriticWords'] as $needle => $replacement ) {
|
|
$patterns[] = "/\b$needle\b/u";
|
|
$replacements[ $needle ] = $replacement;
|
|
}
|
|
}
|
|
|
|
$this->settings['diacriticReplacement'] = array( 'patterns' => $patterns, 'replacements' => $replacements );
|
|
}
|
|
|
|
/**
|
|
* Enable/disable replacement of (r) (c) (tm) (sm) (p) (R) (C) (TM) (SM) (P) with ® © ™ ℠ ℗.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_smart_marks( $on = true ) {
|
|
$this->settings['smartMarks'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable proper mathematical symbols.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_smart_math( $on = true ) {
|
|
$this->settings['smartMath'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable replacement of 2^2 with 2<sup>2</sup>
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_smart_exponents( $on = true ) {
|
|
$this->settings['smartExponents'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable replacement of 1/4 with <sup>1</sup>⁄<sub>4</sub>.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_smart_fractions( $on = true ) {
|
|
$this->settings['smartFractions'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable replacement of 1st with 1<sup>st</sup>.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_smart_ordinal_suffix( $on = true ) {
|
|
$this->settings['smartOrdinalSuffix'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable forcing single character words to next line with the insertion of .
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_single_character_word_spacing( $on = true ) {
|
|
$this->settings['singleCharacterWordSpacing'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable fraction spacing.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_fraction_spacing( $on = true ) {
|
|
$this->settings['fractionSpacing'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable keeping units and values together with the insertion of .
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_unit_spacing( $on = true ) {
|
|
$this->settings['unitSpacing'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable extra whitespace before certain punction marks, as is the French custom.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_french_punctuation_spacing( $on = true ) {
|
|
$this->settings['frenchPunctuationSpacing'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Set the list of units to keep together with their values.
|
|
*
|
|
* @param string|array $units A comma separated list or an array of units.
|
|
*/
|
|
function set_units( $units = array() ) {
|
|
if ( ! is_array( $units ) ) {
|
|
$units = preg_split( $this->regex['parameterSplitting'], $units, -1, PREG_SPLIT_NO_EMPTY );
|
|
}
|
|
|
|
$this->settings['units'] = $units;
|
|
$this->update_unit_pattern( $units );
|
|
}
|
|
|
|
/**
|
|
* Update components and pattern for matching both standard and custom units.
|
|
*
|
|
* @param array $units An array of unit names.
|
|
*/
|
|
private function update_unit_pattern( array $units ) {
|
|
// Update components & regex pattern.
|
|
foreach ( $units as $index => $unit ) {
|
|
// Escape special chars.
|
|
$units[ $index ] = preg_replace( $this->regex['unitSpacingEscapeSpecialChars'], '\\\\$1', $unit );
|
|
}
|
|
$custom_units = implode( '|', $units );
|
|
$custom_units .= ( $custom_units ) ? '|' : '';
|
|
$this->components['unitSpacingUnits'] = $custom_units . $this->components['unitSpacingStandardUnits'];
|
|
$this->regex['unitSpacingUnitPattern'] = "/(\d\.?)\s({$this->components['unitSpacingUnits']})\b/x";
|
|
}
|
|
|
|
/**
|
|
* Enable/disable wrapping of Em and En dashes are in thin spaces.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_dash_spacing( $on = true ) {
|
|
$this->settings['dashSpacing'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable removal of extra whitespace characters.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_space_collapse( $on = true ) {
|
|
$this->settings['spaceCollapse'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable widow handling.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_dewidow( $on = true ) {
|
|
$this->settings['dewidow'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Set the maximum length of widows that will be protected.
|
|
*
|
|
* @param number $length Defaults to 5. Trying to set the value to less than 2 resets the length to the default.
|
|
*/
|
|
function set_max_dewidow_length( $length = 5 ) {
|
|
$length = ( $length > 1 ) ? $length : 5;
|
|
|
|
$this->settings['dewidowMaxLength'] = $length;
|
|
}
|
|
|
|
/**
|
|
* Set the maximum length of pulled text to keep widows company.
|
|
*
|
|
* @param number $length Defaults to 5. Trying to set the value to less than 2 resets the length to the default.
|
|
*/
|
|
function set_max_dewidow_pull( $length = 5 ) {
|
|
$length = ( $length > 1 ) ? $length : 5;
|
|
|
|
$this->settings['dewidowMaxPull'] = $length;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable wrapping at internal hard hyphens with the insertion of a zero-width-space.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_wrap_hard_hyphens( $on = true ) {
|
|
$this->settings['hyphenHardWrap'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable wrapping of urls.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_url_wrap( $on = true ) {
|
|
$this->settings['urlWrap'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable wrapping of email addresses.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_email_wrap( $on = true ) {
|
|
$this->settings['emailWrap'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Set the minimum character requirement after an URL wrapping point.
|
|
*
|
|
* @param number $length Defaults to 5. Trying to set the value to less than 1 resets the length to the default.
|
|
*/
|
|
function set_min_after_url_wrap( $length = 5 ) {
|
|
$length = ( $length > 0 ) ? $length : 5;
|
|
|
|
$this->settings['urlMinAfterWrap'] = $length;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable wrapping of ampersands in <span class="amp">.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_style_ampersands( $on = true ) {
|
|
$this->settings['styleAmpersands'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable wrapping caps in <span class="caps">.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_style_caps( $on = true ) {
|
|
$this->settings['styleCaps'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable wrapping of initial quotes in <span class="quo"> or <span class="dquo">.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_style_initial_quotes( $on = true ) {
|
|
$this->settings['styleInitialQuotes'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable wrapping of numbers in <span class="numbers">.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_style_numbers( $on = true ) {
|
|
$this->settings['styleNumbers'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable wrapping of punctiation and wide characters in <span class="pull-*">.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_style_hanging_punctuation( $on = true ) {
|
|
$this->settings['styleHangingPunctuation'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Set the list of tags where initial quotes and guillemets should be styled.
|
|
*
|
|
* @param string|array $tags A comma separated list or an array of tag names.
|
|
*/
|
|
function set_initial_quote_tags( $tags = array( 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'li', 'dd', 'dt' ) ) {
|
|
// Make array if handed a list of tags as a string.
|
|
if ( ! is_array( $tags ) ) {
|
|
$tags = preg_split( '/[^a-z0-9]+/', $tags, -1, PREG_SPLIT_NO_EMPTY );
|
|
}
|
|
|
|
// Store the tag array inverted (with the tagName as its index for faster lookup).
|
|
$this->settings['initialQuoteTags'] = array_change_key_case( array_flip( $tags ), CASE_LOWER );
|
|
}
|
|
|
|
/**
|
|
* Enable/disable hyphenation.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_hyphenation( $on = true ) {
|
|
$this->settings['hyphenation'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Set the hyphenation pattern language.
|
|
*
|
|
* @param string $lang Has to correspond to a filename in 'lang'. Optional. Default 'en-US'.
|
|
*/
|
|
function set_hyphenation_language( $lang = 'en-US' ) {
|
|
if ( isset( $this->settings['hyphenLanguage'] ) && $this->settings['hyphenLanguage'] === $lang ) {
|
|
return; // Bail out, no need to do anything.
|
|
}
|
|
|
|
$this->settings['hyphenLanguage'] = $lang;
|
|
|
|
if ( file_exists( dirname( __FILE__ ) . '/lang/' . $this->settings['hyphenLanguage'] . '.php' ) ) {
|
|
include( 'lang/' . $this->settings['hyphenLanguage'] . '.php' );
|
|
|
|
// @todo Fix variable naming in language files. @codingStandardsIgnoreStart.
|
|
$this->settings['hyphenationPattern'] = $patgen;
|
|
$this->settings['hyphenationPatternMaxSegment'] = $patgenMaxSeg;
|
|
$this->settings['hyphenationPatternExceptions'] = $patgenExceptions; // @codingStandardsIgnoreEnd.
|
|
} else {
|
|
unset( $this->settings['hyphenationPattern'] );
|
|
unset( $this->settings['hyphenationPatternMaxSegment'] );
|
|
unset( $this->settings['hyphenationPatternExceptions'] );
|
|
}
|
|
|
|
// Make sure hyphenationExceptions is not set to force remerging of patgen and custom exceptions.
|
|
if ( isset( $this->settings['hyphenationExceptions'] ) ) {
|
|
unset( $this->settings['hyphenationExceptions'] );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Set the minimum length of a word that may be hyphenated.
|
|
*
|
|
* @param number $length Defaults to 5. Trying to set the value to less than 2 resets the length to the default.
|
|
*/
|
|
function set_min_length_hyphenation( $length = 5 ) {
|
|
$length = ( $length > 1 ) ? $length : 5;
|
|
|
|
$this->settings['hyphenMinLength'] = $length;
|
|
}
|
|
|
|
/**
|
|
* Set the minimum character requirement before a hyphenation point.
|
|
*
|
|
* @param number $length Defaults to 3. Trying to set the value to less than 1 resets the length to the default.
|
|
*/
|
|
function set_min_before_hyphenation( $length = 3 ) {
|
|
$length = ( $length > 0 ) ? $length : 3;
|
|
|
|
$this->settings['hyphenMinBefore'] = $length;
|
|
}
|
|
|
|
/**
|
|
* Set the minimum character requirement after a hyphenation point.
|
|
*
|
|
* @param number $length Defaults to 2. Trying to set the value to less than 1 resets the length to the default.
|
|
*/
|
|
function set_min_after_hyphenation( $length = 2 ) {
|
|
$length = ( $length > 0 ) ? $length : 2;
|
|
|
|
$this->settings['hyphenMinAfter'] = $length;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable hyphenation of titles and headings.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_hyphenate_headings( $on = true ) {
|
|
$this->settings['hyphenateTitle'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable hyphenation of words set completely in capital letters.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_hyphenate_all_caps( $on = true ) {
|
|
$this->settings['hyphenateAllCaps'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable hyphenation of words starting with a capital letter.
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_hyphenate_title_case( $on = true ) {
|
|
$this->settings['hyphenateTitleCase'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Enable/disable hyphenation of compound words (e.g. "editor-in-chief").
|
|
*
|
|
* @param boolean $on Optional. Default true.
|
|
*/
|
|
function set_hyphenate_compounds( $on = true ) {
|
|
$this->settings['hyphenateCompounds'] = $on;
|
|
}
|
|
|
|
/**
|
|
* Sets custom word hyphenations.
|
|
*
|
|
* @param string|array $exceptions An array of words with all hyphenation points marked with a hard hyphen (or a string list of such words).
|
|
* In the latter case, only alphanumeric characters and hyphens are recognized. The default is empty.
|
|
*/
|
|
function set_hyphenation_exceptions( $exceptions = array() ) {
|
|
if ( ! is_array( $exceptions ) ) {
|
|
$exceptions = preg_split( $this->regex['parameterSplitting'], $exceptions, -1, PREG_SPLIT_NO_EMPTY );
|
|
}
|
|
|
|
$exception_keys = array();
|
|
$func = array();
|
|
foreach ( $exceptions as $exception ) {
|
|
$func = $this->str_functions[ mb_detect_encoding( $exception, $this->encodings, true ) ];
|
|
if ( empty( $func ) || empty( $func['strlen'] ) ) {
|
|
continue; // unknown encoding, abort.
|
|
}
|
|
|
|
$exception = $func['strtolower']( $exception );
|
|
$exception_keys[ $exception ] = preg_replace( "#-#{$func['u']}", '', $exception );
|
|
}
|
|
|
|
$this->settings['hyphenationCustomExceptions'] = array_flip( $exception_keys );
|
|
|
|
// Make sure hyphenationExceptions is not set to force remerging of patgen and custom exceptions.
|
|
if ( isset( $this->settings['hyphenationExceptions'] ) ) {
|
|
unset( $this->settings['hyphenationExceptions'] );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Modifies $html according to the defined settings.
|
|
*
|
|
* @param string $html A HTML fragment.
|
|
* @param string $is_title If the HTML fragment is a title. Optional. Default false.
|
|
* @return string The processed $html.
|
|
*/
|
|
function process( $html, $is_title = false ) {
|
|
if ( isset( $this->settings['ignoreTags'] ) && $is_title && ( in_array( 'h1', $this->settings['ignoreTags'], true ) || in_array( 'h2', $this->settings['ignoreTags'], true ) ) ) {
|
|
return $html;
|
|
}
|
|
|
|
// Lazy-load our HTML parser.
|
|
$html5_parser = $this->get_html5_parser();
|
|
|
|
// Parse the HTML.
|
|
$dom = $this->parse_html( $html5_parser, $html );
|
|
$xpath = new \DOMXPath( $dom );
|
|
|
|
// Query some nodes.
|
|
$body_node = $xpath->query( '/html/body' )->item( 0 );
|
|
$all_textnodes = $xpath->query( '//text()', $body_node );
|
|
$tags_to_ignore = $this->query_tags_to_ignore( $xpath, $body_node );
|
|
|
|
// Start processing.
|
|
foreach ( $all_textnodes as $textnode ) {
|
|
if ( arrays_intersect( get_ancestors( $textnode ), $tags_to_ignore ) ) {
|
|
continue;
|
|
}
|
|
|
|
// We won't be doing anything with spaces, so we can jump ship if that is all we have.
|
|
if ( $textnode->isWhitespaceInElementContent() ) {
|
|
continue;
|
|
}
|
|
|
|
// Decode all characters except < > &.
|
|
$textnode->data = htmlspecialchars( $textnode->data, ENT_NOQUOTES, 'UTF-8' ); // returns < > & to encoded HTML characters (< > and & respectively).
|
|
|
|
// Nodify anything that requires adjacent text awareness here.
|
|
$this->smart_math( $textnode );
|
|
$this->smart_diacritics( $textnode );
|
|
$this->smart_quotes( $textnode );
|
|
$this->smart_dashes( $textnode );
|
|
$this->smart_ellipses( $textnode );
|
|
$this->smart_marks( $textnode );
|
|
|
|
// Keep spacing after smart character replacement.
|
|
$this->single_character_word_spacing( $textnode );
|
|
$this->dash_spacing( $textnode );
|
|
$this->unit_spacing( $textnode );
|
|
$this->french_punctuation_spacing( $textnode );
|
|
|
|
// Parse and process individual words.
|
|
$this->process_words( $textnode, $is_title );
|
|
|
|
// Some final space manipulation.
|
|
$this->dewidow( $textnode );
|
|
$this->space_collapse( $textnode );
|
|
|
|
// Everything that requires HTML injection occurs here (functions above assume tag-free content)
|
|
// pay careful attention to functions below for tolerance of injected tags.
|
|
$this->smart_ordinal_suffix( $textnode ); // call before "style_numbers" and "smart_fractions".
|
|
$this->smart_exponents( $textnode ); // call before "style_numbers".
|
|
$this->smart_fractions( $textnode ); // call before "style_numbers" and after "smart_ordinal_suffix".
|
|
if ( ! has_class( $textnode, $this->css_classes['caps'] ) ) {
|
|
// Call before "style_numbers".
|
|
$this->style_caps( $textnode );
|
|
}
|
|
if ( ! has_class( $textnode, $this->css_classes['numbers'] ) ) {
|
|
// Call after "smart_ordinal_suffix", "smart_exponents", "smart_fractions", and "style_caps".
|
|
$this->style_numbers( $textnode );
|
|
}
|
|
if ( ! has_class( $textnode, $this->css_classes['amp'] ) ) {
|
|
$this->style_ampersands( $textnode );
|
|
}
|
|
if ( ! has_class( $textnode, array( $this->css_classes['quo'], $this->css_classes['dquo'] ) ) ) {
|
|
$this->style_initial_quotes( $textnode, $is_title );
|
|
}
|
|
if ( ! has_class( $textnode, array( $this->css_classes['pull-single'], $this->css_classes['pull-double'] ) ) ) {
|
|
$this->style_hanging_punctuation( $textnode );
|
|
}
|
|
|
|
// Until now, we've only been working on a single textnode: HTMLify result.
|
|
$this->replace_node_with_html( $textnode, $textnode->data );
|
|
}
|
|
|
|
return $html5_parser->saveHTML( $body_node->childNodes ); // @codingStandardsIgnoreLine.
|
|
}
|
|
|
|
/**
|
|
* Modifies $html according to the defined settings, in a way that is appropriate for RSS feeds
|
|
* (i.e. excluding processes that may not display well with limited character set intelligence).
|
|
*
|
|
* @param string $html A HTML fragment.
|
|
* @param string $is_title If the HTML fragment is a title. Optional. Default false.
|
|
* @return string The processed $html.
|
|
*/
|
|
function process_feed( $html, $is_title = false ) {
|
|
if ( isset( $this->settings['ignoreTags'] ) && $is_title && ( in_array( 'h1', $this->settings['ignoreTags'], true ) || in_array( 'h2', $this->settings['ignoreTags'], true ) ) ) {
|
|
return $html;
|
|
}
|
|
|
|
// Lazy-load our parser (the text parser is not needed for feeds).
|
|
$html5_parser = $this->get_html5_parser();
|
|
|
|
// Parse the HTML.
|
|
$dom = $this->parse_html( $html5_parser, $html );
|
|
$xpath = new \DOMXPath( $dom );
|
|
|
|
// Query some nodes in the DOM.
|
|
$body_node = $xpath->query( '/html/body' )->item( 0 );
|
|
$all_textnodes = $xpath->query( '//text()', $body_node );
|
|
$tags_to_ignore = $this->query_tags_to_ignore( $xpath, $body_node );
|
|
|
|
// Start processing.
|
|
foreach ( $all_textnodes as $textnode ) {
|
|
if ( arrays_intersect( get_ancestors( $textnode ), $tags_to_ignore ) ) {
|
|
continue;
|
|
}
|
|
|
|
// We won't be doing anything with spaces, so we can jump ship if that is all we have.
|
|
if ( $textnode->isWhitespaceInElementContent() ) {
|
|
continue;
|
|
}
|
|
|
|
// Decode all characters except < > &.
|
|
$textnode->data = htmlspecialchars( $textnode->data, ENT_NOQUOTES, 'UTF-8' ); // returns < > & to encoded HTML characters (< > and & respectively).
|
|
|
|
// Modify anything that requires adjacent text awareness here.
|
|
$this->smart_quotes( $textnode );
|
|
$this->smart_dashes( $textnode );
|
|
$this->smart_ellipses( $textnode );
|
|
$this->smart_marks( $textnode );
|
|
|
|
// Until now, we've only been working on a textnode: HTMLify result.
|
|
$this->replace_node_with_html( $textnode, $textnode->data );
|
|
}
|
|
|
|
return $html5_parser->saveHTML( $body_node->childNodes ); // @codingStandardsIgnoreLine.
|
|
}
|
|
|
|
/**
|
|
* Tokenize the content of a textnode and process the individual words separately.
|
|
*
|
|
* Currently this functions applies the following enhancements:
|
|
* - wrapping hard hyphens
|
|
* - hyphenation
|
|
* - wrapping URLs
|
|
* - wrapping email addresses
|
|
*
|
|
* @param \DOMText $textnode The textnode to process.
|
|
* @param boolean $is_title If the HTML fragment is a title. Defaults to false.
|
|
*/
|
|
function process_words( \DOMText $textnode, $is_title = false ) {
|
|
// Lazy-load text parser.
|
|
$text_parser = $this->get_text_parser();
|
|
|
|
// Set up parameters for word categories.
|
|
$mixed_caps = empty( $this->settings['hyphenateAllCaps'] ) ? 'allow-all-caps' : 'no-all-caps';
|
|
$letter_caps = empty( $this->settings['hyphenateAllCaps'] ) ? 'no-all-caps' : 'allow-all-caps';
|
|
$mixed_compounds = empty( $this->settings['hyphenateCompounds'] ) ? 'allow-compounds' : 'no-compounds';
|
|
$letter_compounds = empty( $this->settings['hyphenateCompounds'] ) ? 'no-compounds' : 'allow-compounds';
|
|
|
|
// Break text down for a bit more granularity.
|
|
$text_parser->load( $textnode->data );
|
|
$parsed_mixed_words = $text_parser->get_words( 'no-all-letters', $mixed_caps, $mixed_compounds ); // prohibit letter-only words, allow caps, allow compounds (or not).
|
|
$parsed_compound_words = ! empty( $this->settings['hyphenateCompounds'] ) ? $text_parser->get_words( 'no-all-letters', $letter_caps, 'require-compounds' ) : array();
|
|
$parsed_words = $text_parser->get_words( 'require-all-letters', $letter_caps, $letter_compounds ); // require letter-only words allow/prohibit caps & compounds vice-versa.
|
|
$parsed_other = $text_parser->get_other();
|
|
|
|
// Process individual text parts here.
|
|
$parsed_mixed_words = $this->wrap_hard_hyphens( $parsed_mixed_words );
|
|
$parsed_compound_words = $this->hyphenate_compounds( $parsed_compound_words, $is_title, $textnode );
|
|
$parsed_words = $this->hyphenate( $parsed_words, $is_title, $textnode );
|
|
$parsed_other = $this->wrap_urls( $parsed_other );
|
|
$parsed_other = $this->wrap_emails( $parsed_other );
|
|
|
|
// Apply updates to our text.
|
|
$text_parser->update( $parsed_mixed_words + $parsed_compound_words + $parsed_words + $parsed_other );
|
|
$textnode->data = $text_parser->unload();
|
|
}
|
|
|
|
/**
|
|
* Parse HTML5 fragment while ignoring certain warnings for invalid HTML code (e.g. duplicate IDs).
|
|
*
|
|
* @param \Masterminds\HTML5 $parser An intialized parser object.
|
|
* @param string $html The HTML fragment to parse (not a complete document).
|
|
*
|
|
* @return \DOMDocument The encoding has already been set to UTF-8.
|
|
*/
|
|
function parse_html( \Masterminds\HTML5 $parser, $html ) {
|
|
// Silence some parsing errors for invalid HTML.
|
|
set_error_handler( array( $this, 'handle_parsing_errors' ) );
|
|
$xml_error_handling = libxml_use_internal_errors( true );
|
|
|
|
// Do the actual parsing.
|
|
$dom = $parser->loadHTML( '<body>' . $html . '</body>' );
|
|
$dom->encoding = 'UTF-8';
|
|
|
|
// Restore original error handling.
|
|
libxml_clear_errors();
|
|
libxml_use_internal_errors( $xml_error_handling );
|
|
restore_error_handler();
|
|
|
|
return $dom;
|
|
}
|
|
|
|
/**
|
|
* Silently handle certain HTML parsing errors.
|
|
*
|
|
* @param int $errno Error number.
|
|
* @param string $errstr Error message.
|
|
* @param string $errfile The file in which the error occurred.
|
|
* @param int $errline The line in which the error occurred.
|
|
* @param array $errcontext Calling context.
|
|
*
|
|
* @return boolean Returns true if the error was handled, false otherwise.
|
|
*/
|
|
public function handle_parsing_errors( $errno, $errstr, $errfile, $errline, array $errcontext ) {
|
|
if ( ! ( error_reporting() & $errno ) ) {
|
|
return true; // not interesting.
|
|
}
|
|
|
|
if ( $errno & E_USER_WARNING && 0 === substr_compare( $errfile, 'DOMTreeBuilder.php', -18 ) ) {
|
|
// Ignore warnings from parser.
|
|
return true;
|
|
}
|
|
|
|
// Let PHP handle the rest.
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Retrieve an array of nodes that should be skipped during processing.
|
|
*
|
|
* @param \DOMXPath $xpath A valid XPath instance for the DOM to be queried.
|
|
* @param \DOMNode $initial_node The starting node of the XPath query.
|
|
* @return array An array of \DOMNode (can be empty).
|
|
*/
|
|
function query_tags_to_ignore( \DOMXPath $xpath, \DOMNode $initial_node ) {
|
|
$elements = array();
|
|
$query_parts = array();
|
|
if ( ! empty( $this->settings['ignoreTags'] ) ) {
|
|
$query_parts[] = '//' . implode( ' | //', $this->settings['ignoreTags'] );
|
|
}
|
|
if ( ! empty( $this->settings['ignoreClasses'] ) ) {
|
|
$query_parts[] = "//*[contains(concat(' ', @class, ' '), ' " . implode( " ') or contains(concat(' ', @class, ' '), ' ", $this->settings['ignoreClasses'] ) . " ')]";
|
|
}
|
|
if ( ! empty( $this->settings['ignoreIDs'] ) ) {
|
|
$query_parts[] = '//*[@id=\'' . implode( '\' or @id=\'', $this->settings['ignoreIDs'] ) . '\']';
|
|
}
|
|
|
|
if ( ! empty( $query_parts ) ) {
|
|
$ignore_query = implode( ' | ', $query_parts );
|
|
|
|
if ( false !== ( $nodelist = $xpath->query( $ignore_query, $initial_node ) ) ) {
|
|
$elements = nodelist_to_array( $nodelist );
|
|
}
|
|
}
|
|
|
|
return $elements;
|
|
}
|
|
|
|
/**
|
|
* Retrieve the last character of the previous \DOMText sibling (if there is one).
|
|
*
|
|
* @param \DOMNode $element The content node.
|
|
* @return string A single character (or the empty string).
|
|
*/
|
|
function get_prev_chr( \DOMNode $element ) {
|
|
$previous_textnode = $this->get_previous_textnode( $element );
|
|
|
|
if ( isset( $previous_textnode ) && isset( $previous_textnode->data ) ) {
|
|
// First determine encoding.
|
|
$func = $this->str_functions[ mb_detect_encoding( $previous_textnode->data, $this->encodings, true ) ];
|
|
|
|
if ( ! empty( $func ) && ! empty( $func['substr'] ) ) {
|
|
return preg_replace( $this->regex['controlCharacters'], '', $func['substr']( $previous_textnode->data, - 1 ) );
|
|
}
|
|
} // @codeCoverageIgnore
|
|
|
|
return '';
|
|
}
|
|
|
|
/**
|
|
* Retrieve the first character of the next \DOMText sibling (if there is one).
|
|
*
|
|
* @param \DOMNode $element The content node.
|
|
* @return string A single character (or the empty string).
|
|
*/
|
|
function get_next_chr( \DOMNode $element ) {
|
|
$next_textnode = $this->get_next_textnode( $element );
|
|
|
|
if ( isset( $next_textnode ) && isset( $next_textnode->data ) ) {
|
|
// First determine encoding.
|
|
$func = $this->str_functions[ mb_detect_encoding( $next_textnode->data, $this->encodings, true ) ];
|
|
|
|
if ( ! empty( $func ) && ! empty( $func['substr'] ) ) {
|
|
return preg_replace( $this->regex['controlCharacters'], '', $func['substr']( $next_textnode->data, 0, 1 ) );
|
|
}
|
|
} // @codeCoverageIgnore
|
|
|
|
return '';
|
|
}
|
|
|
|
/**
|
|
* Retrieve the previous \DOMText sibling (if there is one).
|
|
*
|
|
* @param \DOMNode $element The content node. Optional. Default null.
|
|
* @return \DOMText Null if $element is a block-level element or no text sibling exists.
|
|
*/
|
|
function get_previous_textnode( \DOMNode $element = null ) {
|
|
if ( ! isset( $element ) ) {
|
|
return null;
|
|
}
|
|
|
|
$previous_textnode = null;
|
|
$node = $element;
|
|
|
|
if ( $node instanceof \DOMElement && isset( $this->block_tags[ $node->tagName ] ) ) { // @codingStandardsIgnoreLine.
|
|
return null;
|
|
}
|
|
|
|
while ( ( $node = $node->previousSibling ) && empty( $previous_textnode ) ) { // @codingStandardsIgnoreLine.
|
|
$previous_textnode = $this->get_last_textnode( $node );
|
|
}
|
|
|
|
if ( ! $previous_textnode ) {
|
|
$previous_textnode = $this->get_previous_textnode( $element->parentNode ); // @codingStandardsIgnoreLine.
|
|
}
|
|
|
|
return $previous_textnode;
|
|
}
|
|
|
|
/**
|
|
* Retrieve the next \DOMText sibling (if there is one).
|
|
*
|
|
* @param \DOMNode $element The content node. Optional. Default null.
|
|
* @return \DOMText Null if $element is a block-level element or no text sibling exists.
|
|
*/
|
|
function get_next_textnode( \DOMNode $element = null ) {
|
|
if ( ! isset( $element ) ) {
|
|
return null;
|
|
}
|
|
|
|
$next_textnode = null;
|
|
$node = $element;
|
|
|
|
if ( $node instanceof \DOMElement && isset( $this->block_tags[ $node->tagName ] ) ) { // @codingStandardsIgnoreLine.
|
|
return null;
|
|
}
|
|
|
|
while ( ( $node = $node->nextSibling ) && empty( $next_textnode ) ) { // @codingStandardsIgnoreLine.
|
|
$next_textnode = $this->get_first_textnode( $node );
|
|
}
|
|
|
|
if ( ! $next_textnode ) {
|
|
$next_textnode = $this->get_next_textnode( $element->parentNode ); // @codingStandardsIgnoreLine.
|
|
}
|
|
|
|
return $next_textnode;
|
|
}
|
|
|
|
/**
|
|
* Retrieve the first \DOMText child of the element. Block-level child elements are ignored.
|
|
*
|
|
* @param \DOMNode $element Optional. Default null.
|
|
* @param boolean $recursive Should be set to true on recursive calls. Optional. Default false.
|
|
*
|
|
* @return \DOMNode The first child of type \DOMText, the element itself if it is of type \DOMText or null.
|
|
*/
|
|
function get_first_textnode( \DOMNode $element = null, $recursive = false ) {
|
|
if ( ! isset( $element ) ) {
|
|
return null;
|
|
}
|
|
|
|
if ( $element instanceof \DOMText ) {
|
|
return $element;
|
|
} elseif ( ! $element instanceof \DOMElement ) {
|
|
// Return null if $element is neither \DOMText nor \DOMElement.
|
|
return null;
|
|
} elseif ( $recursive && isset( $this->block_tags[ $element->tagName ] ) ) { // @codingStandardsIgnoreLine.
|
|
return null;
|
|
}
|
|
|
|
$first_textnode = null;
|
|
|
|
if ( $element->hasChildNodes() ) {
|
|
$children = $element->childNodes; // @codingStandardsIgnoreLine.
|
|
$i = 0;
|
|
|
|
while ( $i < $children->length && empty( $first_textnode ) ) {
|
|
$first_textnode = $this->get_first_textnode( $children->item( $i ), true );
|
|
$i++;
|
|
}
|
|
}
|
|
|
|
return $first_textnode;
|
|
}
|
|
|
|
/**
|
|
* Retrieve the last \DOMText child of the element. Block-level child elements are ignored.
|
|
*
|
|
* @param \DOMNode $element Optional. Default null.
|
|
* @param boolean $recursive Should be set to true on recursive calls. Optional. Default false.
|
|
*
|
|
* @return \DOMNode The last child of type \DOMText, the element itself if it is of type \DOMText or null.
|
|
*/
|
|
function get_last_textnode( \DOMNode $element = null, $recursive = false ) {
|
|
if ( ! isset( $element ) ) {
|
|
return null;
|
|
}
|
|
|
|
if ( $element instanceof \DOMText ) {
|
|
return $element;
|
|
} elseif ( ! $element instanceof \DOMElement ) {
|
|
// Return null if $element is neither \DOMText nor \DOMElement.
|
|
return null;
|
|
} elseif ( $recursive && isset( $this->block_tags[ $element->tagName ] ) ) { // @codingStandardsIgnoreLine.
|
|
return null;
|
|
}
|
|
|
|
$last_textnode = null;
|
|
|
|
if ( $element->hasChildNodes() ) {
|
|
$children = $element->childNodes; // @codingStandardsIgnoreLine.
|
|
$i = $children->length - 1;
|
|
|
|
while ( $i >= 0 && empty( $last_textnode ) ) {
|
|
$last_textnode = $this->get_last_textnode( $children->item( $i ), true );
|
|
$i--;
|
|
}
|
|
}
|
|
|
|
return $last_textnode;
|
|
}
|
|
|
|
/**
|
|
* Apply smart quotes (if enabled).
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function smart_quotes( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['smartQuotes'] ) ) {
|
|
return;
|
|
}
|
|
|
|
// Need to get context of adjacent characters outside adjacent inline tags or HTML comment
|
|
// if we have adjacent characters add them to the text.
|
|
$previous_character = $this->get_prev_chr( $textnode );
|
|
if ( '' !== $previous_character ) {
|
|
$textnode->data = $previous_character . $textnode->data;
|
|
}
|
|
$next_character = $this->get_next_chr( $textnode );
|
|
if ( '' !== $next_character ) {
|
|
$textnode->data = $textnode->data . $next_character;
|
|
}
|
|
|
|
// Before primes, handle quoted numbers.
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesSingleQuotedNumbers'], $this->chr['singleQuoteOpen'] . '$1' . $this->chr['singleQuoteClose'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesDoubleQuotedNumbers'], $this->chr['doubleQuoteOpen'] . '$1' . $this->chr['doubleQuoteClose'], $textnode->data );
|
|
|
|
// Guillemets.
|
|
$textnode->data = str_replace( '<<', $this->chr['guillemetOpen'], $textnode->data );
|
|
$textnode->data = str_replace( '<<', $this->chr['guillemetOpen'], $textnode->data );
|
|
$textnode->data = str_replace( '>>', $this->chr['guillemetClose'], $textnode->data );
|
|
$textnode->data = str_replace( '>>', $this->chr['guillemetClose'], $textnode->data );
|
|
|
|
// Primes.
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesSingleDoublePrime'], '$1' . $this->chr['singlePrime'] . '$2$3' . $this->chr['doublePrime'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesSingleDoublePrime1Glyph'], '$1' . $this->chr['singlePrime'] . '$2$3' . $this->chr['doublePrime'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesDoublePrime'], '$1' . $this->chr['doublePrime'], $textnode->data ); // should not interfere with regular quote matching.
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesSinglePrimeCompound'], '$1' . $this->chr['singlePrime'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesDoublePrimeCompound'], '$1' . $this->chr['doublePrime'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesDoublePrime1GlyphCompound'], '$1' . $this->chr['doublePrime'], $textnode->data );
|
|
|
|
// Backticks.
|
|
$textnode->data = str_replace( '``', $this->chr['doubleQuoteOpen'], $textnode->data );
|
|
$textnode->data = str_replace( '`', $this->chr['singleQuoteOpen'], $textnode->data );
|
|
$textnode->data = str_replace( "''", $this->chr['doubleQuoteClose'], $textnode->data );
|
|
|
|
// Comma quotes.
|
|
$textnode->data = str_replace( ',,', $this->chr['doubleLow9Quote'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesCommaQuote'], $this->chr['singleLow9Quote'], $textnode->data ); // like _,¿hola?'_.
|
|
|
|
// Apostrophes.
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesApostropheWords'], $this->chr['apostrophe'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesApostropheDecades'], $this->chr['apostrophe'] . '$1', $textnode->data ); // decades: '98.
|
|
$textnode->data = str_replace( $this->components['smartQuotesApostropheExceptionMatches'], $this->components['smartQuotesApostropheExceptionReplacements'], $textnode->data );
|
|
|
|
// Quotes.
|
|
$textnode->data = str_replace( $this->components['smartQuotesBracketMatches'], $this->components['smartQuotesBracketReplacements'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesSingleQuoteOpen'], $this->chr['singleQuoteOpen'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesSingleQuoteClose'], $this->chr['singleQuoteClose'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesSingleQuoteOpenSpecial'], $this->chr['singleQuoteOpen'], $textnode->data ); // like _'¿hola?'_.
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesSingleQuoteCloseSpecial'], $this->chr['singleQuoteClose'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesDoubleQuoteOpen'], $this->chr['doubleQuoteOpen'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesDoubleQuoteClose'], $this->chr['doubleQuoteClose'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesDoubleQuoteOpenSpecial'], $this->chr['doubleQuoteOpen'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartQuotesDoubleQuoteCloseSpecial'], $this->chr['doubleQuoteClose'], $textnode->data );
|
|
|
|
// Quote catch-alls - assume left over quotes are closing - as this is often the most complicated position, thus most likely to be missed.
|
|
$textnode->data = str_replace( "'", $this->chr['singleQuoteClose'], $textnode->data );
|
|
$textnode->data = str_replace( '"', $this->chr['doubleQuoteClose'], $textnode->data );
|
|
|
|
// If we have adjacent characters remove them from the text.
|
|
$func = $this->str_functions[ mb_detect_encoding( $textnode->data, $this->encodings, true ) ];
|
|
|
|
if ( '' !== $previous_character ) {
|
|
$textnode->data = $func['substr']( $textnode->data, 1, $func['strlen']( $textnode->data ) );
|
|
}
|
|
if ( '' !== $next_character ) {
|
|
$textnode->data = $func['substr']( $textnode->data, 0, $func['strlen']( $textnode->data ) - 1 );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Apply smart dashes (if enabled).
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function smart_dashes( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['smartDashes'] ) ) {
|
|
return;
|
|
}
|
|
|
|
$textnode->data = str_replace( '---', $this->chr['emDash'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartDashesParentheticalDoubleDash'], "\$1{$this->chr['parentheticalDash']}\$2", $textnode->data );
|
|
$textnode->data = str_replace( '--', $this->chr['enDash'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartDashesParentheticalSingleDash'], "\$1{$this->chr['parentheticalDash']}\$2", $textnode->data );
|
|
|
|
$textnode->data = preg_replace( $this->regex['smartDashesEnDashAll'], '$1' . $this->chr['enDash'] . '$2', $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartDashesEnDashWords'] , '$1' . $this->chr['enDash'] . '$2', $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartDashesEnDashNumbers'], '$1' . $this->chr['intervalDash'] . '$2', $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartDashesEnDashPhoneNumbers'], '$1' . $this->chr['noBreakHyphen'] . '$2', $textnode->data ); // phone numbers.
|
|
$textnode->data = str_replace( "xn{$this->chr['enDash']}", 'xn--', $textnode->data ); // revert messed-up punycode.
|
|
|
|
// Revert dates back to original formats
|
|
// YYYY-MM-DD.
|
|
$textnode->data = preg_replace( $this->regex['smartDashesYYYY-MM-DD'], '$1-$2-$3', $textnode->data );
|
|
// MM-DD-YYYY or DD-MM-YYYY.
|
|
$textnode->data = preg_replace( $this->regex['smartDashesMM-DD-YYYY'], '$1$3-$2$4-$5', $textnode->data );
|
|
// YYYY-MM or YYYY-DDDD next.
|
|
$textnode->data = preg_replace( $this->regex['smartDashesYYYY-MM'], '$1-$2', $textnode->data );
|
|
}
|
|
|
|
/**
|
|
* Apply smart ellipses (if enabled).
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function smart_ellipses( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['smartEllipses'] ) ) {
|
|
return;
|
|
}
|
|
|
|
$textnode->data = str_replace( array( '....', '. . . .' ), '.' . $this->chr['ellipses'], $textnode->data );
|
|
$textnode->data = str_replace( array( '...', '. . .' ), $this->chr['ellipses'], $textnode->data );
|
|
}
|
|
|
|
/**
|
|
* Apply smart diacritics (if enabled).
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function smart_diacritics( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['smartDiacritics'] ) ) {
|
|
return; // abort.
|
|
}
|
|
|
|
if ( ! empty( $this->settings['diacriticReplacement'] ) &&
|
|
! empty( $this->settings['diacriticReplacement']['patterns'] ) &&
|
|
! empty( $this->settings['diacriticReplacement']['replacements'] ) ) {
|
|
|
|
// Uses "word" => "replacement" pairs from an array to make fast preg_* replacements.
|
|
$replacements = $this->settings['diacriticReplacement']['replacements'];
|
|
$textnode->data = preg_replace_callback( $this->settings['diacriticReplacement']['patterns'], function( $match ) use ( $replacements ) {
|
|
if ( isset( $replacements[ $match[0] ] ) ) {
|
|
return $replacements[ $match[0] ];
|
|
} else {
|
|
return $match[0];
|
|
}
|
|
}, $textnode->data );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Apply smart marks (if enabled).
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function smart_marks( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['smartMarks'] ) ) {
|
|
return;
|
|
}
|
|
|
|
// Escape usage of "501(c)(1...29)" (US non-profit).
|
|
$textnode->data = preg_replace( $this->regex['smartMarksEscape501(c)'], '$1' . $this->components['escapeMarker'] . '$2' . $this->components['escapeMarker'] . '$3', $textnode->data );
|
|
|
|
// Replace marks.
|
|
$textnode->data = str_replace( array( '(c)', '(C)' ), $this->chr['copyright'], $textnode->data );
|
|
$textnode->data = str_replace( array( '(r)', '(R)' ), $this->chr['registeredMark'], $textnode->data );
|
|
$textnode->data = str_replace( array( '(p)', '(P)' ), $this->chr['soundCopyMark'], $textnode->data );
|
|
$textnode->data = str_replace( array( '(sm)', '(SM)' ), $this->chr['serviceMark'], $textnode->data );
|
|
$textnode->data = str_replace( array( '(tm)', '(TM)' ), $this->chr['tradeMark'], $textnode->data );
|
|
|
|
// Un-escape escaped sequences.
|
|
$textnode->data = str_replace( $this->components['escapeMarker'], '', $textnode->data );
|
|
}
|
|
|
|
/**
|
|
* Apply smart math (if enabled).
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function smart_math( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['smartMath'] ) ) {
|
|
return;
|
|
}
|
|
|
|
// First, let's find math equations.
|
|
$textnode->data = preg_replace_callback( $this->regex['smartMathEquation'], array( $this, '_smart_math_callback' ), $textnode->data );
|
|
|
|
// Revert 4-4 to plain minus-hyphen so as to not mess with ranges of numbers (i.e. pp. 46-50).
|
|
$textnode->data = preg_replace( $this->regex['smartMathRevertRange'], '$1-$2', $textnode->data );
|
|
|
|
// Revert fractions to basic slash.
|
|
// We'll leave styling fractions to smart_fractions.
|
|
$textnode->data = preg_replace( $this->regex['smartMathRevertFraction'], '$1/$2', $textnode->data );
|
|
|
|
// Revert date back to original formats.
|
|
// YYYY-MM-DD.
|
|
$textnode->data = preg_replace( $this->regex['smartMathRevertDateYYYY-MM-DD'], '$1-$2-$3', $textnode->data );
|
|
// MM-DD-YYYY or DD-MM-YYYY.
|
|
$textnode->data = preg_replace( $this->regex['smartMathRevertDateMM-DD-YYYY'], '$1$3-$2$4-$5', $textnode->data );
|
|
// YYYY-MM or YYYY-DDD next.
|
|
$textnode->data = preg_replace( $this->regex['smartMathRevertDateYYYY-MM'], '$1-$2', $textnode->data );
|
|
// MM/DD/YYYY or DD/MM/YYYY.
|
|
$textnode->data = preg_replace( $this->regex['smartMathRevertDateMM/DD/YYYY'], '$1$3/$2$4/$5', $textnode->data );
|
|
}
|
|
|
|
/**
|
|
* Callback function for smart math.
|
|
*
|
|
* @param array $matches Regex matches.
|
|
*/
|
|
private function _smart_math_callback( array $matches ) {
|
|
$matches[0] = str_replace( '-', $this->chr['minus'], $matches[0] );
|
|
$matches[0] = str_replace( '/', $this->chr['division'], $matches[0] );
|
|
$matches[0] = str_replace( 'x', $this->chr['multiplication'], $matches[0] );
|
|
$matches[0] = str_replace( '*', $this->chr['multiplication'], $matches[0] );
|
|
|
|
return $matches[0];
|
|
}
|
|
|
|
/**
|
|
* Apply smart exponents (if enabled).
|
|
* Purposefully seperated from smart_math because of HTML code injection.
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function smart_exponents( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['smartExponents'] ) ) {
|
|
return;
|
|
}
|
|
|
|
// Handle exponents (ie. 4^2).
|
|
$textnode->data = preg_replace( $this->regex['smartExponents'], '$1<sup>$2</sup>', $textnode->data );
|
|
}
|
|
|
|
/**
|
|
* Apply smart fractions (if enabled).
|
|
*
|
|
* Call before style_numbers, but after smart_ordinal_suffix.
|
|
* Purposefully seperated from smart_math because of HTML code injection.
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function smart_fractions( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['smartFractions'] ) && empty( $this->settings['fractionSpacing'] ) ) {
|
|
return;
|
|
}
|
|
|
|
if ( ! empty( $this->settings['fractionSpacing'] ) && ! empty( $this->settings['smartFractions'] ) ) {
|
|
$textnode->data = preg_replace( $this->regex['smartFractionsSpacing'], '$1' . $this->chr['noBreakNarrowSpace'] . '$2', $textnode->data );
|
|
} elseif ( ! empty( $this->settings['fractionSpacing'] ) && empty( $this->settings['smartFractions'] ) ) {
|
|
$textnode->data = preg_replace( $this->regex['smartFractionsSpacing'], '$1' . $this->chr['noBreakSpace'] . '$2', $textnode->data );
|
|
}
|
|
|
|
if ( ! empty( $this->settings['smartFractions'] ) ) {
|
|
// Escape sequences we don't want fractionified.
|
|
$textnode->data = preg_replace( $this->regex['smartFractionsEscapeYYYY/YYYY'], '$1' . $this->components['escapeMarker'] . '$2$3$4', $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['smartFractionsEscapeMM/YYYY'], '$1' . $this->components['escapeMarker'] . '$2$3$4', $textnode->data );
|
|
|
|
// Replace fractions.
|
|
$numerator_class = empty( $this->css_classes['numerator'] ) ? '' : ' class="' . $this->css_classes['numerator'] . '"';
|
|
$denominator_class = empty( $this->css_classes['denominator'] ) ? '' : ' class="' . $this->css_classes['denominator'] . '"';
|
|
$textnode->data = preg_replace( $this->regex['smartFractionsReplacement'], "<sup{$numerator_class}>\$1</sup>" . $this->chr['fractionSlash'] . "<sub{$denominator_class}>\$2</sub>\$3", $textnode->data );
|
|
|
|
// Unescape escaped sequences.
|
|
$textnode->data = str_replace( $this->components['escapeMarker'], '', $textnode->data );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Apply smart ordinal suffix (if enabled).
|
|
*
|
|
* Call before style_numbers.
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function smart_ordinal_suffix( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['smartOrdinalSuffix'] ) ) {
|
|
return;
|
|
}
|
|
|
|
$ordinal_class = empty( $this->css_classes['ordinal'] ) ? '' : ' class="' . $this->css_classes['ordinal'] . '"';
|
|
$textnode->data = preg_replace( $this->regex['smartOrdinalSuffix'], '$1' . "<sup{$ordinal_class}>$2</sup>", $textnode->data );
|
|
}
|
|
|
|
/**
|
|
* Prevent single character words from being alone (if enabled).
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function single_character_word_spacing( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['singleCharacterWordSpacing'] ) ) {
|
|
return;
|
|
}
|
|
|
|
// Add $next_character and $previous_character for context.
|
|
$previous_character = $this->get_prev_chr( $textnode );
|
|
if ( '' !== $previous_character ) {
|
|
$textnode->data = $previous_character . $textnode->data;
|
|
}
|
|
|
|
$next_character = $this->get_next_chr( $textnode );
|
|
if ( '' !== $next_character ) {
|
|
$textnode->data = $textnode->data . $next_character;
|
|
}
|
|
|
|
$textnode->data = preg_replace( $this->regex['singleCharacterWordSpacing'], '$1$2' . $this->chr['noBreakSpace'], $textnode->data );
|
|
|
|
// If we have adjacent characters remove them from the text.
|
|
$func = $this->str_functions[ mb_detect_encoding( $textnode->data, $this->encodings, true ) ];
|
|
|
|
if ( '' !== $previous_character ) {
|
|
$textnode->data = $func['substr']( $textnode->data, 1, $func['strlen']( $textnode->data ) );
|
|
}
|
|
if ( '' !== $next_character ) {
|
|
$textnode->data = $func['substr']( $textnode->data, 0, $func['strlen']( $textnode->data ) - 1 );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Apply spacing around dashes (if enabled).
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function dash_spacing( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['dashSpacing'] ) ) {
|
|
return;
|
|
}
|
|
|
|
$textnode->data = preg_replace( $this->regex['dashSpacingEmDash'], $this->chr['intervalDashSpace'] . '$1$2' . $this->chr['intervalDashSpace'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['dashSpacingParentheticalDash'], $this->chr['parentheticalDashSpace'] . '$1$2' . $this->chr['parentheticalDashSpace'], $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['dashSpacingIntervalDash'], $this->chr['intervalDashSpace'] . '$1$2' . $this->chr['intervalDashSpace'], $textnode->data );
|
|
}
|
|
|
|
/**
|
|
* Collapse spaces (if enabled).
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function space_collapse( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['spaceCollapse'] ) ) {
|
|
return;
|
|
}
|
|
|
|
// Normal spacing.
|
|
$textnode->data = preg_replace( $this->regex['spaceCollapseNormal'], ' ', $textnode->data );
|
|
|
|
// Non-breakable space get's priority. If non-breakable space exists in a string of spaces, it collapses to a single non-breakable space.
|
|
$textnode->data = preg_replace( $this->regex['spaceCollapseNonBreakable'], $this->chr['noBreakSpace'], $textnode->data );
|
|
|
|
// For any other spaceing, replace with the first occurance of an unusual space character.
|
|
$textnode->data = preg_replace( $this->regex['spaceCollapseOther'], '$1', $textnode->data );
|
|
|
|
// Remove all spacing at beginning of block level elements.
|
|
if ( '' === $this->get_prev_chr( $textnode ) ) { // we have the first text in a block level element.
|
|
$textnode->data = preg_replace( $this->regex['spaceCollapseBlockStart'], '', $textnode->data );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Prevent values being split from their units (if enabled).
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function unit_spacing( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['unitSpacing'] ) ) {
|
|
return;
|
|
}
|
|
|
|
$textnode->data = preg_replace( $this->regex['unitSpacingUnitPattern'], '$1' . $this->chr['noBreakNarrowSpace'] . '$2', $textnode->data );
|
|
}
|
|
|
|
/**
|
|
* Add a narrow no-break space before
|
|
* - exclamation mark (!)
|
|
* - question mark (?)
|
|
* - semicolon (;)
|
|
* - colon (:)
|
|
*
|
|
* If there already is a space there, it is replaced.
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function french_punctuation_spacing( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['frenchPunctuationSpacing'] ) ) {
|
|
return;
|
|
}
|
|
|
|
$textnode->data = preg_replace( $this->regex['frenchPunctuationSpacingNarrow'], '$1' . $this->chr['noBreakNarrowSpace'] . '$3$4', $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['frenchPunctuationSpacingFull'], '$1' . $this->chr['noBreakSpace'] . '$3$4', $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['frenchPunctuationSpacingSemicolon'], '$1' . $this->chr['noBreakNarrowSpace'] . '$3$4', $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['frenchPunctuationSpacingOpeningQuote'], '$1$2' . $this->chr['noBreakNarrowSpace'] . '$4', $textnode->data );
|
|
}
|
|
|
|
/**
|
|
* Wrap hard hypens with zero-width spaces (if enabled).
|
|
*
|
|
* @param array $parsed_text_tokens The tokenized content of a textnode.
|
|
*/
|
|
function wrap_hard_hyphens( array $parsed_text_tokens ) {
|
|
if ( ! empty( $this->settings['hyphenHardWrap'] ) || ! empty( $this->settings['smartDashes'] ) ) {
|
|
|
|
foreach ( $parsed_text_tokens as &$text_token ) {
|
|
|
|
if ( isset( $this->settings['hyphenHardWrap'] ) && $this->settings['hyphenHardWrap'] ) {
|
|
$text_token['value'] = str_replace( $this->components['hyphensArray'], '-' . $this->chr['zeroWidthSpace'], $text_token['value'] );
|
|
$text_token['value'] = str_replace( '_', '_' . $this->chr['zeroWidthSpace'], $text_token['value'] );
|
|
$text_token['value'] = str_replace( '/', '/' . $this->chr['zeroWidthSpace'], $text_token['value'] );
|
|
|
|
$text_token['value'] = preg_replace( $this->regex['wrapHardHyphensRemoveEndingSpace'], '$1', $text_token['value'] );
|
|
}
|
|
|
|
if ( ! empty( $this->settings['smartDashes'] ) ) {
|
|
// Handled here because we need to know we are inside a word and not a URL.
|
|
$text_token['value'] = str_replace( '-', $this->chr['hyphen'], $text_token['value'] );
|
|
}
|
|
}
|
|
}
|
|
|
|
return $parsed_text_tokens;
|
|
}
|
|
|
|
/**
|
|
* Prevent widows (if enabled).
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function dewidow( \DOMText $textnode ) {
|
|
// Intervening inline tags may interfere with widow identification, but that is a sacrifice of using the parser.
|
|
// Intervening tags will only interfere if they separate the widow from previous or preceding whitespace.
|
|
if ( empty( $this->settings['dewidow'] ) || empty( $this->settings['dewidowMaxPull'] ) || empty( $this->settings['dewidowMaxLength'] ) ) {
|
|
return;
|
|
}
|
|
|
|
if ( '' === $this->get_next_chr( $textnode ) ) {
|
|
// We have the last type "text" child of a block level element.
|
|
$textnode->data = preg_replace_callback( $this->regex['dewidow'], array( $this, '_dewidow_callback' ), $textnode->data );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Callback function for de-widowing.
|
|
*
|
|
* @param array $widow Regex matching array.
|
|
* @return string
|
|
*/
|
|
private function _dewidow_callback( array $widow ) {
|
|
$func = $this->str_functions[ mb_detect_encoding( $widow[0], $this->encodings, true ) ];
|
|
|
|
// If we are here, we know that widows are being protected in some fashion
|
|
// with that, we will assert that widows should never be hyphenated or wrapped
|
|
// as such, we will strip soft hyphens and zero-width-spaces.
|
|
$widow['widow'] = str_replace( $this->chr['zeroWidthSpace'], '', $widow['widow'] ); // TODO: check if this can match here.
|
|
$widow['widow'] = str_replace( $this->chr['softHyphen'], '', $widow['widow'] ); // TODO: check if this can match here.
|
|
$widow['trailing'] = preg_replace( "/\s+/{$func['u']}", $this->chr['noBreakSpace'], $widow['trailing'] );
|
|
$widow['trailing'] = str_replace( $this->chr['zeroWidthSpace'], '', $widow['trailing'] );
|
|
$widow['trailing'] = str_replace( $this->chr['softHyphen'], '', $widow['trailing'] );
|
|
|
|
// Eject if widows neighbor is proceeded by a no break space (the pulled text would be too long).
|
|
if ( '' === $widow['space_before'] || strstr( $this->chr['noBreakSpace'], $widow['space_before'] ) ) {
|
|
return $widow['space_before'] . $widow['neighbor'] . $widow['space_between'] . $widow['widow'] . $widow['trailing'];
|
|
}
|
|
|
|
// Eject if widows neighbor length exceeds the max allowed or widow length exceeds max allowed.
|
|
if ( $func['strlen']( $widow['neighbor'] ) > $this->settings['dewidowMaxPull'] ||
|
|
$func['strlen']( $widow['widow'] ) > $this->settings['dewidowMaxLength'] ) {
|
|
return $widow['space_before'] . $widow['neighbor'] . $widow['space_between'] . $widow['widow'] . $widow['trailing'];
|
|
}
|
|
|
|
// Never replace thin and hair spaces with .
|
|
switch ( $widow['space_between'] ) {
|
|
case $this->chr['thinSpace']:
|
|
case $this->chr['hairSpace']:
|
|
return $widow['space_before'] . $widow['neighbor'] . $widow['space_between'] . $widow['widow'] . $widow['trailing'];
|
|
}
|
|
|
|
// Let's protect some widows!
|
|
return $widow['space_before'] . $widow['neighbor'] . $this->chr['noBreakSpace'] . $widow['widow'] . $widow['trailing'];
|
|
}
|
|
|
|
/**
|
|
* Wrap URL parts zero-width spaces (if enabled).
|
|
*
|
|
* @param array $parsed_text_tokens The tokenized content of a textnode.
|
|
*/
|
|
function wrap_urls( array $parsed_text_tokens ) {
|
|
if ( empty( $this->settings['urlWrap'] ) || empty( $this->settings['urlMinAfterWrap'] ) ) {
|
|
return $parsed_text_tokens;
|
|
}
|
|
|
|
// Test for and parse urls.
|
|
foreach ( $parsed_text_tokens as &$text_token ) {
|
|
if ( preg_match( $this->regex['wrapUrlsPattern'], $text_token['value'], $url_match ) ) {
|
|
|
|
// $url_match['schema'] holds "http://".
|
|
// $url_match['domain'] holds "subdomains.domain.tld".
|
|
// $url_match['path'] holds the path after the domain.
|
|
$http = ( $url_match['schema'] ) ? $url_match[1] . $this->chr['zeroWidthSpace'] : '';
|
|
|
|
$domain_parts = preg_split( $this->regex['wrapUrlsDomainParts'], $url_match['domain'], -1, PREG_SPLIT_DELIM_CAPTURE );
|
|
|
|
// This is a hack, but it works.
|
|
// First, we hyphenate each part, we need it formated like a group of words.
|
|
$parsed_words_like = array();
|
|
foreach ( $domain_parts as $key => $part ) {
|
|
$parsed_words_like[ $key ]['value'] = $part;
|
|
}
|
|
|
|
// Do the hyphenation.
|
|
$parsed_words_like = $this->do_hyphenate( $parsed_words_like );
|
|
|
|
// Restore format.
|
|
foreach ( $parsed_words_like as $key => $parsed_word ) {
|
|
$domain_parts[ $key ] = $parsed_word['value'];
|
|
}
|
|
foreach ( $domain_parts as $key => &$part ) {
|
|
// Then we swap out each soft-hyphen" with a zero-space.
|
|
$part = str_replace( $this->chr['softHyphen'], $this->chr['zeroWidthSpace'], $part );
|
|
|
|
// We also insert zero-spaces before periods and hyphens.
|
|
if ( $key > 0 && 1 === strlen( $part ) ) {
|
|
$part = $this->chr['zeroWidthSpace'] . $part;
|
|
}
|
|
}
|
|
|
|
// Lastly let's recombine.
|
|
$domain = implode( $domain_parts );
|
|
|
|
// Break up the URL path to individual characters.
|
|
$path_parts = str_split( $url_match['path'], 1 );
|
|
$path_count = count( $path_parts );
|
|
$path = '';
|
|
foreach ( $path_parts as $index => $path_part ) {
|
|
if ( 0 === $index || $path_count - $index < $this->settings['urlMinAfterWrap'] ) {
|
|
$path .= $path_part;
|
|
} else {
|
|
$path .= $this->chr['zeroWidthSpace'] . $path_part;
|
|
}
|
|
}
|
|
|
|
$text_token['value'] = $http . $domain . $path;
|
|
}
|
|
}
|
|
|
|
return $parsed_text_tokens;
|
|
}
|
|
|
|
/**
|
|
* Wrap email parts zero-width spaces (if enabled).
|
|
*
|
|
* @param array $parsed_text_tokens The tokenized content of a textnode.
|
|
*/
|
|
function wrap_emails( array $parsed_text_tokens ) {
|
|
if ( empty( $this->settings['emailWrap'] ) ) {
|
|
return $parsed_text_tokens;
|
|
}
|
|
|
|
// Test for and parse urls.
|
|
foreach ( $parsed_text_tokens as &$text_token ) {
|
|
if ( preg_match( $this->regex['wrapEmailsMatchEmails'], $text_token['value'], $email_match ) ) {
|
|
$text_token['value'] = preg_replace( $this->regex['wrapEmailsReplaceEmails'], '$1' . $this->chr['zeroWidthSpace'], $text_token['value'] );
|
|
}
|
|
}
|
|
|
|
return $parsed_text_tokens;
|
|
}
|
|
|
|
/**
|
|
* Wraps words of all caps (may include numbers) in <span class="caps"> if enabled.
|
|
*
|
|
* Call before style_numbers().Only call if you are certain that no html tags have been
|
|
* injected containing capital letters.
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function style_caps( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['styleCaps'] ) ) {
|
|
return;
|
|
}
|
|
|
|
$textnode->data = preg_replace( $this->regex['styleCaps'], '<span class="' . $this->css_classes['caps'] . '">$1</span>', $textnode->data );
|
|
}
|
|
|
|
/**
|
|
* Replace the given node with HTML content. Uses the HTML5 parser.
|
|
*
|
|
* @param \DOMNode $node The node to replace.
|
|
* @param string $content The HTML fragment used to replace the node.
|
|
*
|
|
* @return \DOMNode|array An array of \DOMNode containing the new nodes or the old \DOMNode if the replacement failed.
|
|
*/
|
|
function replace_node_with_html( \DOMNode $node, $content ) {
|
|
$result = $node;
|
|
|
|
$parent = $node->parentNode; // @codingStandardsIgnoreLine.
|
|
if ( empty( $parent ) ) {
|
|
return $node; // abort early to save cycles.
|
|
}
|
|
|
|
set_error_handler( array( $this, 'handle_parsing_errors' ) );
|
|
|
|
$html_fragment = $this->get_html5_parser()->loadHTMLFragment( $content );
|
|
if ( ! empty( $html_fragment ) ) {
|
|
$imported_fragment = $node->ownerDocument->importNode( $html_fragment, true ); // @codingStandardsIgnoreLine.
|
|
|
|
if ( ! empty( $imported_fragment ) ) {
|
|
// Save the children of the imported DOMDocumentFragment before replacement.
|
|
$children = nodelist_to_array( $imported_fragment->childNodes ); // @codingStandardsIgnoreLine.
|
|
|
|
if ( false !== $parent->replaceChild( $imported_fragment, $node ) ) {
|
|
// Success! We return the saved array of DOMNodes as
|
|
// $imported_fragment is just an empty DOMDocumentFragment now.
|
|
$result = $children;
|
|
}
|
|
}
|
|
}
|
|
|
|
restore_error_handler();
|
|
|
|
return $result;
|
|
}
|
|
|
|
/**
|
|
* Wraps numbers in <span class="numbers"> (even numbers that appear inside a word,
|
|
* i.e. A9 becomes A<span class="numbers">9</span>), if enabled.
|
|
*
|
|
* Call after style_caps so A9 becomes <span class="caps">A<span class="numbers">9</span></span>.
|
|
* Call after smart_fractions and smart_ordinal_suffix.
|
|
* Only call if you are certain that no html tags have been injected containing numbers.
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function style_numbers( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['styleNumbers'] ) ) {
|
|
return;
|
|
}
|
|
|
|
$textnode->data = preg_replace( $this->regex['styleNumbers'], '<span class="' . $this->css_classes['numbers'] . '">$1</span>', $textnode->data );
|
|
}
|
|
|
|
/**
|
|
* Wraps hanging punctuation in <span class="pull-*"> and <span class="push-*">, if enabled.
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function style_hanging_punctuation( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['styleHangingPunctuation'] ) ) {
|
|
return;
|
|
}
|
|
|
|
// We need the parent.
|
|
$block = $this->get_block_parent( $textnode );
|
|
$firstnode = ! empty( $block ) ? $this->get_first_textnode( $block ) : null;
|
|
|
|
// Need to get context of adjacent characters outside adjacent inline tags or HTML comment
|
|
// if we have adjacent characters add them to the text.
|
|
$next_character = $this->get_next_chr( $textnode );
|
|
if ( '' !== $next_character ) {
|
|
$textnode->data = $textnode->data . $next_character;
|
|
}
|
|
|
|
$textnode->data = preg_replace( $this->regex['styleHangingPunctuationDouble'], '$1<span class="' . $this->css_classes['push-double'] . '"></span>' . $this->chr['zeroWidthSpace'] . '<span class="' . $this->css_classes['pull-double'] . '">$2</span>$3', $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['styleHangingPunctuationSingle'], '$1<span class="' . $this->css_classes['push-single'] . '"></span>' . $this->chr['zeroWidthSpace'] . '<span class="' . $this->css_classes['pull-single'] . '">$2</span>$3', $textnode->data );
|
|
|
|
if ( empty( $block ) || $firstnode === $textnode ) {
|
|
$textnode->data = preg_replace( $this->regex['styleHangingPunctuationInitialDouble'], '<span class="' . $this->css_classes['pull-double'] . '">$1</span>$2', $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['styleHangingPunctuationInitialSingle'], '<span class="' . $this->css_classes['pull-single'] . '">$1</span>$2', $textnode->data );
|
|
} else {
|
|
$textnode->data = preg_replace( $this->regex['styleHangingPunctuationInitialDouble'], '<span class="' . $this->css_classes['push-double'] . '"></span>' . $this->chr['zeroWidthSpace'] . '<span class="' . $this->css_classes['pull-double'] . '">$1</span>$2', $textnode->data );
|
|
$textnode->data = preg_replace( $this->regex['styleHangingPunctuationInitialSingle'], '<span class="' . $this->css_classes['push-single'] . '"></span>' . $this->chr['zeroWidthSpace'] . '<span class="' . $this->css_classes['pull-single'] . '">$1</span>$2', $textnode->data );
|
|
}
|
|
|
|
// Remove any added characters.
|
|
if ( '' !== $next_character ) {
|
|
$func = $this->str_functions[ mb_detect_encoding( $textnode->data, $this->encodings, true ) ];
|
|
$textnode->data = $func['substr']( $textnode->data, 0, $func['strlen']( $textnode->data ) - 1 );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Wraps ampersands in <span class="amp"> (i.e. H&J becomes H<span class="amp">&</span>J),
|
|
* if enabled.
|
|
*
|
|
* Call after style_caps so H&J becomes <span class="caps">H<span class="amp">&</span>J</span>.
|
|
* Note that all standalone ampersands were previously converted to &.
|
|
* Only call if you are certain that no html tags have been injected containing "&".
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
*/
|
|
function style_ampersands( \DOMText $textnode ) {
|
|
if ( empty( $this->settings['styleAmpersands'] ) ) {
|
|
return;
|
|
}
|
|
|
|
$textnode->data = preg_replace( $this->regex['styleAmpersands'], '<span class="' . $this->css_classes['amp'] . '">$1</span>', $textnode->data );
|
|
}
|
|
|
|
/**
|
|
* Styles initial quotes and guillemets (if enabled).
|
|
*
|
|
* @param \DOMText $textnode The content node.
|
|
* @param boolean $is_title Default false.
|
|
*/
|
|
function style_initial_quotes( \DOMText $textnode, $is_title = false ) {
|
|
if ( empty( $this->settings['styleInitialQuotes'] ) || empty( $this->settings['initialQuoteTags'] ) ) {
|
|
return;
|
|
}
|
|
|
|
if ( '' === $this->get_prev_chr( $textnode ) ) { // we have the first text in a block level element.
|
|
|
|
$func = $this->str_functions[ mb_detect_encoding( $textnode->data, $this->encodings, true ) ];
|
|
$first_character = $func['substr']( $textnode->data, 0, 1 );
|
|
|
|
switch ( $first_character ) {
|
|
case "'":
|
|
case $this->chr['singleQuoteOpen']:
|
|
case $this->chr['singleLow9Quote']:
|
|
case ',':
|
|
case '"':
|
|
case $this->chr['doubleQuoteOpen']:
|
|
case $this->chr['guillemetOpen']:
|
|
case $this->chr['guillemetClose']:
|
|
case $this->chr['doubleLow9Quote']:
|
|
|
|
$block_level_parent = $this->get_block_parent( $textnode );
|
|
$block_level_parent = isset( $block_level_parent->tagName ) ? $block_level_parent->tagName : false; // @codingStandardsIgnoreLine.
|
|
|
|
if ( $is_title ) {
|
|
// Assume page title is h2.
|
|
$block_level_parent = 'h2';
|
|
}
|
|
|
|
if ( $block_level_parent && isset( $this->settings['initialQuoteTags'][ $block_level_parent ] ) ) {
|
|
switch ( $first_character ) {
|
|
case "'":
|
|
case $this->chr['singleQuoteOpen']:
|
|
case $this->chr['singleLow9Quote']:
|
|
case ',':
|
|
$span_class = 'quo';
|
|
break;
|
|
|
|
default: // double quotes or guillemets.
|
|
$span_class = 'dquo';
|
|
}
|
|
|
|
$textnode->data = '<span class="' . $this->css_classes[ $span_class ] . '">' . $first_character . '</span>' . $func['substr']( $textnode->data, 1, $func['strlen']( $textnode->data ) );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Inject the PatGen segments pattern into the PatGen words pattern.
|
|
*
|
|
* @param array $word_pattern Required.
|
|
* @param array $segment_pattern Required.
|
|
* @param number $segment_position Required.
|
|
* @param number $segment_length Required.
|
|
*/
|
|
function hyphenation_pattern_injection( array $word_pattern, array $segment_pattern, $segment_position, $segment_length ) {
|
|
|
|
for ( $number_position = $segment_position;
|
|
$number_position <= $segment_position + $segment_length;
|
|
$number_position++ ) {
|
|
|
|
$word_pattern[ $number_position ] =
|
|
( intval( $word_pattern[ $number_position ] ) >= intval( $segment_pattern[ $number_position - $segment_position ] ) ) ?
|
|
$word_pattern[ $number_position ] : $segment_pattern[ $number_position - $segment_position ];
|
|
}
|
|
|
|
return $word_pattern;
|
|
}
|
|
|
|
/**
|
|
* Hyphenate given text fragment (if enabled).
|
|
*
|
|
* Actual work is done in do_hyphenate().
|
|
*
|
|
* @param array $parsed_text_tokens Filtered to words.
|
|
* @param boolean $is_title Flag to indicate title fragments. Optional. Default false.
|
|
* @param \DOMText $textnode The textnode corresponding to the $parsed_text_tokens. Optional. Default null.
|
|
*/
|
|
function hyphenate( $parsed_text_tokens, $is_title = false, \DOMText $textnode = null ) {
|
|
if ( empty( $this->settings['hyphenation'] ) ) {
|
|
return $parsed_text_tokens; // abort.
|
|
}
|
|
|
|
$is_heading = false;
|
|
if ( ! empty( $textnode ) && ! empty( $textnode->parentNode ) ) { // @codingStandardsIgnoreLine.
|
|
$block_level_parent = $this->get_block_parent( $textnode );
|
|
$block_level_parent = isset( $block_level_parent->tagName ) ? $block_level_parent->tagName : false; // @codingStandardsIgnoreLine.
|
|
|
|
if ( $block_level_parent && isset( $this->heading_tags[ $block_level_parent ] ) ) {
|
|
$is_heading = true;
|
|
}
|
|
}
|
|
|
|
if ( empty( $this->settings['hyphenateTitle'] ) && ( $is_title || $is_heading ) ) {
|
|
return $parsed_text_tokens; // abort.
|
|
}
|
|
|
|
// Call functionality as seperate function so it can be run without test for setting['hyphenation'] - such as with url wrapping.
|
|
return $this->do_hyphenate( $parsed_text_tokens );
|
|
}
|
|
|
|
/**
|
|
* Hyphenate hyphenated compound words (if enabled).
|
|
*
|
|
* Calls hyphenate() on the component words.
|
|
*
|
|
* @param array $parsed_text_tokens Filtered to compound words.
|
|
* @param boolean $is_title Flag to indicate title fragments. Optional. Default false.
|
|
* @param \DOMText $textnode The textnode corresponding to the $parsed_text_tokens. Optional. Default null.
|
|
*/
|
|
function hyphenate_compounds( array $parsed_text_tokens, $is_title = false, \DOMText $textnode = null ) {
|
|
if ( empty( $this->settings['hyphenateCompounds'] ) ) {
|
|
return $parsed_text_tokens; // abort.
|
|
}
|
|
|
|
// Hyphenate compound words.
|
|
foreach ( $parsed_text_tokens as $key => $word_token ) {
|
|
$component_words = array();
|
|
foreach ( preg_split( '/(-)/', $word_token['value'], -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE ) as $word_part ) {
|
|
$component_words[] = array( 'value' => $word_part );
|
|
}
|
|
|
|
$parsed_text_tokens[ $key ]['value'] = array_reduce( $this->hyphenate( $component_words, $is_title, $textnode ), function( $carry, $item ) {
|
|
return $carry . $item['value'];
|
|
});
|
|
}
|
|
|
|
return $parsed_text_tokens;
|
|
}
|
|
|
|
/**
|
|
* Really hyphenate given text fragment.
|
|
*
|
|
* @param array $parsed_text_tokens Filtered to words.
|
|
* @return array The hyphenated text token.
|
|
*/
|
|
function do_hyphenate( array $parsed_text_tokens ) {
|
|
|
|
if ( empty( $this->settings['hyphenMinLength'] ) ||
|
|
empty( $this->settings['hyphenMinBefore'] ) ||
|
|
! isset( $this->settings['hyphenationPatternMaxSegment'] ) ||
|
|
! isset( $this->settings['hyphenationPatternExceptions'] ) ||
|
|
! isset( $this->settings['hyphenationPattern'] ) ) {
|
|
|
|
return $parsed_text_tokens;
|
|
}
|
|
|
|
// Make sure we have full exceptions list.
|
|
if ( ! isset( $this->settings['hyphenationExceptions'] ) ) {
|
|
$exceptions = array();
|
|
|
|
if ( $this->settings['hyphenationPatternExceptions'] || ! empty( $this->settings['hyphenationCustomExceptions'] ) ) {
|
|
if ( isset( $this->settings['hyphenationCustomExceptions'] ) ) {
|
|
// Nerges custom and language specific word hyphenations.
|
|
$exceptions = array_merge( $this->settings['hyphenationCustomExceptions'], $this->settings['hyphenationPatternExceptions'] );
|
|
} else {
|
|
$exceptions = $this->settings['hyphenationPatternExceptions'];
|
|
}
|
|
}
|
|
|
|
$this->settings['hyphenationExceptions'] = $exceptions;
|
|
}
|
|
|
|
$func = array(); // quickly reference string functions according to encoding.
|
|
foreach ( $parsed_text_tokens as &$text_token ) {
|
|
$func = $this->str_functions[ mb_detect_encoding( $text_token['value'], $this->encodings, true ) ];
|
|
if ( empty( $func ) || empty( $func['strlen'] ) ) {
|
|
continue; // unknown encoding, abort.
|
|
}
|
|
|
|
$word_length = $func['strlen']( $text_token['value'] );
|
|
$the_key = $func['strtolower']( $text_token['value'] );
|
|
|
|
if ( $word_length < $this->settings['hyphenMinLength'] ) {
|
|
continue;
|
|
}
|
|
|
|
// If this is a capitalized word, and settings do not allow hyphenation of such, abort!
|
|
// Note: This is different than uppercase words, where we are looking for title case.
|
|
if ( empty( $this->settings['hyphenateTitleCase'] ) && $func['substr']( $the_key , 0 , 1 ) !== $func['substr']( $text_token['value'], 0, 1 ) ) {
|
|
continue;
|
|
}
|
|
|
|
// Give exceptions preference.
|
|
if ( isset( $this->settings['hyphenationExceptions'][ $the_key ] ) ) {
|
|
// Set the word_pattern - this method keeps any contextually important capitalization.
|
|
$lowercase_hyphened_word = $this->settings['hyphenationExceptions'][ $the_key ];
|
|
$lowercase_hyphened_word_parts = $func['str_split']( $lowercase_hyphened_word, 1 );
|
|
$lowercase_hyphened_word_length = $func['strlen']( $lowercase_hyphened_word );
|
|
|
|
$word_pattern = array();
|
|
for ( $i = 0; $i < $lowercase_hyphened_word_length; $i++ ) {
|
|
if ( '-' === $lowercase_hyphened_word_parts[ $i ] ) {
|
|
$word_pattern[] = '9';
|
|
$i++;
|
|
} else {
|
|
$word_pattern[] = '0';
|
|
}
|
|
}
|
|
$word_pattern[] = '0'; // For consistent length with the other word patterns.
|
|
}
|
|
|
|
if ( ! isset( $word_pattern ) ) {
|
|
// First we set up the matching pattern to be a series of zeros one character longer than $parsedTextToken.
|
|
$word_pattern = array();
|
|
for ( $i = 0; $i < $word_length + 1; $i++ ) {
|
|
$word_pattern[] = '0';
|
|
}
|
|
|
|
// We grab all possible segments from $parsedTextToken of length 1 through $this->settings['hyphenationPatternMaxSegment'].
|
|
for ( $segment_length = 1; ( $segment_length <= $word_length ) && ( $segment_length <= $this->settings['hyphenationPatternMaxSegment'] ); $segment_length++ ) {
|
|
for ( $segment_position = 0; $segment_position + $segment_length <= $word_length; $segment_position++ ) {
|
|
$segment = $func['strtolower']( $func['substr']( $text_token['value'], $segment_position, $segment_length ) );
|
|
|
|
if ( 0 === $segment_position && isset( $this->settings['hyphenationPattern']['begin'][ $segment ] ) ) {
|
|
$segment_pattern = $func['str_split']( $this->settings['hyphenationPattern']['begin'][ $segment ], 1 );
|
|
$word_pattern = $this->hyphenation_pattern_injection( $word_pattern, $segment_pattern, $segment_position, $segment_length );
|
|
}
|
|
|
|
if ( $segment_position + $segment_length === $word_length && isset( $this->settings['hyphenationPattern']['end'][ $segment ] ) ) {
|
|
$segment_pattern = $func['str_split']( $this->settings['hyphenationPattern']['end'][ $segment ], 1 );
|
|
$word_pattern = $this->hyphenation_pattern_injection( $word_pattern, $segment_pattern, $segment_position, $segment_length );
|
|
}
|
|
|
|
if ( isset( $this->settings['hyphenationPattern']['all'][ $segment ] ) ) {
|
|
$segment_pattern = $func['str_split']( $this->settings['hyphenationPattern']['all'][ $segment ], 1 );
|
|
$word_pattern = $this->hyphenation_pattern_injection( $word_pattern, $segment_pattern, $segment_position, $segment_length );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Add soft-hyphen based on $wordPattern.
|
|
$word_parts = $func['str_split']( $text_token['value'], 1 );
|
|
|
|
$hyphenated_word = '';
|
|
for ( $i = 0; $i < $word_length; $i++ ) {
|
|
if ( is_odd( intval( $word_pattern[ $i ] ) ) && ( $i >= $this->settings['hyphenMinBefore']) && ( $i < $word_length - $this->settings['hyphenMinAfter'] ) ) {
|
|
$hyphenated_word .= $this->chr['softHyphen'] . $word_parts[ $i ];
|
|
} else {
|
|
$hyphenated_word .= $word_parts[ $i ];
|
|
}
|
|
}
|
|
|
|
$text_token['value'] = $hyphenated_word;
|
|
unset( $word_pattern );
|
|
}
|
|
|
|
return $parsed_text_tokens;
|
|
}
|
|
|
|
/**
|
|
* Returns the nearest block-level parent.
|
|
*
|
|
* @param \DOMNode $element The node to get the containing block-level tag.
|
|
*
|
|
* @return \DOMElement
|
|
*/
|
|
function get_block_parent( \DOMNode $element ) {
|
|
$parent = $element->parentNode; // @codingStandardsIgnoreLine.
|
|
|
|
while ( isset( $parent->tagName ) && ! isset( $this->block_tags[ $parent->tagName ] ) && ! empty( $parent->parentNode ) && $parent->parentNode instanceof \DOMElement ) { // @codingStandardsIgnoreLine.
|
|
$parent = $parent->parentNode; // @codingStandardsIgnoreLine.
|
|
}
|
|
|
|
return $parent;
|
|
}
|
|
|
|
/**
|
|
* Retrieve a unique hash value for the current settings.
|
|
*
|
|
* @param number $max_length The maximum number of bytes returned.
|
|
* @return string An binary hash value for the current settings limited to $max_length.
|
|
*/
|
|
public function get_settings_hash( $max_length = 8 ) {
|
|
$hash = md5( json_encode( $this->settings ), true );
|
|
|
|
if ( $max_length < strlen( $hash ) ) {
|
|
$hash = substr( $hash, 0, $max_length );
|
|
}
|
|
|
|
return $hash;
|
|
}
|
|
|
|
/**
|
|
* Retrieve the HTML5 parser instance.
|
|
*
|
|
* @return \Mastermind\HTML5
|
|
*/
|
|
public function get_html5_parser() {
|
|
// Lazy-load HTML5 parser.
|
|
if ( ! isset( $this->html5_parser ) ) {
|
|
$this->html5_parser = new \Masterminds\HTML5( array( 'disable_html_ns' => true ) );
|
|
}
|
|
|
|
return $this->html5_parser;
|
|
}
|
|
|
|
/**
|
|
* Retrieve the text parser instance.
|
|
*
|
|
* @return \PHP_Typography\Parse_Text
|
|
*/
|
|
public function get_text_parser() {
|
|
// Lazy-load text parser.
|
|
if ( ! isset( $this->text_parser ) ) {
|
|
$this->text_parser = new Parse_Text( $this->encodings );
|
|
}
|
|
|
|
return $this->text_parser;
|
|
}
|
|
|
|
/**
|
|
* Retrieve the list of valid hyphenation languages.
|
|
* The language names are translation-ready but not translated yet.
|
|
*
|
|
* @return array An array in the form of ( LANG_CODE => LANGUAGE ).
|
|
*/
|
|
static public function get_hyphenation_languages() {
|
|
return \PHP_Typography\get_language_plugin_list( __DIR__ . '/lang/', 'patgenLanguage' );
|
|
}
|
|
|
|
/**
|
|
* Retrieve the list of valid diacritic replacement languages.
|
|
* The language names are translation-ready but not translated yet.
|
|
*
|
|
* @return array An array in the form of ( LANG_CODE => LANGUAGE ).
|
|
*/
|
|
static public function get_diacritic_languages() {
|
|
return \PHP_Typography\get_language_plugin_list( __DIR__ . '/diacritics/', 'diacriticLanguage' );
|
|
}
|
|
}
|