本文整理汇总了PHP中UtfNormal::toNFD方法的典型用法代码示例。如果您正苦于以下问题:PHP UtfNormal::toNFD方法的具体用法?PHP UtfNormal::toNFD怎么用?PHP UtfNormal::toNFD使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类UtfNormal
的用法示例。
在下文中一共展示了UtfNormal::toNFD方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的PHP代码示例。
示例1: gs_utf8_decompose_to_ascii
function gs_utf8_decompose_to_ascii($str)
{
static $map = null;
if (!is_array($map)) {
$map = _gs_utf8_get_map();
}
$str = UtfNormal::toNFD(strTr($str, $map));
# return "safe" ASCII without control chars, newlines etc.
//$str = preg_replace('/[^a-z0-9\-_. *#\'"!$()\/]/i', '', $str);
$str = preg_replace('/[^\\x20-\\x7E]/', '', $str);
return $str;
}
示例2: normalize_form_d_php
function normalize_form_d_php($c)
{
return UtfNormal::toNFD($c, "php");
}
示例3: ustringToNFD
public function ustringToNFD($s)
{
$this->checkString('toNFD', $s, false);
if (!$this->checkEncoding($s)) {
return array(null);
}
return array(UtfNormal::toNFD($s));
}
示例4: checkUnicodeString
/**
* TODO: does too much in one routine, refactor...
* @param $testName
* @return array
*/
public static function checkUnicodeString($testName)
{
# Start with some sanity checking
if (!is_string($testName)) {
return array("ERROR", wfMsg('antispoof-badtype'));
}
if (strlen($testName) == 0) {
return array("ERROR", wfMsg('antispoof-empty'));
}
if (array_intersect(self::stringToList($testName), self::$character_blacklist)) {
return array("ERROR", wfMsg('antispoof-blacklisted'));
}
# Perform Unicode normalization
$testName = UtfNormal::toNFD($testName);
$testChars = self::stringToList($testName);
# Be paranoid: check again, just in case Unicode normalization code changes...
if (array_intersect($testChars, self::$character_blacklist)) {
return array("ERROR", wfMsg('antispoof-blacklisted'));
}
# Check for this: should not happen in any valid Unicode string
if (self::getScriptCode($testChars[0]) == "SCRIPT_COMBINING_MARKS") {
return array("ERROR", wfMsg('antispoof-combining'));
}
# Strip all combining characters in order to crudely strip accents
# Note: NFD normalization should have decomposed all accented chars earlier
$testChars = self::stripScript($testChars, "SCRIPT_COMBINING_MARKS");
$testScripts = array_unique(array_map(array('AntiSpoof', 'getScriptCode'), $testChars));
if (in_array("SCRIPT_UNASSIGNED", $testScripts) || in_array("SCRIPT_DEPRECATED", $testScripts)) {
return array("ERROR", wfMsg('antispoof-unassigned'));
}
# We don't mind ASCII punctuation or digits
$testScripts = array_diff($testScripts, array("SCRIPT_ASCII_PUNCTUATION", "SCRIPT_ASCII_DIGITS"));
if (!$testScripts) {
return array("ERROR", wfMsg('antispoof-noletters'));
}
if (count($testScripts) > 1 && !self::isAllowedScriptCombination($testScripts)) {
return array("ERROR", wfMsg('antispoof-mixedscripts'));
}
# At this point, we should probably check for BiDi violations if they aren't
# caught above...
# Replace characters in confusables set with equivalence chars
$testChars = self::equivString($testChars);
# Do very simple sequence processing: "vv" -> "w", "rn" -> "m"...
# Not exhaustive, but ups the ante...
# Do this _after_ canonicalization: looks weird, but needed for consistency
$testChars = self::mergePairs($testChars, self::equivString(self::stringToList("VV")), self::equivString(self::stringToList("W")));
$testChars = self::mergePairs($testChars, self::equivString(self::stringToList("RN")), self::equivString(self::stringToList("M")));
# Squeeze out all punctuation chars
# TODO: almost the same code occurs twice, refactor into own routine
$testChars = self::stripScript($testChars, "SCRIPT_ASCII_PUNCTUATION");
$testName = self::listToString($testChars);
# Remove all remaining spaces, just in case any have snuck through...
$testName = self::hardjoin(explode(" ", $testName));
# Reduce repeated char sequences to single character
# BUG: TODO: implement this
if (strlen($testName) < 1) {
return array("ERROR", wfMsg('antispoof-tooshort'));
}
# Don't ASCIIfy: we assume we are UTF-8 capable on output
# Prepend version string, for futureproofing if this algorithm changes
$testName = "v2:" . $testName;
# And return the canonical version of the name
return array("OK", $testName);
}
示例5: forTransliteration
/**
* Normalise the text so it can be used with strtr() safely
*
* 1. decodeCharReferences
* 2. split into NFD codepoints or NFC fully combined
* 3. add bookends on word boundaries
*
* @param $word String from user input
* @param $flags may include self::DECOMPOSE, self::IGNORE_ENDINGS
* @return String
*/
static function forTransliteration( $word, $flags ) {
static $regexes = null;
// NOTE: this is very slightly inconsistent with MediaWiki if an NFD code-point
// has been HTML escaped it will be converted to NFC if it passes through
// transliteration unchanged, I think that's a WONTFIX though.
$word = Sanitizer::decodeCharReferences( $word );
if ( $flags & self::DECOMPOSE ) {
$word = UtfNormal::toNFD( $word );
$word = preg_replace( '/./u', '$0' . self::LETTER_END, $word );
} else {
$word = preg_replace( '/\X/u', '$0' . self::LETTER_END, $word );
}
if ( !$regexes ) {
// A "letter" is a unicode letter followed by some combining characters
// A "non-letter" is any other character followed by some combining characters
// "end" is done first so it watches out for word-endings in "start"
// If it should treat endings then the start and end of the string are non-letters
// Otherwise it does not touch the start or end of the string, only internal transitions
$combining = '(?:[\pM]*' . self::LETTER_END . ')';
$nonletter = '[^\pL' . self::LETTER_END . self::WORD_END . '\pM]';
$regexes = array (
'endings' => array (
'start' => "/(^$combining?|$nonletter$combining)([\pL])/u",
'end' => "/([\pL]$combining)([^\pL]|$)/u",
),
'ignore-endings' => array (
'start' => "/($nonletter$combining)([\pL])/u",
'end' => "/([\pL]$combining)([^\pL])/u",
),
);
}
$regex = $regexes[$flags & self::IGNORE_ENDINGS ? 'ignore-endings' : 'endings'];
$word = preg_replace( $regex['end'], '$1' . self::WORD_END . '$2', $word );
$word = preg_replace( $regex['start'], '$1' . self::WORD_START . '$2', $word );
return $word;
}