本文整理汇总了PHP中utf8ToCodepoint函数的典型用法代码示例。如果您正苦于以下问题:PHP utf8ToCodepoint函数的具体用法?PHP utf8ToCodepoint怎么用?PHP utf8ToCodepoint使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了utf8ToCodepoint函数的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的PHP代码示例。
示例1: utf8ToHexSequence
/**
* Take a UTF-8 string and return a space-separated series of hex
* numbers representing Unicode code points. For debugging.
*
* @param string $str UTF-8 string.
* @return string
* @private
*/
function utf8ToHexSequence($str)
{
$buf = '';
foreach (preg_split('//u', $str, -1, PREG_SPLIT_NO_EMPTY) as $cp) {
$buf .= sprintf('%04x ', utf8ToCodepoint($cp));
}
return rtrim($buf);
}
示例2: firstChar
/**
* Get the first character of a string.
*
* @param $s string
* @return string
*/
function firstChar($s)
{
$matches = array();
preg_match('/^([\\x00-\\x7f]|[\\xc0-\\xdf][\\x80-\\xbf]|' . '[\\xe0-\\xef][\\x80-\\xbf]{2}|[\\xf0-\\xf7][\\x80-\\xbf]{3})/', $s, $matches);
if (isset($matches[1])) {
if (strlen($matches[1]) != 3) {
return $matches[1];
}
// Break down Hangul syllables to grab the first jamo
$code = utf8ToCodepoint($matches[1]);
if ($code < 0xac00 || 0xd7a4 <= $code) {
return $matches[1];
} elseif ($code < 0xb098) {
return "ㄱ";
} elseif ($code < 0xb2e4) {
return "ㄴ";
} elseif ($code < 0xb77c) {
return "ㄷ";
} elseif ($code < 0xb9c8) {
return "ㄹ";
} elseif ($code < 0xbc14) {
return "ㅁ";
} elseif ($code < 0xc0ac) {
return "ㅂ";
} elseif ($code < 0xc544) {
return "ㅅ";
} elseif ($code < 0xc790) {
return "ㅇ";
} elseif ($code < 0xcc28) {
return "ㅈ";
} elseif ($code < 0xce74) {
return "ㅊ";
} elseif ($code < 0xd0c0) {
return "ㅋ";
} elseif ($code < 0xd30c) {
return "ㅌ";
} elseif ($code < 0xd558) {
return "ㅍ";
} else {
return "ㅎ";
}
} else {
return '';
}
}
示例3: getItemCodepoint
/**
* get the unicode index of an item
* @param string $a_item
*/
private function getItemCodepoint($a_item)
{
if (preg_match('/^[uU]\\+[0-9a-fA-F]+$/', $a_item)) {
return (int) hexdec(substr($a_item, 2));
} else {
//take the codepoint of the first character
require_once "include/Unicode/UtfNormalUtil.php";
return (int) utf8ToCodepoint($a_item);
}
}
示例4: normalizeCss
/**
* Normalize CSS into a format we can easily search for hostile input
* - decode character references
* - decode escape sequences
* - convert characters that IE6 interprets into ascii
* - remove comments, unless the entire value is one single comment
* @param string $value the css string
* @return string normalized css
*/
public static function normalizeCss($value)
{
// Decode character references like {
$value = Sanitizer::decodeCharReferences($value);
// Decode escape sequences and line continuation
// See the grammar in the CSS 2 spec, appendix D.
// This has to be done AFTER decoding character references.
// This means it isn't possible for this function to return
// unsanitized escape sequences. It is possible to manufacture
// input that contains character references that decode to
// escape sequences that decode to character references, but
// it's OK for the return value to contain character references
// because the caller is supposed to escape those anyway.
static $decodeRegex;
if (!$decodeRegex) {
$space = '[\\x20\\t\\r\\n\\f]';
$nl = '(?:\\n|\\r\\n|\\r|\\f)';
$backslash = '\\\\';
$decodeRegex = "/ {$backslash}\n\t\t\t\t(?:\n\t\t\t\t\t({$nl}) | # 1. Line continuation\n\t\t\t\t\t([0-9A-Fa-f]{1,6}){$space}? | # 2. character number\n\t\t\t\t\t(.) | # 3. backslash cancelling special meaning\n\t\t\t\t\t() | # 4. backslash at end of string\n\t\t\t\t)/xu";
}
$value = preg_replace_callback($decodeRegex, array(__CLASS__, 'cssDecodeCallback'), $value);
// Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
$value = preg_replace_callback('/[!-[]-z]/u', function ($matches) {
$cp = utf8ToCodepoint($matches[0]);
if ($cp === false) {
return '';
}
return chr($cp - 65248);
// ASCII range \x21-\x7A
}, $value);
// Convert more characters IE6 might treat as ascii
// U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
$value = str_replace(array('ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍'), array('r', 'n', 'n', 'l', 'i', '(', '('), $value);
// Let the value through if it's nothing but a single comment, to
// allow other functions which may reject it to pass some error
// message through.
if (!preg_match('! ^ \\s* /\\* [^*\\/]* \\*/ \\s* $ !x', $value)) {
// Remove any comments; IE gets token splitting wrong
// This must be done AFTER decoding character references and
// escape sequences, because those steps can introduce comments
// This step cannot introduce character references or escape
// sequences, because it replaces comments with spaces rather
// than removing them completely.
$value = StringUtils::delimiterReplace('/*', '*/', ' ', $value);
// Remove anything after a comment-start token, to guard against
// incorrect client implementations.
$commentPos = strpos($value, '/*');
if ($commentPos !== false) {
$value = substr($value, 0, $commentPos);
}
}
// S followed by repeat, iteration, or prolonged sound marks,
// which IE will treat as "ss"
$value = preg_replace('/s(?:
\\xE3\\x80\\xB1 | # U+3031
\\xE3\\x82\\x9D | # U+309D
\\xE3\\x83\\xBC | # U+30FC
\\xE3\\x83\\xBD | # U+30FD
\\xEF\\xB9\\xBC | # U+FE7C
\\xEF\\xB9\\xBD | # U+FE7D
\\xEF\\xBD\\xB0 # U+FF70
)/ix', 'ss', $value);
return $value;
}
示例5: cssNormalizeUnicodeWidth
/**
* Normalize Unicode U+FF01 to U+FF5A
* @param character $char
* @return character in ASCII range \x21-\x7A
*/
static function cssNormalizeUnicodeWidth($matches)
{
$cp = utf8ToCodepoint($matches[0]);
if ($cp === false) {
return '';
}
return chr($cp - 65248);
// ASCII range \x21-\x7A
}
示例6: array
}
# Find the set for the right character, add a new one if necessary
if (isset($setsByChar[$m['charright']])) {
$setName = $setsByChar[$m['charright']];
} else {
# New set
$setName = $m['charright'];
$sets[$setName] = array($m['charright']);
$setsByChar[$setName] = $setName;
}
# Add the left character to the set
$sets[$setName][] = $m['charleft'];
$setsByChar[$m['charleft']] = $setName;
}
# Sets output
foreach ($sets as $setName => $members) {
fwrite($setsFile, implode(' ', $members) . $endl);
}
# Map output
$output = var_export($setsByChar, true);
$output = str_replace("\n", $endl, $output);
fwrite($outputFile, '$equivset = ' . "{$output}{$endl}?" . ">{$endl}");
# Serialized codepoint map
$codepointMap = array();
foreach ($setsByChar as $char => $setName) {
$codepointMap[utf8ToCodepoint($char)] = utf8ToCodepoint($setName);
}
fwrite($serializedFile, serialize($codepointMap));
fclose($setsFile);
fclose($outputFile);
fclose($serializedFile);
示例7: getFirstLetter
function getFirstLetter($string)
{
$string = strval($string);
if ($string === '') {
return '';
}
// Check for CJK
$firstChar = mb_substr($string, 0, 1, 'UTF-8');
if (ord($firstChar) > 0x7f && self::isCjk(utf8ToCodepoint($firstChar))) {
return $firstChar;
}
$sortKey = $this->getPrimarySortKey($string);
// Do a binary search to find the correct letter to sort under
$min = $this->findLowerBound(array($this, 'getSortKeyByLetterIndex'), $this->getFirstLetterCount(), 'strcmp', $sortKey);
if ($min === false) {
// Before the first letter
return '';
}
return $this->getLetterByIndex($min);
}
示例8: execute
public function execute()
{
$dir = __DIR__;
$endl = "\n";
$lines = file("{$dir}/equivset.in");
if (!$lines) {
$this->error("Unable to open equivset.in\n", 1);
}
$setsFile = fopen("{$dir}/equivset.txt", 'w');
if (!$setsFile) {
$this->error("Unable to open equivset.txt for writing\n", 1);
}
fwrite($setsFile, <<<EOT
# This file is generated by generateEquivset.php
# It shows sets of equivalent characters, one set per line, with characters
# separated by whitespace. This file is not used by MediaWiki, rather it is
# intended as a human-readable version of equivset.php, for debugging and
# review purposes.
EOT
);
$outputFile = fopen("{$dir}/equivset.php", 'w');
if (!$outputFile) {
$this->error("Unable to open equivset.php for writing\n", 1);
}
fwrite($outputFile, "<?" . "php{$endl}" . <<<EOT
# This file is generated by generateEquivset.php
# It contains a map of characters, encoded in UTF-8, such that running strtr()
# on a string with this map will cause confusable characters to be reduced to
# a canonical representation. The same array is also available in serialized
# form, in equivset.ser.
EOT
);
$serializedFile = fopen("{$dir}/equivset.ser", 'w');
if (!$serializedFile) {
$this->error("Unable to open equivset.ser for writing\n", 1);
}
# \s matches \xa0 in non-unicode mode, which is not what we want
# So we need to make our own whitespace class
$sp = '[\\ \\t]';
$lineNum = 0;
$setsByChar = array();
$sets = array();
$exitStatus = 0;
foreach ($lines as $line) {
++$lineNum;
$mapToEmpty = false;
# Whether the line ends with a null character
$mapToEmpty = strpos($line, "") === strlen($line) - 2;
$line = trim($line);
# Filter comments
if (!$line || $line[0] == '#') {
continue;
}
# Process line
if (!preg_match("/^(?P<hexleft> [A-F0-9]+) {$sp}+ (?P<charleft> .+?) {$sp}+ => {$sp}+ (?:(?P<hexright> [A-F0-9]+) {$sp}+|) (?P<charright> .+?) {$sp}* (?: \\#.*|) \$ /x", $line, $m)) {
$this->output("Error: invalid entry at line {$lineNum}: {$line}\n");
$exitStatus = 1;
continue;
}
$error = false;
if ($mapToEmpty) {
$m['charright'] = '';
} else {
if (codepointToUtf8(hexdec($m['hexleft'])) != $m['charleft']) {
$actual = utf8ToCodepoint($m['charleft']);
if ($actual === false) {
$this->output("Bytes: " . strlen($m['charleft']) . "\n");
$this->output(bin2hex($line) . "\n");
$hexForm = bin2hex($m['charleft']);
$this->output("Invalid UTF-8 character \"{$m['charleft']}\" ({$hexForm}) at line {$lineNum}: {$line}\n");
} else {
$this->output("Error: left number ({$m['hexleft']}) does not match left character ({$actual}) " . "at line {$lineNum}: {$line}\n");
}
$error = true;
}
if (!empty($m['hexright']) && codepointToUtf8(hexdec($m['hexright'])) != $m['charright']) {
$actual = utf8ToCodepoint($m['charright']);
if ($actual === false) {
$hexForm = bin2hex($m['charright']);
$this->output("Invalid UTF-8 character \"{$m['charleft']}\" ({$hexForm}) at line {$lineNum}: {$line}\n");
} else {
$this->output("Error: right number ({$m['hexright']}) does not match right character ({$actual}) " . "at line {$lineNum}: {$line}\n");
}
$error = true;
}
if ($error) {
$exitStatus = 1;
continue;
}
}
# Find the set for the right character, add a new one if necessary
if (isset($setsByChar[$m['charright']])) {
$setName = $setsByChar[$m['charright']];
} else {
# New set
$setName = $m['charright'];
$sets[$setName] = array($m['charright']);
$setsByChar[$setName] = $setName;
//.........这里部分代码省略.........
示例9: stringToList
/**
* Convert string into array of Unicode code points as integers
* @param $str
* @return array
*/
public static function stringToList($str)
{
$ar = array();
if (!preg_match_all('/./us', $str, $ar)) {
return array();
}
$out = array();
foreach ($ar[0] as $char) {
$out[] = utf8ToCodepoint($char);
}
return $out;
}
示例10: firstChar
/**
* Get the first character of a string.
*
* @param $s string
* @return string
*/
function firstChar( $s ) {
$matches = array();
preg_match(
'/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
'[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/',
$s,
$matches
);
if ( isset( $matches[1] ) ) {
if ( strlen( $matches[1] ) != 3 ) {
return $matches[1];
}
// Break down Hangul syllables to grab the first jamo
$code = utf8ToCodepoint( $matches[1] );
if ( $code < 0xac00 || 0xd7a4 <= $code ) {
return $matches[1];
} elseif ( $code < 0xb098 ) {
return "\xe3\x84\xb1";
} elseif ( $code < 0xb2e4 ) {
return "\xe3\x84\xb4";
} elseif ( $code < 0xb77c ) {
return "\xe3\x84\xb7";
} elseif ( $code < 0xb9c8 ) {
return "\xe3\x84\xb9";
} elseif ( $code < 0xbc14 ) {
return "\xe3\x85\x81";
} elseif ( $code < 0xc0ac ) {
return "\xe3\x85\x82";
} elseif ( $code < 0xc544 ) {
return "\xe3\x85\x85";
} elseif ( $code < 0xc790 ) {
return "\xe3\x85\x87";
} elseif ( $code < 0xcc28 ) {
return "\xe3\x85\x88";
} elseif ( $code < 0xce74 ) {
return "\xe3\x85\x8a";
} elseif ( $code < 0xd0c0 ) {
return "\xe3\x85\x8b";
} elseif ( $code < 0xd30c ) {
return "\xe3\x85\x8c";
} elseif ( $code < 0xd558 ) {
return "\xe3\x85\x8d";
} else {
return "\xe3\x85\x8e";
}
} else {
return '';
}
}
示例11: mangleName
/**
* Mangle XML-invalid names to be valid in XML
* @param string $name
* @param array $preserveKeys Names to not mangle
* @return string Mangled name
*/
private static function mangleName($name, $preserveKeys = array())
{
static $nsc = null, $nc = null;
if (in_array($name, $preserveKeys, true)) {
return $name;
}
if ($name === '') {
return '_';
}
if ($nsc === null) {
// Note we omit ':' from $nsc and $nc because it's reserved for XML
// namespacing, and we omit '_' from $nsc (but not $nc) because we
// reserve it.
$nsc = 'A-Za-z\\x{C0}-\\x{D6}\\x{D8}-\\x{F6}\\x{F8}-\\x{2FF}\\x{370}-\\x{37D}\\x{37F}-\\x{1FFF}' . '\\x{200C}-\\x{200D}\\x{2070}-\\x{218F}\\x{2C00}-\\x{2FEF}\\x{3001}-\\x{D7FF}' . '\\x{F900}-\\x{FDCF}\\x{FDF0}-\\x{FFFD}\\x{10000}-\\x{EFFFF}';
$nc = $nsc . '_\\-.0-9\\x{B7}\\x{300}-\\x{36F}\\x{203F}-\\x{2040}';
}
if (preg_match("/^[{$nsc}][{$nc}]*\$/uS", $name)) {
return $name;
}
return '_' . preg_replace_callback("/[^{$nc}]/uS", function ($m) {
return sprintf('.%X.', utf8ToCodepoint($m[0]));
}, str_replace('.', '.2E.', $name));
}
示例12: onCategoryMultisortSortkeys_buildRadicalSortkey
function onCategoryMultisortSortkeys_buildRadicalSortkey($data, $str)
{
$result = '';
foreach ($this->onCategoryMultisortSortkeys_splitString($str) as $ch) {
# One UTF-8 character can have 4 bytes max.
$c = str_pad($ch, 4);
$chcp = utf8ToCodepoint($ch);
# One radical-stroke entry always has 3 (radical) + 3 (stroke) = 6 bytes, or blank if unavailable.
$r = str_pad($this->onCategoryMultisortSortkeys_getRadical($data, $chcp), 6);
$result .= $r . $c;
}
return $result;
}