本文整理汇总了C++中U8_NEXT函数的典型用法代码示例。如果您正苦于以下问题:C++ U8_NEXT函数的具体用法?C++ U8_NEXT怎么用?C++ U8_NEXT使用的例子?那么, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了U8_NEXT函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: checkStringEnd
int ICUUnicodeSupport::_compareNoCase<1>(ConstStringHolder<1> _first, ConstStringHolder<1> _second)
{
int32_t len1 = _first.length();
int32_t len2 = _second.length();
int32_t ofs1 = 0;
int32_t ofs2 = 0;
int r = checkStringEnd(ofs1, len1, ofs2, len2);
if(r != 2) return r;
const uint8_t* buf1 = _first.c_str();
const uint8_t* buf2 = _second.c_str();
while(true)
{
UChar32 c1, c2;
U8_NEXT(buf1, ofs1, len1, c1);
U8_NEXT(buf2, ofs2, len2, c2);
c1 = u_tolower(c1);
c2 = u_tolower(c2);
if(c1 != c2)
return (c1 < c2) ? -1 : 1;
r = checkStringEnd(ofs1, len1, ofs2, len2);
if(r != 2) return r;
}
}
示例2: utf8_comparison
static int utf8_comparison(const unsigned char *buf1, size_t buf1size,
const unsigned char *buf2, size_t buf2size,
int caseinsensitive, int genericwhitespace) {
UChar32 c1, c2;
int i1 = 0;
int i2 = 0;
while (i1 < (int)buf1size && i2 < (int)buf2size) {
U8_NEXT(buf1, i1, buf1size, c1);
assert(c1 >= 0);
U8_NEXT(buf2, i2, buf2size, c2);
assert(c2 >= 0);
if (c1 != c2) {
if (caseinsensitive) {
// turn both lowercase and compare again:
if (c1 < 127 && c2 < 127) { // manual ascii case insensitive
char bytec1 = (char)c1;
char bytec2 = (char)c2;
if (((bytec1 >= 'a' && bytec1 <= 'z') ||
(bytec1 >= 'A' && bytec1 <= 'Z')) &&
((bytec2 >= 'a' && bytec2 <= 'z') ||
(bytec2 >= 'A' && bytec2 <= 'Z'))) {
// turn them into uppercase
if (bytec1 >= 'a' && bytec1 <= 'z') {
bytec1 = bytec1 - ('a' - 'A');
}
if (bytec2 >= 'a' && bytec2 <= 'z') {
bytec2 = bytec2 - ('a' - 'A');
}
// compare with case insensitive:
if (bytec1 == bytec2) {
continue;
} else {
return ((int)bytec1) - ((int)bytec2);
}
}
}
}
if (genericwhitespace) {
// see if both are whitespace and try again:
if (utf8_codepoint_is_whitespace(c1) &&
utf8_codepoint_is_whitespace(c2)) {
continue;
}
}
// not equal. return difference:
return ((int)c1) - ((int)c2);
}
}
if (i1 < (int)buf1size) {
U8_NEXT(buf1, i1, buf1size, c1);
return c1;
}
if (i2 < (int)buf2size) {
U8_NEXT(buf2, i2, buf2size, c2);
return -((int)c2);
}
return 0;
}
示例3: _caseMap
/*
* Case-maps [srcStart..srcLimit[ but takes
* context [0..srcLength[ into account.
*/
static void
_caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
const uint8_t *src, UCaseContext *csc,
int32_t srcStart, int32_t srcLimit,
icu::ByteSink &sink, icu::Edits *edits,
UErrorCode &errorCode) {
/* case mapping loop */
int32_t srcIndex=srcStart;
while (U_SUCCESS(errorCode) && srcIndex<srcLimit) {
int32_t cpStart;
csc->cpStart=cpStart=srcIndex;
UChar32 c;
U8_NEXT(src, srcIndex, srcLimit, c);
csc->cpLimit=srcIndex;
if(c<0) {
// Malformed UTF-8.
ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
sink, options, edits, errorCode);
} else {
const UChar *s;
c=map(c, utf8_caseContextIterator, csc, &s, caseLocale);
appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
}
}
}
示例4: convert_cp
static inline int convert_cp(UChar32* pcp, zval *zcp) {
zend_long cp = -1;
if (Z_TYPE_P(zcp) == IS_LONG) {
cp = Z_LVAL_P(zcp);
} else if (Z_TYPE_P(zcp) == IS_STRING) {
int32_t i = 0;
size_t zcp_len = Z_STRLEN_P(zcp);
if (ZEND_SIZE_T_INT_OVFL(zcp_len)) {
intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
intl_error_set_custom_msg(NULL, "Input string is too long.", 0);
return FAILURE;
}
U8_NEXT(Z_STRVAL_P(zcp), i, zcp_len, cp);
if ((size_t)i != zcp_len) {
intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
intl_error_set_custom_msg(NULL, "Passing a UTF-8 character for codepoint requires a string which is exactly one UTF-8 codepoint long.", 0);
return FAILURE;
}
} else {
intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
intl_error_set_custom_msg(NULL, "Invalid parameter for unicode point. Must be either integer or UTF-8 sequence.", 0);
return FAILURE;
}
if ((cp < UCHAR_MIN_VALUE) || (cp > UCHAR_MAX_VALUE)) {
intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
intl_error_set_custom_msg(NULL, "Codepoint out of range", 0);
return FAILURE;
}
*pcp = (UChar32)cp;
return SUCCESS;
}
示例5: TestNextPrevNonCharacters
static void TestNextPrevNonCharacters() {
/* test non-characters */
static const uint8_t nonChars[]={
0xef, 0xb7, 0x90, /* U+fdd0 */
0xef, 0xbf, 0xbf, /* U+feff */
0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */
};
UChar32 ch;
int32_t idx;
for(idx=0; idx<(int32_t)sizeof(nonChars);) {
U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
if(!U_IS_UNICODE_NONCHAR(ch)) {
log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
}
}
for(idx=(int32_t)sizeof(nonChars); idx>0;) {
U8_PREV(nonChars, 0, idx, ch);
if(!U_IS_UNICODE_NONCHAR(ch)) {
log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
}
}
}
示例6: _caseMap
/*
* Case-maps [srcStart..srcLimit[ but takes
* context [0..srcLength[ into account.
*/
static int32_t
_caseMap(const UCaseMap *csm, UCaseMapFull *map,
uint8_t *dest, int32_t destCapacity,
const uint8_t *src, UCaseContext *csc,
int32_t srcStart, int32_t srcLimit,
UErrorCode *pErrorCode) {
const UChar *s;
UChar32 c;
int32_t srcIndex, destIndex;
int32_t locCache;
locCache=csm->locCache;
/* case mapping loop */
srcIndex=srcStart;
destIndex=0;
while(srcIndex<srcLimit) {
csc->cpStart=srcIndex;
U8_NEXT(src, srcIndex, srcLimit, c);
csc->cpLimit=srcIndex;
c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache);
destIndex=appendResult(dest, destIndex, destCapacity, c, s);
}
if(destIndex>destCapacity) {
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
return destIndex;
}
示例7: utf8_caseContextIterator
static UChar32 U_CALLCONV
utf8_caseContextIterator(void *context, int8_t dir) {
UCaseContext *csc=(UCaseContext *)context;
UChar32 c;
if(dir<0) {
/* reset for backward iteration */
csc->index=csc->cpStart;
csc->dir=dir;
} else if(dir>0) {
/* reset for forward iteration */
csc->index=csc->cpLimit;
csc->dir=dir;
} else {
/* continue current iteration direction */
dir=csc->dir;
}
if(dir<0) {
if(csc->start<csc->index) {
U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
return c;
}
} else {
if(csc->index<csc->limit) {
U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
return c;
}
}
return U_SENTINEL;
}
示例8: stri_subset_charclass
/**
* Detect if a character class occurs in a string
*
* @param str character vector
* @param pattern character vector
* @param omit_na single logical value
* @return logical vector
*
* @version 0.3-1 (Bartek Tartanus, 2014-07-25)
*
* @version 0.3-1 (Marek Gagolewski, 2014-10-17)
* using std::vector<int> to avoid mem-leaks
*
* @version 0.3-1 (Marek Gagolewski, 2014-11-04)
* Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
*
* @version 0.4-1 (Marek Gagolewski, 2014-12-04)
* FR #122: omit_na arg added
*
* @version 1.0-3 (Marek Gagolewski, 2016-02-03)
* FR #216: `negate` arg added
*/
SEXP stri_subset_charclass(SEXP str, SEXP pattern, SEXP omit_na, SEXP negate)
{
bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
bool omit_na1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na");
PROTECT(str = stri_prepare_arg_string(str, "str"));
PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));
R_len_t vectorize_length =
stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));
STRI__ERROR_HANDLER_BEGIN(2)
StriContainerUTF8 str_cont(str, vectorize_length);
StriContainerCharClass pattern_cont(pattern, vectorize_length);
// BT: this cannot be done with deque, because pattern is reused so i does not
// go like 0,1,2...n but 0,pat_len,2*pat_len,1,pat_len+1 and so on
// MG: agreed
std::vector<int> which(vectorize_length);
int result_counter = 0;
for (R_len_t i = pattern_cont.vectorize_init();
i != pattern_cont.vectorize_end();
i = pattern_cont.vectorize_next(i))
{
if (str_cont.isNA(i) || pattern_cont.isNA(i)) {
if (omit_na1) which[i] = FALSE;
else {
which[i] = NA_LOGICAL;
result_counter++;
}
continue;
}
const UnicodeSet* pattern_cur = &pattern_cont.get(i);
R_len_t str_cur_n = str_cont.get(i).length();
const char* str_cur_s = str_cont.get(i).c_str();
UChar32 chr = 0;
which[i] = FALSE;
for (R_len_t j=0; j<str_cur_n; ) {
U8_NEXT(str_cur_s, j, str_cur_n, chr);
if (chr < 0) // invalid utf-8 sequence
throw StriException(MSG__INVALID_UTF8);
if (pattern_cur->contains(chr)) {
which[i] = TRUE;
break;
}
}
if (negate_1) which[i] = !which[i];
if (which[i]) result_counter++;
}
SEXP ret;
STRI__PROTECT(ret = stri__subset_by_logical(str_cont, which, result_counter));
STRI__UNPROTECT_ALL
return ret;
STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
示例9: stri_enc_toutf32
/** Convert character vector to UTF-32
*
* @param str character vector
* @return list with integer vectors
*
* @version 0.1-?? (Marek Gagolewski)
*
* @version 0.1-?? (Marek Gagolewski, 2013-06-16)
* make StriException-friendly
*
* @version 0.2-1 (Marek Gagolewski, 2014-03-26)
* use vector<UChar32> buf instead of R_alloc;
* warn and set NULL on improper UTF-8 byte sequences
*
* @version 0.2-3 (Marek Gagolewski, 2014-05-12)
* Use UChar32* instead of vector<UChar32> as ::data is C++11
*
* @version 0.3-1 (Marek Gagolewski, 2014-11-04)
* Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
*/
SEXP stri_enc_toutf32(SEXP str)
{
PROTECT(str = stri_prepare_arg_string(str, "str"));
R_len_t n = LENGTH(str);
STRI__ERROR_HANDLER_BEGIN(1)
StriContainerUTF8 str_cont(str, n);
R_len_t bufsize = 1; // to avoid allocating an empty buffer
for (R_len_t i=0; i<n; ++i) {
if (str_cont.isNA(i)) continue;
R_len_t ni = str_cont.get(i).length();
if (ni > bufsize) bufsize = ni;
}
UChar32* buf = (UChar32*)R_alloc((size_t)bufsize, (int)sizeof(UChar32)); // at most bufsize UChars32 (bufsize/4 min.)
if (!buf) throw StriException(MSG__MEM_ALLOC_ERROR);
// deque<UChar32> was slower than using a common, over-sized buf
SEXP ret;
STRI__PROTECT(ret = Rf_allocVector(VECSXP, n)); // all
for (R_len_t i=0; i<n; ++i) {
if (str_cont.isNA(i)) {
SET_VECTOR_ELT(ret, i, R_NilValue);
continue;
}
UChar32 c = (UChar32)0;
const char* s = str_cont.get(i).c_str();
R_len_t sn = str_cont.get(i).length();
R_len_t j = 0;
R_len_t k = 0;
while (c >= 0 && j < sn) {
U8_NEXT(s, j, sn, c);
buf[k++] = (int)c;
}
if (c < 0) {
Rf_warning(MSG__INVALID_UTF8);
SET_VECTOR_ELT(ret, i, R_NilValue);
continue;
}
else {
SEXP conv;
STRI__PROTECT(conv = Rf_allocVector(INTSXP, k));
memcpy(INTEGER(conv), buf, (size_t)sizeof(int)*k);
SET_VECTOR_ELT(ret, i, conv);
STRI__UNPROTECT(1);
}
}
STRI__UNPROTECT_ALL
return ret;
STRI__ERROR_HANDLER_END({ /* do nothing on error */ })
}
示例10: stri_subset_charclass_replacement
/**
* Substitutes vector elements if a pattern occurs in a string
*
* @param str character vector
* @param pattern character vector
* @param value character vector
* @return character vector
*
* @version 1.0-3 (Marek Gagolewski, 2016-02-03)
* FR#124
*
* @version 1.0-3 (Marek Gagolewski, 2016-02-03)
* FR #216: `negate` arg added
*/
SEXP stri_subset_charclass_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP value)
{
bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
PROTECT(str = stri_prepare_arg_string(str, "str"));
PROTECT(pattern = stri_prepare_arg_string_1(pattern, "pattern"));
PROTECT(value = stri_prepare_arg_string(value, "value"));
int vectorize_length = LENGTH(str);
int value_length = LENGTH(value);
if (value_length == 0)
Rf_error(MSG__REPLACEMENT_ZERO);
STRI__ERROR_HANDLER_BEGIN(3)
StriContainerUTF8 str_cont(str, vectorize_length);
StriContainerUTF8 value_cont(value, value_length);
StriContainerCharClass pattern_cont(pattern, vectorize_length);
SEXP ret;
STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length));
R_len_t k = 0;
for (R_len_t i = str_cont.vectorize_init();
i != str_cont.vectorize_end();
i = str_cont.vectorize_next(i))
{
if (str_cont.isNA(i) || pattern_cont.isNA(i)) {
SET_STRING_ELT(ret, i, NA_STRING);
continue;
}
const UnicodeSet* pattern_cur = &pattern_cont.get(i);
R_len_t str_cur_n = str_cont.get(i).length();
const char* str_cur_s = str_cont.get(i).c_str();
UChar32 chr = 0;
bool found = false;
for (R_len_t j=0; j<str_cur_n; ) {
U8_NEXT(str_cur_s, j, str_cur_n, chr);
if (chr < 0) // invalid utf-8 sequence
throw StriException(MSG__INVALID_UTF8);
if (pattern_cur->contains(chr)) {
found = true;
break;
}
}
if ((found && !negate_1) || (!found && negate_1))
SET_STRING_ELT(ret, i, value_cont.toR((k++)%value_length));
else
SET_STRING_ELT(ret, i, str_cont.toR(i));
}
STRI__UNPROTECT_ALL
return ret;
STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
示例11: getResultsManually
static UBool *
getResultsManually(const char** encodings, int32_t num_encodings,
const char *utf8, int32_t length,
const USet* excludedCodePoints, const UConverterUnicodeSet whichSet) {
UBool* resultsManually;
int32_t i;
resultsManually = (UBool*) uprv_malloc(gCountAvailable);
uprv_memset(resultsManually, 0, gCountAvailable);
for(i = 0 ; i < num_encodings ; i++) {
UErrorCode status = U_ZERO_ERROR;
/* get unicode set for that converter */
USet* set;
UConverter* test_converter;
UChar32 cp;
int32_t encIndex, offset;
set = uset_openEmpty();
test_converter = ucnv_open(encodings[i], &status);
ucnv_getUnicodeSet(test_converter, set,
whichSet, &status);
if (excludedCodePoints != NULL) {
uset_addAll(set, excludedCodePoints);
}
uset_freeze(set);
offset = 0;
cp = 0;
encIndex = findIndex(encodings[i]);
/*
* The following is almost, but not entirely, the same as
* resultsManually[encIndex] =
* (UBool)(uset_spanUTF8(set, utf8, length, USET_SPAN_SIMPLE) == length);
* They might be different if the set contains strings,
* or if the utf8 string contains an illegal sequence.
*
* The UConverterSelector does not currently handle strings that can be
* converted, and it treats an illegal sequence as convertible
* while uset_spanUTF8() treats it like U+FFFD which may not be convertible.
*/
resultsManually[encIndex] = TRUE;
while(offset<length) {
U8_NEXT(utf8, offset, length, cp);
if (cp >= 0 && !uset_contains(set, cp)) {
resultsManually[encIndex] = FALSE;
break;
}
}
uset_close(set);
ucnv_close(test_converter);
}
return resultsManually;
}
示例12: utf8_length
static int utf8_length(const unsigned char *buf, size_t bufsize) {
UChar32 c;
size_t length = 0;
int i = 0;
while (i < (int)bufsize) {
U8_NEXT(buf, i, bufsize, c);
assert(c >= 0);
length++;
}
return length;
}
示例13: stri__extract_firstlast_charclass
/**
* Extract first or last occurences of a character class in each string
*
* @param str character vector
* @param pattern character vector
* @return character vector
*
* @version 0.1 (Marek Gagolewski, 2013-06-08)
* @version 0.2 (Marek Gagolewski, 2013-06-15) Use StrContainerCharClass
* @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly
*/
SEXP stri__extract_firstlast_charclass(SEXP str, SEXP pattern, bool first)
{
str = stri_prepare_arg_string(str, "str");
pattern = stri_prepare_arg_string(pattern, "pattern");
R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));
STRI__ERROR_HANDLER_BEGIN
StriContainerUTF8 str_cont(str, vectorize_length);
StriContainerCharClass pattern_cont(pattern, vectorize_length);
SEXP ret;
PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length));
for (R_len_t i = pattern_cont.vectorize_init();
i != pattern_cont.vectorize_end();
i = pattern_cont.vectorize_next(i))
{
SET_STRING_ELT(ret, i, NA_STRING);
if (str_cont.isNA(i) || pattern_cont.isNA(i))
continue;
CharClass pattern_cur = pattern_cont.get(i);
R_len_t str_cur_n = str_cont.get(i).length();
const char* str_cur_s = str_cont.get(i).c_str();
R_len_t j, jlast;
UChar32 chr;
if (first) {
for (jlast=j=0; j<str_cur_n; ) {
U8_NEXT(str_cur_s, j, str_cur_n, chr);
if (pattern_cur.test(chr)) {
SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+jlast, j-jlast, CE_UTF8));
break; // that's enough for first
}
jlast = j;
}
}
else {
for (jlast=j=str_cur_n; j>0; ) {
U8_PREV(str_cur_s, 0, j, chr); // go backwards
if (pattern_cur.test(chr)) {
SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+j, jlast-j, CE_UTF8));
break; // that's enough for last
}
jlast = j;
}
}
}
UNPROTECT(1);
return ret;
STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
示例14: stri_enc_toascii
/** Convert character vector to ASCII
*
* All charcodes > 127 are replaced with subst chars (0x1A)
*
* @param str character vector
* @return character vector
*
* @version 0.1 (Marek Gagolewski)
* @version 0.2 (Marek Gagolewski, 2013-06-16) make StriException-friendly
*/
SEXP stri_enc_toascii(SEXP str)
{
str = stri_prepare_arg_string(str, "str");
R_len_t n = LENGTH(str);
STRI__ERROR_HANDLER_BEGIN
SEXP ret;
PROTECT(ret = Rf_allocVector(STRSXP, n));
for (R_len_t i=0; i<n; ++i) {
SEXP curs = STRING_ELT(str, i);
if (curs == NA_STRING) {
SET_STRING_ELT(ret, i, NA_STRING);
continue;
}
else if (IS_ASCII(curs)) {
SET_STRING_ELT(ret, i, curs);
}
else if (IS_UTF8(curs)) {
R_len_t curn = LENGTH(curs);
const char* curs_tab = CHAR(curs);
// TODO: buffer reuse....
String8 buf(curn+1); // this may be 4 times too much
R_len_t k = 0;
UChar32 c;
for (int j=0; j<curn; ) {
U8_NEXT(curs_tab, j, curn, c);
if (c > ASCII_MAXCHARCODE)
buf.data()[k++] = ASCII_SUBSTITUTE;
else
buf.data()[k++] = (char)c;
}
SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), k, CE_UTF8)); // will be marked as ASCII anyway by mkCharLenCE
}
else { // some 8-bit encoding
R_len_t curn = LENGTH(curs);
const char* curs_tab = CHAR(curs);
// TODO: buffer reuse....
String8 buf(curn+1);
R_len_t k = 0;
for (R_len_t j=0; j<curn; ++j) {
if (U8_IS_SINGLE(curs_tab[j]))
buf.data()[k++] = curs_tab[j];
else {
buf.data()[k++] = (char)ASCII_SUBSTITUTE; // subst char in ascii
}
}
SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), k, CE_UTF8)); // will be marked as ASCII anyway by mkCharLenCE
}
}
UNPROTECT(1);
return ret;
STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
示例15: utf8_iterate_codepoints
static int utf8_iterate_codepoints(const unsigned char *buf, size_t bufsize,
int (*do_something)(UChar32 c)) {
UChar32 c;
int i = 0;
while (i < (int)bufsize) {
U8_NEXT(buf, i, bufsize, c);
assert(c >= 0);
if (!do_something(c)) {
return 0;
}
}
return 1;
}