本文整理汇总了Java中com.ibm.icu.text.UnicodeSet类的典型用法代码示例。如果您正苦于以下问题:Java UnicodeSet类的具体用法?Java UnicodeSet怎么用?Java UnicodeSet使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
UnicodeSet类属于com.ibm.icu.text包,在下文中一共展示了UnicodeSet类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: create
import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
@Override
public TokenStream create(TokenStream tokenStream) {
// The ICUFoldingFilter is in fact implemented as a ICUNormalizer2Filter.
// ICUFoldingFilter lacks a constructor for adding filtering so we implemement it here
if (unicodeSetFilter != null) {
Normalizer2 base = Normalizer2.getInstance(
ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
"utr30", Normalizer2.Mode.COMPOSE);
UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);
unicodeSet.freeze();
Normalizer2 filtered = new FilteredNormalizer2(base, unicodeSet);
return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, filtered);
}
else {
return new ICUFoldingFilter(tokenStream);
}
}
示例2: addPropertyStarts
import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
public final void addPropertyStarts(UnicodeSet set) {
/* add the start code point of each same-value range of the trie */
Iterator<Trie2.Range> trieIterator=trie.iterator();
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
set.add(range.startCodePoint);
}
/* add code points with hardcoded properties, plus the ones following them */
/* (none right now, see comment below) */
/*
* Omit code points with hardcoded specialcasing properties
* because we do not build property UnicodeSets for them right now.
*/
}
示例3: enumNorm16PropertyStartsRange
import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) {
/* add the start code point to the USet */
set.add(start);
if(start!=end && isAlgorithmicNoNo(value)) {
// Range of code points with same-norm16-value algorithmic decompositions.
// They might have different non-zero FCD16 values.
int prevFCD16=getFCD16(start);
while(++start<=end) {
int fcd16=getFCD16(start);
if(fcd16!=prevFCD16) {
set.add(start);
prevFCD16=fcd16;
}
}
}
}
示例4: getCanonStartSet
import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
/**
* Returns true if there are characters whose decomposition starts with c.
* If so, then the set is cleared and then filled with those characters.
* <b>{@link #ensureCanonIterData()} must have been called before this method,
* or else this method will crash.</b>
* @param c A Unicode code point.
* @param set A UnicodeSet to receive the characters whose decompositions
* start with c, if there are any.
* @return true if there are characters whose decomposition starts with c.
*/
public boolean getCanonStartSet(int c, UnicodeSet set) {
int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
if(canonValue==0) {
return false;
}
set.clear();
int value=canonValue&CANON_VALUE_MASK;
if((canonValue&CANON_HAS_SET)!=0) {
set.addAll(canonStartSets.get(value));
} else if(value!=0) {
set.add(value);
}
if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
int norm16=getNorm16(c);
if(norm16==JAMO_L) {
int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
} else {
addComposites(getCompositionsList(norm16), set);
}
}
return true;
}
示例5: addComposites
import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
/**
* @param list some character's compositions list
* @param set recursively receives the composites from these compositions
*/
private void addComposites(int list, UnicodeSet set) {
int firstUnit, compositeAndFwd;
do {
firstUnit=maybeYesCompositions.charAt(list);
if((firstUnit&COMP_1_TRIPLE)==0) {
compositeAndFwd=maybeYesCompositions.charAt(list+1);
list+=2;
} else {
compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
maybeYesCompositions.charAt(list+2);
list+=3;
}
int composite=compositeAndFwd>>1;
if((compositeAndFwd&1)!=0) {
addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
}
set.add(composite);
} while((firstUnit&COMP_1_LAST_TUPLE)==0);
}
示例6: suppressContractions
import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
void suppressContractions(UnicodeSet set) {
if(set.isEmpty()) { return; }
UnicodeSetIterator iter = new UnicodeSetIterator(set);
while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
int c = iter.codepoint;
int ce32 = trie.get(c);
if(ce32 == Collation.FALLBACK_CE32) {
ce32 = base.getFinalCE32(base.getCE32(c));
if(Collation.ce32HasContext(ce32)) {
ce32 = copyFromBaseCE32(c, ce32, false /* without context */);
trie.set(c, ce32);
}
} else if(isBuilderContextCE32(ce32)) {
ce32 = getConditionalCE32ForCE32(ce32).ce32;
// Simply abandon the list of ConditionalCE32.
// The caller will copy this builder in the end,
// eliminating unreachable data.
trie.set(c, ce32);
contextChars.remove(c);
}
}
modified = true;
}
示例7: setDigitTags
import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
protected void setDigitTags() {
UnicodeSet digits = new UnicodeSet("[:Nd:]");
UnicodeSetIterator iter = new UnicodeSetIterator(digits);
while(iter.next()) {
assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
int c = iter.codepoint;
int ce32 = trie.get(c);
if(ce32 != Collation.FALLBACK_CE32 && ce32 != Collation.UNASSIGNED_CE32) {
int index = addCE32(ce32);
if(index > Collation.MAX_INDEX) {
throw new IndexOutOfBoundsException("too many mappings");
// BufferOverflowException is a better fit
// but cannot be constructed with a message string.
}
ce32 = Collation.makeCE32FromTagIndexAndLength(
Collation.DIGIT_TAG, index, UCharacter.digit(c)); // u_charDigitValue(c)
trie.set(c, ce32);
}
}
}
示例8: run
import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
public synchronized void run(final String value, final InputRow row, final int distinctCount) {
final List<Entry<String, UnicodeSet>> unicodeSetsRemaining = new ArrayList<>(_unicodeSets.entrySet());
final CharIterator charIterator = new CharIterator(value);
while (charIterator.hasNext()) {
final Character c = charIterator.next();
if (charIterator.isWhitespace() || charIterator.isDigit()) {
logger.debug("Skipping whitespace/digit char: {}", c);
} else {
final Iterator<Entry<String, UnicodeSet>> it = unicodeSetsRemaining.iterator();
while (it.hasNext()) {
final Entry<String, UnicodeSet> unicodeSet = it.next();
if (unicodeSet.getValue().contains(c)) {
final String name = unicodeSet.getKey();
final RowAnnotation annotation = _annotations.get(name);
_annotationFactory.annotate(row, distinctCount, annotation);
// remove this unicode set from the remaining checks on
// this value.
it.remove();
}
}
}
}
}
示例9: testCreateFilters
import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
public void testCreateFilters() throws Exception {
final Map<String, UnicodeSet> unicodeSets = CharacterSetDistributionAnalyzer.createUnicodeSets();
final Set<String> keys = unicodeSets.keySet();
assertEquals(CHARSET_NAMES, keys.toString());
UnicodeSet set = unicodeSets.get("Arabic");
assertFalse(set.contains('a'));
assertTrue(set.containsAll("البيانات"));
set = unicodeSets.get("Latin, ASCII");
assertTrue(set.contains('a'));
assertTrue(set.contains('z'));
assertFalse(set.contains('ä'));
assertFalse(set.contains('æ'));
set = unicodeSets.get("Latin, non-ASCII");
assertFalse(set.contains('a'));
assertFalse(set.contains('z'));
assertTrue(set.contains('ä'));
assertTrue(set.contains('æ'));
}
示例10: run
import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
public synchronized void run(String value, InputRow row, int distinctCount) {
final List<Entry<String, UnicodeSet>> unicodeSetsRemaining = new ArrayList<Entry<String, UnicodeSet>>(
_unicodeSets.entrySet());
CharIterator charIterator = new CharIterator(value);
while (charIterator.hasNext()) {
Character c = charIterator.next();
if (charIterator.isWhitespace() || charIterator.isDigit()) {
logger.debug("Skipping whitespace/digit char: {}", c);
} else {
Iterator<Entry<String, UnicodeSet>> it = unicodeSetsRemaining.iterator();
while (it.hasNext()) {
Entry<String, UnicodeSet> unicodeSet = it.next();
if (unicodeSet.getValue().contains(c)) {
String name = unicodeSet.getKey();
RowAnnotation annotation = _annotations.get(name);
_annotationFactory.annotate(row, distinctCount, annotation);
// remove this unicode set from the remaining checks on
// this value.
it.remove();
}
}
}
}
}
开发者ID:datacleaner,项目名称:AnalyzerBeans,代码行数:27,代码来源:CharacterSetDistributionAnalyzerColumnDelegate.java
示例11: testCreateFilters
import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
public void testCreateFilters() throws Exception {
Map<String, UnicodeSet> unicodeSets = CharacterSetDistributionAnalyzer.createUnicodeSets();
Set<String> keys = unicodeSets.keySet();
assertEquals(CHARSET_NAMES, keys.toString());
UnicodeSet set = unicodeSets.get("Arabic");
assertFalse(set.contains('a'));
assertTrue(set.containsAll("البيانات"));
set = unicodeSets.get("Latin, ASCII");
assertTrue(set.contains('a'));
assertTrue(set.contains('z'));
assertFalse(set.contains('ä'));
assertFalse(set.contains('æ'));
set = unicodeSets.get("Latin, non-ASCII");
assertFalse(set.contains('a'));
assertFalse(set.contains('z'));
assertTrue(set.contains('ä'));
assertTrue(set.contains('æ'));
}
示例12: dovec
import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
/**
* dovec - fill in arcs for each element of a cvec
* all kinds of MCCE complexity removed.
*/
private void dovec(UnicodeSet set, State lp, State rp) throws RegexException {
int rangeCount = set.getRangeCount();
for (int rx = 0; rx < rangeCount; rx++) {
int rangeStart = set.getRangeStart(rx);
int rangeEnd = set.getRangeEnd(rx);
/*
* Note: ICU operates in UTF-32 here, and the ColorMap is happy to play along.
*/
if (LOG.isDebugEnabled() && IS_DEBUG) {
LOG.debug(String.format("%s %d %4x %4x", set, rx, rangeStart, rangeEnd));
}
//TODO: this arc is probably redundant.
if (rangeStart == rangeEnd) {
nfa.newarc(PLAIN, cm.subcolor(rangeStart), lp, rp);
}
cm.subrange(rangeStart, rangeEnd, lp, rp);
}
}
示例13: expandSingleRule
import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
private static void expandSingleRule(StringBuilder builder, String leftHandSide, String rightHandSide) {
UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE);
boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches();
for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.nextRange(); ) {
if (it.codepoint != UnicodeSetIterator.IS_STRING) {
if (numericValue) {
for (int cp = it.codepoint; cp <= it.codepointEnd; ++cp) {
builder.append(String.format(Locale.ROOT, "%04X", cp)).append('>');
builder.append(String.format(Locale.ROOT, "%04X", 0x30 + UCharacter.getNumericValue(cp)));
builder.append(" # ").append(UCharacter.getName(cp));
builder.append("\n");
}
} else {
builder.append(String.format(Locale.ROOT, "%04X", it.codepoint));
if (it.codepointEnd > it.codepoint) {
builder.append("..").append(String.format(Locale.ROOT, "%04X", it.codepointEnd));
}
builder.append('>').append(rightHandSide).append("\n");
}
} else {
logger.error("ERROR: String '" + it.getString() + "' found in UnicodeSet");
}
}
}
示例14: StringTokenizer
import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
/**
* {@icu} Constructs a string tokenizer for the specified string. All
* characters in the delim argument are the delimiters for separating
* tokens.
* <p>If the returnDelims flag is false, the delimiter characters are
* skipped and only serve as separators between tokens.
* <p>If the returnDelims flag is true, then the delimiter characters
* are also returned as tokens. If coalescedelims is true, one token
* is returned for each run of delimiter characters, otherwise one
* token is returned per delimiter. Since surrogate pairs can be
* delimiters, the returned token might be two chars in length.
* @param str a string to be parsed.
* @param delim the delimiters.
* @param returndelims flag indicating whether to return the delimiters
* as tokens.
* @param coalescedelims flag indicating whether to return a run of
* delimiters as a single token or as one token per delimiter.
* This only takes effect if returndelims is true.
* @exception NullPointerException if str is null
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public StringTokenizer(String str, UnicodeSet delim, boolean returndelims, boolean coalescedelims)
{
m_source_ = str;
m_length_ = str.length();
if (delim == null) {
m_delimiters_ = EMPTY_DELIMITER_;
}
else {
m_delimiters_ = delim;
}
m_returnDelimiters_ = returndelims;
m_coalesceDelimiters_ = coalescedelims;
m_tokenOffset_ = -1;
m_tokenSize_ = -1;
if (m_length_ == 0) {
// string length 0, no tokens
m_nextOffset_ = -1;
}
else {
m_nextOffset_ = 0;
if (!returndelims) {
m_nextOffset_ = getNextNonDelimiter(0);
}
}
}
示例15: getExemplarSet
import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
/**
* Returns the set of exemplar characters for a locale.
*
* @param options Bitmask for options to apply to the exemplar pattern.
* Specify zero to retrieve the exemplar set as it is
* defined in the locale data. Specify
* UnicodeSet.CASE to retrieve a case-folded exemplar
* set. See {@link UnicodeSet#applyPattern(String,
* int)} for a complete list of valid options. The
* IGNORE_SPACE bit is always set, regardless of the
* value of 'options'.
* @param extype The type of exemplar set to be retrieved,
* ES_STANDARD, ES_INDEX, ES_AUXILIARY, or ES_PUNCTUATION
* @return The set of exemplar characters for the given locale.
* If there is nothing available for the locale,
* then null is returned if {@link #getNoSubstitute()} is true, otherwise the
* root value is returned (which may be UnicodeSet.EMPTY).
* @exception RuntimeException if the extype is invalid.
* @stable ICU 3.4
*/
public UnicodeSet getExemplarSet(int options, int extype) {
String [] exemplarSetTypes = {
"ExemplarCharacters",
"AuxExemplarCharacters",
"ExemplarCharactersIndex",
"ExemplarCharactersCurrency",
"ExemplarCharactersPunctuation"
};
if (extype == ES_CURRENCY) {
// currency symbol exemplar is no longer available
return noSubstitute ? null : UnicodeSet.EMPTY;
}
try{
final String aKey = exemplarSetTypes[extype]; // will throw an out-of-bounds exception
ICUResourceBundle stringBundle = (ICUResourceBundle) bundle.get(aKey);
if (noSubstitute && !bundle.isRoot() && stringBundle.isRoot()) {
return null;
}
String unicodeSetPattern = stringBundle.getString();
return new UnicodeSet(unicodeSetPattern, UnicodeSet.IGNORE_SPACE | options);
} catch (ArrayIndexOutOfBoundsException aiooe) {
throw new IllegalArgumentException(aiooe);
} catch (Exception ex){
return noSubstitute ? null : UnicodeSet.EMPTY;
}
}