当前位置: 首页>>代码示例>>Java>>正文


Java UnicodeSet类代码示例

本文整理汇总了Java中com.ibm.icu.text.UnicodeSet的典型用法代码示例。如果您正苦于以下问题:Java UnicodeSet类的具体用法?Java UnicodeSet怎么用?Java UnicodeSet使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


UnicodeSet类属于com.ibm.icu.text包,在下文中一共展示了UnicodeSet类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: create

import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
@Override
public TokenStream create(TokenStream tokenStream) {

    // The ICUFoldingFilter is in fact implemented as a ICUNormalizer2Filter.
    // ICUFoldingFilter lacks a constructor for adding filtering so we implemement it here
    if (unicodeSetFilter != null) {
        Normalizer2 base = Normalizer2.getInstance(
                ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
                "utr30", Normalizer2.Mode.COMPOSE);
        UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);

        unicodeSet.freeze();
        Normalizer2 filtered = new FilteredNormalizer2(base, unicodeSet);
        return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, filtered);
    }
    else {
        return new ICUFoldingFilter(tokenStream);
    }
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:20,代码来源:IcuFoldingTokenFilterFactory.java

示例2: addPropertyStarts

import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
public final void addPropertyStarts(UnicodeSet set) {
    /* add the start code point of each same-value range of the trie */
    Iterator<Trie2.Range> trieIterator=trie.iterator();
    Trie2.Range range;
    while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
        set.add(range.startCodePoint);
    }

    /* add code points with hardcoded properties, plus the ones following them */

    /* (none right now, see comment below) */

    /*
     * Omit code points with hardcoded specialcasing properties
     * because we do not build property UnicodeSets for them right now.
     */
}
 
开发者ID:abhijitvalluri,项目名称:fitnotifications,代码行数:18,代码来源:UCaseProps.java

示例3: enumNorm16PropertyStartsRange

import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) {
    /* add the start code point to the USet */
    set.add(start);
    if(start!=end && isAlgorithmicNoNo(value)) {
        // Range of code points with same-norm16-value algorithmic decompositions.
        // They might have different non-zero FCD16 values.
        int prevFCD16=getFCD16(start);
        while(++start<=end) {
            int fcd16=getFCD16(start);
            if(fcd16!=prevFCD16) {
                set.add(start);
                prevFCD16=fcd16;
            }
        }
    }
}
 
开发者ID:abhijitvalluri,项目名称:fitnotifications,代码行数:17,代码来源:Normalizer2Impl.java

示例4: getCanonStartSet

import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
/**
 * Returns true if there are characters whose decomposition starts with c.
 * If so, then the set is cleared and then filled with those characters.
 * <b>{@link #ensureCanonIterData()} must have been called before this method,
 * or else this method will crash.</b>
 * @param c A Unicode code point.
 * @param set A UnicodeSet to receive the characters whose decompositions
 *        start with c, if there are any.
 * @return true if there are characters whose decomposition starts with c.
 */
public boolean getCanonStartSet(int c, UnicodeSet set) {
    int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
    if(canonValue==0) {
        return false;
    }
    set.clear();
    int value=canonValue&CANON_VALUE_MASK;
    if((canonValue&CANON_HAS_SET)!=0) {
        set.addAll(canonStartSets.get(value));
    } else if(value!=0) {
        set.add(value);
    }
    if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
        int norm16=getNorm16(c);
        if(norm16==JAMO_L) {
            int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
            set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
        } else {
            addComposites(getCompositionsList(norm16), set);
        }
    }
    return true;
}
 
开发者ID:abhijitvalluri,项目名称:fitnotifications,代码行数:34,代码来源:Normalizer2Impl.java

示例5: addComposites

import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
/**
 * @param list some character's compositions list
 * @param set recursively receives the composites from these compositions
 */
private void addComposites(int list, UnicodeSet set) {
    int firstUnit, compositeAndFwd;
    do {
        firstUnit=maybeYesCompositions.charAt(list);
        if((firstUnit&COMP_1_TRIPLE)==0) {
            compositeAndFwd=maybeYesCompositions.charAt(list+1);
            list+=2;
        } else {
            compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
                            maybeYesCompositions.charAt(list+2);
            list+=3;
        }
        int composite=compositeAndFwd>>1;
        if((compositeAndFwd&1)!=0) {
            addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
        }
        set.add(composite);
    } while((firstUnit&COMP_1_LAST_TUPLE)==0);
}
 
开发者ID:abhijitvalluri,项目名称:fitnotifications,代码行数:24,代码来源:Normalizer2Impl.java

示例6: suppressContractions

import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
void suppressContractions(UnicodeSet set) {
    if(set.isEmpty()) { return; }
    UnicodeSetIterator iter = new UnicodeSetIterator(set);
    while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
        int c = iter.codepoint;
        int ce32 = trie.get(c);
        if(ce32 == Collation.FALLBACK_CE32) {
            ce32 = base.getFinalCE32(base.getCE32(c));
            if(Collation.ce32HasContext(ce32)) {
                ce32 = copyFromBaseCE32(c, ce32, false /* without context */);
                trie.set(c, ce32);
            }
        } else if(isBuilderContextCE32(ce32)) {
            ce32 = getConditionalCE32ForCE32(ce32).ce32;
            // Simply abandon the list of ConditionalCE32.
            // The caller will copy this builder in the end,
            // eliminating unreachable data.
            trie.set(c, ce32);
            contextChars.remove(c);
        }
    }
    modified = true;
}
 
开发者ID:abhijitvalluri,项目名称:fitnotifications,代码行数:24,代码来源:CollationDataBuilder.java

示例7: setDigitTags

import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
protected void setDigitTags() {
    UnicodeSet digits = new UnicodeSet("[:Nd:]");
    UnicodeSetIterator iter = new UnicodeSetIterator(digits);
    while(iter.next()) {
        assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
        int c = iter.codepoint;
        int ce32 = trie.get(c);
        if(ce32 != Collation.FALLBACK_CE32 && ce32 != Collation.UNASSIGNED_CE32) {
            int index = addCE32(ce32);
            if(index > Collation.MAX_INDEX) {
                throw new IndexOutOfBoundsException("too many mappings");
                // BufferOverflowException is a better fit
                // but cannot be constructed with a message string.
            }
            ce32 = Collation.makeCE32FromTagIndexAndLength(
                    Collation.DIGIT_TAG, index, UCharacter.digit(c));  // u_charDigitValue(c)
            trie.set(c, ce32);
        }
    }
}
 
开发者ID:abhijitvalluri,项目名称:fitnotifications,代码行数:21,代码来源:CollationDataBuilder.java

示例8: run

import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
public synchronized void run(final String value, final InputRow row, final int distinctCount) {
    final List<Entry<String, UnicodeSet>> unicodeSetsRemaining = new ArrayList<>(_unicodeSets.entrySet());
    final CharIterator charIterator = new CharIterator(value);
    while (charIterator.hasNext()) {
        final Character c = charIterator.next();
        if (charIterator.isWhitespace() || charIterator.isDigit()) {
            logger.debug("Skipping whitespace/digit char: {}", c);
        } else {

            final Iterator<Entry<String, UnicodeSet>> it = unicodeSetsRemaining.iterator();
            while (it.hasNext()) {
                final Entry<String, UnicodeSet> unicodeSet = it.next();
                if (unicodeSet.getValue().contains(c)) {
                    final String name = unicodeSet.getKey();
                    final RowAnnotation annotation = _annotations.get(name);
                    _annotationFactory.annotate(row, distinctCount, annotation);

                    // remove this unicode set from the remaining checks on
                    // this value.
                    it.remove();
                }
            }
        }
    }
}
 
开发者ID:datacleaner,项目名称:DataCleaner,代码行数:26,代码来源:CharacterSetDistributionAnalyzerColumnDelegate.java

示例9: testCreateFilters

import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
public void testCreateFilters() throws Exception {
    final Map<String, UnicodeSet> unicodeSets = CharacterSetDistributionAnalyzer.createUnicodeSets();
    final Set<String> keys = unicodeSets.keySet();
    assertEquals(CHARSET_NAMES, keys.toString());

    UnicodeSet set = unicodeSets.get("Arabic");
    assertFalse(set.contains('a'));
    assertTrue(set.containsAll("البيانات"));

    set = unicodeSets.get("Latin, ASCII");
    assertTrue(set.contains('a'));
    assertTrue(set.contains('z'));
    assertFalse(set.contains('ä'));
    assertFalse(set.contains('æ'));

    set = unicodeSets.get("Latin, non-ASCII");
    assertFalse(set.contains('a'));
    assertFalse(set.contains('z'));
    assertTrue(set.contains('ä'));
    assertTrue(set.contains('æ'));
}
 
开发者ID:datacleaner,项目名称:DataCleaner,代码行数:22,代码来源:CharacterSetDistributionAnalyzerTest.java

示例10: run

import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
public synchronized void run(String value, InputRow row, int distinctCount) {
	final List<Entry<String, UnicodeSet>> unicodeSetsRemaining = new ArrayList<Entry<String, UnicodeSet>>(
			_unicodeSets.entrySet());
	CharIterator charIterator = new CharIterator(value);
	while (charIterator.hasNext()) {
		Character c = charIterator.next();
		if (charIterator.isWhitespace() || charIterator.isDigit()) {
			logger.debug("Skipping whitespace/digit char: {}", c);
		} else {

			Iterator<Entry<String, UnicodeSet>> it = unicodeSetsRemaining.iterator();
			while (it.hasNext()) {
				Entry<String, UnicodeSet> unicodeSet = it.next();
				if (unicodeSet.getValue().contains(c)) {
					String name = unicodeSet.getKey();
					RowAnnotation annotation = _annotations.get(name);
					_annotationFactory.annotate(row, distinctCount, annotation);

					// remove this unicode set from the remaining checks on
					// this value.
					it.remove();
				}
			}
		}
	}
}
 
开发者ID:datacleaner,项目名称:AnalyzerBeans,代码行数:27,代码来源:CharacterSetDistributionAnalyzerColumnDelegate.java

示例11: testCreateFilters

import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
public void testCreateFilters() throws Exception {
	Map<String, UnicodeSet> unicodeSets = CharacterSetDistributionAnalyzer.createUnicodeSets();
	Set<String> keys = unicodeSets.keySet();
	assertEquals(CHARSET_NAMES, keys.toString());

	UnicodeSet set = unicodeSets.get("Arabic");
	assertFalse(set.contains('a'));
	assertTrue(set.containsAll("البيانات"));

	set = unicodeSets.get("Latin, ASCII");
	assertTrue(set.contains('a'));
	assertTrue(set.contains('z'));
	assertFalse(set.contains('ä'));
	assertFalse(set.contains('æ'));

	set = unicodeSets.get("Latin, non-ASCII");
	assertFalse(set.contains('a'));
	assertFalse(set.contains('z'));
	assertTrue(set.contains('ä'));
	assertTrue(set.contains('æ'));
}
 
开发者ID:datacleaner,项目名称:AnalyzerBeans,代码行数:22,代码来源:CharacterSetDistributionAnalyzerTest.java

示例12: dovec

import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
/**
 * dovec - fill in arcs for each element of a cvec
 * all kinds of MCCE complexity removed.
 */
private void dovec(UnicodeSet set, State lp, State rp) throws RegexException {

    int rangeCount = set.getRangeCount();
    for (int rx = 0; rx < rangeCount; rx++) {
        int rangeStart = set.getRangeStart(rx);
        int rangeEnd = set.getRangeEnd(rx);
        /*
         * Note: ICU operates in UTF-32 here, and the ColorMap is happy to play along.
         */
        if (LOG.isDebugEnabled() && IS_DEBUG) {
            LOG.debug(String.format("%s %d %4x %4x", set, rx, rangeStart, rangeEnd));
        }
        //TODO: this arc is probably redundant.
        if (rangeStart == rangeEnd) {
            nfa.newarc(PLAIN, cm.subcolor(rangeStart), lp, rp);
        }
        cm.subrange(rangeStart, rangeEnd, lp, rp);
    }
}
 
开发者ID:basis-technology-corp,项目名称:tcl-regex-java,代码行数:24,代码来源:Compiler.java

示例13: expandSingleRule

import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
private static void expandSingleRule(StringBuilder builder, String leftHandSide, String rightHandSide) {
    UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE);
    boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches();
    for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.nextRange(); ) {
        if (it.codepoint != UnicodeSetIterator.IS_STRING) {
            if (numericValue) {
                for (int cp = it.codepoint; cp <= it.codepointEnd; ++cp) {
                    builder.append(String.format(Locale.ROOT, "%04X", cp)).append('>');
                    builder.append(String.format(Locale.ROOT, "%04X", 0x30 + UCharacter.getNumericValue(cp)));
                    builder.append("   # ").append(UCharacter.getName(cp));
                    builder.append("\n");
                }
            } else {
                builder.append(String.format(Locale.ROOT, "%04X", it.codepoint));
                if (it.codepointEnd > it.codepoint) {
                    builder.append("..").append(String.format(Locale.ROOT, "%04X", it.codepointEnd));
                }
                builder.append('>').append(rightHandSide).append("\n");
            }
        } else {
            logger.error("ERROR: String '" + it.getString() + "' found in UnicodeSet");
        }
    }
}
 
开发者ID:jprante,项目名称:elasticsearch-plugin-bundle,代码行数:25,代码来源:UTR30DataFileGenerator.java

示例14: StringTokenizer

import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
/**
 * {@icu} Constructs a string tokenizer for the specified string. All 
 * characters in the delim argument are the delimiters for separating 
 * tokens. 
 * <p>If the returnDelims flag is false, the delimiter characters are 
 * skipped and only serve as separators between tokens.
 * <p>If the returnDelims flag is true, then the delimiter characters 
 * are also returned as tokens.  If coalescedelims is true, one token
 * is returned for each run of delimiter characters, otherwise one
 * token is returned per delimiter.  Since surrogate pairs can be
 * delimiters, the returned token might be two chars in length.
 * @param str a string to be parsed.
 * @param delim the delimiters.
 * @param returndelims flag indicating whether to return the delimiters 
 *        as tokens.
 * @param coalescedelims flag indicating whether to return a run of 
 *        delimiters as a single token or as one token per delimiter.  
 *        This only takes effect if returndelims is true.
 * @exception NullPointerException if str is null
 * @internal
 * @deprecated This API is ICU internal only.
 */
@Deprecated
public StringTokenizer(String str, UnicodeSet delim, boolean returndelims, boolean coalescedelims)
{
    m_source_ = str;
    m_length_ = str.length();
    if (delim == null) {
        m_delimiters_ = EMPTY_DELIMITER_;
    }
    else {
        m_delimiters_ = delim;   
    }
    m_returnDelimiters_ = returndelims;
    m_coalesceDelimiters_ = coalescedelims;
    m_tokenOffset_ = -1;
    m_tokenSize_ = -1;
    if (m_length_ == 0) {
        // string length 0, no tokens
        m_nextOffset_ = -1;
    }
    else {
        m_nextOffset_ = 0;
        if (!returndelims) {
            m_nextOffset_ = getNextNonDelimiter(0);
        }
    }
}
 
开发者ID:abhijitvalluri,项目名称:fitnotifications,代码行数:49,代码来源:StringTokenizer.java

示例15: getExemplarSet

import com.ibm.icu.text.UnicodeSet; //导入依赖的package包/类
/**
 * Returns the set of exemplar characters for a locale.
 *
 * @param options   Bitmask for options to apply to the exemplar pattern.
 *                  Specify zero to retrieve the exemplar set as it is
 *                  defined in the locale data.  Specify
 *                  UnicodeSet.CASE to retrieve a case-folded exemplar
 *                  set.  See {@link UnicodeSet#applyPattern(String,
 *                  int)} for a complete list of valid options.  The
 *                  IGNORE_SPACE bit is always set, regardless of the
 *                  value of 'options'.
 * @param extype    The type of exemplar set to be retrieved,
 *                  ES_STANDARD, ES_INDEX, ES_AUXILIARY, or ES_PUNCTUATION
 * @return          The set of exemplar characters for the given locale.
 *                  If there is nothing available for the locale,
 *                  then null is returned if {@link #getNoSubstitute()} is true, otherwise the
 *                  root value is returned (which may be UnicodeSet.EMPTY).
 * @exception       RuntimeException if the extype is invalid.
 * @stable ICU 3.4
 */
public UnicodeSet getExemplarSet(int options, int extype) {
    String [] exemplarSetTypes = {
            "ExemplarCharacters",
            "AuxExemplarCharacters",
            "ExemplarCharactersIndex",
            "ExemplarCharactersCurrency",
            "ExemplarCharactersPunctuation"
    };

    if (extype == ES_CURRENCY) {
        // currency symbol exemplar is no longer available
        return noSubstitute ? null : UnicodeSet.EMPTY;
    }

    try{
        final String aKey = exemplarSetTypes[extype]; // will throw an out-of-bounds exception
        ICUResourceBundle stringBundle = (ICUResourceBundle) bundle.get(aKey);

        if (noSubstitute && !bundle.isRoot() && stringBundle.isRoot()) {
            return null;
        }
        String unicodeSetPattern = stringBundle.getString();
        return new UnicodeSet(unicodeSetPattern, UnicodeSet.IGNORE_SPACE | options);
    } catch (ArrayIndexOutOfBoundsException aiooe) {
        throw new IllegalArgumentException(aiooe);
    } catch (Exception ex){
        return noSubstitute ? null : UnicodeSet.EMPTY;
    }
}
 
开发者ID:abhijitvalluri,项目名称:fitnotifications,代码行数:50,代码来源:LocaleData.java


注:本文中的com.ibm.icu.text.UnicodeSet类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。