当前位置: 首页>>代码示例>>Java>>正文


Java UnicodeUtil类代码示例

本文整理汇总了Java中org.apache.lucene.util.UnicodeUtil的典型用法代码示例。如果您正苦于以下问题:Java UnicodeUtil类的具体用法?Java UnicodeUtil怎么用?Java UnicodeUtil使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


UnicodeUtil类属于org.apache.lucene.util包,在下文中一共展示了UnicodeUtil类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testSortMetaField

import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
public void testSortMetaField() throws Exception {
    createIndex("test");
    ensureGreen();
    final int numDocs = randomIntBetween(10, 20);
    IndexRequestBuilder[] indexReqs = new IndexRequestBuilder[numDocs];
    for (int i = 0; i < numDocs; ++i) {
        indexReqs[i] = client().prepareIndex("test", "type", Integer.toString(i))
                .setSource();
    }
    indexRandom(true, indexReqs);

    SortOrder order = randomFrom(SortOrder.values());
    SearchResponse searchResponse = client().prepareSearch()
            .setQuery(matchAllQuery())
            .setSize(randomIntBetween(1, numDocs + 5))
            .addSort("_uid", order)
            .execute().actionGet();
    assertNoFailures(searchResponse);
    SearchHit[] hits = searchResponse.getHits().getHits();
    BytesRef previous = order == SortOrder.ASC ? new BytesRef() : UnicodeUtil.BIG_TERM;
    for (int i = 0; i < hits.length; ++i) {
        final BytesRef uid = new BytesRef(Uid.createUid(hits[i].getType(), hits[i].getId()));
        assertThat(previous, order == SortOrder.ASC ? lessThan(uid) : greaterThan(uid));
        previous = uid;
    }
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:27,代码来源:FieldSortIT.java

示例2: initAutomata

import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
/** initialize levenshtein DFAs up to maxDistance, if possible */
private List<CompiledAutomaton> initAutomata(int maxDistance) {
  final List<CompiledAutomaton> runAutomata = dfaAtt.automata();
  //System.out.println("cached automata size: " + runAutomata.size());
  if (runAutomata.size() <= maxDistance &&
      maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    LevenshteinAutomata builder = 
      new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions);

    String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength);
    for (int i = runAutomata.size(); i <= maxDistance; i++) {
      Automaton a = builder.toAutomaton(i, prefix);
      //System.out.println("compute automaton n=" + i);
      runAutomata.add(new CompiledAutomaton(a, true, false));
    }
  }
  return runAutomata;
}
 
开发者ID:lamsfoundation,项目名称:lams,代码行数:19,代码来源:FuzzyTermsEnum.java

示例3: build

import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
/**
 * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
 * strings in UTF-8. These strings must be binary-sorted.
 */
public static Automaton build(Collection<BytesRef> input) {
  final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
  
  char[] chars = new char[0];
  CharsRef ref = new CharsRef();
  for (BytesRef b : input) {
    chars = ArrayUtil.grow(chars, b.length);
    final int len = UnicodeUtil.UTF8toUTF16(b, chars);
    ref.chars = chars;
    ref.length = len;
    builder.add(ref);
  }
  
  Automaton.Builder a = new Automaton.Builder();
  convert(a,
      builder.complete(), 
      new IdentityHashMap<State,Integer>());

  return a.finish();
}
 
开发者ID:lamsfoundation,项目名称:lams,代码行数:25,代码来源:DaciukMihovAutomatonBuilder.java

示例4: evaluate

import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
@Override
public BytesRef evaluate(Input<Object>... args) {
    Object stringValue = args[0].value();
    if (stringValue == null) {
        return null;
    }

    BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue);

    char[] ref = new char[inputByteRef.length];
    int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref);
    charUtils.toLowerCase(ref, 0, len);

    byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len];
    len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res);
    return new BytesRef(res, 0, len);
}
 
开发者ID:baidu,项目名称:Elasticsearch,代码行数:18,代码来源:LowerFunction.java

示例5: evaluate

import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
@Override
public BytesRef evaluate(Input<Object>... args) {
    Object stringValue = args[0].value();
    if (stringValue == null) {
        return null;
    }

    BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue);

    char[] ref = new char[inputByteRef.length];
    int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref);
    charUtils.toUpperCase(ref, 0, len);

    byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len];
    len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res);
    return new BytesRef(res, 0, len);
}
 
开发者ID:baidu,项目名称:Elasticsearch,代码行数:18,代码来源:UpperFunction.java

示例6: decrementKey

import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
@Override
public TermDocIndexKey decrementKey(TermDocIndexKey previousKey) {
  int termIndex = previousKey.index;
  BytesRef docId = previousKey.docId;
  do {
    while ((docId = decrementDocId(termIndex, docId)) != null) {
      int docIndex = acceptDoc(termIndex, docId);
      if (docIndex >= 0) {
        localDocIndex = docIndex;
        return termDocIndexKey = new TermDocIndexKey(termIndex, docId);
      }
    }
    docId = UnicodeUtil.BIG_TERM;
  } while ((termIndex = decrementTermIndex(termIndex)) >= 0);
  localDocIndex = -1;
  return termDocIndexKey = null;
}
 
开发者ID:upenn-libraries,项目名称:solrplugins,代码行数:18,代码来源:DocBasedFacetResponseBuilder.java

示例7: targetKeyInit

import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
@Override
public TermDocIndexKey targetKeyInit(boolean ascending) throws IOException {
  int termIndex = getTargetKeyIndexInit(ascending);
  if (termIndex < 0) {
    return null;
  }
  int rawTargetIdx = getTargetKeyIndex();
  BytesRef initTargetDoc = targetDoc;
  if (rawTargetIdx < termIndex) {
    initTargetDoc = null;
  } else if (rawTargetIdx > termIndex) {
    initTargetDoc = UnicodeUtil.BIG_TERM;
  }
  TermDocIndexKey ret = new TermDocIndexKey(termIndex, initTargetDoc);
  int docIndex = acceptDoc(termIndex, initTargetDoc);
  if (docIndex >= 0) {
    localDocIndex = docIndex;
    return termDocIndexKey = ret;
  } else if (ascending) {
    return incrementKey(ret);
  } else {
    return decrementKey(ret);
  }
}
 
开发者ID:upenn-libraries,项目名称:solrplugins,代码行数:25,代码来源:DocBasedFacetResponseBuilder.java

示例8: testSurrogates2

import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
  int numIterations = atLeast(10000);
  for (int i = 0; i < numIterations; i++) {
    if (VERBOSE) {
      System.out.println("\nTEST: iter=" + i);
    }
    String s = TestUtil.randomUnicodeString(random(), 100);
    TokenStream ts = analyzer.tokenStream("foo", s);
    try {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        assertTrue(UnicodeUtil.validUTF16String(termAtt));
      }
      ts.end();
    } finally {
      IOUtils.closeWhileHandlingException(ts);
    }
  }
}
 
开发者ID:europeana,项目名称:search,代码行数:22,代码来源:TestJapaneseTokenizer.java

示例9: testSurrogates2

import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
  int numIterations = atLeast(1000);
  for (int i = 0; i < numIterations; i++) {
    String s = TestUtil.randomUnicodeString(random(), 100);
    TokenStream ts = analyzer.tokenStream("foo", s);
    try {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        assertTrue(UnicodeUtil.validUTF16String(termAtt));
      }
      ts.end();
    } finally {
      IOUtils.closeWhileHandlingException(ts);
    }
  }
}
 
开发者ID:europeana,项目名称:search,代码行数:19,代码来源:TestExtendedMode.java

示例10: setUp

import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
@Override
public void setUp() throws Exception {
  super.setUp();
  NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();

  builder.add( "aa", "a" );
  builder.add( "bbb", "b" );
  builder.add( "cccc", "cc" );

  builder.add( "h", "i" );
  builder.add( "j", "jj" );
  builder.add( "k", "kkk" );
  builder.add( "ll", "llll" );

  builder.add( "empty", "" );

  // BMP (surrogate pair):
  builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef");

  builder.add("\uff01", "full-width-exclamation");

  normMap = builder.build();
}
 
开发者ID:europeana,项目名称:search,代码行数:24,代码来源:TestMappingCharFilter.java

示例11: build

import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
@Override
public void build(TermFreqIterator tfit) throws IOException {
  if (tfit.getComparator() != null) {
    // make sure it's unsorted
    // WTF - this could result in yet another sorted iteration....
    tfit = new UnsortedTermFreqIteratorWrapper(tfit);
  }
  trie = new JaspellTernarySearchTrie();
  trie.setMatchAlmostDiff(editDistance);
  BytesRef spare;
  final CharsRef charsSpare = new CharsRef();

  while ((spare = tfit.next()) != null) {
    final long weight = tfit.weight();
    if (spare.length == 0) {
      continue;
    }
    charsSpare.grow(spare.length);
    UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
    trie.put(charsSpare.toString(), Long.valueOf(weight));
  }
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:23,代码来源:JaspellLookup.java

示例12: build

import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
@Override
public void build(TermFreqIterator tfit) throws IOException {
  root = new TernaryTreeNode();
  // buffer first
  if (tfit.getComparator() != BytesRef.getUTF8SortedAsUTF16Comparator()) {
    // make sure it's sorted and the comparator uses UTF16 sort order
    tfit = new SortedTermFreqIteratorWrapper(tfit, BytesRef.getUTF8SortedAsUTF16Comparator());
  }

  ArrayList<String> tokens = new ArrayList<String>();
  ArrayList<Number> vals = new ArrayList<Number>();
  BytesRef spare;
  CharsRef charsSpare = new CharsRef();
  while ((spare = tfit.next()) != null) {
    charsSpare.grow(spare.length);
    UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
    tokens.add(charsSpare.toString());
    vals.add(Long.valueOf(tfit.weight()));
  }
  autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:22,代码来源:TSTLookup.java

示例13: testSurrogates2

import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
  int numIterations = atLeast(10000);
  for (int i = 0; i < numIterations; i++) {
    if (VERBOSE) {
      System.out.println("\nTEST: iter=" + i);
    }
    String s = _TestUtil.randomUnicodeString(random(), 100);
    TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
      assertTrue(UnicodeUtil.validUTF16String(termAtt));
    }
  }
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:17,代码来源:TestJapaneseTokenizer.java

示例14: initAutomata

import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
/** initialize levenshtein DFAs up to maxDistance, if possible */
private List<CompiledAutomaton> initAutomata(int maxDistance) {
  final List<CompiledAutomaton> runAutomata = dfaAtt.automata();
  //System.out.println("cached automata size: " + runAutomata.size());
  if (runAutomata.size() <= maxDistance && 
      maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    LevenshteinAutomata builder = 
      new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions);

    for (int i = runAutomata.size(); i <= maxDistance; i++) {
      Automaton a = builder.toAutomaton(i);
      //System.out.println("compute automaton n=" + i);
      // constant prefix
      if (realPrefixLength > 0) {
        Automaton prefix = BasicAutomata.makeString(
          UnicodeUtil.newString(termText, 0, realPrefixLength));
        a = BasicOperations.concatenate(prefix, a);
      }
      runAutomata.add(new CompiledAutomaton(a, true, false));
    }
  }
  return runAutomata;
}
 
开发者ID:yintaoxue,项目名称:read-open-source-code,代码行数:24,代码来源:FuzzyTermsEnum.java

示例15: build

import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
/**
 * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @throws IOException if an {@link IOException} occurs;
 */
public StemmerOverrideMap build() throws IOException {
  ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>(
      FST.INPUT_TYPE.BYTE4, outputs);
  final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
  IntsRef intsSpare = new IntsRef();
  final int size = hash.size();
  for (int i = 0; i < size; i++) {
    int id = sort[i];
    BytesRef bytesRef = hash.get(id, spare);
    UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare);
    builder.add(intsSpare, new BytesRef(outputValues.get(id)));
  }
  return new StemmerOverrideMap(builder.finish(), ignoreCase);
}
 
开发者ID:yintaoxue,项目名称:read-open-source-code,代码行数:21,代码来源:StemmerOverrideFilter.java


注:本文中的org.apache.lucene.util.UnicodeUtil类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。