本文整理汇总了Java中org.apache.lucene.util.UnicodeUtil类的典型用法代码示例。如果您正苦于以下问题:Java UnicodeUtil类的具体用法?Java UnicodeUtil怎么用?Java UnicodeUtil使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
UnicodeUtil类属于org.apache.lucene.util包,在下文中一共展示了UnicodeUtil类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testSortMetaField
import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
public void testSortMetaField() throws Exception {
createIndex("test");
ensureGreen();
final int numDocs = randomIntBetween(10, 20);
IndexRequestBuilder[] indexReqs = new IndexRequestBuilder[numDocs];
for (int i = 0; i < numDocs; ++i) {
indexReqs[i] = client().prepareIndex("test", "type", Integer.toString(i))
.setSource();
}
indexRandom(true, indexReqs);
SortOrder order = randomFrom(SortOrder.values());
SearchResponse searchResponse = client().prepareSearch()
.setQuery(matchAllQuery())
.setSize(randomIntBetween(1, numDocs + 5))
.addSort("_uid", order)
.execute().actionGet();
assertNoFailures(searchResponse);
SearchHit[] hits = searchResponse.getHits().getHits();
BytesRef previous = order == SortOrder.ASC ? new BytesRef() : UnicodeUtil.BIG_TERM;
for (int i = 0; i < hits.length; ++i) {
final BytesRef uid = new BytesRef(Uid.createUid(hits[i].getType(), hits[i].getId()));
assertThat(previous, order == SortOrder.ASC ? lessThan(uid) : greaterThan(uid));
previous = uid;
}
}
示例2: initAutomata
import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
/** initialize levenshtein DFAs up to maxDistance, if possible */
private List<CompiledAutomaton> initAutomata(int maxDistance) {
final List<CompiledAutomaton> runAutomata = dfaAtt.automata();
//System.out.println("cached automata size: " + runAutomata.size());
if (runAutomata.size() <= maxDistance &&
maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
LevenshteinAutomata builder =
new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions);
String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength);
for (int i = runAutomata.size(); i <= maxDistance; i++) {
Automaton a = builder.toAutomaton(i, prefix);
//System.out.println("compute automaton n=" + i);
runAutomata.add(new CompiledAutomaton(a, true, false));
}
}
return runAutomata;
}
示例3: build
import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
/**
* Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
* strings in UTF-8. These strings must be binary-sorted.
*/
public static Automaton build(Collection<BytesRef> input) {
final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
char[] chars = new char[0];
CharsRef ref = new CharsRef();
for (BytesRef b : input) {
chars = ArrayUtil.grow(chars, b.length);
final int len = UnicodeUtil.UTF8toUTF16(b, chars);
ref.chars = chars;
ref.length = len;
builder.add(ref);
}
Automaton.Builder a = new Automaton.Builder();
convert(a,
builder.complete(),
new IdentityHashMap<State,Integer>());
return a.finish();
}
示例4: evaluate
import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
@Override
public BytesRef evaluate(Input<Object>... args) {
Object stringValue = args[0].value();
if (stringValue == null) {
return null;
}
BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue);
char[] ref = new char[inputByteRef.length];
int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref);
charUtils.toLowerCase(ref, 0, len);
byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len];
len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res);
return new BytesRef(res, 0, len);
}
示例5: evaluate
import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
@Override
public BytesRef evaluate(Input<Object>... args) {
Object stringValue = args[0].value();
if (stringValue == null) {
return null;
}
BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue);
char[] ref = new char[inputByteRef.length];
int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref);
charUtils.toUpperCase(ref, 0, len);
byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len];
len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res);
return new BytesRef(res, 0, len);
}
示例6: decrementKey
import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
@Override
public TermDocIndexKey decrementKey(TermDocIndexKey previousKey) {
int termIndex = previousKey.index;
BytesRef docId = previousKey.docId;
do {
while ((docId = decrementDocId(termIndex, docId)) != null) {
int docIndex = acceptDoc(termIndex, docId);
if (docIndex >= 0) {
localDocIndex = docIndex;
return termDocIndexKey = new TermDocIndexKey(termIndex, docId);
}
}
docId = UnicodeUtil.BIG_TERM;
} while ((termIndex = decrementTermIndex(termIndex)) >= 0);
localDocIndex = -1;
return termDocIndexKey = null;
}
示例7: targetKeyInit
import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
@Override
public TermDocIndexKey targetKeyInit(boolean ascending) throws IOException {
int termIndex = getTargetKeyIndexInit(ascending);
if (termIndex < 0) {
return null;
}
int rawTargetIdx = getTargetKeyIndex();
BytesRef initTargetDoc = targetDoc;
if (rawTargetIdx < termIndex) {
initTargetDoc = null;
} else if (rawTargetIdx > termIndex) {
initTargetDoc = UnicodeUtil.BIG_TERM;
}
TermDocIndexKey ret = new TermDocIndexKey(termIndex, initTargetDoc);
int docIndex = acceptDoc(termIndex, initTargetDoc);
if (docIndex >= 0) {
localDocIndex = docIndex;
return termDocIndexKey = ret;
} else if (ascending) {
return incrementKey(ret);
} else {
return decrementKey(ret);
}
}
示例8: testSurrogates2
import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
int numIterations = atLeast(10000);
for (int i = 0; i < numIterations; i++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + i);
}
String s = TestUtil.randomUnicodeString(random(), 100);
TokenStream ts = analyzer.tokenStream("foo", s);
try {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
assertTrue(UnicodeUtil.validUTF16String(termAtt));
}
ts.end();
} finally {
IOUtils.closeWhileHandlingException(ts);
}
}
}
示例9: testSurrogates2
import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
int numIterations = atLeast(1000);
for (int i = 0; i < numIterations; i++) {
String s = TestUtil.randomUnicodeString(random(), 100);
TokenStream ts = analyzer.tokenStream("foo", s);
try {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
assertTrue(UnicodeUtil.validUTF16String(termAtt));
}
ts.end();
} finally {
IOUtils.closeWhileHandlingException(ts);
}
}
}
示例10: setUp
import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
@Override
public void setUp() throws Exception {
super.setUp();
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add( "aa", "a" );
builder.add( "bbb", "b" );
builder.add( "cccc", "cc" );
builder.add( "h", "i" );
builder.add( "j", "jj" );
builder.add( "k", "kkk" );
builder.add( "ll", "llll" );
builder.add( "empty", "" );
// BMP (surrogate pair):
builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef");
builder.add("\uff01", "full-width-exclamation");
normMap = builder.build();
}
示例11: build
import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
@Override
public void build(TermFreqIterator tfit) throws IOException {
if (tfit.getComparator() != null) {
// make sure it's unsorted
// WTF - this could result in yet another sorted iteration....
tfit = new UnsortedTermFreqIteratorWrapper(tfit);
}
trie = new JaspellTernarySearchTrie();
trie.setMatchAlmostDiff(editDistance);
BytesRef spare;
final CharsRef charsSpare = new CharsRef();
while ((spare = tfit.next()) != null) {
final long weight = tfit.weight();
if (spare.length == 0) {
continue;
}
charsSpare.grow(spare.length);
UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
trie.put(charsSpare.toString(), Long.valueOf(weight));
}
}
示例12: build
import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
@Override
public void build(TermFreqIterator tfit) throws IOException {
root = new TernaryTreeNode();
// buffer first
if (tfit.getComparator() != BytesRef.getUTF8SortedAsUTF16Comparator()) {
// make sure it's sorted and the comparator uses UTF16 sort order
tfit = new SortedTermFreqIteratorWrapper(tfit, BytesRef.getUTF8SortedAsUTF16Comparator());
}
ArrayList<String> tokens = new ArrayList<String>();
ArrayList<Number> vals = new ArrayList<Number>();
BytesRef spare;
CharsRef charsSpare = new CharsRef();
while ((spare = tfit.next()) != null) {
charsSpare.grow(spare.length);
UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
tokens.add(charsSpare.toString());
vals.add(Long.valueOf(tfit.weight()));
}
autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
}
示例13: testSurrogates2
import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
int numIterations = atLeast(10000);
for (int i = 0; i < numIterations; i++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + i);
}
String s = _TestUtil.randomUnicodeString(random(), 100);
TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
assertTrue(UnicodeUtil.validUTF16String(termAtt));
}
}
}
示例14: initAutomata
import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
/** initialize levenshtein DFAs up to maxDistance, if possible */
private List<CompiledAutomaton> initAutomata(int maxDistance) {
final List<CompiledAutomaton> runAutomata = dfaAtt.automata();
//System.out.println("cached automata size: " + runAutomata.size());
if (runAutomata.size() <= maxDistance &&
maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
LevenshteinAutomata builder =
new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions);
for (int i = runAutomata.size(); i <= maxDistance; i++) {
Automaton a = builder.toAutomaton(i);
//System.out.println("compute automaton n=" + i);
// constant prefix
if (realPrefixLength > 0) {
Automaton prefix = BasicAutomata.makeString(
UnicodeUtil.newString(termText, 0, realPrefixLength));
a = BasicOperations.concatenate(prefix, a);
}
runAutomata.add(new CompiledAutomaton(a, true, false));
}
}
return runAutomata;
}
示例15: build
import org.apache.lucene.util.UnicodeUtil; //导入依赖的package包/类
/**
* Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
* @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
* @throws IOException if an {@link IOException} occurs;
*/
public StemmerOverrideMap build() throws IOException {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>(
FST.INPUT_TYPE.BYTE4, outputs);
final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
IntsRef intsSpare = new IntsRef();
final int size = hash.size();
for (int i = 0; i < size; i++) {
int id = sort[i];
BytesRef bytesRef = hash.get(id, spare);
UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare);
builder.add(intsSpare, new BytesRef(outputValues.get(id)));
}
return new StemmerOverrideMap(builder.finish(), ignoreCase);
}