本文整理汇总了Java中org.apache.lucene.analysis.ngram.NGramTokenizer类的典型用法代码示例。如果您正苦于以下问题:Java NGramTokenizer类的具体用法?Java NGramTokenizer怎么用?Java NGramTokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
NGramTokenizer类属于org.apache.lucene.analysis.ngram包,在下文中一共展示了NGramTokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: reusableTokenStream
import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws
IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new NGramTokenizer(reader, 1, 30);
streams.result = new LowerCaseFilter(streams.source);
streams.result = new PorterStemFilter(streams.source);
streams.result = new StopFilter(false, streams.source, stopwords, true);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
}
示例2: create
import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
@Override
public Tokenizer create() {
if (matcher == null) {
return new NGramTokenizer(minGram, maxGram);
} else {
return new NGramTokenizer(minGram, maxGram) {
@Override
protected boolean isTokenChar(int chr) {
return matcher.isTokenChar(chr);
}
};
}
}
示例3: EdgeNGramTokenizerFactory
import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
public EdgeNGramTokenizerFactory(Index index, Settings indexSettings, String name, Settings settings) {
super(index, indexSettings, name, settings);
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
this.side = Lucene43EdgeNGramTokenizer.Side.getSide(settings.get("side", Lucene43EdgeNGramTokenizer.DEFAULT_SIDE.getLabel()));
this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings);
}
示例4: NGramTokenizerFactory
import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
NGramTokenizerFactory(Index index, Settings indexSettings, String name, Settings settings) {
super(index, indexSettings, name, settings);
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings);
}
示例5: NGramLuceneQuery
import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
public NGramLuceneQuery(String fieldName, String fieldValue, int gramSize) {
super(gramSize);
Preconditions.checkArgument(fieldValue.length()>=gramSize);
try (NGramTokenizer tokenizer = new NGramTokenizer(new StringReader(fieldValue.toLowerCase()), gramSize, gramSize)) {
tokenizer.reset();
while (tokenizer.incrementToken()) {
add(new Term(fieldName,
tokenizer.getAttribute(CharTermAttribute.class).toString()));
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
示例6: docToMinHashes
import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
public int[] docToMinHashes(String raw_html) throws Exception {
HashSet<Integer> doc = new HashSet<Integer>();
int count = 0;
NGramTokenizer gramTokenizer = new NGramTokenizer(factory, gram_length, gram_length);
gramTokenizer.setReader(new StringReader(raw_html));
CharTermAttribute cattr = gramTokenizer.addAttribute(CharTermAttribute.class);
gramTokenizer.reset();
while (gramTokenizer.incrementToken()) {
count++;
if ((count % skip_interval) == 0)
doc.add(murmur.hashString(cattr.toString(), Charsets.UTF_8).asInt());
}
gramTokenizer.close();
if (hasher == null)
hasher = new MinHasher(num_hashes);
return hasher.hash(doc);
}
示例7: createAnalzyer
import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
private Analyzer createAnalzyer(final int length) {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
final Tokenizer tokenizer = new NGramTokenizer(1, 1);
final AlphaNumWordFilter filter = new AlphaNumWordFilter(tokenizer);
filter.setMaxTokenLength(length);
return new TokenStreamComponents(tokenizer, filter);
}
};
return analyzer;
}
示例8: testTokenStream2
import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
public void testTokenStream2() throws IOException {
// '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<'
String input = "㌰゙5℃№㈱㌘ザゾ";
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), reader, 1, 1);
assertTokenStreamContents(tokenStream,
new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"},
new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9},
new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11},
input.length()
);
}
示例9: init
import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
/** Initializes the n-gram min and max sizes and the side from which one should start tokenizing. */
@Override
public void init(Map<String, String> args) {
super.init(args);
String maxArg = args.get("maxGramSize");
maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) : NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
String minArg = args.get("minGramSize");
minGramSize = (minArg != null ? Integer.parseInt(minArg) : NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
}
示例10: tokenStream
import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream stream = new NGramTokenizer(reader, 1, 30);
stream = new LowerCaseFilter(stream);
stream = new PorterStemFilter(stream);
stream = new StopFilter(false, stream, stopwords, true);
return stream;
}
示例11: tokenStream
import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
/**
*
* @param fieldName
* @param reader
* @return
*/
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream stream = new NGramTokenizer(reader, 1, 30);
stream = new LowerCaseFilter(stream);
return stream;
}
示例12: reusableTokenStream
import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
/**
*
* @param fieldName
* @param reader
* @return
* @throws IOException
*/
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new NGramTokenizer(reader, 1, 30);
streams.result = new LowerCaseFilter(streams.source);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
}
示例13: EdgeNGramTokenizerFactory
import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
public EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
}
示例14: NGramTokenizerFactory
import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
public NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
}
示例15: createComponents
import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
@Override
protected TokenStreamComponents createComponents(String paramString) {
Tokenizer source = new NGramTokenizer(n, n);
TokenStream result = new StandardFilter(source);
return new TokenStreamComponents(source, result);
}