当前位置: 首页>>代码示例>>Java>>正文


Java NGramTokenizer类代码示例

本文整理汇总了Java中org.apache.lucene.analysis.ngram.NGramTokenizer的典型用法代码示例。如果您正苦于以下问题:Java NGramTokenizer类的具体用法?Java NGramTokenizer怎么用?Java NGramTokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


NGramTokenizer类属于org.apache.lucene.analysis.ngram包,在下文中一共展示了NGramTokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: reusableTokenStream

import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws
        IOException {
    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        streams.source = new NGramTokenizer(reader, 1, 30);
        streams.result = new LowerCaseFilter(streams.source);
        streams.result = new PorterStemFilter(streams.source);
        streams.result = new StopFilter(false, streams.source, stopwords, true);

        setPreviousTokenStream(streams);
    } else {
        streams.source.reset(reader);
    }
    return streams.result;
}
 
开发者ID:jcrcano,项目名称:DrakkarKeel,代码行数:18,代码来源:NGramAnalyzer.java

示例2: create

import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
@Override
public Tokenizer create() {
    if (matcher == null) {
        return new NGramTokenizer(minGram, maxGram);
    } else {
        return new NGramTokenizer(minGram, maxGram) {
            @Override
            protected boolean isTokenChar(int chr) {
                return matcher.isTokenChar(chr);
            }
        };
    }
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:14,代码来源:NGramTokenizerFactory.java

示例3: EdgeNGramTokenizerFactory

import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
public EdgeNGramTokenizerFactory(Index index, Settings indexSettings, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    this.side = Lucene43EdgeNGramTokenizer.Side.getSide(settings.get("side", Lucene43EdgeNGramTokenizer.DEFAULT_SIDE.getLabel()));
    this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
    this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings);
}
 
开发者ID:baidu,项目名称:Elasticsearch,代码行数:9,代码来源:EdgeNGramTokenizerFactory.java

示例4: NGramTokenizerFactory

import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
NGramTokenizerFactory(Index index, Settings indexSettings, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
    this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings);
}
 
开发者ID:baidu,项目名称:Elasticsearch,代码行数:8,代码来源:NGramTokenizerFactory.java

示例5: NGramLuceneQuery

import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
public NGramLuceneQuery(String fieldName, String fieldValue, int gramSize) {
	super(gramSize);
	
	Preconditions.checkArgument(fieldValue.length()>=gramSize);
	
	try (NGramTokenizer tokenizer = new NGramTokenizer(new StringReader(fieldValue.toLowerCase()), gramSize, gramSize)) {
		tokenizer.reset();
		while (tokenizer.incrementToken()) { 
			add(new Term(fieldName, 
					tokenizer.getAttribute(CharTermAttribute.class).toString()));
		}
	} catch (IOException e) {
		throw new RuntimeException(e);
	}
}
 
开发者ID:jmfgdev,项目名称:gitplex-mit,代码行数:16,代码来源:NGramLuceneQuery.java

示例6: docToMinHashes

import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
public int[] docToMinHashes(String raw_html) throws Exception {
    HashSet<Integer> doc = new HashSet<Integer>();
    int count = 0;

    NGramTokenizer gramTokenizer = new NGramTokenizer(factory, gram_length, gram_length);
    gramTokenizer.setReader(new StringReader(raw_html));
    CharTermAttribute cattr = gramTokenizer.addAttribute(CharTermAttribute.class);
    gramTokenizer.reset();

    while (gramTokenizer.incrementToken()) {
        count++;
        if ((count % skip_interval) == 0)
            doc.add(murmur.hashString(cattr.toString(), Charsets.UTF_8).asInt());
    }
    gramTokenizer.close();
    if (hasher == null)
        hasher = new MinHasher(num_hashes);
    return hasher.hash(doc);

}
 
开发者ID:isoboroff,项目名称:crawl-eval,代码行数:21,代码来源:MinHashDupesByCharNgram.java

示例7: createAnalzyer

import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
private Analyzer createAnalzyer(final int length) {
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new NGramTokenizer(1, 1);
            final AlphaNumWordFilter filter = new AlphaNumWordFilter(tokenizer);
            filter.setMaxTokenLength(length);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
    return analyzer;
}
 
开发者ID:codelibs,项目名称:analyzers-ja,代码行数:13,代码来源:AlphaNumWordFilterTest.java

示例8: testTokenStream2

import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
public void testTokenStream2() throws IOException {
  // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<'
  String input = "㌰゙5℃№㈱㌘ザゾ";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), reader, 1, 1);

  assertTokenStreamContents(tokenStream,
    new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"},
    new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9},
    new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11},
    input.length()
  );
}
 
开发者ID:europeana,项目名称:search,代码行数:17,代码来源:TestICUNormalizer2CharFilter.java

示例9: init

import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
/** Initializes the n-gram min and max sizes and the side from which one should start tokenizing. */
@Override
public void init(Map<String, String> args) {
  super.init(args);
  String maxArg = args.get("maxGramSize");
  maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) : NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
  
  String minArg = args.get("minGramSize");
  minGramSize = (minArg != null ? Integer.parseInt(minArg) : NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:11,代码来源:NGramTokenizerFactory.java

示例10: tokenStream

import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
public TokenStream tokenStream(String fieldName, Reader reader) {

        TokenStream stream = new NGramTokenizer(reader, 1, 30);
        stream = new LowerCaseFilter(stream);
        stream = new PorterStemFilter(stream);
        stream = new StopFilter(false, stream, stopwords, true);

        return stream;
    }
 
开发者ID:jcrcano,项目名称:DrakkarKeel,代码行数:10,代码来源:NGramAnalyzer.java

示例11: tokenStream

import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
/**
 *
 * @param fieldName
 * @param reader
 * @return
 */
public TokenStream tokenStream(String fieldName, Reader reader) {

    TokenStream stream = new NGramTokenizer(reader, 1, 30);
    stream = new LowerCaseFilter(stream);
    
    return stream;
}
 
开发者ID:jcrcano,项目名称:DrakkarKeel,代码行数:14,代码来源:NGramAnalyzer.java

示例12: reusableTokenStream

import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
/**
 *
 * @param fieldName
 * @param reader
 * @return
 * @throws IOException
 */
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        streams.source = new NGramTokenizer(reader, 1, 30);
        streams.result = new LowerCaseFilter(streams.source);
        setPreviousTokenStream(streams);
    } else {
        streams.source.reset(reader);
    }
    return streams.result;
}
 
开发者ID:jcrcano,项目名称:DrakkarKeel,代码行数:21,代码来源:NGramAnalyzer.java

示例13: EdgeNGramTokenizerFactory

import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
public EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
    super(indexSettings, name, settings);
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:7,代码来源:EdgeNGramTokenizerFactory.java

示例14: NGramTokenizerFactory

import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
public NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
    super(indexSettings, name, settings);
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:7,代码来源:NGramTokenizerFactory.java

示例15: createComponents

import org.apache.lucene.analysis.ngram.NGramTokenizer; //导入依赖的package包/类
@Override
protected TokenStreamComponents createComponents(String paramString) {
	Tokenizer source = new NGramTokenizer(n, n);
	TokenStream result =  new StandardFilter(source);
	return new TokenStreamComponents(source, result);
}
 
开发者ID:ksgwr,项目名称:LuceneDB,代码行数:7,代码来源:NgramAnalyzer.java


注:本文中的org.apache.lucene.analysis.ngram.NGramTokenizer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。