当前位置: 首页>>代码示例>>Java>>正文


Java Analyzer.tokenStream方法代码示例

本文整理汇总了Java中org.apache.lucene.analysis.Analyzer.tokenStream方法的典型用法代码示例。如果您正苦于以下问题:Java Analyzer.tokenStream方法的具体用法?Java Analyzer.tokenStream怎么用?Java Analyzer.tokenStream使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.lucene.analysis.Analyzer的用法示例。


在下文中一共展示了Analyzer.tokenStream方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: findGoodEndForNoHighlightExcerpt

import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents)
        throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
        if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
            // Can't split on term boundaries without offsets
            return -1;
        }
        int end = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
            if (attr.endOffset() >= noMatchSize) {
                // Jump to the end of this token if it wouldn't put us past the boundary
                if (attr.endOffset() == noMatchSize) {
                    end = noMatchSize;
                }
                return end;
            }
            end = attr.endOffset();
        }
        tokenStream.end();
        // We've exhausted the token stream so we should just highlight everything.
        return end;
    }
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:26,代码来源:PlainHighlighter.java

示例2: parseQueryString

import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private static Query parseQueryString(ExtendedCommonTermsQuery query, Object queryString, String field, Analyzer analyzer,
                                     String lowFreqMinimumShouldMatch, String highFreqMinimumShouldMatch) throws IOException {
    // Logic similar to QueryParser#getFieldQuery
    try (TokenStream source = analyzer.tokenStream(field, queryString.toString())) {
        source.reset();
        CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
        BytesRefBuilder builder = new BytesRefBuilder();
        while (source.incrementToken()) {
            // UTF-8
            builder.copyChars(termAtt);
            query.add(new Term(field, builder.toBytesRef()));
        }
    }

    query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch);
    query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch);
    return query;
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:19,代码来源:CommonTermsQueryBuilder.java

示例3: tokenStream

import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream previous) {
    try {
        allEntries.reset(); // reset the all entries, just in case it was read already
        if (allEntries.customBoost() && fieldType().indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
            // TODO: we should be able to reuse "previous" if its instanceof AllTokenStream?
            // but we need to be careful this optimization is safe (and tested)...
            
            // AllTokenStream maps boost to 4-byte payloads, so we only need to use it any field had non-default (!= 1.0f) boost and if
            // positions are indexed:
            return AllTokenStream.allTokenStream(name, allEntries, analyzer);
        } else {
            return analyzer.tokenStream(name, allEntries);
        }
    } catch (IOException e) {
        throw new ElasticsearchException("Failed to create token stream");
    }
}
 
开发者ID:baidu,项目名称:Elasticsearch,代码行数:19,代码来源:AllField.java

示例4: analyze

import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private Set<String> analyze(String text) throws IOException {
    Set<String> result = new HashSet<String>();
    Analyzer analyzer = configuration.getAnalyzer();
    try (TokenStream ts = analyzer.tokenStream("", text)) {
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            int length = termAtt.length();
            if (length == 0) {
                throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
            }
            if (posIncAtt.getPositionIncrement() != 1) {
                throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
            }

            result.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }

        ts.end();
        return result;
    }
}
 
开发者ID:ginobefun,项目名称:elasticsearch-dynamic-synonym,代码行数:24,代码来源:SimpleSynonymMap.java

示例5: testSimple

import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
public void testSimple() throws IOException {
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(t, new UniqueTokenFilter(t));
        }
    };

    TokenStream test = analyzer.tokenStream("test", "this test with test");
    test.reset();
    CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class);
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("this"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("test"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("with"));

    assertThat(test.incrementToken(), equalTo(false));
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:24,代码来源:UniqueTokenFilterTests.java

示例6: analyzeMultitermTerm

import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
  if (analyzerIn == null) analyzerIn = getAnalyzer();

  TokenStream source = null;
  try {
    source = analyzerIn.tokenStream(field, part);
    source.reset();
    
    TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
    BytesRef bytes = termAtt.getBytesRef();

    if (!source.incrementToken())
      throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
    termAtt.fillBytesRef();
    if (source.incrementToken())
      throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
    source.end();
    return BytesRef.deepCopyOf(bytes);
  } catch (IOException e) {
    throw new RuntimeException("Error analyzing multiTerm term: " + part, e);
  } finally {
    IOUtils.closeWhileHandlingException(source);
  }
}
 
开发者ID:lamsfoundation,项目名称:lams,代码行数:25,代码来源:QueryParserBase.java

示例7: findGoodEndForNoHighlightExcerpt

import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
        if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
            // Can't split on term boundaries without offsets
            return -1;
        }
        int end = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
            if (attr.endOffset() >= noMatchSize) {
                // Jump to the end of this token if it wouldn't put us past the boundary
                if (attr.endOffset() == noMatchSize) {
                    end = noMatchSize;
                }
                return end;
            }
            end = attr.endOffset();
        }
        tokenStream.end();
        // We've exhausted the token stream so we should just highlight everything.
        return end;
    }
}
 
开发者ID:baidu,项目名称:Elasticsearch,代码行数:25,代码来源:PlainHighlighter.java

示例8: testMetaphonePhrases

import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
@Test
public void testMetaphonePhrases() throws Exception {
    Index index = new Index("test", "_na_");
    Settings settings = Settings.builder()
            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("index.analysis.analyzer.myAnalyzer.type", "custom")
            .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard")
            .put("index.analysis.analyzer.myAnalyzer.filter", "br_metaphone")
            .build();

    AnalysisService analysisService = createAnalysisService(index, settings, new AnalysisMetaphonePlugin());

    Analyzer analyzer = analysisService.analyzer("myAnalyzer");
    
    Map<String,List<String>> phrases = buildPhraseList();
    
    for(String phrase : phrases.keySet()) {
     List<String> outputWords = phrases.get(phrase);
     
     TokenStream ts = analyzer.tokenStream("test", phrase);
	
     CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
     ts.reset();
	
     for (String expected : outputWords) {
         assertThat(ts.incrementToken(), equalTo(true));
         assertThat(term1.toString(), equalTo(expected));
     }
     ts.close();
    
    }
}
 
开发者ID:anaelcarvalho,项目名称:elasticsearch-analysis-metaphone_ptBR,代码行数:33,代码来源:MetaphoneTokenFilterTests.java

示例9: analyze

import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare)
        throws IOException {
    spare.copyUTF8Bytes(toAnalyze);
    CharsRef charsRef = spare.get();
    try (TokenStream ts = analyzer.tokenStream(
                              field, new FastCharArrayReader(charsRef.chars, charsRef.offset, charsRef.length))) {
         return analyze(ts, consumer);
    }
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:10,代码来源:DirectCandidateGenerator.java

示例10: countPositions

import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
/**
 * Count position increments in a token stream.  Package private for testing.
 * @param analyzer analyzer to create token stream
 * @param fieldName field name to pass to analyzer
 * @param fieldValue field value to pass to analyzer
 * @return number of position increments in a token stream
 * @throws IOException if tokenStream throws it
 */
static int countPositions(Analyzer analyzer, String fieldName, String fieldValue) throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, fieldValue)) {
        int count = 0;
        PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            count += position.getPositionIncrement();
        }
        tokenStream.end();
        count += position.getPositionIncrement();
        return count;
    }
}
 
开发者ID:baidu,项目名称:Elasticsearch,代码行数:22,代码来源:TokenCountFieldMapper.java

示例11: tokenStream

import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream previous) {
    TokenStream ts = analyzer.tokenStream(name(), stringValue());
    if (boost != 1.0f && fieldType().indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
        // TODO: we should be able to reuse "previous" if its instanceof AllTokenStream?
        // but we need to be careful this optimization is safe (and tested)...

        // AllTokenStream maps boost to 4-byte payloads, so we only need to use it any field had non-default (!= 1.0f) boost and if
        // positions are indexed:
        return new AllTokenStream(ts, boost);
    }
    return ts;
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:14,代码来源:AllField.java

示例12: assertLuceneAnalyzersAreNotClosed

import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private void assertLuceneAnalyzersAreNotClosed(Map<PreBuiltAnalyzers, List<Version>> loadedAnalyzers) throws IOException {
    for (Map.Entry<PreBuiltAnalyzers, List<Version>> preBuiltAnalyzerEntry : loadedAnalyzers.entrySet()) {
        for (Version version : preBuiltAnalyzerEntry.getValue()) {
            Analyzer analyzer = preBuiltAnalyzerEntry.getKey().getCache().get(version);
            try (TokenStream stream = analyzer.tokenStream("foo", "bar")) {
                stream.reset();
                while (stream.incrementToken()) {
                }
                stream.end();
            }
        }
    }
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:14,代码来源:PreBuiltAnalyzerIntegrationIT.java

示例13: assertAnalyzes

import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private void assertAnalyzes(Analyzer analyzer, String field, String output) throws IOException {
    try (TokenStream tok = analyzer.tokenStream(field, new StringReader(""))) {
        CharTermAttribute term = tok.addAttribute(CharTermAttribute.class);
        assertTrue(tok.incrementToken());
        assertEquals(output, term.toString());
    }
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:8,代码来源:DocumentFieldMapperTests.java

示例14: testAnalyzer

import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private void testAnalyzer(String source, String... expected_terms) throws IOException {
    TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY, new AnalysisStempelPlugin());

    Analyzer analyzer = analysis.indexAnalyzers.get("polish").analyzer();

    TokenStream ts = analyzer.tokenStream("test", source);

    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();

    for (String expected : expected_terms) {
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(expected));
    }
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:16,代码来源:SimplePolishTokenFilterTests.java

示例15: testAnalyzer

import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private static void testAnalyzer(String source, String... expected_terms) throws IOException {
    TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY, new AnalysisUkrainianPlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("ukrainian").analyzer();
    TokenStream ts = analyzer.tokenStream("test", source);
    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    for (String expected : expected_terms) {
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(expected));
    }
    assertThat(ts.incrementToken(), equalTo(false));
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:13,代码来源:SimpleUkrainianAnalyzerTests.java


注:本文中的org.apache.lucene.analysis.Analyzer.tokenStream方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。