本文整理汇总了Java中org.apache.lucene.analysis.Analyzer.tokenStream方法的典型用法代码示例。如果您正苦于以下问题:Java Analyzer.tokenStream方法的具体用法?Java Analyzer.tokenStream怎么用?Java Analyzer.tokenStream使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.analysis.Analyzer
的用法示例。
在下文中一共展示了Analyzer.tokenStream方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: findGoodEndForNoHighlightExcerpt
import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents)
throws IOException {
try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
// Can't split on term boundaries without offsets
return -1;
}
int end = -1;
tokenStream.reset();
while (tokenStream.incrementToken()) {
OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
if (attr.endOffset() >= noMatchSize) {
// Jump to the end of this token if it wouldn't put us past the boundary
if (attr.endOffset() == noMatchSize) {
end = noMatchSize;
}
return end;
}
end = attr.endOffset();
}
tokenStream.end();
// We've exhausted the token stream so we should just highlight everything.
return end;
}
}
示例2: parseQueryString
import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private static Query parseQueryString(ExtendedCommonTermsQuery query, Object queryString, String field, Analyzer analyzer,
String lowFreqMinimumShouldMatch, String highFreqMinimumShouldMatch) throws IOException {
// Logic similar to QueryParser#getFieldQuery
try (TokenStream source = analyzer.tokenStream(field, queryString.toString())) {
source.reset();
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
BytesRefBuilder builder = new BytesRefBuilder();
while (source.incrementToken()) {
// UTF-8
builder.copyChars(termAtt);
query.add(new Term(field, builder.toBytesRef()));
}
}
query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch);
query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch);
return query;
}
示例3: tokenStream
import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream previous) {
try {
allEntries.reset(); // reset the all entries, just in case it was read already
if (allEntries.customBoost() && fieldType().indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
// TODO: we should be able to reuse "previous" if its instanceof AllTokenStream?
// but we need to be careful this optimization is safe (and tested)...
// AllTokenStream maps boost to 4-byte payloads, so we only need to use it any field had non-default (!= 1.0f) boost and if
// positions are indexed:
return AllTokenStream.allTokenStream(name, allEntries, analyzer);
} else {
return analyzer.tokenStream(name, allEntries);
}
} catch (IOException e) {
throw new ElasticsearchException("Failed to create token stream");
}
}
示例4: analyze
import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private Set<String> analyze(String text) throws IOException {
Set<String> result = new HashSet<String>();
Analyzer analyzer = configuration.getAnalyzer();
try (TokenStream ts = analyzer.tokenStream("", text)) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
ts.reset();
while (ts.incrementToken()) {
int length = termAtt.length();
if (length == 0) {
throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
}
if (posIncAtt.getPositionIncrement() != 1) {
throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
}
result.add(new String(termAtt.buffer(), 0, termAtt.length()));
}
ts.end();
return result;
}
}
示例5: testSimple
import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
public void testSimple() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(t, new UniqueTokenFilter(t));
}
};
TokenStream test = analyzer.tokenStream("test", "this test with test");
test.reset();
CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class);
assertThat(test.incrementToken(), equalTo(true));
assertThat(termAttribute.toString(), equalTo("this"));
assertThat(test.incrementToken(), equalTo(true));
assertThat(termAttribute.toString(), equalTo("test"));
assertThat(test.incrementToken(), equalTo(true));
assertThat(termAttribute.toString(), equalTo("with"));
assertThat(test.incrementToken(), equalTo(false));
}
示例6: analyzeMultitermTerm
import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
if (analyzerIn == null) analyzerIn = getAnalyzer();
TokenStream source = null;
try {
source = analyzerIn.tokenStream(field, part);
source.reset();
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
BytesRef bytes = termAtt.getBytesRef();
if (!source.incrementToken())
throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
termAtt.fillBytesRef();
if (source.incrementToken())
throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
source.end();
return BytesRef.deepCopyOf(bytes);
} catch (IOException e) {
throw new RuntimeException("Error analyzing multiTerm term: " + part, e);
} finally {
IOUtils.closeWhileHandlingException(source);
}
}
示例7: findGoodEndForNoHighlightExcerpt
import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException {
try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
// Can't split on term boundaries without offsets
return -1;
}
int end = -1;
tokenStream.reset();
while (tokenStream.incrementToken()) {
OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
if (attr.endOffset() >= noMatchSize) {
// Jump to the end of this token if it wouldn't put us past the boundary
if (attr.endOffset() == noMatchSize) {
end = noMatchSize;
}
return end;
}
end = attr.endOffset();
}
tokenStream.end();
// We've exhausted the token stream so we should just highlight everything.
return end;
}
}
示例8: testMetaphonePhrases
import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
@Test
public void testMetaphonePhrases() throws Exception {
Index index = new Index("test", "_na_");
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("index.analysis.analyzer.myAnalyzer.type", "custom")
.put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard")
.put("index.analysis.analyzer.myAnalyzer.filter", "br_metaphone")
.build();
AnalysisService analysisService = createAnalysisService(index, settings, new AnalysisMetaphonePlugin());
Analyzer analyzer = analysisService.analyzer("myAnalyzer");
Map<String,List<String>> phrases = buildPhraseList();
for(String phrase : phrases.keySet()) {
List<String> outputWords = phrases.get(phrase);
TokenStream ts = analyzer.tokenStream("test", phrase);
CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
ts.reset();
for (String expected : outputWords) {
assertThat(ts.incrementToken(), equalTo(true));
assertThat(term1.toString(), equalTo(expected));
}
ts.close();
}
}
开发者ID:anaelcarvalho,项目名称:elasticsearch-analysis-metaphone_ptBR,代码行数:33,代码来源:MetaphoneTokenFilterTests.java
示例9: analyze
import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare)
throws IOException {
spare.copyUTF8Bytes(toAnalyze);
CharsRef charsRef = spare.get();
try (TokenStream ts = analyzer.tokenStream(
field, new FastCharArrayReader(charsRef.chars, charsRef.offset, charsRef.length))) {
return analyze(ts, consumer);
}
}
示例10: countPositions
import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
/**
* Count position increments in a token stream. Package private for testing.
* @param analyzer analyzer to create token stream
* @param fieldName field name to pass to analyzer
* @param fieldValue field value to pass to analyzer
* @return number of position increments in a token stream
* @throws IOException if tokenStream throws it
*/
static int countPositions(Analyzer analyzer, String fieldName, String fieldValue) throws IOException {
try (TokenStream tokenStream = analyzer.tokenStream(fieldName, fieldValue)) {
int count = 0;
PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
count += position.getPositionIncrement();
}
tokenStream.end();
count += position.getPositionIncrement();
return count;
}
}
示例11: tokenStream
import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream previous) {
TokenStream ts = analyzer.tokenStream(name(), stringValue());
if (boost != 1.0f && fieldType().indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
// TODO: we should be able to reuse "previous" if its instanceof AllTokenStream?
// but we need to be careful this optimization is safe (and tested)...
// AllTokenStream maps boost to 4-byte payloads, so we only need to use it any field had non-default (!= 1.0f) boost and if
// positions are indexed:
return new AllTokenStream(ts, boost);
}
return ts;
}
示例12: assertLuceneAnalyzersAreNotClosed
import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private void assertLuceneAnalyzersAreNotClosed(Map<PreBuiltAnalyzers, List<Version>> loadedAnalyzers) throws IOException {
for (Map.Entry<PreBuiltAnalyzers, List<Version>> preBuiltAnalyzerEntry : loadedAnalyzers.entrySet()) {
for (Version version : preBuiltAnalyzerEntry.getValue()) {
Analyzer analyzer = preBuiltAnalyzerEntry.getKey().getCache().get(version);
try (TokenStream stream = analyzer.tokenStream("foo", "bar")) {
stream.reset();
while (stream.incrementToken()) {
}
stream.end();
}
}
}
}
示例13: assertAnalyzes
import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private void assertAnalyzes(Analyzer analyzer, String field, String output) throws IOException {
try (TokenStream tok = analyzer.tokenStream(field, new StringReader(""))) {
CharTermAttribute term = tok.addAttribute(CharTermAttribute.class);
assertTrue(tok.incrementToken());
assertEquals(output, term.toString());
}
}
示例14: testAnalyzer
import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private void testAnalyzer(String source, String... expected_terms) throws IOException {
TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY, new AnalysisStempelPlugin());
Analyzer analyzer = analysis.indexAnalyzers.get("polish").analyzer();
TokenStream ts = analyzer.tokenStream("test", source);
CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
ts.reset();
for (String expected : expected_terms) {
assertThat(ts.incrementToken(), equalTo(true));
assertThat(term1.toString(), equalTo(expected));
}
}
示例15: testAnalyzer
import org.apache.lucene.analysis.Analyzer; //导入方法依赖的package包/类
private static void testAnalyzer(String source, String... expected_terms) throws IOException {
TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY, new AnalysisUkrainianPlugin());
Analyzer analyzer = analysis.indexAnalyzers.get("ukrainian").analyzer();
TokenStream ts = analyzer.tokenStream("test", source);
CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
ts.reset();
for (String expected : expected_terms) {
assertThat(ts.incrementToken(), equalTo(true));
assertThat(term1.toString(), equalTo(expected));
}
assertThat(ts.incrementToken(), equalTo(false));
}