本文整理汇总了Java中org.apache.lucene.analysis.TokenStream.incrementToken方法的典型用法代码示例。如果您正苦于以下问题:Java TokenStream.incrementToken方法的具体用法?Java TokenStream.incrementToken怎么用?Java TokenStream.incrementToken使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.analysis.TokenStream
的用法示例。
在下文中一共展示了TokenStream.incrementToken方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: lemmatize
import org.apache.lucene.analysis.TokenStream; //导入方法依赖的package包/类
protected String lemmatize(String query) {
ItalianAnalyzer analyzer = new ItalianAnalyzer();
TokenStream tokenStream = analyzer.tokenStream("label", query);
StringBuilder sb = new StringBuilder();
CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
try {
tokenStream.reset();
while (tokenStream.incrementToken()) {
if (sb.length() > 0) {
sb.append(" ");
}
sb.append(token.toString());
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return sb.toString();
}
示例2: splitByTokenizer
import org.apache.lucene.analysis.TokenStream; //导入方法依赖的package包/类
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) throws IOException{
StringReader reader = new StringReader( source );
TokenStream ts = loadTokenizer(tokFactory, reader);
List<String> tokList = new ArrayList<>();
try {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()){
if( termAtt.length() > 0 )
tokList.add( termAtt.toString() );
}
} finally{
reader.close();
}
return tokList;
}
示例3: getFilter
import org.apache.lucene.analysis.TokenStream; //导入方法依赖的package包/类
@Override
public Filter getFilter(Element e) throws ParserException {
List<BytesRef> terms = new ArrayList<>();
String text = DOMUtils.getNonBlankTextOrFail(e);
String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
TokenStream ts = null;
try {
ts = analyzer.tokenStream(fieldName, text);
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
BytesRef bytes = termAtt.getBytesRef();
ts.reset();
while (ts.incrementToken()) {
termAtt.fillBytesRef();
terms.add(BytesRef.deepCopyOf(bytes));
}
ts.end();
}
catch (IOException ioe) {
throw new RuntimeException("Error constructing terms from index:" + ioe);
} finally {
IOUtils.closeWhileHandlingException(ts);
}
return new TermsFilter(fieldName, terms);
}
示例4: assertOffsets
import org.apache.lucene.analysis.TokenStream; //导入方法依赖的package包/类
static private void assertOffsets(String inputStr, TokenStream tokenStream, List<String> expected) {
try {
List<String> termList = new ArrayList<String>();
// CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttr = tokenStream.addAttribute(OffsetAttribute.class);
while (tokenStream.incrementToken()) {
int start = offsetAttr.startOffset();
int end = offsetAttr.endOffset();
termList.add(inputStr.substring(start, end));
}
System.out.println(String.join(" ", termList));
assertThat(termList, is(expected));
} catch (IOException e) {
assertTrue(false);
}
}
示例5: analyze
import org.apache.lucene.analysis.TokenStream; //导入方法依赖的package包/类
/** NOTE: this method closes the TokenStream, even on exception, which is awkward
* because really the caller who called {@link Analyzer#tokenStream} should close it,
* but when trying that there are recursion issues when we try to use the same
* TokenStrem twice in the same recursion... */
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException {
int numTokens = 0;
boolean success = false;
try {
stream.reset();
consumer.reset(stream);
while (stream.incrementToken()) {
consumer.nextToken();
numTokens++;
}
consumer.end();
} finally {
if (success) {
stream.close();
} else {
IOUtils.closeWhileHandlingException(stream);
}
}
return numTokens;
}
示例6: test
import org.apache.lucene.analysis.TokenStream; //导入方法依赖的package包/类
@Test
public void test() throws IOException {
TokenStream ts = analyzer.tokenStream("field", new StringReader("大大大战争"));
while (ts.incrementToken()) {
System.out.println("token : " + ts);
}
}
示例7: analysisResult
import org.apache.lucene.analysis.TokenStream; //导入方法依赖的package包/类
/**
* 打印出给定分词器的分词结果
*
* @param analyzer 分词器
* @param keyWord 关键词
* @throws Exception
*/
private static List<String> analysisResult(Analyzer analyzer, String keyWord)
throws Exception {
TokenStream tokenStream = analyzer.tokenStream("content",
new StringReader(keyWord));
tokenStream.addAttribute(CharTermAttribute.class);
List<String> stringList = new ArrayList<String>();
while (tokenStream.incrementToken()) {
CharTermAttribute charTermAttribute = tokenStream
.getAttribute(CharTermAttribute.class);
stringList.add(charTermAttribute.toString());
}
return stringList;
}
示例8: analyze
import org.apache.lucene.analysis.TokenStream; //导入方法依赖的package包/类
private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes) {
try {
stream.reset();
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
PositionLengthAttribute posLen = stream.addAttribute(PositionLengthAttribute.class);
while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
lastPosition = lastPosition + increment;
}
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), extractExtendedAttributes(stream, includeAttributes)));
}
stream.end();
lastOffset += offset.endOffset();
lastPosition += posIncr.getPositionIncrement();
lastPosition += analyzer.getPositionIncrementGap(field);
lastOffset += analyzer.getOffsetGap(field);
} catch (IOException e) {
throw new ElasticsearchException("failed to analyze", e);
} finally {
IOUtils.closeWhileHandlingException(stream);
}
}
示例9: termsFromTokenStream
import org.apache.lucene.analysis.TokenStream; //导入方法依赖的package包/类
private String[] termsFromTokenStream(TokenStream stream) throws IOException {
List<String> outputTemp=new ArrayList<>();
CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);
stream.reset();
while (stream.incrementToken()) {
outputTemp.add(charTermAttribute.toString());
}
stream.end();
stream.close();
return outputTemp.toArray(new String[0]);
}
开发者ID:sebastian-hofstaetter,项目名称:ir-generalized-translation-models,代码行数:14,代码来源:SimilarityApiParser.java
示例10: assertSimpleTSOutput
import org.apache.lucene.analysis.TokenStream; //导入方法依赖的package包/类
public static void assertSimpleTSOutput(TokenStream stream,
String[] expected) throws IOException {
stream.reset();
CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
assertThat(termAttr, notNullValue());
int i = 0;
while (stream.incrementToken()) {
assertThat(expected.length, greaterThan(i));
assertThat( "expected different term at index " + i, expected[i++], equalTo(termAttr.toString()));
}
assertThat("not all tokens produced", i, equalTo(expected.length));
}
示例11: getSpanQuery
import org.apache.lucene.analysis.TokenStream; //导入方法依赖的package包/类
@Override
public SpanQuery getSpanQuery(Element e) throws ParserException {
String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
String value = DOMUtils.getNonBlankTextOrFail(e);
List<SpanQuery> clausesList = new ArrayList<>();
TokenStream ts = null;
try {
ts = analyzer.tokenStream(fieldName, value);
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
BytesRef bytes = termAtt.getBytesRef();
ts.reset();
while (ts.incrementToken()) {
termAtt.fillBytesRef();
SpanTermQuery stq = new SpanTermQuery(new Term(fieldName, BytesRef.deepCopyOf(bytes)));
clausesList.add(stq);
}
ts.end();
SpanOrQuery soq = new SpanOrQuery(clausesList.toArray(new SpanQuery[clausesList.size()]));
soq.setBoost(DOMUtils.getAttribute(e, "boost", 1.0f));
return soq;
}
catch (IOException ioe) {
throw new ParserException("IOException parsing value:" + value);
} finally {
IOUtils.closeWhileHandlingException(ts);
}
}
示例12: analyze
import org.apache.lucene.analysis.TokenStream; //导入方法依赖的package包/类
private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes) {
try {
stream.reset();
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
lastPosition = lastPosition + increment;
}
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes)));
}
stream.end();
lastOffset += offset.endOffset();
lastPosition += posIncr.getPositionIncrement();
lastPosition += analyzer.getPositionIncrementGap(field);
lastOffset += analyzer.getOffsetGap(field);
} catch (IOException e) {
throw new ElasticsearchException("failed to analyze", e);
} finally {
IOUtils.closeWhileHandlingException(stream);
}
}
示例13: lemmatize
import org.apache.lucene.analysis.TokenStream; //导入方法依赖的package包/类
public static String lemmatize(String query) {
StringBuilder sb = new StringBuilder();
ItalianAnalyzer analyzer = new ItalianAnalyzer(Version.LUCENE_44);
TokenStream tokenStream;
try {
tokenStream = analyzer.tokenStream("label", query);
CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
if (sb.length() > 0) {
sb.append(" ");
}
sb.append(token.toString());
}
analyzer.close();
} catch (IOException e) {
log.error(e.getMessage(), e);
sb = new StringBuilder();
sb.append(query);
}
return sb.toString();
}
示例14: analyzeSingleChunk
import org.apache.lucene.analysis.TokenStream; //导入方法依赖的package包/类
/**
* Returns the analyzed form for the given chunk
*
* If the analyzer produces more than one output token from the given chunk,
* a ParseException is thrown.
*
* @param field The target field
* @param termStr The full term from which the given chunk is excerpted
* @param chunk The portion of the given termStr to be analyzed
* @return The result of analyzing the given chunk
* @throws ParseException when analysis returns other than one output token
*/
protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException{
String analyzed = null;
TokenStream stream = null;
try {
stream = getAnalyzer().tokenStream(field, chunk);
stream.reset();
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
// get first and hopefully only output token
if (stream.incrementToken()) {
analyzed = termAtt.toString();
// try to increment again, there should only be one output token
StringBuilder multipleOutputs = null;
while (stream.incrementToken()) {
if (null == multipleOutputs) {
multipleOutputs = new StringBuilder();
multipleOutputs.append('"');
multipleOutputs.append(analyzed);
multipleOutputs.append('"');
}
multipleOutputs.append(',');
multipleOutputs.append('"');
multipleOutputs.append(termAtt.toString());
multipleOutputs.append('"');
}
stream.end();
if (null != multipleOutputs) {
throw new ParseException(
String.format(getLocale(),
"Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString()));
}
} else {
// nothing returned by analyzer. Was it a stop word and the user accidentally
// used an analyzer with stop words?
stream.end();
throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk));
}
} catch (IOException e){
throw new ParseException(
String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr));
} finally {
IOUtils.closeWhileHandlingException(stream);
}
return analyzed;
}
示例15: getQuery
import org.apache.lucene.analysis.TokenStream; //导入方法依赖的package包/类
@Override
public Query getQuery(Element e) throws ParserException {
String fieldsList = e.getAttribute("fieldNames"); //a comma-delimited list of fields
String fields[] = defaultFieldNames;
if ((fieldsList != null) && (fieldsList.trim().length() > 0)) {
fields = fieldsList.trim().split(",");
//trim the fieldnames
for (int i = 0; i < fields.length; i++) {
fields[i] = fields[i].trim();
}
}
//Parse any "stopWords" attribute
//TODO MoreLikeThis needs to ideally have per-field stopWords lists - until then
//I use all analyzers/fields to generate multi-field compatible stop list
String stopWords = e.getAttribute("stopWords");
Set<String> stopWordsSet = null;
if ((stopWords != null) && (fields != null)) {
stopWordsSet = new HashSet<>();
for (String field : fields) {
TokenStream ts = null;
try {
ts = analyzer.tokenStream(field, stopWords);
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
stopWordsSet.add(termAtt.toString());
}
ts.end();
} catch (IOException ioe) {
throw new ParserException("IoException parsing stop words list in "
+ getClass().getName() + ":" + ioe.getLocalizedMessage());
} finally {
IOUtils.closeWhileHandlingException(ts);
}
}
}
MoreLikeThisQuery mlt = new MoreLikeThisQuery(DOMUtils.getText(e), fields, analyzer, fields[0]);
mlt.setMaxQueryTerms(DOMUtils.getAttribute(e, "maxQueryTerms", DEFAULT_MAX_QUERY_TERMS));
mlt.setMinTermFrequency(DOMUtils.getAttribute(e, "minTermFrequency", DEFAULT_MIN_TERM_FREQUENCY));
mlt.setPercentTermsToMatch(DOMUtils.getAttribute(e, "percentTermsToMatch", DEFAULT_PERCENT_TERMS_TO_MATCH) / 100);
mlt.setStopWords(stopWordsSet);
int minDocFreq = DOMUtils.getAttribute(e, "minDocFreq", -1);
if (minDocFreq >= 0) {
mlt.setMinDocFreq(minDocFreq);
}
mlt.setBoost(DOMUtils.getAttribute(e, "boost", 1.0f));
return mlt;
}