本文整理汇总了Java中org.apache.lucene.index.Terms.iterator方法的典型用法代码示例。如果您正苦于以下问题:Java Terms.iterator方法的具体用法?Java Terms.iterator怎么用?Java Terms.iterator使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.index.Terms
的用法示例。
在下文中一共展示了Terms.iterator方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: DfsOnlyRequest
import org.apache.lucene.index.Terms; //导入方法依赖的package包/类
public DfsOnlyRequest(Fields termVectorsFields, String[] indices, String[] types, Set<String> selectedFields) throws IOException {
super(indices);
// build a search request with a query of all the terms
final BoolQueryBuilder boolBuilder = boolQuery();
for (String fieldName : termVectorsFields) {
if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
continue;
}
Terms terms = termVectorsFields.terms(fieldName);
TermsEnum iterator = terms.iterator();
while (iterator.next() != null) {
String text = iterator.term().utf8ToString();
boolBuilder.should(QueryBuilders.termQuery(fieldName, text));
}
}
// wrap a search request object
this.searchRequest = new SearchRequest(indices).types(types).source(new SearchSourceBuilder().query(boolBuilder));
}
示例2: DirectCandidateGenerator
import org.apache.lucene.index.Terms; //导入方法依赖的package包/类
public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader,
double nonErrorLikelihood, int numCandidates, Analyzer preFilter, Analyzer postFilter, Terms terms) throws IOException {
if (terms == null) {
throw new IllegalArgumentException("generator field [" + field + "] doesn't exist");
}
this.spellchecker = spellchecker;
this.field = field;
this.numCandidates = numCandidates;
this.suggestMode = suggestMode;
this.reader = reader;
final long dictSize = terms.getSumTotalTermFreq();
this.useTotalTermFrequency = dictSize != -1;
this.dictSize = dictSize == -1 ? reader.maxDoc() : dictSize;
this.preFilter = preFilter;
this.postFilter = postFilter;
this.nonErrorLikelihood = nonErrorLikelihood;
float thresholdFrequency = spellchecker.getThresholdFrequency();
this.frequencyPlateau = thresholdFrequency >= 1.0f ? (int) thresholdFrequency: (int)(dictSize * thresholdFrequency);
termsEnum = terms.iterator();
}
示例3: QueryAutoStopWordAnalyzer
import org.apache.lucene.index.Terms; //导入方法依赖的package包/类
/**
* Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
* given selection of fields from terms with a document frequency greater than
* the given maxDocFreq
*
* @param delegate Analyzer whose TokenStream will be filtered
* @param indexReader IndexReader to identify the stopwords from
* @param fields Selection of fields to calculate stopwords for
* @param maxDocFreq Document frequency terms should be above in order to be stopwords
* @throws IOException Can be thrown while reading from the IndexReader
*/
public QueryAutoStopWordAnalyzer(
Analyzer delegate,
IndexReader indexReader,
Collection<String> fields,
int maxDocFreq) throws IOException {
super(delegate.getReuseStrategy());
this.delegate = delegate;
for (String field : fields) {
Set<String> stopWords = new HashSet<>();
Terms terms = MultiFields.getTerms(indexReader, field);
CharsRefBuilder spare = new CharsRefBuilder();
if (terms != null) {
TermsEnum te = terms.iterator(null);
BytesRef text;
while ((text = te.next()) != null) {
if (te.docFreq() > maxDocFreq) {
spare.copyUTF8Bytes(text);
stopWords.add(spare.toString());
}
}
}
stopWordsPerField.put(field, stopWords);
}
}
示例4: getTermsEnum
import org.apache.lucene.index.Terms; //导入方法依赖的package包/类
/** Return a {@link TermsEnum} intersecting the provided {@link Terms}
* with the terms accepted by this automaton. */
public TermsEnum getTermsEnum(Terms terms) throws IOException {
switch(type) {
case NONE:
return TermsEnum.EMPTY;
case ALL:
return terms.iterator(null);
case SINGLE:
return new SingleTermsEnum(terms.iterator(null), term);
case PREFIX:
// TODO: this is very likely faster than .intersect,
// but we should test and maybe cutover
return new PrefixTermsEnum(terms.iterator(null), term);
case NORMAL:
return terms.intersect(this, null);
default:
// unreachable
throw new RuntimeException("unhandled case");
}
}
示例5: createCandidateQuery
import org.apache.lucene.index.Terms; //导入方法依赖的package包/类
Query createCandidateQuery(IndexReader indexReader) throws IOException {
List<BytesRef> extractedTerms = new ArrayList<>();
LeafReader reader = indexReader.leaves().get(0).reader();
Fields fields = reader.fields();
for (String field : fields) {
Terms terms = fields.terms(field);
if (terms == null) {
continue;
}
BytesRef fieldBr = new BytesRef(field);
TermsEnum tenum = terms.iterator();
for (BytesRef term = tenum.next(); term != null; term = tenum.next()) {
BytesRefBuilder builder = new BytesRefBuilder();
builder.append(fieldBr);
builder.append(FIELD_VALUE_SEPARATOR);
builder.append(term);
extractedTerms.add(builder.toBytesRef());
}
}
Query extractionSuccess = new TermInSetQuery(queryTermsField.name(), extractedTerms);
// include extractionResultField:failed, because docs with this term have no extractedTermsField
// and otherwise we would fail to return these docs. Docs that failed query term extraction
// always need to be verified by MemoryIndex:
Query extractionFailure = new TermQuery(new Term(extractionResultField.name(), EXTRACTION_FAILED));
return new BooleanQuery.Builder()
.add(extractionSuccess, Occur.SHOULD)
.add(extractionFailure, Occur.SHOULD)
.build();
}
示例6: beforeLoad
import org.apache.lucene.index.Terms; //导入方法依赖的package包/类
/**
* Determine whether the BlockTreeTermsReader.FieldReader can be used
* for estimating the field data, adding the estimate to the circuit
* breaker if it can, otherwise wrapping the terms in a
* RamAccountingTermsEnum to be estimated on a per-term basis.
*
* @param terms terms to be estimated
* @return A possibly wrapped TermsEnum for the terms
*/
@Override
public TermsEnum beforeLoad(Terms terms) throws IOException {
LeafReader reader = context.reader();
TermsEnum iterator = terms.iterator();
TermsEnum filteredIterator = filter(terms, iterator, reader);
final boolean filtered = iterator != filteredIterator;
iterator = filteredIterator;
if (filtered) {
if (logger.isTraceEnabled()) {
logger.trace("Filter exists, can't circuit break normally, using RamAccountingTermsEnum");
}
return new RamAccountingTermsEnum(iterator, breaker, this, this.fieldName);
} else {
estimatedBytes = this.estimateStringFieldData();
// If we weren't able to estimate, wrap in the RamAccountingTermsEnum
if (estimatedBytes == 0) {
iterator = new RamAccountingTermsEnum(iterator, breaker, this, this.fieldName);
} else {
breaker.addEstimateBytesAndMaybeBreak(estimatedBytes, fieldName);
}
return iterator;
}
}
示例7: getTermsEnum
import org.apache.lucene.index.Terms; //导入方法依赖的package包/类
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
TermsEnum tenum = terms.iterator(null);
if (prefix.bytes().length == 0) {
// no prefix -- match all terms for this field:
return tenum;
}
return new PrefixTermsEnum(tenum, prefix.bytes());
}
示例8: getPrefixTerms
import org.apache.lucene.index.Terms; //导入方法依赖的package包/类
private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
// SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms
// instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually.
List<LeafReaderContext> leaves = reader.leaves();
for (LeafReaderContext leaf : leaves) {
Terms _terms = leaf.reader().terms(field);
if (_terms == null) {
continue;
}
TermsEnum termsEnum = _terms.iterator();
TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes());
if (TermsEnum.SeekStatus.END == seekStatus) {
continue;
}
for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) {
if (!StringHelper.startsWith(term, prefix.bytes())) {
break;
}
terms.add(new Term(field, BytesRef.deepCopyOf(term)));
if (terms.size() >= maxExpansions) {
return;
}
}
}
}
示例9: buildField
import org.apache.lucene.index.Terms; //导入方法依赖的package包/类
private void buildField(XContentBuilder builder, final CharsRefBuilder spare, Fields theFields, Iterator<String> fieldIter) throws IOException {
String fieldName = fieldIter.next();
builder.startObject(fieldName);
Terms curTerms = theFields.terms(fieldName);
// write field statistics
buildFieldStatistics(builder, curTerms);
builder.startObject(FieldStrings.TERMS);
TermsEnum termIter = curTerms.iterator();
BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class);
for (int i = 0; i < curTerms.size(); i++) {
buildTerm(builder, spare, curTerms, termIter, boostAtt);
}
builder.endObject();
builder.endObject();
}
示例10: checkBrownFoxTermVector
import org.apache.lucene.index.Terms; //导入方法依赖的package包/类
private void checkBrownFoxTermVector(Fields fields, String fieldName, boolean withPayloads) throws IOException {
String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"};
int[] freq = {1, 1, 1, 1, 1, 1, 1, 2};
int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}};
int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}};
int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}};
Terms terms = fields.terms(fieldName);
assertThat(terms.size(), equalTo(8L));
TermsEnum iterator = terms.iterator();
for (int j = 0; j < values.length; j++) {
String string = values[j];
BytesRef next = iterator.next();
assertThat(next, notNullValue());
assertThat("expected " + string, string, equalTo(next.utf8ToString()));
assertThat(next, notNullValue());
// do not test ttf or doc frequency, because here we have many
// shards and do not know how documents are distributed
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
assertThat(docsAndPositions.nextDoc(), equalTo(0));
assertThat(freq[j], equalTo(docsAndPositions.freq()));
int[] termPos = pos[j];
int[] termStartOffset = startOffset[j];
int[] termEndOffset = endOffset[j];
assertThat(termPos.length, equalTo(freq[j]));
assertThat(termStartOffset.length, equalTo(freq[j]));
assertThat(termEndOffset.length, equalTo(freq[j]));
for (int k = 0; k < freq[j]; k++) {
int nextPosition = docsAndPositions.nextPosition();
assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
if (withPayloads) {
assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
}
}
}
assertThat(iterator.next(), nullValue());
}
示例11: testArtificialNoDoc
import org.apache.lucene.index.Terms; //导入方法依赖的package包/类
public void testArtificialNoDoc() throws IOException {
// setup indices
Settings.Builder settings = Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer", "standard");
assertAcked(prepareCreate("test")
.setSettings(settings)
.addMapping("type1", "field1", "type=text"));
ensureGreen();
// request tvs from artificial document
String text = "the quick brown fox jumps over the lazy dog";
TermVectorsResponse resp = client().prepareTermVectors()
.setIndex("test")
.setType("type1")
.setDoc(jsonBuilder()
.startObject()
.field("field1", text)
.endObject())
.setOffsets(true)
.setPositions(true)
.setFieldStatistics(true)
.setTermStatistics(true)
.get();
assertThat(resp.isExists(), equalTo(true));
checkBrownFoxTermVector(resp.getFields(), "field1", false);
// Since the index is empty, all of artificial document's "term_statistics" should be 0/absent
Terms terms = resp.getFields().terms("field1");
assertEquals("sumDocFreq should be 0 for a non-existing field!", 0, terms.getSumDocFreq());
assertEquals("sumTotalTermFreq should be 0 for a non-existing field!", 0, terms.getSumTotalTermFreq());
TermsEnum termsEnum = terms.iterator(); // we're guaranteed to receive terms for that field
while (termsEnum.next() != null) {
String term = termsEnum.term().utf8ToString();
assertEquals("term [" + term + "] does not exist in the index; ttf should be 0!", 0, termsEnum.totalTermFreq());
}
}
示例12: visitMatchingTerms
import org.apache.lucene.index.Terms; //导入方法依赖的package包/类
@Override
public void visitMatchingTerms(
IndexReader reader,
String fieldName,
MatchingTermVisitor mtv) throws IOException
{
int prefixLength = prefix.length();
Terms terms = MultiFields.getTerms(reader, fieldName);
if (terms != null) {
Matcher matcher = pattern.matcher("");
try {
TermsEnum termsEnum = terms.iterator(null);
TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef);
BytesRef text;
if (status == TermsEnum.SeekStatus.FOUND) {
text = prefixRef;
} else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
text = termsEnum.term();
} else {
text = null;
}
while(text != null) {
if (text != null && StringHelper.startsWith(text, prefixRef)) {
String textString = text.utf8ToString();
matcher.reset(textString.substring(prefixLength));
if (matcher.matches()) {
mtv.visitMatchingTerm(new Term(fieldName, textString));
}
} else {
break;
}
text = termsEnum.next();
}
} finally {
matcher.reset();
}
}
}
示例13: getTermsEnum
import org.apache.lucene.index.Terms; //导入方法依赖的package包/类
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact
return new SingleTermsEnum(terms.iterator(null), term.bytes());
}
return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions);
}
示例14: termsEnum
import org.apache.lucene.index.Terms; //导入方法依赖的package包/类
@Override
public TermsEnum termsEnum(Terms terms) throws IOException {
return terms.iterator(null);
}
示例15: checkTermTexts
import org.apache.lucene.index.Terms; //导入方法依赖的package包/类
private void checkTermTexts(Terms terms, String[] expectedTexts) throws IOException {
final TermsEnum termsEnum = terms.iterator();
for (String expectedText : expectedTexts) {
assertThat(termsEnum.next().utf8ToString(), equalTo(expectedText));
}
}