本文整理汇总了Java中org.apache.lucene.index.MultiFields.getTermPositionsEnum方法的典型用法代码示例。如果您正苦于以下问题:Java MultiFields.getTermPositionsEnum方法的具体用法?Java MultiFields.getTermPositionsEnum怎么用?Java MultiFields.getTermPositionsEnum使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.index.MultiFields
的用法示例。
在下文中一共展示了MultiFields.getTermPositionsEnum方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getFeatures
import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
static double[] getFeatures(IndexReader ir, String fieldName, BytesRef rawPhrase, int docId, int docSize, int numDocs, boolean inc)
throws IOException {
PostingsEnum de = MultiFields.getTermDocsEnum(ir, fieldName, rawPhrase);
int ret = de.advance(docId);
if(ret == PostingsEnum.NO_MORE_DOCS){
throw new RuntimeException("no more docs...");
}
else{
int freq = de.freq();
if(freq < 2) return null;
PostingsEnum pe = MultiFields.getTermPositionsEnum(ir, fieldName, rawPhrase);
int ret2 = pe.advance(docId);
if(ret2 == PostingsEnum.NO_MORE_DOCS){
throw new RuntimeException("no more docs...");
}
else{
double[] features = new double[2];
int pos = pe.nextPosition();
int docFreq = ir.docFreq(new Term(fieldName, rawPhrase));
if(inc){
docFreq++;
numDocs++;
}
features[0] = Commons.calcTfIdf(freq, docSize, docFreq, numDocs);
features[1] = Commons.calcFirstOccurrence(pos, docSize);
return features;
}
}
}
示例2: initParents
import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
private void initParents(IndexReader reader, int first) throws IOException {
if (reader.maxDoc() == first) {
return;
}
// it's ok to use MultiFields because we only iterate on one posting list.
// breaking it to loop over the leaves() only complicates code for no
// apparent gain.
DocsAndPositionsEnum positions = MultiFields.getTermPositionsEnum(reader, null,
Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF,
DocsAndPositionsEnum.FLAG_PAYLOADS);
// shouldn't really happen, if it does, something's wrong
if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
throw new CorruptIndexException("Missing parent data for category " + first);
}
int num = reader.maxDoc();
for (int i = first; i < num; i++) {
if (positions.docID() == i) {
if (positions.freq() == 0) { // shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i);
}
parents[i] = positions.nextPosition();
if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
if (i + 1 < num) {
throw new CorruptIndexException("Missing parent data for category "+ (i + 1));
}
break;
}
} else { // this shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i);
}
}
}
示例3: buildModel
import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
static KEAModel buildModel(Map<String, Set<String>> knownKeyphrases) throws IOException {
Directory indexDir = Commons.getLuceneDirectory(LUCENE_INDEX_DIR);
IndexReader ir = DirectoryReader.open(indexDir);
KEAModel model = new KEAModel(ir, knownKeyphrases);
try{
for(int n = 1; n <= 3; n++){
System.out.printf("%s : building %d-gram model\n", new Date().toString(), n);
String fieldName = Commons.getFieldName(FIELD_NAME, n);
Terms terms = MultiFields.getTerms(ir, fieldName);
TermsEnum te = terms.iterator();
for(BytesRef rawPhrase = te.next(); rawPhrase != null; rawPhrase = te.next()){
String phrase = rawPhrase.utf8ToString();
// use KEAStopFilter instead
//if(stopWords(phrase, n)) continue;
//System.out.printf("%s ", phrase);
PostingsEnum de = MultiFields.getTermDocsEnum(ir, fieldName, rawPhrase);
while(de.nextDoc() != PostingsEnum.NO_MORE_DOCS){
int docId = de.docID();
int freq = de.freq();
// Let's consider only terms that occurs more than one time in the document
// KEA papers said "To reduce the size of the training set, we discard any phrase that occurs only once in the document."
if(freq > 1){
PostingsEnum pe = MultiFields.getTermPositionsEnum(ir, fieldName, rawPhrase);
int ret = pe.advance(docId);
if(ret == PostingsEnum.NO_MORE_DOCS){
System.out.printf("(NO_MORE_DOCS) %d\n", docId);
}
else{
// get first position of the term in the doc (first occurrence)
int pos = pe.nextPosition();
model.add(docId, fieldName, phrase, freq, pos);
}
}
}
}
}
}
finally{
IOUtils.closeWhileHandlingException(ir);
}
return model;
}
示例4: testWickedLongTerm
import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
/**
* Make sure we skip wicked long terms.
*/
public void testWickedLongTerm() throws IOException {
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new ClassicAnalyzer()));
char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
Arrays.fill(chars, 'x');
Document doc = new Document();
final String bigTerm = new String(chars);
// This produces a too-long term:
String contents = "abc xyz x" + bigTerm + " another term";
doc.add(new TextField("content", contents, Field.Store.NO));
writer.addDocument(doc);
// Make sure we can add another normal document
doc = new Document();
doc.add(new TextField("content", "abc bbb ccc", Field.Store.NO));
writer.addDocument(doc);
writer.close();
IndexReader reader = IndexReader.open(dir);
// Make sure all terms < max size were indexed
assertEquals(2, reader.docFreq(new Term("content", "abc")));
assertEquals(1, reader.docFreq(new Term("content", "bbb")));
assertEquals(1, reader.docFreq(new Term("content", "term")));
assertEquals(1, reader.docFreq(new Term("content", "another")));
// Make sure position is still incremented when
// massive term is skipped:
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
"content",
new BytesRef("another"));
assertTrue(tps.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, tps.freq());
assertEquals(3, tps.nextPosition());
// Make sure the doc that has the massive term is in
// the index:
assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
reader.close();
// Make sure we can add a document with exactly the
// maximum length term, and search on that term:
doc = new Document();
doc.add(new TextField("content", bigTerm, Field.Store.NO));
ClassicAnalyzer sa = new ClassicAnalyzer();
sa.setMaxTokenLength(100000);
writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa));
writer.addDocument(doc);
writer.close();
reader = IndexReader.open(dir);
assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
reader.close();
dir.close();
}
示例5: testCaching
import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
public void testCaching() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
TokenStream stream = new TokenStream() {
private int index = 0;
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@Override
public boolean incrementToken() {
if (index == tokens.length) {
return false;
} else {
clearAttributes();
termAtt.append(tokens[index++]);
offsetAtt.setOffset(0,0);
return true;
}
}
};
stream = new CachingTokenFilter(stream);
doc.add(new TextField("preanalyzed", stream));
// 1) we consume all tokens twice before we add the doc to the index
checkTokens(stream);
stream.reset();
checkTokens(stream);
// 2) now add the document to the index and verify if all tokens are indexed
// don't reset the stream here, the DocumentWriter should do that implicitly
writer.addDocument(doc);
IndexReader reader = writer.getReader();
DocsAndPositionsEnum termPositions = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
"preanalyzed",
new BytesRef("term1"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(0, termPositions.nextPosition());
termPositions = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
"preanalyzed",
new BytesRef("term2"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(2, termPositions.freq());
assertEquals(1, termPositions.nextPosition());
assertEquals(3, termPositions.nextPosition());
termPositions = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
"preanalyzed",
new BytesRef("term3"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(2, termPositions.nextPosition());
reader.close();
writer.close();
// 3) reset stream and consume tokens again
stream.reset();
checkTokens(stream);
dir.close();
}
示例6: testWickedLongTerm
import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
/**
* Make sure we skip wicked long terms.
*/
public void testWickedLongTerm() throws IOException {
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT)));
char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
Arrays.fill(chars, 'x');
Document doc = new Document();
final String bigTerm = new String(chars);
// This produces a too-long term:
String contents = "abc xyz x" + bigTerm + " another term";
doc.add(new TextField("content", contents, Field.Store.NO));
writer.addDocument(doc);
// Make sure we can add another normal document
doc = new Document();
doc.add(new TextField("content", "abc bbb ccc", Field.Store.NO));
writer.addDocument(doc);
writer.close();
IndexReader reader = IndexReader.open(dir);
// Make sure all terms < max size were indexed
assertEquals(2, reader.docFreq(new Term("content", "abc")));
assertEquals(1, reader.docFreq(new Term("content", "bbb")));
assertEquals(1, reader.docFreq(new Term("content", "term")));
assertEquals(1, reader.docFreq(new Term("content", "another")));
// Make sure position is still incremented when
// massive term is skipped:
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
"content",
new BytesRef("another"));
assertTrue(tps.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, tps.freq());
assertEquals(3, tps.nextPosition());
// Make sure the doc that has the massive term is in
// the index:
assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
reader.close();
// Make sure we can add a document with exactly the
// maximum length term, and search on that term:
doc = new Document();
doc.add(new TextField("content", bigTerm, Field.Store.NO));
ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
sa.setMaxTokenLength(100000);
writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa));
writer.addDocument(doc);
writer.close();
reader = IndexReader.open(dir);
assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
reader.close();
dir.close();
}