当前位置: 首页>>代码示例>>Java>>正文


Java MultiFields.getTermPositionsEnum方法代码示例

本文整理汇总了Java中org.apache.lucene.index.MultiFields.getTermPositionsEnum方法的典型用法代码示例。如果您正苦于以下问题:Java MultiFields.getTermPositionsEnum方法的具体用法?Java MultiFields.getTermPositionsEnum怎么用?Java MultiFields.getTermPositionsEnum使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.lucene.index.MultiFields的用法示例。


在下文中一共展示了MultiFields.getTermPositionsEnum方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: getFeatures

import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
static double[] getFeatures(IndexReader ir, String fieldName, BytesRef rawPhrase, int docId, int docSize, int numDocs, boolean inc)
    throws IOException {
  PostingsEnum de = MultiFields.getTermDocsEnum(ir, fieldName, rawPhrase);
  int ret = de.advance(docId);
  if(ret == PostingsEnum.NO_MORE_DOCS){
    throw new RuntimeException("no more docs...");
  }
  else{
    int freq = de.freq();
    if(freq < 2) return null;
    
    PostingsEnum pe = MultiFields.getTermPositionsEnum(ir, fieldName, rawPhrase);
    int ret2 = pe.advance(docId);
    if(ret2 == PostingsEnum.NO_MORE_DOCS){
      throw new RuntimeException("no more docs...");
    }
    else{
      double[] features = new double[2];
      int pos = pe.nextPosition();
      int docFreq = ir.docFreq(new Term(fieldName, rawPhrase));
      if(inc){
        docFreq++;
        numDocs++;
      }
      features[0] = Commons.calcTfIdf(freq, docSize, docFreq, numDocs);
      features[1] = Commons.calcFirstOccurrence(pos, docSize);
      
      return features;
    }
  }
}
 
开发者ID:kojisekig,项目名称:KEA-lucene,代码行数:32,代码来源:KeyphraseExtractor2.java

示例2: initParents

import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
private void initParents(IndexReader reader, int first) throws IOException {
  if (reader.maxDoc() == first) {
    return;
  }
  
  // it's ok to use MultiFields because we only iterate on one posting list.
  // breaking it to loop over the leaves() only complicates code for no
  // apparent gain.
  DocsAndPositionsEnum positions = MultiFields.getTermPositionsEnum(reader, null,
      Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF,
      DocsAndPositionsEnum.FLAG_PAYLOADS);

  // shouldn't really happen, if it does, something's wrong
  if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
    throw new CorruptIndexException("Missing parent data for category " + first);
  }
  
  int num = reader.maxDoc();
  for (int i = first; i < num; i++) {
    if (positions.docID() == i) {
      if (positions.freq() == 0) { // shouldn't happen
        throw new CorruptIndexException("Missing parent data for category " + i);
      }
      
      parents[i] = positions.nextPosition();
      
      if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
        if (i + 1 < num) {
          throw new CorruptIndexException("Missing parent data for category "+ (i + 1));
        }
        break;
      }
    } else { // this shouldn't happen
      throw new CorruptIndexException("Missing parent data for category " + i);
    }
  }
}
 
开发者ID:europeana,项目名称:search,代码行数:38,代码来源:TaxonomyIndexArrays.java

示例3: buildModel

import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
static KEAModel buildModel(Map<String, Set<String>> knownKeyphrases) throws IOException {
  
  Directory indexDir = Commons.getLuceneDirectory(LUCENE_INDEX_DIR);
  IndexReader ir = DirectoryReader.open(indexDir);
  KEAModel model = new KEAModel(ir, knownKeyphrases);

  try{
    for(int n = 1; n <= 3; n++){
      System.out.printf("%s : building %d-gram model\n", new Date().toString(), n);
      String fieldName = Commons.getFieldName(FIELD_NAME, n);
      Terms terms = MultiFields.getTerms(ir, fieldName);
      TermsEnum te = terms.iterator();
      for(BytesRef rawPhrase = te.next(); rawPhrase != null; rawPhrase = te.next()){
        String phrase = rawPhrase.utf8ToString();
        // use KEAStopFilter instead
        //if(stopWords(phrase, n)) continue;

        //System.out.printf("%s ", phrase);
        PostingsEnum de = MultiFields.getTermDocsEnum(ir, fieldName, rawPhrase);
        while(de.nextDoc() != PostingsEnum.NO_MORE_DOCS){
          int docId = de.docID();
          int freq = de.freq();
          // Let's consider only terms that occurs more than one time in the document
          // KEA papers said "To reduce the size of the training set, we discard any phrase that occurs only once in the document."
          if(freq > 1){
            PostingsEnum pe = MultiFields.getTermPositionsEnum(ir, fieldName, rawPhrase);
            int ret = pe.advance(docId);
            if(ret == PostingsEnum.NO_MORE_DOCS){
              System.out.printf("(NO_MORE_DOCS) %d\n", docId);
            }
            else{
              // get first position of the term in the doc (first occurrence)
              int pos = pe.nextPosition();
              model.add(docId, fieldName, phrase, freq, pos);
            }
          }
        }
      }
    }
  }
  finally{
    IOUtils.closeWhileHandlingException(ir);
  }
  
  return model;
}
 
开发者ID:kojisekig,项目名称:KEA-lucene,代码行数:47,代码来源:KEAModelBuilder.java

示例4: testWickedLongTerm

import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
/**
 * Make sure we skip wicked long terms.
*/
public void testWickedLongTerm() throws IOException {
  RAMDirectory dir = new RAMDirectory();
  IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new ClassicAnalyzer()));

  char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
  Arrays.fill(chars, 'x');
  Document doc = new Document();
  final String bigTerm = new String(chars);

  // This produces a too-long term:
  String contents = "abc xyz x" + bigTerm + " another term";
  doc.add(new TextField("content", contents, Field.Store.NO));
  writer.addDocument(doc);

  // Make sure we can add another normal document
  doc = new Document();
  doc.add(new TextField("content", "abc bbb ccc", Field.Store.NO));
  writer.addDocument(doc);
  writer.close();

  IndexReader reader = IndexReader.open(dir);

  // Make sure all terms < max size were indexed
  assertEquals(2, reader.docFreq(new Term("content", "abc")));
  assertEquals(1, reader.docFreq(new Term("content", "bbb")));
  assertEquals(1, reader.docFreq(new Term("content", "term")));
  assertEquals(1, reader.docFreq(new Term("content", "another")));

  // Make sure position is still incremented when
  // massive term is skipped:
  DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader,
                                                              MultiFields.getLiveDocs(reader),
                                                              "content",
                                                              new BytesRef("another"));
  assertTrue(tps.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertEquals(1, tps.freq());
  assertEquals(3, tps.nextPosition());

  // Make sure the doc that has the massive term is in
  // the index:
  assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());

  reader.close();

  // Make sure we can add a document with exactly the
  // maximum length term, and search on that term:
  doc = new Document();
  doc.add(new TextField("content", bigTerm, Field.Store.NO));
  ClassicAnalyzer sa = new ClassicAnalyzer();
  sa.setMaxTokenLength(100000);
  writer  = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa));
  writer.addDocument(doc);
  writer.close();
  reader = IndexReader.open(dir);
  assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
  reader.close();

  dir.close();
}
 
开发者ID:europeana,项目名称:search,代码行数:63,代码来源:TestClassicAnalyzer.java

示例5: testCaching

import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
public void testCaching() throws IOException {
  Directory dir = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
  Document doc = new Document();
  TokenStream stream = new TokenStream() {
    private int index = 0;
    private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    
    @Override
    public boolean incrementToken() {
      if (index == tokens.length) {
        return false;
      } else {
        clearAttributes();
        termAtt.append(tokens[index++]);
        offsetAtt.setOffset(0,0);
        return true;
      }        
    }
    
  };
  
  stream = new CachingTokenFilter(stream);
  
  doc.add(new TextField("preanalyzed", stream));
  
  // 1) we consume all tokens twice before we add the doc to the index
  checkTokens(stream);
  stream.reset();  
  checkTokens(stream);
  
  // 2) now add the document to the index and verify if all tokens are indexed
  //    don't reset the stream here, the DocumentWriter should do that implicitly
  writer.addDocument(doc);
  
  IndexReader reader = writer.getReader();
  DocsAndPositionsEnum termPositions = MultiFields.getTermPositionsEnum(reader,
                                                                        MultiFields.getLiveDocs(reader),
                                                                        "preanalyzed",
                                                                        new BytesRef("term1"));
  assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertEquals(1, termPositions.freq());
  assertEquals(0, termPositions.nextPosition());

  termPositions = MultiFields.getTermPositionsEnum(reader,
                                                   MultiFields.getLiveDocs(reader),
                                                   "preanalyzed",
                                                   new BytesRef("term2"));
  assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertEquals(2, termPositions.freq());
  assertEquals(1, termPositions.nextPosition());
  assertEquals(3, termPositions.nextPosition());
  
  termPositions = MultiFields.getTermPositionsEnum(reader,
                                                   MultiFields.getLiveDocs(reader),
                                                   "preanalyzed",
                                                   new BytesRef("term3"));
  assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertEquals(1, termPositions.freq());
  assertEquals(2, termPositions.nextPosition());
  reader.close();
  writer.close();
  // 3) reset stream and consume tokens again
  stream.reset();
  checkTokens(stream);
  dir.close();
}
 
开发者ID:europeana,项目名称:search,代码行数:69,代码来源:TestCachingTokenFilter.java

示例6: testWickedLongTerm

import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
/**
 * Make sure we skip wicked long terms.
*/
public void testWickedLongTerm() throws IOException {
  RAMDirectory dir = new RAMDirectory();
  IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
    TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT)));

  char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
  Arrays.fill(chars, 'x');
  Document doc = new Document();
  final String bigTerm = new String(chars);

  // This produces a too-long term:
  String contents = "abc xyz x" + bigTerm + " another term";
  doc.add(new TextField("content", contents, Field.Store.NO));
  writer.addDocument(doc);

  // Make sure we can add another normal document
  doc = new Document();
  doc.add(new TextField("content", "abc bbb ccc", Field.Store.NO));
  writer.addDocument(doc);
  writer.close();

  IndexReader reader = IndexReader.open(dir);

  // Make sure all terms < max size were indexed
  assertEquals(2, reader.docFreq(new Term("content", "abc")));
  assertEquals(1, reader.docFreq(new Term("content", "bbb")));
  assertEquals(1, reader.docFreq(new Term("content", "term")));
  assertEquals(1, reader.docFreq(new Term("content", "another")));

  // Make sure position is still incremented when
  // massive term is skipped:
  DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader,
                                                              MultiFields.getLiveDocs(reader),
                                                              "content",
                                                              new BytesRef("another"));
  assertTrue(tps.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertEquals(1, tps.freq());
  assertEquals(3, tps.nextPosition());

  // Make sure the doc that has the massive term is in
  // the index:
  assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());

  reader.close();

  // Make sure we can add a document with exactly the
  // maximum length term, and search on that term:
  doc = new Document();
  doc.add(new TextField("content", bigTerm, Field.Store.NO));
  ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
  sa.setMaxTokenLength(100000);
  writer  = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa));
  writer.addDocument(doc);
  writer.close();
  reader = IndexReader.open(dir);
  assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
  reader.close();

  dir.close();
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:64,代码来源:TestClassicAnalyzer.java


注:本文中的org.apache.lucene.index.MultiFields.getTermPositionsEnum方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。