Java MultiFields.getTermDocsEnum方法代码示例

本文整理汇总了Java中org.apache.lucene.index.MultiFields.getTermDocsEnum方法的典型用法代码示例。如果您正苦于以下问题:Java MultiFields.getTermDocsEnum方法的具体用法?Java MultiFields.getTermDocsEnum怎么用?Java MultiFields.getTermDocsEnum使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.lucene.index.MultiFields的用法示例。


示例1: getFeatures

import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
static double[] getFeatures(IndexReader ir, String fieldName, BytesRef rawPhrase, int docId, int docSize, int numDocs, boolean inc)
    throws IOException {
  PostingsEnum de = MultiFields.getTermDocsEnum(ir, fieldName, rawPhrase);
  int ret = de.advance(docId);
  if(ret == PostingsEnum.NO_MORE_DOCS){
    throw new RuntimeException("no more docs...");
    int freq = de.freq();
    if(freq < 2) return null;
    PostingsEnum pe = MultiFields.getTermPositionsEnum(ir, fieldName, rawPhrase);
    int ret2 = pe.advance(docId);
    if(ret2 == PostingsEnum.NO_MORE_DOCS){
      throw new RuntimeException("no more docs...");
      double[] features = new double[2];
      int pos = pe.nextPosition();
      int docFreq = ir.docFreq(new Term(fieldName, rawPhrase));
      features[0] = Commons.calcTfIdf(freq, docSize, docFreq, numDocs);
      features[1] = Commons.calcFirstOccurrence(pos, docSize);
      return features;

示例2: getDoc

import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
private Document getDoc(String s, IndexReader reader) throws IOException {
    //TODO: normalize s?
    BytesRef bytesRef = new BytesRef(s);

    PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader,
            SyntacticSynsConfig.getSynsTargetFieldName(), bytesRef);
    if (docsEnum == null) {
        //couldn't find search term
        return null;

    int i = 0;
    int tmpDocID = docsEnum.nextDoc();
    int docID = -1;
    while (tmpDocID != PostingsEnum.NO_MORE_DOCS) {
        docID = tmpDocID;
        tmpDocID = docsEnum.nextDoc();
    if (i > 1) {
        //TODO: log or do something "there should only be one key term!"
    if (docID > -1) {
        return reader.document(docID);
    return null;

示例3: buildModel

import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
static KEAModel buildModel(Map<String, Set<String>> knownKeyphrases) throws IOException {
  Directory indexDir = Commons.getLuceneDirectory(LUCENE_INDEX_DIR);
  IndexReader ir = DirectoryReader.open(indexDir);
  KEAModel model = new KEAModel(ir, knownKeyphrases);

    for(int n = 1; n <= 3; n++){
      System.out.printf("%s : building %d-gram model\n", new Date().toString(), n);
      String fieldName = Commons.getFieldName(FIELD_NAME, n);
      Terms terms = MultiFields.getTerms(ir, fieldName);
      TermsEnum te = terms.iterator();
      for(BytesRef rawPhrase = te.next(); rawPhrase != null; rawPhrase = te.next()){
        String phrase = rawPhrase.utf8ToString();
        // use KEAStopFilter instead
        //if(stopWords(phrase, n)) continue;

        //System.out.printf("%s ", phrase);
        PostingsEnum de = MultiFields.getTermDocsEnum(ir, fieldName, rawPhrase);
        while(de.nextDoc() != PostingsEnum.NO_MORE_DOCS){
          int docId = de.docID();
          int freq = de.freq();
          // Let's consider only terms that occurs more than one time in the document
          // KEA papers said "To reduce the size of the training set, we discard any phrase that occurs only once in the document."
          if(freq > 1){
            PostingsEnum pe = MultiFields.getTermPositionsEnum(ir, fieldName, rawPhrase);
            int ret = pe.advance(docId);
            if(ret == PostingsEnum.NO_MORE_DOCS){
              System.out.printf("(NO_MORE_DOCS) %d\n", docId);
              // get first position of the term in the doc (first occurrence)
              int pos = pe.nextPosition();
              model.add(docId, fieldName, phrase, freq, pos);
  return model;

示例4: getOrdinal

import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
public int getOrdinal(FacetLabel cp) throws IOException {
  if (cp.length == 0) {
    return ROOT_ORDINAL;

  // First try to find the answer in the LRU cache:
  synchronized (ordinalCache) {
    Integer res = ordinalCache.get(cp);
    if (res != null) {
      if (res.intValue() < indexReader.maxDoc()) {
        // Since the cache is shared with DTR instances allocated from
        // doOpenIfChanged, we need to ensure that the ordinal is one that
        // this DTR instance recognizes.
        return res.intValue();
      } else {
        // if we get here, it means that the category was found in the cache,
        // but is not recognized by this TR instance. Therefore there's no
        // need to continue search for the path on disk, because we won't find
        // it there too.
        return TaxonomyReader.INVALID_ORDINAL;

  // If we're still here, we have a cache miss. We need to fetch the
  // value from disk, and then also put it in the cache:
  int ret = TaxonomyReader.INVALID_ORDINAL;
  DocsEnum docs = MultiFields.getTermDocsEnum(indexReader, null, Consts.FULL, new BytesRef(FacetsConfig.pathToString(cp.components, cp.length)), 0);
  if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
    ret = docs.docID();
    // we only store the fact that a category exists, not its inexistence.
    // This is required because the caches are shared with new DTR instances
    // that are allocated from doOpenIfChanged. Therefore, if we only store
    // information about found categories, we cannot accidently tell a new
    // generation of DTR that a category does not exist.
    synchronized (ordinalCache) {
      ordinalCache.put(cp, Integer.valueOf(ret));

  return ret;

示例5: getOrdinal

import org.apache.lucene.index.MultiFields; //导入方法依赖的package包/类
public int getOrdinal(CategoryPath cp) throws IOException {
  if (cp.length == 0) {
    return ROOT_ORDINAL;

  // First try to find the answer in the LRU cache:
  synchronized (ordinalCache) {
    Integer res = ordinalCache.get(cp);
    if (res != null) {
      if (res.intValue() < indexReader.maxDoc()) {
        // Since the cache is shared with DTR instances allocated from
        // doOpenIfChanged, we need to ensure that the ordinal is one that
        // this DTR instance recognizes.
        return res.intValue();
      } else {
        // if we get here, it means that the category was found in the cache,
        // but is not recognized by this TR instance. Therefore there's no
        // need to continue search for the path on disk, because we won't find
        // it there too.
        return TaxonomyReader.INVALID_ORDINAL;

  // If we're still here, we have a cache miss. We need to fetch the
  // value from disk, and then also put it in the cache:
  int ret = TaxonomyReader.INVALID_ORDINAL;
  DocsEnum docs = MultiFields.getTermDocsEnum(indexReader, null, Consts.FULL, new BytesRef(cp.toString(delimiter)), 0);
  if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
    ret = docs.docID();
    // we only store the fact that a category exists, not its inexistence.
    // This is required because the caches are shared with new DTR instances
    // that are allocated from doOpenIfChanged. Therefore, if we only store
    // information about found categories, we cannot accidently tell a new
    // generation of DTR that a category does not exist.
    synchronized (ordinalCache) {
      ordinalCache.put(cp, Integer.valueOf(ret));

  return ret;
