当前位置: 首页>>代码示例>>Java>>正文


Java CorpusReader类代码示例

本文整理汇总了Java中org.lemurproject.galago.core.index.corpus.CorpusReader的典型用法代码示例。如果您正苦于以下问题:Java CorpusReader类的具体用法?Java CorpusReader怎么用?Java CorpusReader使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


CorpusReader类属于org.lemurproject.galago.core.index.corpus包,在下文中一共展示了CorpusReader类的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: getDocument

import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Override
public Document getDocument(String document, DocumentComponents p) throws IOException {
  if (parts.containsKey("corpus")) {
    try {
      CorpusReader corpus = (CorpusReader) parts.get("corpus");
      if(corpus == null) {
        throw new IllegalArgumentException("Attempted to pull a document from index without a corpus");
      }

      long docId = getIdentifier(document);
      return corpus.getDocument(docId, p);
    } catch (IOException e) {
      // ignore the exception
      logger.log(Level.SEVERE,"IOException while pulling document: "+document,e);
      /*logger.log(Level.SEVERE,
              "Failed to get document: {0}\n{1}",
              new Object[]{document, e.toString()});*/
    }
  }
  return null;
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:22,代码来源:DiskIndex.java

示例2: updateIndex

import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
/**
 * This function is called after each index flush
 *  and after each index merge operation.
 * 
 * It ensures the set of retrievals are updated to reflect the flush/merge op
 *  and the collection statistics used for retrieval are maintained correctly.
 * 
 */
private void updateIndex() throws IOException {
  // maintain the document store (corpus) - if there is one
  if (currentMemoryIndex.containsPart("corpus")) {
    // get all corpora + shove into document store
    ArrayList<DocumentReader> readers = new ArrayList<>();
    readers.add((DocumentReader) currentMemoryIndex.getIndexPart("corpus"));
    for (String path : geometricParts.getAllShards().getBinPaths()) {
      String corpus = path + File.separator + "corpus";
      readers.add(new CorpusReader(corpus));
    }
  }
  // finally write new checkpointing data (checkpoints the disk indexes)
  Parameters checkpoint = createCheckpoint();
  this.checkpointer.saveCheckpoint(checkpoint);
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:24,代码来源:GeometricIndex.java

示例3: CorpusSelectiveSplitParser

import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
public CorpusSelectiveSplitParser(DocumentSplit split, Parameters p) throws FileNotFoundException, IOException {
  super(split, p);
  // check that the corpus is an actual corpus
  reader = new CorpusReader(p.getString("corpus"));

  // Must be a simple list of strings, one per line;
  File ids = new File(split.fileName);
  Set<String> documentIds = Utility.readFileToStringSet(ids);
  docIds = new long[documentIds.size()];
  int i = 0;
  for (String sid : documentIds) {
    long id = Long.parseLong(sid);
    docIds[i] = id;
    i+=1;
  }

  // ensure increasing order...
  Arrays.sort(docIds);

  idx = 0;
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:22,代码来源:CorpusSelectiveSplitParser.java

示例4: run

import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Override
public void run(Parameters p, PrintStream output) throws Exception {
  CorpusReader reader = new CorpusReader(p.getString("path"));
  if (reader.getManifest().get("emptyIndexFile", false)) {
    output.println("Empty Corpus.");
    return;
  }

  DocumentReader.DocumentIterator iterator = reader.getIterator();
  DocumentComponents dc = new DocumentComponents(p);
  
  while (!iterator.isDone()) {
    output.println("#IDENTIFIER: " + iterator.getKeyString());
    Document document = iterator.getDocument(dc);
    output.println("#NAME: " + document.name);
    output.println("#METADATA");
    for (Map.Entry<String, String> entry : document.metadata.entrySet()) {
      output.println(entry.getKey() + "," + entry.getValue());
    }
    output.println("#TEXT");
    output.println(document.text);
    iterator.nextKey();
  }
  reader.close();
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:26,代码来源:DumpCorpusFn.java

示例5: getDocuments

import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Override
public Map<String, Document> getDocuments(List<String> documents, DocumentComponents p) throws IOException {
  HashMap<String, Document> results = new HashMap<>();

ArrayList<Long> docIds = new ArrayList<>();
  // should get a names iterator + sort requested documents
  for (String name : documents) {
	docIds.add(getIdentifier(name));
  }
Collections.sort(docIds);

CorpusReader corpus = (CorpusReader) parts.get("corpus");
  if(corpus == null) {
    throw new IllegalArgumentException("Attempted to pull documents from index without a corpus");
  }

  // loop over documents and pull them as requested
  CorpusReader.KeyIterator iter = corpus.getIterator();
for (long id : docIds) {
	if (iter.findKey(Utility.fromLong(id))) {
		try {
			Document doc = iter.getDocument(p);
			if(doc != null) {
				results.put(doc.name, doc);
			}
		} catch (IOException e) {
			// ignore the exception
			Logger.getLogger(this.getClass().getName()).log(Level.SEVERE,
							"Failed to get document: {0}\n{1}",
							new Object[]{id, e.toString()});
		}
	}
}

  return results;
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:37,代码来源:DiskIndex.java

示例6: CorpusSplitParser

import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
public CorpusSplitParser(DocumentSplit split, Parameters p) throws IOException {
  super(split, p);
  this.reader = new CorpusReader(split.fileName);
  this.iterator = (DocumentIterator) reader.getIterator();
  if(split.startKey != null) {
    this.iterator.skipToKey(split.startKey);
  }
  this.split = split;

  extractionParameters = new DocumentComponents(true, true, false);
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:12,代码来源:CorpusSplitParser.java

示例7: buildIndex

import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Test
public void buildIndex() throws IOException, IncompatibleProcessorException {
  File temporary = FileUtility.createTemporary();
  assertTrue(temporary.delete());
  assertTrue(temporary.mkdirs());

  temporaryName = temporary.getAbsolutePath();

  // Build an encoded document:
  document = new Document();
  document.identifier = 10;
  document.name = "doc-identifier";
  document.text = "This is the text part.";
  document.metadata.put("Key", "Value");
  document.metadata.put("Something", "Else");

  Parameters corpusWriterParameters = Parameters.create();
  corpusWriterParameters.set("readerClass", CorpusReader.class.getName());
  corpusWriterParameters.set("writerClass", CorpusFolderWriter.class.getName());
  corpusWriterParameters.set("filename", temporary.getAbsolutePath());
  CorpusFolderWriter valueWriter = new CorpusFolderWriter(new FakeParameters(corpusWriterParameters.clone()));
  Sorter sorter = new Sorter(new KeyValuePair.KeyOrder());
  SplitBTreeKeyWriter keyWriter = new SplitBTreeKeyWriter(new FakeParameters(corpusWriterParameters.clone()));

  valueWriter.setProcessor(sorter);
  sorter.setProcessor(keyWriter);
  
  valueWriter.process(document);
  valueWriter.close();
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:31,代码来源:IndexReaderSplitParserTest.java

示例8: testBuildIndexSpecific

import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Test
public void testBuildIndexSpecific() throws Exception {
    File tmpDir = FileUtility.createTemporaryDirectory();
    try {
        File inputTxt = new File(tmpDir, "input.txt");
        File testIndex = new File(tmpDir, "test.galago");
        StreamUtil.copyStringToFile("this is a document of some kind", inputTxt);
        BuildIndex.execute(
                Parameters.parseArray(
                        "inputPath", inputTxt,
                        "indexPath", testIndex,
                        "corpusParameters", Parameters.parseArray(
                                "documentSerializerClass", WebDocumentSerializer.class.getName())),
                System.out);

        CorpusReader reader = new CorpusReader(new File(testIndex, "corpus").getAbsolutePath());
        assertEquals(WebDocumentSerializer.class.getName(), reader.getManifest().getString("documentSerializerClass"));
        System.out.println(reader.serializer.getClass());
        Document document = reader.getIterator().getDocument(Document.DocumentComponents.JustTerms);
        assertNotNull(document);
        assertNotNull(document.text);
        assertNotNull(document.terms);
        assertEquals(7, document.terms.size());
        assertEquals("this", document.terms.get(0));
    } finally {
        FSUtil.deleteDirectory(tmpDir);
    }
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:29,代码来源:WebDocumentSerializerTest.java

示例9: serializerClass

import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Test
public void serializerClass() throws Exception {
  File tmpDir = FileUtility.createTemporaryDirectory();
  try {
    File inputTxt = new File(tmpDir, "input.txt");
    File testIndex = new File(tmpDir, "test.galago");
    StreamUtil.copyStringToFile("this is a document of some kind", inputTxt);
    BuildIndex.execute(
        Parameters.parseArray(
            "inputPath", inputTxt,
            "indexPath", testIndex,
            "corpusParameters", Parameters.parseArray(
                "documentSerializerClass", TokenizedDocumentSerializer.class.getName())),
        System.out);

    CorpusReader reader = new CorpusReader(new File(testIndex, "corpus").getAbsolutePath());
    assertEquals(TokenizedDocumentSerializer.class.getName(), reader.getManifest().getString("documentSerializerClass"));
    System.out.println(reader.serializer.getClass());
    Document document = reader.getIterator().getDocument(Document.DocumentComponents.JustTerms);
    assertNotNull(document);
    assertNull(document.text);
    assertNotNull(document.terms);
    assertEquals(7, document.terms.size());
    assertEquals("this", document.terms.get(0));
  } finally {
    FSUtil.deleteDirectory(tmpDir);
  }
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:29,代码来源:TokenizedDocumentSerializerTest.java

示例10: run

import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Override
public void run(Parameters argp) throws Exception {
  File input = new File(argp.getString("index"));
  DiskIndex index = new DiskIndex(input.getAbsolutePath());
  File output = new File(argp.getString("output"));
  DiskMapBuilder dmb = new DiskMapBuilder(output.getAbsolutePath());

  CorpusReader corpusReader = (CorpusReader) index.getIndexPart("corpus");

  for(CorpusReader.KeyIterator iterator = corpusReader.getIterator(); !iterator.isDone(); iterator.nextKey()) {
    Document doc = iterator.getDocument(Document.DocumentComponents.JustMetadata);
    String docName = doc.name;
    Map<String,String> metadata = doc.metadata;

    String date = metadata.get("date");
    if(date == null || date.trim().isEmpty()) {
      continue;
    }
    Integer year = DateRecognition.tryExtractMetadataYear(date);
    if(year == null) {
      if(date.contains("?")) continue;
      if(date.contains("--")) continue;
      if(date.contains("n.d")) continue;
      if(date.contains("s.d")) continue;
      //System.out.println("# fail: "+docName + "\t" + date);
      continue;
    }
    dmb.put(ByteUtil.fromString(docName), Utility.fromInt(year));
  }
  dmb.close();
  System.out.println("## DONE");
}
 
开发者ID:jjfiv,项目名称:ecir2015timebooks,代码行数:33,代码来源:CollectPubDates.java

示例11: run

import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Override
public void run(Parameters argp) throws Exception {
  File input = new File(argp.getString("index"));
  DiskIndex index = new DiskIndex(input.getAbsolutePath());
  CorpusReader corpusReader = (CorpusReader) index.getIndexPart("corpus");

  StanfordCoreNLP nlp = NLP.instance(argp);

  File output = new File(argp.getString("output"));
  DiskMapBuilder dmb = new DiskMapBuilder(output.getAbsolutePath());

  for(CorpusReader.KeyIterator iterator = corpusReader.getIterator(); !iterator.isDone(); iterator.nextKey()) {
    Document doc = iterator.getDocument(Document.DocumentComponents.JustText);
    String docName = doc.name;
    String text = doc.text;
    if(text.contains("<DATE>")) {
      String innerDate = DateUtil.fixRobustDates(SGML.getTagContents(text, "DATE"));

      List<ExtractTimexSentences.SentenceInfo> sentences = ExtractTimexSentences.extractFromSinglePage(nlp, innerDate);

      if(sentences.isEmpty()) {
        continue;
      }

      String firstTimex = sentences.get(0).timexValue;
      Integer year = DateRecognition.getYear(firstTimex);
      if(year == null) continue;
      if(!DataSet.yearMatches(year, argp.get("dataset","robust04"))) continue;

      dmb.put(ByteUtil.fromString(docName), Utility.fromInt(year));
    }
  }
  dmb.close();
  System.out.println("## DONE");
}
 
开发者ID:jjfiv,项目名称:ecir2015timebooks,代码行数:36,代码来源:RobustCollectPubDates.java

示例12: documentIterable

import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
public static Iterable<Document> documentIterable(DiskIndex index, Document.DocumentComponents opts) throws IOException {
  CorpusReader corpus = (CorpusReader) index.getIndexPart("corpus");

  CorpusReaderSource source = corpus.getIterator().getSource(opts);
  return asIterable(source);
}
 
开发者ID:jjfiv,项目名称:ecir2015timebooks,代码行数:7,代码来源:GalagoUtil.java


注:本文中的org.lemurproject.galago.core.index.corpus.CorpusReader类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。