本文整理汇总了Java中org.lemurproject.galago.core.index.corpus.CorpusReader类的典型用法代码示例。如果您正苦于以下问题:Java CorpusReader类的具体用法?Java CorpusReader怎么用?Java CorpusReader使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
CorpusReader类属于org.lemurproject.galago.core.index.corpus包,在下文中一共展示了CorpusReader类的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getDocument
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Override
public Document getDocument(String document, DocumentComponents p) throws IOException {
if (parts.containsKey("corpus")) {
try {
CorpusReader corpus = (CorpusReader) parts.get("corpus");
if(corpus == null) {
throw new IllegalArgumentException("Attempted to pull a document from index without a corpus");
}
long docId = getIdentifier(document);
return corpus.getDocument(docId, p);
} catch (IOException e) {
// ignore the exception
logger.log(Level.SEVERE,"IOException while pulling document: "+document,e);
/*logger.log(Level.SEVERE,
"Failed to get document: {0}\n{1}",
new Object[]{document, e.toString()});*/
}
}
return null;
}
示例2: updateIndex
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
/**
* This function is called after each index flush
* and after each index merge operation.
*
* It ensures the set of retrievals are updated to reflect the flush/merge op
* and the collection statistics used for retrieval are maintained correctly.
*
*/
private void updateIndex() throws IOException {
// maintain the document store (corpus) - if there is one
if (currentMemoryIndex.containsPart("corpus")) {
// get all corpora + shove into document store
ArrayList<DocumentReader> readers = new ArrayList<>();
readers.add((DocumentReader) currentMemoryIndex.getIndexPart("corpus"));
for (String path : geometricParts.getAllShards().getBinPaths()) {
String corpus = path + File.separator + "corpus";
readers.add(new CorpusReader(corpus));
}
}
// finally write new checkpointing data (checkpoints the disk indexes)
Parameters checkpoint = createCheckpoint();
this.checkpointer.saveCheckpoint(checkpoint);
}
示例3: CorpusSelectiveSplitParser
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
public CorpusSelectiveSplitParser(DocumentSplit split, Parameters p) throws FileNotFoundException, IOException {
super(split, p);
// check that the corpus is an actual corpus
reader = new CorpusReader(p.getString("corpus"));
// Must be a simple list of strings, one per line;
File ids = new File(split.fileName);
Set<String> documentIds = Utility.readFileToStringSet(ids);
docIds = new long[documentIds.size()];
int i = 0;
for (String sid : documentIds) {
long id = Long.parseLong(sid);
docIds[i] = id;
i+=1;
}
// ensure increasing order...
Arrays.sort(docIds);
idx = 0;
}
示例4: run
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Override
public void run(Parameters p, PrintStream output) throws Exception {
CorpusReader reader = new CorpusReader(p.getString("path"));
if (reader.getManifest().get("emptyIndexFile", false)) {
output.println("Empty Corpus.");
return;
}
DocumentReader.DocumentIterator iterator = reader.getIterator();
DocumentComponents dc = new DocumentComponents(p);
while (!iterator.isDone()) {
output.println("#IDENTIFIER: " + iterator.getKeyString());
Document document = iterator.getDocument(dc);
output.println("#NAME: " + document.name);
output.println("#METADATA");
for (Map.Entry<String, String> entry : document.metadata.entrySet()) {
output.println(entry.getKey() + "," + entry.getValue());
}
output.println("#TEXT");
output.println(document.text);
iterator.nextKey();
}
reader.close();
}
示例5: getDocuments
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Override
public Map<String, Document> getDocuments(List<String> documents, DocumentComponents p) throws IOException {
HashMap<String, Document> results = new HashMap<>();
ArrayList<Long> docIds = new ArrayList<>();
// should get a names iterator + sort requested documents
for (String name : documents) {
docIds.add(getIdentifier(name));
}
Collections.sort(docIds);
CorpusReader corpus = (CorpusReader) parts.get("corpus");
if(corpus == null) {
throw new IllegalArgumentException("Attempted to pull documents from index without a corpus");
}
// loop over documents and pull them as requested
CorpusReader.KeyIterator iter = corpus.getIterator();
for (long id : docIds) {
if (iter.findKey(Utility.fromLong(id))) {
try {
Document doc = iter.getDocument(p);
if(doc != null) {
results.put(doc.name, doc);
}
} catch (IOException e) {
// ignore the exception
Logger.getLogger(this.getClass().getName()).log(Level.SEVERE,
"Failed to get document: {0}\n{1}",
new Object[]{id, e.toString()});
}
}
}
return results;
}
示例6: CorpusSplitParser
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
public CorpusSplitParser(DocumentSplit split, Parameters p) throws IOException {
super(split, p);
this.reader = new CorpusReader(split.fileName);
this.iterator = (DocumentIterator) reader.getIterator();
if(split.startKey != null) {
this.iterator.skipToKey(split.startKey);
}
this.split = split;
extractionParameters = new DocumentComponents(true, true, false);
}
示例7: buildIndex
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Test
public void buildIndex() throws IOException, IncompatibleProcessorException {
File temporary = FileUtility.createTemporary();
assertTrue(temporary.delete());
assertTrue(temporary.mkdirs());
temporaryName = temporary.getAbsolutePath();
// Build an encoded document:
document = new Document();
document.identifier = 10;
document.name = "doc-identifier";
document.text = "This is the text part.";
document.metadata.put("Key", "Value");
document.metadata.put("Something", "Else");
Parameters corpusWriterParameters = Parameters.create();
corpusWriterParameters.set("readerClass", CorpusReader.class.getName());
corpusWriterParameters.set("writerClass", CorpusFolderWriter.class.getName());
corpusWriterParameters.set("filename", temporary.getAbsolutePath());
CorpusFolderWriter valueWriter = new CorpusFolderWriter(new FakeParameters(corpusWriterParameters.clone()));
Sorter sorter = new Sorter(new KeyValuePair.KeyOrder());
SplitBTreeKeyWriter keyWriter = new SplitBTreeKeyWriter(new FakeParameters(corpusWriterParameters.clone()));
valueWriter.setProcessor(sorter);
sorter.setProcessor(keyWriter);
valueWriter.process(document);
valueWriter.close();
}
示例8: testBuildIndexSpecific
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Test
public void testBuildIndexSpecific() throws Exception {
File tmpDir = FileUtility.createTemporaryDirectory();
try {
File inputTxt = new File(tmpDir, "input.txt");
File testIndex = new File(tmpDir, "test.galago");
StreamUtil.copyStringToFile("this is a document of some kind", inputTxt);
BuildIndex.execute(
Parameters.parseArray(
"inputPath", inputTxt,
"indexPath", testIndex,
"corpusParameters", Parameters.parseArray(
"documentSerializerClass", WebDocumentSerializer.class.getName())),
System.out);
CorpusReader reader = new CorpusReader(new File(testIndex, "corpus").getAbsolutePath());
assertEquals(WebDocumentSerializer.class.getName(), reader.getManifest().getString("documentSerializerClass"));
System.out.println(reader.serializer.getClass());
Document document = reader.getIterator().getDocument(Document.DocumentComponents.JustTerms);
assertNotNull(document);
assertNotNull(document.text);
assertNotNull(document.terms);
assertEquals(7, document.terms.size());
assertEquals("this", document.terms.get(0));
} finally {
FSUtil.deleteDirectory(tmpDir);
}
}
示例9: serializerClass
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Test
public void serializerClass() throws Exception {
File tmpDir = FileUtility.createTemporaryDirectory();
try {
File inputTxt = new File(tmpDir, "input.txt");
File testIndex = new File(tmpDir, "test.galago");
StreamUtil.copyStringToFile("this is a document of some kind", inputTxt);
BuildIndex.execute(
Parameters.parseArray(
"inputPath", inputTxt,
"indexPath", testIndex,
"corpusParameters", Parameters.parseArray(
"documentSerializerClass", TokenizedDocumentSerializer.class.getName())),
System.out);
CorpusReader reader = new CorpusReader(new File(testIndex, "corpus").getAbsolutePath());
assertEquals(TokenizedDocumentSerializer.class.getName(), reader.getManifest().getString("documentSerializerClass"));
System.out.println(reader.serializer.getClass());
Document document = reader.getIterator().getDocument(Document.DocumentComponents.JustTerms);
assertNotNull(document);
assertNull(document.text);
assertNotNull(document.terms);
assertEquals(7, document.terms.size());
assertEquals("this", document.terms.get(0));
} finally {
FSUtil.deleteDirectory(tmpDir);
}
}
示例10: run
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Override
public void run(Parameters argp) throws Exception {
File input = new File(argp.getString("index"));
DiskIndex index = new DiskIndex(input.getAbsolutePath());
File output = new File(argp.getString("output"));
DiskMapBuilder dmb = new DiskMapBuilder(output.getAbsolutePath());
CorpusReader corpusReader = (CorpusReader) index.getIndexPart("corpus");
for(CorpusReader.KeyIterator iterator = corpusReader.getIterator(); !iterator.isDone(); iterator.nextKey()) {
Document doc = iterator.getDocument(Document.DocumentComponents.JustMetadata);
String docName = doc.name;
Map<String,String> metadata = doc.metadata;
String date = metadata.get("date");
if(date == null || date.trim().isEmpty()) {
continue;
}
Integer year = DateRecognition.tryExtractMetadataYear(date);
if(year == null) {
if(date.contains("?")) continue;
if(date.contains("--")) continue;
if(date.contains("n.d")) continue;
if(date.contains("s.d")) continue;
//System.out.println("# fail: "+docName + "\t" + date);
continue;
}
dmb.put(ByteUtil.fromString(docName), Utility.fromInt(year));
}
dmb.close();
System.out.println("## DONE");
}
示例11: run
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
@Override
public void run(Parameters argp) throws Exception {
File input = new File(argp.getString("index"));
DiskIndex index = new DiskIndex(input.getAbsolutePath());
CorpusReader corpusReader = (CorpusReader) index.getIndexPart("corpus");
StanfordCoreNLP nlp = NLP.instance(argp);
File output = new File(argp.getString("output"));
DiskMapBuilder dmb = new DiskMapBuilder(output.getAbsolutePath());
for(CorpusReader.KeyIterator iterator = corpusReader.getIterator(); !iterator.isDone(); iterator.nextKey()) {
Document doc = iterator.getDocument(Document.DocumentComponents.JustText);
String docName = doc.name;
String text = doc.text;
if(text.contains("<DATE>")) {
String innerDate = DateUtil.fixRobustDates(SGML.getTagContents(text, "DATE"));
List<ExtractTimexSentences.SentenceInfo> sentences = ExtractTimexSentences.extractFromSinglePage(nlp, innerDate);
if(sentences.isEmpty()) {
continue;
}
String firstTimex = sentences.get(0).timexValue;
Integer year = DateRecognition.getYear(firstTimex);
if(year == null) continue;
if(!DataSet.yearMatches(year, argp.get("dataset","robust04"))) continue;
dmb.put(ByteUtil.fromString(docName), Utility.fromInt(year));
}
}
dmb.close();
System.out.println("## DONE");
}
示例12: documentIterable
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入依赖的package包/类
public static Iterable<Document> documentIterable(DiskIndex index, Document.DocumentComponents opts) throws IOException {
CorpusReader corpus = (CorpusReader) index.getIndexPart("corpus");
CorpusReaderSource source = corpus.getIterator().getSource(opts);
return asIterable(source);
}