本文整理汇总了Java中org.lemurproject.galago.core.index.corpus.CorpusReader.getIterator方法的典型用法代码示例。如果您正苦于以下问题:Java CorpusReader.getIterator方法的具体用法?Java CorpusReader.getIterator怎么用?Java CorpusReader.getIterator使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.lemurproject.galago.core.index.corpus.CorpusReader
的用法示例。
在下文中一共展示了CorpusReader.getIterator方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: run
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入方法依赖的package包/类
@Override
public void run(Parameters p, PrintStream output) throws Exception {
CorpusReader reader = new CorpusReader(p.getString("path"));
if (reader.getManifest().get("emptyIndexFile", false)) {
output.println("Empty Corpus.");
return;
}
DocumentReader.DocumentIterator iterator = reader.getIterator();
DocumentComponents dc = new DocumentComponents(p);
while (!iterator.isDone()) {
output.println("#IDENTIFIER: " + iterator.getKeyString());
Document document = iterator.getDocument(dc);
output.println("#NAME: " + document.name);
output.println("#METADATA");
for (Map.Entry<String, String> entry : document.metadata.entrySet()) {
output.println(entry.getKey() + "," + entry.getValue());
}
output.println("#TEXT");
output.println(document.text);
iterator.nextKey();
}
reader.close();
}
示例2: getDocuments
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入方法依赖的package包/类
@Override
public Map<String, Document> getDocuments(List<String> documents, DocumentComponents p) throws IOException {
HashMap<String, Document> results = new HashMap<>();
ArrayList<Long> docIds = new ArrayList<>();
// should get a names iterator + sort requested documents
for (String name : documents) {
docIds.add(getIdentifier(name));
}
Collections.sort(docIds);
CorpusReader corpus = (CorpusReader) parts.get("corpus");
if(corpus == null) {
throw new IllegalArgumentException("Attempted to pull documents from index without a corpus");
}
// loop over documents and pull them as requested
CorpusReader.KeyIterator iter = corpus.getIterator();
for (long id : docIds) {
if (iter.findKey(Utility.fromLong(id))) {
try {
Document doc = iter.getDocument(p);
if(doc != null) {
results.put(doc.name, doc);
}
} catch (IOException e) {
// ignore the exception
Logger.getLogger(this.getClass().getName()).log(Level.SEVERE,
"Failed to get document: {0}\n{1}",
new Object[]{id, e.toString()});
}
}
}
return results;
}
示例3: run
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入方法依赖的package包/类
@Override
public void run(Parameters argp) throws Exception {
File input = new File(argp.getString("index"));
DiskIndex index = new DiskIndex(input.getAbsolutePath());
File output = new File(argp.getString("output"));
DiskMapBuilder dmb = new DiskMapBuilder(output.getAbsolutePath());
CorpusReader corpusReader = (CorpusReader) index.getIndexPart("corpus");
for(CorpusReader.KeyIterator iterator = corpusReader.getIterator(); !iterator.isDone(); iterator.nextKey()) {
Document doc = iterator.getDocument(Document.DocumentComponents.JustMetadata);
String docName = doc.name;
Map<String,String> metadata = doc.metadata;
String date = metadata.get("date");
if(date == null || date.trim().isEmpty()) {
continue;
}
Integer year = DateRecognition.tryExtractMetadataYear(date);
if(year == null) {
if(date.contains("?")) continue;
if(date.contains("--")) continue;
if(date.contains("n.d")) continue;
if(date.contains("s.d")) continue;
//System.out.println("# fail: "+docName + "\t" + date);
continue;
}
dmb.put(ByteUtil.fromString(docName), Utility.fromInt(year));
}
dmb.close();
System.out.println("## DONE");
}
示例4: run
import org.lemurproject.galago.core.index.corpus.CorpusReader; //导入方法依赖的package包/类
@Override
public void run(Parameters argp) throws Exception {
File input = new File(argp.getString("index"));
DiskIndex index = new DiskIndex(input.getAbsolutePath());
CorpusReader corpusReader = (CorpusReader) index.getIndexPart("corpus");
StanfordCoreNLP nlp = NLP.instance(argp);
File output = new File(argp.getString("output"));
DiskMapBuilder dmb = new DiskMapBuilder(output.getAbsolutePath());
for(CorpusReader.KeyIterator iterator = corpusReader.getIterator(); !iterator.isDone(); iterator.nextKey()) {
Document doc = iterator.getDocument(Document.DocumentComponents.JustText);
String docName = doc.name;
String text = doc.text;
if(text.contains("<DATE>")) {
String innerDate = DateUtil.fixRobustDates(SGML.getTagContents(text, "DATE"));
List<ExtractTimexSentences.SentenceInfo> sentences = ExtractTimexSentences.extractFromSinglePage(nlp, innerDate);
if(sentences.isEmpty()) {
continue;
}
String firstTimex = sentences.get(0).timexValue;
Integer year = DateRecognition.getYear(firstTimex);
if(year == null) continue;
if(!DataSet.yearMatches(year, argp.get("dataset","robust04"))) continue;
dmb.put(ByteUtil.fromString(docName), Utility.fromInt(year));
}
}
dmb.close();
System.out.println("## DONE");
}