本文整理汇总了Java中org.lemurproject.galago.core.parse.DocumentStreamParser类的典型用法代码示例。如果您正苦于以下问题:Java DocumentStreamParser类的具体用法?Java DocumentStreamParser怎么用?Java DocumentStreamParser使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
DocumentStreamParser类属于org.lemurproject.galago.core.parse包,在下文中一共展示了DocumentStreamParser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: process
import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Override
public void process(DocumentSplit doc) throws IOException {
BufferedReader reader = DocumentStreamParser.getBufferedReader(doc);
for (String str = reader.readLine();
str != null;
str = reader.readLine()) {
String[] parts = str.split(delim);
if (parts.length > 0) {
String t = parts[0];
long cf = 1;
long dc = 1;
long mxdf = 1;
if (parts.length >= 2) {
cf = Long.parseLong(parts[1]);
}
if (parts.length >= 3) {
dc = Long.parseLong(parts[2]);
}
if (parts.length >= 4) {
mxdf = Long.parseLong(parts[4]);
}
WordCount wc = new WordCount(ByteUtil.fromString(t), cf, dc, mxdf);
processor.process(wc);
}
}
reader.close();
}
示例2: forEachDocument
import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
public static void forEachDocument(DocumentStreamParser parser, Operation<Document> action) throws IOException {
try {
while (true) {
Document doc = parser.nextDocument();
if (doc == null) break;
action.process(doc);
}
} finally {
parser.close();
}
}
示例3: readEach
import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
public static void readEach(List<DocumentSplit> splits, Operation<BufferedReader> action) throws IOException {
for (DocumentSplit split : splits) {
try (BufferedReader reader = DocumentStreamParser.getBufferedReader(split)) {
action.process(reader);
}
}
}
示例4: documentsStreamIterable
import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
public static Iterable<Document> documentsStreamIterable(final DocumentStreamParser parser) {
return new Iterable<Document>() {
@Override
public Iterator<Document> iterator() {
try {
return new DocumentStreamIterator(parser);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
}
示例5: run
import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Override
public void run(Parameters argp) throws Exception {
DocumentStreamParser.addExternalParsers(argp);
final PrintWriter output = IO.printWriter(argp.getString("output"));
List<File> files = Util.checkAndExpandPaths(argp.getAsList("input", String.class));
boolean isListFile = argp.getBoolean("listFile");
if(isListFile) {
files = Util.collectLines(files);
}
final StanfordCoreNLP nlp = NLP.instance(Parameters.parseArray("annotators",
Arrays.asList("tokenize", "cleanxml", "ssplit")));
for(File file : files) {
System.err.println("# File: " + file.getAbsolutePath());
MBTEIPageParser pageParser = null;
try {
DocumentSplit split = new DocumentSplit();
split.fileName = file.getCanonicalPath();
pageParser = new MBTEIPageParser(split, argp);
while (true) {
Document page = pageParser.nextDocument();
if (page == null) break;
handlePage(page, nlp, output);
}
} catch(Exception ex) {
System.err.println("# Fail: "+file);
ex.printStackTrace();
} finally {
output.flush();
if(pageParser != null) pageParser.close();
}
}
output.close();
}
示例6: selectsDatedSentenceParser
import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Test
public void selectsDatedSentenceParser() throws IOException {
File tmp = null;
try {
tmp = FileUtility.createTemporary();
Parameters buildP = Parameters.parseArray("filetype", DatedSentenceParser.class.getName(), "dataset", "none");
DocumentSplit fakeSplit = DocumentSplitFactory.file(tmp);
DatedSentenceParser dsp = (DatedSentenceParser) DocumentStreamParser.instance(fakeSplit, buildP);
assertEquals("none", dsp.conf.getString("dataset"));
} finally {
if(tmp != null) assertTrue(tmp.delete());
}
}
示例7: simpleParse
import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Test
public void simpleParse() throws IOException {
String data = "<http://dbpedia.org/resource/Anarchism> <http://dbpedia.org/ontology/abstract> \"\"@en .\n" +
"<http://dbpedia.org/resource/Autism> <http://dbpedia.org/ontology/abstract> \"Autism is a disorder of neural development characterized by impaired social interaction and communication, and by restricted and repetitive behavior.\"@en .\n" +
"<http://dbpedia.org/resource/Achilles> <http://dbpedia.org/ontology/abstract> \"In Greek mythology, Achilles was a Greek hero of the Trojan War and the central character and greatest warrior of Homer's Iliad.\"@en .\n";
File tmp = File.createTempFile("fake-dbpedia-abstracts", ".ttl");
try {
Utility.copyStringToFile(data, tmp);
DocumentStreamParser ps = new DbpediaAbstractParser(DocumentSplitFactory.file(tmp), Parameters.instance());
Document autism = ps.nextDocument();
assertNotNull(autism);
Document achilles = ps.nextDocument();
assertNotNull(achilles);
assertNull(ps.nextDocument());
assertNull(ps.nextDocument());
assertEquals("Autism", autism.name);
assertEquals("Achilles", achilles.name);
assertEquals("<title>Achilles</title>\n<body>In Greek mythology, Achilles was a Greek hero of the Trojan War and the central character and greatest warrior of Homer's Iliad.</body>", achilles.text);
} finally {
assertTrue(tmp.delete());
}
}
示例8: BufferedReaderParser
import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
public BufferedReaderParser(DocumentSplit split, Parameters p) throws IOException {
super(split, p);
this.reader = DocumentStreamParser.getBufferedReader(split);
this.conf = p;
}
示例9: DbpediaAbstractParser
import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
public DbpediaAbstractParser(DocumentSplit split, Parameters p) throws IOException {
super(split, p);
this.reader = DocumentStreamParser.getBufferedReader(split);
}
示例10: run
import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Override
public void run(Parameters argp) throws Exception {
final Set<String> finished = new HashSet<>();
if(argp.containsKey("previous")) {
List<String> previousOutput = argp.getAsList("previous", String.class);
for (String prev : previousOutput) {
IO.forEachLine(IO.file(prev), new IO.StringFunctor() {
@Override
public void process(String input) {
if (input.startsWith("#")) {
finished.add(StrUtil.removeFront(input, "#"));
}
}
});
}
}
List<DocumentSplit> inputs = GalagoUtil.getDocumentSplits(argp.getAsList("input", String.class), argp);
PrintWriter out = new PrintWriter(argp.getString("output"));
if(inputs.isEmpty()) {
throw new IllegalArgumentException("Input set is empty!");
}
System.err.println("# found "+inputs.size()+" input documents!");
StanfordCoreNLP nlp = NLP.instance(argp);
for(DocumentSplit split : inputs) {
System.err.println("# "+split.fileName);
try (DocumentStreamParser parser = DocumentStreamParser.instance(split, argp)) {
for (Document doc : GalagoUtil.documentsStreamIterable(parser)) {
if (finished.contains(doc.name)) continue;
System.err.println("# " + doc.name);
try {
String text = doc.text;
// hack for silly \\N documents when dealing with wex index
if (text.length() <= 3) continue;
List<ExtractTimexSentences.SentenceInfo> sentences = ExtractTimexSentences.extractFromSinglePage(nlp, text);
for (ExtractTimexSentences.SentenceInfo sinfo : sentences) {
out.printf("%s\t%s\t%s\n",
doc.name,
sinfo.timexValue,
sinfo.sentence);
}
out.println("#" + doc.name); //finish document mark...
} catch (Exception ex) {
System.err.println("# Fail: " + split.fileName + "/" + doc.name);
ex.printStackTrace(System.err);
}
}
}
}
out.close();
}
示例11: DocumentStreamIterator
import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
public DocumentStreamIterator(DocumentStreamParser parser) throws IOException {
this.parser = parser;
this.current = parser.nextDocument();
}
示例12: testSimpleData
import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Test
public void testSimpleData() throws Exception {
File inF = null;
File outF = null;
try {
inF = File.createTempFile("asd", "jkl");
outF = File.createTempFile("asd", "jkl");
Utility.copyStringToFile(
"doc0\t0\t0\t1982\tThis is the way things are, here in 1982.\n" +
"doc0\t0\t0\t1981\tThis is the way things were, last year, in 1981.\n" +
"doc0\t0\t0\t1982\tBut 1982 wasn't always this good.\n" +
"doc1\t0\t0\t1783\t1783 was a year that I keep using for examples.\n",
inF
);
Main.main(new String[]{
"--tool=doc-date-lm-collector",
"--dataset=none",
"--what=books",
"--input=" + inF.getAbsolutePath(),
"--output=" + outF.getAbsolutePath()
});
DocumentSplit written = DocumentSplitFactory.file(outF);
DocumentStreamParser docsSP = new DocDateSketchParser(written, Parameters.instance());
List<Document> docs = new ArrayList<Document>();
while(true) {
Document d = docsSP.nextDocument();
if(d == null) break;
docs.add(d);
}
Assert.assertEquals(3, docs.size());
Assert.assertEquals("doc0", docs.get(0).metadata.get("book"));
Assert.assertEquals("doc0", docs.get(1).metadata.get("book"));
Assert.assertEquals("doc1", docs.get(2).metadata.get("book"));
Assert.assertEquals("1783", docs.get(2).metadata.get("year"));
String year0 = docs.get(0).metadata.get("year");
String year1 = docs.get(1).metadata.get("year");
Assert.assertTrue(("1981".equals(year0) && "1982".equals(year1)) || ("1982".equals(year0) && "1981".equals(year1)));
} finally {
Assert.assertNotNull(inF);
Assert.assertTrue(inF.delete());
Assert.assertNotNull(outF);
Assert.assertTrue(outF.delete());
}
}
示例13: testByPages
import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Test
public void testByPages() throws Exception {
File inF = null;
File outF = null;
try {
inF = File.createTempFile("asdf", "jkl");
outF = File.createTempFile("asdf", "jkl");
Utility.copyStringToFile(
"doc0\t0\t0\t1982\tThis is the way things are, here in 1982.\n" +
"doc0\t0\t0\t1981\tThis is the way things were, last year, in 1981.\n" +
"doc0\t0\t0\t1982\tBut 1982 wasn't always this good.\n" +
"doc0\t1\t0\t1783\t1783 was a year that I keep using for examples.\n",
inF
);
Main.main(new String[]{
"--tool=doc-date-lm-collector",
"--dataset=none",
"--what=pages",
"--input=" + inF.getAbsolutePath(),
"--output=" + outF.getAbsolutePath()
});
DocumentSplit written = DocumentSplitFactory.file(outF);
DocumentStreamParser docsSP = new DocDateSketchParser(written, Parameters.instance());
List<Document> docs = new ArrayList<Document>();
while(true) {
Document d = docsSP.nextDocument();
if(d == null) break;
docs.add(d);
}
Assert.assertEquals(3, docs.size());
Assert.assertEquals("doc0_0", docs.get(0).metadata.get("book"));
Assert.assertEquals("doc0_0", docs.get(1).metadata.get("book"));
Assert.assertEquals("doc0_1", docs.get(2).metadata.get("book"));
Assert.assertEquals("1783", docs.get(2).metadata.get("year"));
String year0 = docs.get(0).metadata.get("year");
String year1 = docs.get(1).metadata.get("year");
Assert.assertTrue(("1981".equals(year0) && "1982".equals(year1)) || ("1982".equals(year0) && "1981".equals(year1)));
} finally {
if(inF != null) {
Assert.assertTrue(inF.delete());
}
if(outF != null) {
Assert.assertTrue(outF.delete());
}
}
}
示例14: testFilterYears
import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Test
public void testFilterYears() throws Exception {
File inF = null;
File outF = null;
try {
inF = File.createTempFile("asdf", "jkl");
outF = File.createTempFile("asdf", "jkl");
Utility.copyStringToFile(
"doc0\t0\t0\t1982\tThis is the way things are, here in 1982.\n" +
"doc0\t0\t0\t1981\tThis is the way things were, last year, in 1981.\n" +
"doc0\t0\t0\t1982\tBut 1982 wasn't always this good.\n" +
"doc1\t0\t0\t1783\t1783 was a year that I keep using for examples.\n",
inF
);
Main.main(new String[]{
"--tool=doc-date-lm-collector",
"--dataset=books",
"--what=books",
"--input=" + inF.getAbsolutePath(),
"--output=" + outF.getAbsolutePath()
});
DocumentSplit written = DocumentSplitFactory.file(outF);
DocumentStreamParser docsSP = new DocDateSketchParser(written, Parameters.instance());
List<Document> docs = new ArrayList<Document>();
while(true) {
Document d = docsSP.nextDocument();
if(d == null) break;
docs.add(d);
}
Assert.assertEquals(1, docs.size());
Assert.assertEquals("doc1", docs.get(0).metadata.get("book"));
Assert.assertEquals("1783", docs.get(0).metadata.get("year"));
} finally {
if(inF != null) {
Assert.assertTrue(inF.delete());
}
if(outF != null) {
Assert.assertTrue(outF.delete());
}
}
}
示例15: SignalMediaJSONParser
import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
/**
* This is the constructor expected by UniversalParser
* It must be implemented in each implementing class
*
* @param split
* @param p
*/
public SignalMediaJSONParser(DocumentSplit split, Parameters p) throws IOException {
super(split, p);
this.reader = DocumentStreamParser.getBufferedReader(split);
}