Java DocumentStreamParser类代码示例

本文整理汇总了Java中org.lemurproject.galago.core.parse.DocumentStreamParser类的典型用法代码示例。如果您正苦于以下问题：Java DocumentStreamParser类的具体用法？Java DocumentStreamParser怎么用？Java DocumentStreamParser使用的例子？那么, 这里精选的类代码示例或许可以为您提供帮助。

DocumentStreamParser类属于org.lemurproject.galago.core.parse包，在下文中一共展示了DocumentStreamParser类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: process

import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Override
public void process(DocumentSplit doc) throws IOException {
    BufferedReader reader = DocumentStreamParser.getBufferedReader(doc);
    for (String str = reader.readLine();
            str != null;
            str = reader.readLine()) {

        String[] parts = str.split(delim);
        if (parts.length > 0) {
            String t = parts[0];
            long cf = 1;
            long dc = 1;
            long mxdf = 1;
            if (parts.length >= 2) {
                cf = Long.parseLong(parts[1]);
            }
            if (parts.length >= 3) {
                dc = Long.parseLong(parts[2]);
            }
            if (parts.length >= 4) {
                mxdf = Long.parseLong(parts[4]);
            }

            WordCount wc = new WordCount(ByteUtil.fromString(t), cf, dc, mxdf);
            processor.process(wc);

        }

    }
    reader.close();
}

开发者ID:teanalab，项目名称:demidovii，代码行数:32，代码来源:ParseWordCountString.java

示例2: forEachDocument

import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
public static void forEachDocument(DocumentStreamParser parser, Operation<Document> action) throws IOException {
  try {
    while (true) {
      Document doc = parser.nextDocument();
      if (doc == null) break;
      action.process(doc);
    }
  } finally {
    parser.close();
  }
}

开发者ID:jjfiv，项目名称:ecir2015timebooks，代码行数:12，代码来源:GalagoUtil.java

示例3: readEach

import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
public static void readEach(List<DocumentSplit> splits, Operation<BufferedReader> action) throws IOException {
  for (DocumentSplit split : splits) {
    try (BufferedReader reader = DocumentStreamParser.getBufferedReader(split)) {
      action.process(reader);
    }
  }
}

开发者ID:jjfiv，项目名称:ecir2015timebooks，代码行数:8，代码来源:GalagoUtil.java

示例4: documentsStreamIterable

import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
public static Iterable<Document> documentsStreamIterable(final DocumentStreamParser parser) {
  return new Iterable<Document>() {
    @Override
    public Iterator<Document> iterator() {
      try {
        return new DocumentStreamIterator(parser);
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }
  };
}

开发者ID:jjfiv，项目名称:ecir2015timebooks，代码行数:13，代码来源:GalagoUtil.java

示例5: run

import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Override
public void run(Parameters argp) throws Exception {
  DocumentStreamParser.addExternalParsers(argp);
  final PrintWriter output = IO.printWriter(argp.getString("output"));
  List<File> files = Util.checkAndExpandPaths(argp.getAsList("input", String.class));
  boolean isListFile = argp.getBoolean("listFile");
  if(isListFile) {
    files = Util.collectLines(files);
  }
  final StanfordCoreNLP nlp = NLP.instance(Parameters.parseArray("annotators",
    Arrays.asList("tokenize", "cleanxml", "ssplit")));

  for(File file : files) {
    System.err.println("# File: " + file.getAbsolutePath());

    MBTEIPageParser pageParser = null;
    try {
      DocumentSplit split = new DocumentSplit();
      split.fileName = file.getCanonicalPath();
      pageParser = new MBTEIPageParser(split, argp);
      while (true) {
        Document page = pageParser.nextDocument();
        if (page == null) break;
        handlePage(page, nlp, output);
      }
    } catch(Exception ex) {
      System.err.println("# Fail: "+file);
      ex.printStackTrace();
    } finally {
      output.flush();
      if(pageParser != null) pageParser.close();
    }
  }

  output.close();
}

开发者ID:jjfiv，项目名称:ecir2015timebooks，代码行数:37，代码来源:PageToSentenceTSV.java

示例6: selectsDatedSentenceParser

import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Test
public void selectsDatedSentenceParser() throws IOException {
  File tmp = null;
  try {
    tmp = FileUtility.createTemporary();
    Parameters buildP = Parameters.parseArray("filetype", DatedSentenceParser.class.getName(), "dataset", "none");
    DocumentSplit fakeSplit = DocumentSplitFactory.file(tmp);
    DatedSentenceParser dsp = (DatedSentenceParser) DocumentStreamParser.instance(fakeSplit, buildP);
    assertEquals("none", dsp.conf.getString("dataset"));
  } finally {
    if(tmp != null) assertTrue(tmp.delete());
  }
}

开发者ID:jjfiv，项目名称:ecir2015timebooks，代码行数:14，代码来源:DatedSentenceParserTest.java

示例7: simpleParse

import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Test
public void simpleParse() throws IOException {
  String data = "<http://dbpedia.org/resource/Anarchism> <http://dbpedia.org/ontology/abstract> \"\"@en .\n" +
      "<http://dbpedia.org/resource/Autism> <http://dbpedia.org/ontology/abstract> \"Autism is a disorder of neural development characterized by impaired social interaction and communication, and by restricted and repetitive behavior.\"@en .\n" +
      "<http://dbpedia.org/resource/Achilles> <http://dbpedia.org/ontology/abstract> \"In Greek mythology, Achilles was a Greek hero of the Trojan War and the central character and greatest warrior of Homer's Iliad.\"@en .\n";

  File tmp = File.createTempFile("fake-dbpedia-abstracts", ".ttl");
  try {
    Utility.copyStringToFile(data, tmp);
    DocumentStreamParser ps = new DbpediaAbstractParser(DocumentSplitFactory.file(tmp), Parameters.instance());

    Document autism = ps.nextDocument();
    assertNotNull(autism);
    Document achilles = ps.nextDocument();
    assertNotNull(achilles);
    assertNull(ps.nextDocument());
    assertNull(ps.nextDocument());

    assertEquals("Autism", autism.name);
    assertEquals("Achilles", achilles.name);
    assertEquals("<title>Achilles</title>\n<body>In Greek mythology, Achilles was a Greek hero of the Trojan War and the central character and greatest warrior of Homer's Iliad.</body>", achilles.text);

  } finally {
    assertTrue(tmp.delete());
  }

}

开发者ID:jjfiv，项目名称:ecir2015timebooks，代码行数:28，代码来源:DbpediaAbstractParserTest.java

示例8: BufferedReaderParser

import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
public BufferedReaderParser(DocumentSplit split, Parameters p) throws IOException {
  super(split, p);
  this.reader = DocumentStreamParser.getBufferedReader(split);
  this.conf = p;
}

开发者ID:jjfiv，项目名称:ecir2015timebooks，代码行数:6，代码来源:BufferedReaderParser.java

示例9: DbpediaAbstractParser

import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
public DbpediaAbstractParser(DocumentSplit split, Parameters p) throws IOException {
  super(split, p);
  this.reader = DocumentStreamParser.getBufferedReader(split);
}

开发者ID:jjfiv，项目名称:ecir2015timebooks，代码行数:5，代码来源:DbpediaAbstractParser.java

示例10: run

import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Override
public void run(Parameters argp) throws Exception {

  final Set<String> finished = new HashSet<>();
  if(argp.containsKey("previous")) {
    List<String> previousOutput = argp.getAsList("previous", String.class);
    for (String prev : previousOutput) {
      IO.forEachLine(IO.file(prev), new IO.StringFunctor() {
			@Override
			public void process(String input) {
				if (input.startsWith("#")) {
					finished.add(StrUtil.removeFront(input, "#"));
				}
			}
		});
    }
  }

  List<DocumentSplit> inputs = GalagoUtil.getDocumentSplits(argp.getAsList("input", String.class), argp);
  PrintWriter out = new PrintWriter(argp.getString("output"));

  if(inputs.isEmpty()) {
    throw new IllegalArgumentException("Input set is empty!");
  }
  System.err.println("# found "+inputs.size()+" input documents!");

  StanfordCoreNLP nlp = NLP.instance(argp);

  for(DocumentSplit split : inputs) {
    System.err.println("# "+split.fileName);
    try (DocumentStreamParser parser = DocumentStreamParser.instance(split, argp)) {
      for (Document doc : GalagoUtil.documentsStreamIterable(parser)) {
        if (finished.contains(doc.name)) continue;

        System.err.println("# " + doc.name);
        try {
          String text = doc.text;
          // hack for silly \\N documents when dealing with wex index
          if (text.length() <= 3) continue;

          List<ExtractTimexSentences.SentenceInfo> sentences = ExtractTimexSentences.extractFromSinglePage(nlp, text);

          for (ExtractTimexSentences.SentenceInfo sinfo : sentences) {
            out.printf("%s\t%s\t%s\n",
              doc.name,
              sinfo.timexValue,
              sinfo.sentence);
          }
          out.println("#" + doc.name); //finish document mark...
        } catch (Exception ex) {
          System.err.println("# Fail: " + split.fileName + "/" + doc.name);
          ex.printStackTrace(System.err);
        }
      }
    }
  }
  out.close();
}

开发者ID:jjfiv，项目名称:ecir2015timebooks，代码行数:59，代码来源:ExtractDatedSentences.java

示例11: DocumentStreamIterator

import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
public DocumentStreamIterator(DocumentStreamParser parser) throws IOException {
  this.parser = parser;
  this.current = parser.nextDocument();
}

开发者ID:jjfiv，项目名称:ecir2015timebooks，代码行数:5，代码来源:GalagoUtil.java

示例12: testSimpleData

import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Test
public void testSimpleData() throws Exception {
  File inF = null;
  File outF = null;
  try {
    inF = File.createTempFile("asd", "jkl");
	outF = File.createTempFile("asd", "jkl");

    Utility.copyStringToFile(
      "doc0\t0\t0\t1982\tThis is the way things are, here in 1982.\n" +
      "doc0\t0\t0\t1981\tThis is the way things were, last year, in 1981.\n" +
      "doc0\t0\t0\t1982\tBut 1982 wasn't always this good.\n" +
      "doc1\t0\t0\t1783\t1783 was a year that I keep using for examples.\n",
      inF
    );

    Main.main(new String[]{
      "--tool=doc-date-lm-collector",
      "--dataset=none",
      "--what=books",
      "--input=" + inF.getAbsolutePath(),
      "--output=" + outF.getAbsolutePath()
    });

    DocumentSplit written = DocumentSplitFactory.file(outF);
    DocumentStreamParser docsSP = new DocDateSketchParser(written, Parameters.instance());

    List<Document> docs = new ArrayList<Document>();
    while(true) {
      Document d = docsSP.nextDocument();
      if(d == null) break;
      docs.add(d);
    }

    Assert.assertEquals(3, docs.size());
    Assert.assertEquals("doc0", docs.get(0).metadata.get("book"));
    Assert.assertEquals("doc0", docs.get(1).metadata.get("book"));
    Assert.assertEquals("doc1", docs.get(2).metadata.get("book"));
    Assert.assertEquals("1783", docs.get(2).metadata.get("year"));
    String year0 = docs.get(0).metadata.get("year");
    String year1 = docs.get(1).metadata.get("year");
    Assert.assertTrue(("1981".equals(year0) && "1982".equals(year1)) || ("1982".equals(year0) && "1981".equals(year1)));


  } finally {
    Assert.assertNotNull(inF);
    Assert.assertTrue(inF.delete());
    Assert.assertNotNull(outF);
    Assert.assertTrue(outF.delete());
  }

}

开发者ID:jjfiv，项目名称:ecir2015timebooks，代码行数:53，代码来源:DocDateLMCollectorTest.java

示例13: testByPages

import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Test
public void testByPages() throws Exception {
  File inF = null;
  File outF = null;
  try {
	inF = File.createTempFile("asdf", "jkl");
	outF = File.createTempFile("asdf", "jkl");

    Utility.copyStringToFile(
        "doc0\t0\t0\t1982\tThis is the way things are, here in 1982.\n" +
            "doc0\t0\t0\t1981\tThis is the way things were, last year, in 1981.\n" +
            "doc0\t0\t0\t1982\tBut 1982 wasn't always this good.\n" +
            "doc0\t1\t0\t1783\t1783 was a year that I keep using for examples.\n",
        inF
    );

    Main.main(new String[]{
        "--tool=doc-date-lm-collector",
        "--dataset=none",
        "--what=pages",
        "--input=" + inF.getAbsolutePath(),
        "--output=" + outF.getAbsolutePath()
    });

    DocumentSplit written = DocumentSplitFactory.file(outF);
    DocumentStreamParser docsSP = new DocDateSketchParser(written, Parameters.instance());

    List<Document> docs = new ArrayList<Document>();
    while(true) {
      Document d = docsSP.nextDocument();
      if(d == null) break;
      docs.add(d);
    }

    Assert.assertEquals(3, docs.size());
    Assert.assertEquals("doc0_0", docs.get(0).metadata.get("book"));
    Assert.assertEquals("doc0_0", docs.get(1).metadata.get("book"));
    Assert.assertEquals("doc0_1", docs.get(2).metadata.get("book"));
    Assert.assertEquals("1783", docs.get(2).metadata.get("year"));
    String year0 = docs.get(0).metadata.get("year");
    String year1 = docs.get(1).metadata.get("year");
    Assert.assertTrue(("1981".equals(year0) && "1982".equals(year1)) || ("1982".equals(year0) && "1981".equals(year1)));


  } finally {
	if(inF != null) {
		Assert.assertTrue(inF.delete());
	}
	if(outF != null) {
		Assert.assertTrue(outF.delete());
	}
  }

}

开发者ID:jjfiv，项目名称:ecir2015timebooks，代码行数:55，代码来源:DocDateLMCollectorTest.java

示例14: testFilterYears

import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
@Test
public void testFilterYears() throws Exception {
  File inF = null;
  File outF = null;
  try {
    inF = File.createTempFile("asdf", "jkl");
	outF = File.createTempFile("asdf", "jkl");

    Utility.copyStringToFile(
      "doc0\t0\t0\t1982\tThis is the way things are, here in 1982.\n" +
        "doc0\t0\t0\t1981\tThis is the way things were, last year, in 1981.\n" +
        "doc0\t0\t0\t1982\tBut 1982 wasn't always this good.\n" +
        "doc1\t0\t0\t1783\t1783 was a year that I keep using for examples.\n",
      inF
    );

    Main.main(new String[]{
      "--tool=doc-date-lm-collector",
      "--dataset=books",
      "--what=books",
      "--input=" + inF.getAbsolutePath(),
      "--output=" + outF.getAbsolutePath()
    });

    DocumentSplit written = DocumentSplitFactory.file(outF);
    DocumentStreamParser docsSP = new DocDateSketchParser(written, Parameters.instance());

    List<Document> docs = new ArrayList<Document>();
    while(true) {
      Document d = docsSP.nextDocument();
      if(d == null) break;
      docs.add(d);
    }

    Assert.assertEquals(1, docs.size());
    Assert.assertEquals("doc1", docs.get(0).metadata.get("book"));
    Assert.assertEquals("1783", docs.get(0).metadata.get("year"));
  } finally {
	if(inF != null) {
		Assert.assertTrue(inF.delete());
	}
	if(outF != null) {
		Assert.assertTrue(outF.delete());
	}
  }

}

开发者ID:jjfiv，项目名称:ecir2015timebooks，代码行数:48，代码来源:DocDateLMCollectorTest.java

示例15: SignalMediaJSONParser

import org.lemurproject.galago.core.parse.DocumentStreamParser; //导入依赖的package包/类
/**
 * This is the constructor expected by UniversalParser
 * It must be implemented in each implementing class
 *
 * @param split
 * @param p
 */
public SignalMediaJSONParser(DocumentSplit split, Parameters p) throws IOException {
  super(split, p);
  this.reader = DocumentStreamParser.getBufferedReader(split);
}

开发者ID:teanalab，项目名称:demidovii，代码行数:12，代码来源:SignalMediaJSONParser.java

注：本文中的org.lemurproject.galago.core.parse.DocumentStreamParser类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。