当前位置: 首页>>代码示例>>Java>>正文


Java DocumentSplitFactory.file方法代码示例

本文整理汇总了Java中org.lemurproject.galago.core.util.DocumentSplitFactory.file方法的典型用法代码示例。如果您正苦于以下问题:Java DocumentSplitFactory.file方法的具体用法?Java DocumentSplitFactory.file怎么用?Java DocumentSplitFactory.file使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.lemurproject.galago.core.util.DocumentSplitFactory的用法示例。


在下文中一共展示了DocumentSplitFactory.file方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: run

import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Override
public void run() throws IOException {
  BufferedReader reader;
  for (String f : p.getAsList("inputPath", String.class)) {
    DocumentSplit split = DocumentSplitFactory.file(f);
    reader = DocumentStreamParser.getBufferedReader( split );
    String line;
    while (null != (line = reader.readLine())) {
      lines.increment();

      if (line.startsWith("#")) {
        continue;
      }
      processor.process(line);
    }
    reader.close();
  }
  processor.close();
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:20,代码来源:FileLineParser.java

示例2: processZipFile

import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
public static List<DocumentSplit> processZipFile(File fp, Parameters conf) throws IOException {
  String forceFileType = conf.get("filetype", (String) null);

  ArrayList<DocumentSplit> splits = new ArrayList<>();
  try (ZipFile zipF = ZipUtil.open(fp)) {
    List<String> names = ZipUtil.listZipFile(zipF);
    for (String name : names) {
      String fileType = forceFileType;
      if (fileType == null) {
        File inside = new File(name);
        String extension = FSUtil.getExtension(inside);
        if (DocumentStreamParser.hasParserForExtension(extension)) {
          fileType = extension;
        } else {
          fileType = detectTrecTextOrWeb(ZipUtil.streamZipEntry(zipF, name), fp.getAbsolutePath() + "!" + name);
        }
      }
      DocumentSplit split = DocumentSplitFactory.file(fp);
      split.fileType = fileType;
      split.innerName = name;
      splits.add(split);
    }
  }
  return splits;
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:26,代码来源:DocumentSource.java

示例3: testAppTestGenDoc

import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testAppTestGenDoc() throws IOException {
    String fileText = AppTest.trecDocument("CACM-0001", "This is some text in a document.\n");

    File f = FileUtility.createTemporary();
    try {
        StreamUtil.copyStringToFile(fileText, f);
        DocumentSplit split = DocumentSplitFactory.file(f, "trectext");
        TrecTextParser parser = new TrecTextParser(split, Parameters.create());

        Document document = parser.nextDocument();
        assertNotNull(document);
        assertEquals("CACM-0001", document.name);
        assertEquals("<TEXT>\nThis is some text in a document.\n</TEXT>\n", document.text);

        document = parser.nextDocument();
        assertNull(document);
    } finally {
        f.delete();
    }
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:22,代码来源:TrecTextParserTest.java

示例4: testDocumentStreamParser

import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testDocumentStreamParser() throws IOException {
    String fileText = AppTest.trecDocument("CACM-0001", "This is some text in a document.\n");

    File f = FileUtility.createTemporary();
    try {
        StreamUtil.copyStringToFile(fileText, f);
        DocumentSplit split = DocumentSplitFactory.file(f, "trectext");
        DocumentStreamParser parser = DocumentStreamParser.create(split, Parameters.create());

        Document document = parser.nextDocument();
        assertNotNull(document);
        assertEquals("CACM-0001", document.name);
        assertEquals("<TEXT>\nThis is some text in a document.\n</TEXT>\n", document.text);

        document = parser.nextDocument();
        assertNull(document);
    } finally {
        f.delete();
    }
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:22,代码来源:TrecTextParserTest.java

示例5: testParseOneDocument

import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testParseOneDocument() throws IOException {
    String fileText
            = "<DOC>\n"
            + "<DOCNO>CACM-0001</DOCNO>\n"
            + "<TEXT>\n"
            + "This is some text in a document.\n"
            + "</TEXT>\n"
            + "</DOC>\n";
    File f = FileUtility.createTemporary();
    try {
        StreamUtil.copyStringToFile(fileText, f);
        DocumentSplit split = DocumentSplitFactory.file(f);
        TrecTextParser parser = new TrecTextParser(split, Parameters.create());

        Document document = parser.nextDocument();
        assertNotNull(document);
        assertEquals("CACM-0001", document.name);
        assertEquals("<TEXT>\nThis is some text in a document.\n</TEXT>\n", document.text);

        document = parser.nextDocument();
        assertNull(document);
    } finally {
        f.delete();
    }
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:27,代码来源:TrecTextParserTest.java

示例6: testParseNothing

import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testParseNothing() throws IOException {
  File f = FileUtility.createTemporary();
  f.createNewFile();

  try {
    DocumentSplit split = DocumentSplitFactory.file(f);
    TrecWebParser parser = new TrecWebParser(split, Parameters.create());

    Document document = parser.nextDocument();
    assertNull(document);
  } finally {
    f.delete();
  }
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:16,代码来源:TrecWebParserTest.java

示例7: testParseOneDocument

import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testParseOneDocument() throws IOException {
  String fileText =
          "<DOC>\n"
          + "<DOCNO>CACM-0001</DOCNO>\n"
          + "<DOCHDR>\n"
          + "http://www.yahoo.com:80 some extra text here\n"
          + "even more text in this part\n"
          + "</DOCHDR>\n"
          + "This is some text in a document.\n"
          + "</DOC>\n";
  File f = FileUtility.createTemporary();
  try {

    StreamUtil.copyStringToFile(fileText, f);
    DocumentSplit split = DocumentSplitFactory.file(f);
    TrecWebParser parser = new TrecWebParser(split, Parameters.create());

    Document document = parser.nextDocument();
    assertNotNull(document);
    assertEquals("CACM-0001", document.name);
    assertEquals("http://www.yahoo.com", document.metadata.get("url"));
    assertEquals("This is some text in a document.\n", document.text);

    document = parser.nextDocument();
    assertNull(document);
  } finally {
    f.delete();
  }
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:31,代码来源:TrecWebParserTest.java

示例8: testParseNothing

import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testParseNothing() throws IOException {
    File f = FileUtility.createTemporary();
    f.createNewFile();
    try {
        DocumentSplit split = DocumentSplitFactory.file(f, "trectext");
        TrecTextParser parser = new TrecTextParser(split, Parameters.create());

        Document document = parser.nextDocument();
        assertNull(document);
    } finally {
        f.delete();
    }
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:15,代码来源:TrecTextParserTest.java

示例9: testParseTwoDocuments

import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testParseTwoDocuments() throws IOException {
    String fileText
            = "<DOC>\n"
            + "<DOCNO>CACM-0001</DOCNO>\n"
            + "<TEXT>\n"
            + "This is some text in a document.\n"
            + "</TEXT>\n"
            + "</DOC>\n"
            + "<DOC>\n"
            + "<DOCNO>CACM-0002</DOCNO>\n"
            + "<TEXT>\n"
            + "This is some text in a document.\n"
            + "</TEXT>\n"
            + "</DOC>\n";
    File f = FileUtility.createTemporary();
    try {
        StreamUtil.copyStringToFile(fileText, f);
        DocumentSplit split = DocumentSplitFactory.file(f);
        TrecTextParser parser = new TrecTextParser(split, Parameters.create());

        Document document = parser.nextDocument();
        assertNotNull(document);
        assertEquals("CACM-0001", document.name);
        assertEquals("<TEXT>\nThis is some text in a document.\n</TEXT>\n", document.text);

        document = parser.nextDocument();
        assertNotNull(document);
        assertEquals("CACM-0002", document.name);
        assertEquals("<TEXT>\nThis is some text in a document.\n</TEXT>\n", document.text);

        document = parser.nextDocument();
        assertNull(document);
    } finally {
        f.delete();
    }
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:38,代码来源:TrecTextParserTest.java

示例10: testExtensions

import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testExtensions() throws IOException {
    File tmp = FileUtility.createTemporary();

    Parameters p = Parameters.create();
    p.set("parser", Parameters.create());

    List<Parameters> kinds = new ArrayList<Parameters>();
    kinds.add(Parameters.parseArray("filetype", "qqe",
            "class", TrecTextParser.class.getName()));
    kinds.add(Parameters.parseArray("filetype", "qwe",
            "class", TrecWebParser.class.getName()));
    kinds.add(Parameters.parseArray("filetype", "trecweb",
            "class", TrecWebParser.class.getName()));
    p.getMap("parser").put("externalParsers", kinds);

    DocumentStreamParser.addExternalParsers(p.getMap("parser"));

    DocumentStreamParser.addExternalParsers(p);
    assertTrue(DocumentStreamParser.hasParserForExtension("qwe"));
    assertTrue(DocumentStreamParser.hasParserForExtension("qqe"));
    assertTrue(DocumentStreamParser.hasParserForExtension("trecweb"));

    DocumentSplit split = DocumentSplitFactory.file(tmp, "qwe");
    DocumentStreamParser parser = DocumentStreamParser.create(split, Parameters.create());
    assertTrue(parser instanceof TrecWebParser);

    tmp.delete();
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:30,代码来源:UniversalParserTest.java

示例11: selectsDatedSentenceParser

import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void selectsDatedSentenceParser() throws IOException {
  File tmp = null;
  try {
    tmp = FileUtility.createTemporary();
    Parameters buildP = Parameters.parseArray("filetype", DatedSentenceParser.class.getName(), "dataset", "none");
    DocumentSplit fakeSplit = DocumentSplitFactory.file(tmp);
    DatedSentenceParser dsp = (DatedSentenceParser) DocumentStreamParser.instance(fakeSplit, buildP);
    assertEquals("none", dsp.conf.getString("dataset"));
  } finally {
    if(tmp != null) assertTrue(tmp.delete());
  }
}
 
开发者ID:jjfiv,项目名称:ecir2015timebooks,代码行数:14,代码来源:DatedSentenceParserTest.java

示例12: simpleParse

import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void simpleParse() throws IOException {
  String data = "<http://dbpedia.org/resource/Anarchism> <http://dbpedia.org/ontology/abstract> \"\"@en .\n" +
      "<http://dbpedia.org/resource/Autism> <http://dbpedia.org/ontology/abstract> \"Autism is a disorder of neural development characterized by impaired social interaction and communication, and by restricted and repetitive behavior.\"@en .\n" +
      "<http://dbpedia.org/resource/Achilles> <http://dbpedia.org/ontology/abstract> \"In Greek mythology, Achilles was a Greek hero of the Trojan War and the central character and greatest warrior of Homer's Iliad.\"@en .\n";

  File tmp = File.createTempFile("fake-dbpedia-abstracts", ".ttl");
  try {
    Utility.copyStringToFile(data, tmp);
    DocumentStreamParser ps = new DbpediaAbstractParser(DocumentSplitFactory.file(tmp), Parameters.instance());

    Document autism = ps.nextDocument();
    assertNotNull(autism);
    Document achilles = ps.nextDocument();
    assertNotNull(achilles);
    assertNull(ps.nextDocument());
    assertNull(ps.nextDocument());

    assertEquals("Autism", autism.name);
    assertEquals("Achilles", achilles.name);
    assertEquals("<title>Achilles</title>\n<body>In Greek mythology, Achilles was a Greek hero of the Trojan War and the central character and greatest warrior of Homer's Iliad.</body>", achilles.text);

  } finally {
    assertTrue(tmp.delete());
  }

}
 
开发者ID:jjfiv,项目名称:ecir2015timebooks,代码行数:28,代码来源:DbpediaAbstractParserTest.java

示例13: processFile

import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
public static List<DocumentSplit> processFile(File fp, Parameters conf) throws IOException {
  // Be smart here, so we delegate to processDirectory as needed.
  if(fp.isDirectory()) {
    return processDirectory(fp, conf);
  }

  String inputPolicy = conf.get("inputPolicy", "require");

  String forceFileType = conf.get("filetype", (String) null);

  ArrayList<DocumentSplit> documents = new ArrayList<>();


  // First, make sure this file exists. If not, whine about it.
  if (!fp.exists()) {
    switch (inputPolicy) {
      case "require":
        throw new IOException(String.format("File %s was not found. Exiting.\n", fp));
      case "warn":
        logger.warning(String.format("File %s was not found. Skipping.\n", fp));
        return Collections.emptyList();
      default:
        throw new IllegalArgumentException("No such inputPolicy=" + inputPolicy);
    }
  }

  // Now try to detect what kind of file this is:
  boolean isCompressed = StreamCreator.isCompressed(fp.getName());
  String fileType = forceFileType;
  String extension = FSUtil.getExtension(fp);

  // don't allow forcing of filetype on zip files;
  // expect that the "force" applies to the inside
  // only process zip files; don't process zip files somebody has re-compressed
  if (!isCompressed && extension.equals("zip")) {
    documents.addAll(processZipFile(fp, conf));
    return documents;
  }

  // don't allow forcing of filetype on list files:
  // expect that the "force" applies to the inside
  if (extension.equals("list")) {
    documents.addAll(processListFile(fp, conf));
    return documents; // now considered processed1
  }

  // We'll try to detect by extension first, so we don't have to open the file
  if (fileType == null) {
    if (extension.equals("subcoll")) {
      documents.addAll(processSubCollectionFile(fp, conf));
      return documents; // now considered processed
    }

    if (DocumentStreamParser.hasParserForExtension(extension)) {
      fileType = extension;

    } else if (!isCompressed && (fp.getName().equals("corpus") || (BTreeFactory.isBTree(fp)))) {
      // perhaps the user has renamed the corpus index, but not if they compressed it
      // we need random access and even bz2 is dumb. just (b|g)?unzip it.
      documents.addAll(processCorpusFile(fp, conf));
      return documents; // done now;

    } else {
      // finally try to be 'clever'...
      fileType = detectTrecTextOrWeb(StreamCreator.openInputStream(fp), fp.getAbsolutePath());
    }
  }

  // Eventually it'd be nice to do more format detection here.

  if (fileType != null) {
    DocumentSplit split = DocumentSplitFactory.file(fp, fileType);
    return Collections.singletonList(split);
  }else {
      logger.warning(String.format("No parser found for file extension: %s.\n", extension));
  }

  return Collections.emptyList();
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:80,代码来源:DocumentSource.java

示例14: split

import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
public static DocumentSplit split(String path) {
  return DocumentSplitFactory.file(path);
}
 
开发者ID:jjfiv,项目名称:ecir2015timebooks,代码行数:4,代码来源:GalagoUtil.java

示例15: testSimpleData

import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testSimpleData() throws Exception {
  File inF = null;
  File outF = null;
  try {
    inF = File.createTempFile("asd", "jkl");
	outF = File.createTempFile("asd", "jkl");

    Utility.copyStringToFile(
      "doc0\t0\t0\t1982\tThis is the way things are, here in 1982.\n" +
      "doc0\t0\t0\t1981\tThis is the way things were, last year, in 1981.\n" +
      "doc0\t0\t0\t1982\tBut 1982 wasn't always this good.\n" +
      "doc1\t0\t0\t1783\t1783 was a year that I keep using for examples.\n",
      inF
    );

    Main.main(new String[]{
      "--tool=doc-date-lm-collector",
      "--dataset=none",
      "--what=books",
      "--input=" + inF.getAbsolutePath(),
      "--output=" + outF.getAbsolutePath()
    });

    DocumentSplit written = DocumentSplitFactory.file(outF);
    DocumentStreamParser docsSP = new DocDateSketchParser(written, Parameters.instance());

    List<Document> docs = new ArrayList<Document>();
    while(true) {
      Document d = docsSP.nextDocument();
      if(d == null) break;
      docs.add(d);
    }

    Assert.assertEquals(3, docs.size());
    Assert.assertEquals("doc0", docs.get(0).metadata.get("book"));
    Assert.assertEquals("doc0", docs.get(1).metadata.get("book"));
    Assert.assertEquals("doc1", docs.get(2).metadata.get("book"));
    Assert.assertEquals("1783", docs.get(2).metadata.get("year"));
    String year0 = docs.get(0).metadata.get("year");
    String year1 = docs.get(1).metadata.get("year");
    Assert.assertTrue(("1981".equals(year0) && "1982".equals(year1)) || ("1982".equals(year0) && "1981".equals(year1)));


  } finally {
    Assert.assertNotNull(inF);
    Assert.assertTrue(inF.delete());
    Assert.assertNotNull(outF);
    Assert.assertTrue(outF.delete());
  }

}
 
开发者ID:jjfiv,项目名称:ecir2015timebooks,代码行数:53,代码来源:DocDateLMCollectorTest.java


注:本文中的org.lemurproject.galago.core.util.DocumentSplitFactory.file方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。