本文整理汇总了Java中org.lemurproject.galago.core.util.DocumentSplitFactory.file方法的典型用法代码示例。如果您正苦于以下问题:Java DocumentSplitFactory.file方法的具体用法?Java DocumentSplitFactory.file怎么用?Java DocumentSplitFactory.file使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.lemurproject.galago.core.util.DocumentSplitFactory
的用法示例。
在下文中一共展示了DocumentSplitFactory.file方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: run
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Override
public void run() throws IOException {
BufferedReader reader;
for (String f : p.getAsList("inputPath", String.class)) {
DocumentSplit split = DocumentSplitFactory.file(f);
reader = DocumentStreamParser.getBufferedReader( split );
String line;
while (null != (line = reader.readLine())) {
lines.increment();
if (line.startsWith("#")) {
continue;
}
processor.process(line);
}
reader.close();
}
processor.close();
}
示例2: processZipFile
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
public static List<DocumentSplit> processZipFile(File fp, Parameters conf) throws IOException {
String forceFileType = conf.get("filetype", (String) null);
ArrayList<DocumentSplit> splits = new ArrayList<>();
try (ZipFile zipF = ZipUtil.open(fp)) {
List<String> names = ZipUtil.listZipFile(zipF);
for (String name : names) {
String fileType = forceFileType;
if (fileType == null) {
File inside = new File(name);
String extension = FSUtil.getExtension(inside);
if (DocumentStreamParser.hasParserForExtension(extension)) {
fileType = extension;
} else {
fileType = detectTrecTextOrWeb(ZipUtil.streamZipEntry(zipF, name), fp.getAbsolutePath() + "!" + name);
}
}
DocumentSplit split = DocumentSplitFactory.file(fp);
split.fileType = fileType;
split.innerName = name;
splits.add(split);
}
}
return splits;
}
示例3: testAppTestGenDoc
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testAppTestGenDoc() throws IOException {
String fileText = AppTest.trecDocument("CACM-0001", "This is some text in a document.\n");
File f = FileUtility.createTemporary();
try {
StreamUtil.copyStringToFile(fileText, f);
DocumentSplit split = DocumentSplitFactory.file(f, "trectext");
TrecTextParser parser = new TrecTextParser(split, Parameters.create());
Document document = parser.nextDocument();
assertNotNull(document);
assertEquals("CACM-0001", document.name);
assertEquals("<TEXT>\nThis is some text in a document.\n</TEXT>\n", document.text);
document = parser.nextDocument();
assertNull(document);
} finally {
f.delete();
}
}
示例4: testDocumentStreamParser
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testDocumentStreamParser() throws IOException {
String fileText = AppTest.trecDocument("CACM-0001", "This is some text in a document.\n");
File f = FileUtility.createTemporary();
try {
StreamUtil.copyStringToFile(fileText, f);
DocumentSplit split = DocumentSplitFactory.file(f, "trectext");
DocumentStreamParser parser = DocumentStreamParser.create(split, Parameters.create());
Document document = parser.nextDocument();
assertNotNull(document);
assertEquals("CACM-0001", document.name);
assertEquals("<TEXT>\nThis is some text in a document.\n</TEXT>\n", document.text);
document = parser.nextDocument();
assertNull(document);
} finally {
f.delete();
}
}
示例5: testParseOneDocument
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testParseOneDocument() throws IOException {
String fileText
= "<DOC>\n"
+ "<DOCNO>CACM-0001</DOCNO>\n"
+ "<TEXT>\n"
+ "This is some text in a document.\n"
+ "</TEXT>\n"
+ "</DOC>\n";
File f = FileUtility.createTemporary();
try {
StreamUtil.copyStringToFile(fileText, f);
DocumentSplit split = DocumentSplitFactory.file(f);
TrecTextParser parser = new TrecTextParser(split, Parameters.create());
Document document = parser.nextDocument();
assertNotNull(document);
assertEquals("CACM-0001", document.name);
assertEquals("<TEXT>\nThis is some text in a document.\n</TEXT>\n", document.text);
document = parser.nextDocument();
assertNull(document);
} finally {
f.delete();
}
}
示例6: testParseNothing
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testParseNothing() throws IOException {
File f = FileUtility.createTemporary();
f.createNewFile();
try {
DocumentSplit split = DocumentSplitFactory.file(f);
TrecWebParser parser = new TrecWebParser(split, Parameters.create());
Document document = parser.nextDocument();
assertNull(document);
} finally {
f.delete();
}
}
示例7: testParseOneDocument
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testParseOneDocument() throws IOException {
String fileText =
"<DOC>\n"
+ "<DOCNO>CACM-0001</DOCNO>\n"
+ "<DOCHDR>\n"
+ "http://www.yahoo.com:80 some extra text here\n"
+ "even more text in this part\n"
+ "</DOCHDR>\n"
+ "This is some text in a document.\n"
+ "</DOC>\n";
File f = FileUtility.createTemporary();
try {
StreamUtil.copyStringToFile(fileText, f);
DocumentSplit split = DocumentSplitFactory.file(f);
TrecWebParser parser = new TrecWebParser(split, Parameters.create());
Document document = parser.nextDocument();
assertNotNull(document);
assertEquals("CACM-0001", document.name);
assertEquals("http://www.yahoo.com", document.metadata.get("url"));
assertEquals("This is some text in a document.\n", document.text);
document = parser.nextDocument();
assertNull(document);
} finally {
f.delete();
}
}
示例8: testParseNothing
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testParseNothing() throws IOException {
File f = FileUtility.createTemporary();
f.createNewFile();
try {
DocumentSplit split = DocumentSplitFactory.file(f, "trectext");
TrecTextParser parser = new TrecTextParser(split, Parameters.create());
Document document = parser.nextDocument();
assertNull(document);
} finally {
f.delete();
}
}
示例9: testParseTwoDocuments
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testParseTwoDocuments() throws IOException {
String fileText
= "<DOC>\n"
+ "<DOCNO>CACM-0001</DOCNO>\n"
+ "<TEXT>\n"
+ "This is some text in a document.\n"
+ "</TEXT>\n"
+ "</DOC>\n"
+ "<DOC>\n"
+ "<DOCNO>CACM-0002</DOCNO>\n"
+ "<TEXT>\n"
+ "This is some text in a document.\n"
+ "</TEXT>\n"
+ "</DOC>\n";
File f = FileUtility.createTemporary();
try {
StreamUtil.copyStringToFile(fileText, f);
DocumentSplit split = DocumentSplitFactory.file(f);
TrecTextParser parser = new TrecTextParser(split, Parameters.create());
Document document = parser.nextDocument();
assertNotNull(document);
assertEquals("CACM-0001", document.name);
assertEquals("<TEXT>\nThis is some text in a document.\n</TEXT>\n", document.text);
document = parser.nextDocument();
assertNotNull(document);
assertEquals("CACM-0002", document.name);
assertEquals("<TEXT>\nThis is some text in a document.\n</TEXT>\n", document.text);
document = parser.nextDocument();
assertNull(document);
} finally {
f.delete();
}
}
示例10: testExtensions
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testExtensions() throws IOException {
File tmp = FileUtility.createTemporary();
Parameters p = Parameters.create();
p.set("parser", Parameters.create());
List<Parameters> kinds = new ArrayList<Parameters>();
kinds.add(Parameters.parseArray("filetype", "qqe",
"class", TrecTextParser.class.getName()));
kinds.add(Parameters.parseArray("filetype", "qwe",
"class", TrecWebParser.class.getName()));
kinds.add(Parameters.parseArray("filetype", "trecweb",
"class", TrecWebParser.class.getName()));
p.getMap("parser").put("externalParsers", kinds);
DocumentStreamParser.addExternalParsers(p.getMap("parser"));
DocumentStreamParser.addExternalParsers(p);
assertTrue(DocumentStreamParser.hasParserForExtension("qwe"));
assertTrue(DocumentStreamParser.hasParserForExtension("qqe"));
assertTrue(DocumentStreamParser.hasParserForExtension("trecweb"));
DocumentSplit split = DocumentSplitFactory.file(tmp, "qwe");
DocumentStreamParser parser = DocumentStreamParser.create(split, Parameters.create());
assertTrue(parser instanceof TrecWebParser);
tmp.delete();
}
示例11: selectsDatedSentenceParser
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void selectsDatedSentenceParser() throws IOException {
File tmp = null;
try {
tmp = FileUtility.createTemporary();
Parameters buildP = Parameters.parseArray("filetype", DatedSentenceParser.class.getName(), "dataset", "none");
DocumentSplit fakeSplit = DocumentSplitFactory.file(tmp);
DatedSentenceParser dsp = (DatedSentenceParser) DocumentStreamParser.instance(fakeSplit, buildP);
assertEquals("none", dsp.conf.getString("dataset"));
} finally {
if(tmp != null) assertTrue(tmp.delete());
}
}
示例12: simpleParse
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void simpleParse() throws IOException {
String data = "<http://dbpedia.org/resource/Anarchism> <http://dbpedia.org/ontology/abstract> \"\"@en .\n" +
"<http://dbpedia.org/resource/Autism> <http://dbpedia.org/ontology/abstract> \"Autism is a disorder of neural development characterized by impaired social interaction and communication, and by restricted and repetitive behavior.\"@en .\n" +
"<http://dbpedia.org/resource/Achilles> <http://dbpedia.org/ontology/abstract> \"In Greek mythology, Achilles was a Greek hero of the Trojan War and the central character and greatest warrior of Homer's Iliad.\"@en .\n";
File tmp = File.createTempFile("fake-dbpedia-abstracts", ".ttl");
try {
Utility.copyStringToFile(data, tmp);
DocumentStreamParser ps = new DbpediaAbstractParser(DocumentSplitFactory.file(tmp), Parameters.instance());
Document autism = ps.nextDocument();
assertNotNull(autism);
Document achilles = ps.nextDocument();
assertNotNull(achilles);
assertNull(ps.nextDocument());
assertNull(ps.nextDocument());
assertEquals("Autism", autism.name);
assertEquals("Achilles", achilles.name);
assertEquals("<title>Achilles</title>\n<body>In Greek mythology, Achilles was a Greek hero of the Trojan War and the central character and greatest warrior of Homer's Iliad.</body>", achilles.text);
} finally {
assertTrue(tmp.delete());
}
}
示例13: processFile
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
public static List<DocumentSplit> processFile(File fp, Parameters conf) throws IOException {
// Be smart here, so we delegate to processDirectory as needed.
if(fp.isDirectory()) {
return processDirectory(fp, conf);
}
String inputPolicy = conf.get("inputPolicy", "require");
String forceFileType = conf.get("filetype", (String) null);
ArrayList<DocumentSplit> documents = new ArrayList<>();
// First, make sure this file exists. If not, whine about it.
if (!fp.exists()) {
switch (inputPolicy) {
case "require":
throw new IOException(String.format("File %s was not found. Exiting.\n", fp));
case "warn":
logger.warning(String.format("File %s was not found. Skipping.\n", fp));
return Collections.emptyList();
default:
throw new IllegalArgumentException("No such inputPolicy=" + inputPolicy);
}
}
// Now try to detect what kind of file this is:
boolean isCompressed = StreamCreator.isCompressed(fp.getName());
String fileType = forceFileType;
String extension = FSUtil.getExtension(fp);
// don't allow forcing of filetype on zip files;
// expect that the "force" applies to the inside
// only process zip files; don't process zip files somebody has re-compressed
if (!isCompressed && extension.equals("zip")) {
documents.addAll(processZipFile(fp, conf));
return documents;
}
// don't allow forcing of filetype on list files:
// expect that the "force" applies to the inside
if (extension.equals("list")) {
documents.addAll(processListFile(fp, conf));
return documents; // now considered processed1
}
// We'll try to detect by extension first, so we don't have to open the file
if (fileType == null) {
if (extension.equals("subcoll")) {
documents.addAll(processSubCollectionFile(fp, conf));
return documents; // now considered processed
}
if (DocumentStreamParser.hasParserForExtension(extension)) {
fileType = extension;
} else if (!isCompressed && (fp.getName().equals("corpus") || (BTreeFactory.isBTree(fp)))) {
// perhaps the user has renamed the corpus index, but not if they compressed it
// we need random access and even bz2 is dumb. just (b|g)?unzip it.
documents.addAll(processCorpusFile(fp, conf));
return documents; // done now;
} else {
// finally try to be 'clever'...
fileType = detectTrecTextOrWeb(StreamCreator.openInputStream(fp), fp.getAbsolutePath());
}
}
// Eventually it'd be nice to do more format detection here.
if (fileType != null) {
DocumentSplit split = DocumentSplitFactory.file(fp, fileType);
return Collections.singletonList(split);
}else {
logger.warning(String.format("No parser found for file extension: %s.\n", extension));
}
return Collections.emptyList();
}
示例14: split
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
public static DocumentSplit split(String path) {
return DocumentSplitFactory.file(path);
}
示例15: testSimpleData
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入方法依赖的package包/类
@Test
public void testSimpleData() throws Exception {
File inF = null;
File outF = null;
try {
inF = File.createTempFile("asd", "jkl");
outF = File.createTempFile("asd", "jkl");
Utility.copyStringToFile(
"doc0\t0\t0\t1982\tThis is the way things are, here in 1982.\n" +
"doc0\t0\t0\t1981\tThis is the way things were, last year, in 1981.\n" +
"doc0\t0\t0\t1982\tBut 1982 wasn't always this good.\n" +
"doc1\t0\t0\t1783\t1783 was a year that I keep using for examples.\n",
inF
);
Main.main(new String[]{
"--tool=doc-date-lm-collector",
"--dataset=none",
"--what=books",
"--input=" + inF.getAbsolutePath(),
"--output=" + outF.getAbsolutePath()
});
DocumentSplit written = DocumentSplitFactory.file(outF);
DocumentStreamParser docsSP = new DocDateSketchParser(written, Parameters.instance());
List<Document> docs = new ArrayList<Document>();
while(true) {
Document d = docsSP.nextDocument();
if(d == null) break;
docs.add(d);
}
Assert.assertEquals(3, docs.size());
Assert.assertEquals("doc0", docs.get(0).metadata.get("book"));
Assert.assertEquals("doc0", docs.get(1).metadata.get("book"));
Assert.assertEquals("doc1", docs.get(2).metadata.get("book"));
Assert.assertEquals("1783", docs.get(2).metadata.get("year"));
String year0 = docs.get(0).metadata.get("year");
String year1 = docs.get(1).metadata.get("year");
Assert.assertTrue(("1981".equals(year0) && "1982".equals(year1)) || ("1982".equals(year0) && "1981".equals(year1)));
} finally {
Assert.assertNotNull(inF);
Assert.assertTrue(inF.delete());
Assert.assertNotNull(outF);
Assert.assertTrue(outF.delete());
}
}