本文整理汇总了Java中org.lemurproject.galago.core.util.DocumentSplitFactory类的典型用法代码示例。如果您正苦于以下问题:Java DocumentSplitFactory类的具体用法?Java DocumentSplitFactory怎么用?Java DocumentSplitFactory使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
DocumentSplitFactory类属于org.lemurproject.galago.core.util包,在下文中一共展示了DocumentSplitFactory类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: run
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入依赖的package包/类
@Override
public void run() throws IOException {
BufferedReader reader;
for (String f : p.getAsList("inputPath", String.class)) {
DocumentSplit split = DocumentSplitFactory.file(f);
reader = DocumentStreamParser.getBufferedReader( split );
String line;
while (null != (line = reader.readLine())) {
lines.increment();
if (line.startsWith("#")) {
continue;
}
processor.process(line);
}
reader.close();
}
processor.close();
}
示例2: processZipFile
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入依赖的package包/类
public static List<DocumentSplit> processZipFile(File fp, Parameters conf) throws IOException {
String forceFileType = conf.get("filetype", (String) null);
ArrayList<DocumentSplit> splits = new ArrayList<>();
try (ZipFile zipF = ZipUtil.open(fp)) {
List<String> names = ZipUtil.listZipFile(zipF);
for (String name : names) {
String fileType = forceFileType;
if (fileType == null) {
File inside = new File(name);
String extension = FSUtil.getExtension(inside);
if (DocumentStreamParser.hasParserForExtension(extension)) {
fileType = extension;
} else {
fileType = detectTrecTextOrWeb(ZipUtil.streamZipEntry(zipF, name), fp.getAbsolutePath() + "!" + name);
}
}
DocumentSplit split = DocumentSplitFactory.file(fp);
split.fileType = fileType;
split.innerName = name;
splits.add(split);
}
}
return splits;
}
示例3: testAppTestGenDoc
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入依赖的package包/类
@Test
public void testAppTestGenDoc() throws IOException {
String fileText = AppTest.trecDocument("CACM-0001", "This is some text in a document.\n");
File f = FileUtility.createTemporary();
try {
StreamUtil.copyStringToFile(fileText, f);
DocumentSplit split = DocumentSplitFactory.file(f, "trectext");
TrecTextParser parser = new TrecTextParser(split, Parameters.create());
Document document = parser.nextDocument();
assertNotNull(document);
assertEquals("CACM-0001", document.name);
assertEquals("<TEXT>\nThis is some text in a document.\n</TEXT>\n", document.text);
document = parser.nextDocument();
assertNull(document);
} finally {
f.delete();
}
}
示例4: testDocumentStreamParser
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入依赖的package包/类
@Test
public void testDocumentStreamParser() throws IOException {
String fileText = AppTest.trecDocument("CACM-0001", "This is some text in a document.\n");
File f = FileUtility.createTemporary();
try {
StreamUtil.copyStringToFile(fileText, f);
DocumentSplit split = DocumentSplitFactory.file(f, "trectext");
DocumentStreamParser parser = DocumentStreamParser.create(split, Parameters.create());
Document document = parser.nextDocument();
assertNotNull(document);
assertEquals("CACM-0001", document.name);
assertEquals("<TEXT>\nThis is some text in a document.\n</TEXT>\n", document.text);
document = parser.nextDocument();
assertNull(document);
} finally {
f.delete();
}
}
示例5: testParseOneDocument
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入依赖的package包/类
@Test
public void testParseOneDocument() throws IOException {
String fileText
= "<DOC>\n"
+ "<DOCNO>CACM-0001</DOCNO>\n"
+ "<TEXT>\n"
+ "This is some text in a document.\n"
+ "</TEXT>\n"
+ "</DOC>\n";
File f = FileUtility.createTemporary();
try {
StreamUtil.copyStringToFile(fileText, f);
DocumentSplit split = DocumentSplitFactory.file(f);
TrecTextParser parser = new TrecTextParser(split, Parameters.create());
Document document = parser.nextDocument();
assertNotNull(document);
assertEquals("CACM-0001", document.name);
assertEquals("<TEXT>\nThis is some text in a document.\n</TEXT>\n", document.text);
document = parser.nextDocument();
assertNull(document);
} finally {
f.delete();
}
}
示例6: run
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入依赖的package包/类
public void run() throws IOException {
int i = 0 ;
int total = parameters.getJSON().getList("inputPath").size();
for(String inputIndex : parameters.getJSON().getList("inputPath", String.class)) {
DocumentSplit split = DocumentSplitFactory.numberedFile(inputIndex, i, total);
processor.process(split);
i++;
}
processor.close();
}
示例7: testParseNothing
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入依赖的package包/类
@Test
public void testParseNothing() throws IOException {
File f = FileUtility.createTemporary();
f.createNewFile();
try {
DocumentSplit split = DocumentSplitFactory.file(f);
TrecWebParser parser = new TrecWebParser(split, Parameters.create());
Document document = parser.nextDocument();
assertNull(document);
} finally {
f.delete();
}
}
示例8: testParseOneDocument
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入依赖的package包/类
@Test
public void testParseOneDocument() throws IOException {
String fileText =
"<DOC>\n"
+ "<DOCNO>CACM-0001</DOCNO>\n"
+ "<DOCHDR>\n"
+ "http://www.yahoo.com:80 some extra text here\n"
+ "even more text in this part\n"
+ "</DOCHDR>\n"
+ "This is some text in a document.\n"
+ "</DOC>\n";
File f = FileUtility.createTemporary();
try {
StreamUtil.copyStringToFile(fileText, f);
DocumentSplit split = DocumentSplitFactory.file(f);
TrecWebParser parser = new TrecWebParser(split, Parameters.create());
Document document = parser.nextDocument();
assertNotNull(document);
assertEquals("CACM-0001", document.name);
assertEquals("http://www.yahoo.com", document.metadata.get("url"));
assertEquals("This is some text in a document.\n", document.text);
document = parser.nextDocument();
assertNull(document);
} finally {
f.delete();
}
}
示例9: testParseNothing
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入依赖的package包/类
@Test
public void testParseNothing() throws IOException {
File f = FileUtility.createTemporary();
f.createNewFile();
try {
DocumentSplit split = DocumentSplitFactory.file(f, "trectext");
TrecTextParser parser = new TrecTextParser(split, Parameters.create());
Document document = parser.nextDocument();
assertNull(document);
} finally {
f.delete();
}
}
示例10: testParseTwoDocuments
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入依赖的package包/类
@Test
public void testParseTwoDocuments() throws IOException {
String fileText
= "<DOC>\n"
+ "<DOCNO>CACM-0001</DOCNO>\n"
+ "<TEXT>\n"
+ "This is some text in a document.\n"
+ "</TEXT>\n"
+ "</DOC>\n"
+ "<DOC>\n"
+ "<DOCNO>CACM-0002</DOCNO>\n"
+ "<TEXT>\n"
+ "This is some text in a document.\n"
+ "</TEXT>\n"
+ "</DOC>\n";
File f = FileUtility.createTemporary();
try {
StreamUtil.copyStringToFile(fileText, f);
DocumentSplit split = DocumentSplitFactory.file(f);
TrecTextParser parser = new TrecTextParser(split, Parameters.create());
Document document = parser.nextDocument();
assertNotNull(document);
assertEquals("CACM-0001", document.name);
assertEquals("<TEXT>\nThis is some text in a document.\n</TEXT>\n", document.text);
document = parser.nextDocument();
assertNotNull(document);
assertEquals("CACM-0002", document.name);
assertEquals("<TEXT>\nThis is some text in a document.\n</TEXT>\n", document.text);
document = parser.nextDocument();
assertNull(document);
} finally {
f.delete();
}
}
示例11: testExtensions
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入依赖的package包/类
@Test
public void testExtensions() throws IOException {
File tmp = FileUtility.createTemporary();
Parameters p = Parameters.create();
p.set("parser", Parameters.create());
List<Parameters> kinds = new ArrayList<Parameters>();
kinds.add(Parameters.parseArray("filetype", "qqe",
"class", TrecTextParser.class.getName()));
kinds.add(Parameters.parseArray("filetype", "qwe",
"class", TrecWebParser.class.getName()));
kinds.add(Parameters.parseArray("filetype", "trecweb",
"class", TrecWebParser.class.getName()));
p.getMap("parser").put("externalParsers", kinds);
DocumentStreamParser.addExternalParsers(p.getMap("parser"));
DocumentStreamParser.addExternalParsers(p);
assertTrue(DocumentStreamParser.hasParserForExtension("qwe"));
assertTrue(DocumentStreamParser.hasParserForExtension("qqe"));
assertTrue(DocumentStreamParser.hasParserForExtension("trecweb"));
DocumentSplit split = DocumentSplitFactory.file(tmp, "qwe");
DocumentStreamParser parser = DocumentStreamParser.create(split, Parameters.create());
assertTrue(parser instanceof TrecWebParser);
tmp.delete();
}
示例12: selectsDatedSentenceParser
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入依赖的package包/类
@Test
public void selectsDatedSentenceParser() throws IOException {
File tmp = null;
try {
tmp = FileUtility.createTemporary();
Parameters buildP = Parameters.parseArray("filetype", DatedSentenceParser.class.getName(), "dataset", "none");
DocumentSplit fakeSplit = DocumentSplitFactory.file(tmp);
DatedSentenceParser dsp = (DatedSentenceParser) DocumentStreamParser.instance(fakeSplit, buildP);
assertEquals("none", dsp.conf.getString("dataset"));
} finally {
if(tmp != null) assertTrue(tmp.delete());
}
}
示例13: simpleParse
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入依赖的package包/类
@Test
public void simpleParse() throws IOException {
String data = "<http://dbpedia.org/resource/Anarchism> <http://dbpedia.org/ontology/abstract> \"\"@en .\n" +
"<http://dbpedia.org/resource/Autism> <http://dbpedia.org/ontology/abstract> \"Autism is a disorder of neural development characterized by impaired social interaction and communication, and by restricted and repetitive behavior.\"@en .\n" +
"<http://dbpedia.org/resource/Achilles> <http://dbpedia.org/ontology/abstract> \"In Greek mythology, Achilles was a Greek hero of the Trojan War and the central character and greatest warrior of Homer's Iliad.\"@en .\n";
File tmp = File.createTempFile("fake-dbpedia-abstracts", ".ttl");
try {
Utility.copyStringToFile(data, tmp);
DocumentStreamParser ps = new DbpediaAbstractParser(DocumentSplitFactory.file(tmp), Parameters.instance());
Document autism = ps.nextDocument();
assertNotNull(autism);
Document achilles = ps.nextDocument();
assertNotNull(achilles);
assertNull(ps.nextDocument());
assertNull(ps.nextDocument());
assertEquals("Autism", autism.name);
assertEquals("Achilles", achilles.name);
assertEquals("<title>Achilles</title>\n<body>In Greek mythology, Achilles was a Greek hero of the Trojan War and the central character and greatest warrior of Homer's Iliad.</body>", achilles.text);
} finally {
assertTrue(tmp.delete());
}
}
示例14: processFile
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入依赖的package包/类
public static List<DocumentSplit> processFile(File fp, Parameters conf) throws IOException {
// Be smart here, so we delegate to processDirectory as needed.
if(fp.isDirectory()) {
return processDirectory(fp, conf);
}
String inputPolicy = conf.get("inputPolicy", "require");
String forceFileType = conf.get("filetype", (String) null);
ArrayList<DocumentSplit> documents = new ArrayList<>();
// First, make sure this file exists. If not, whine about it.
if (!fp.exists()) {
switch (inputPolicy) {
case "require":
throw new IOException(String.format("File %s was not found. Exiting.\n", fp));
case "warn":
logger.warning(String.format("File %s was not found. Skipping.\n", fp));
return Collections.emptyList();
default:
throw new IllegalArgumentException("No such inputPolicy=" + inputPolicy);
}
}
// Now try to detect what kind of file this is:
boolean isCompressed = StreamCreator.isCompressed(fp.getName());
String fileType = forceFileType;
String extension = FSUtil.getExtension(fp);
// don't allow forcing of filetype on zip files;
// expect that the "force" applies to the inside
// only process zip files; don't process zip files somebody has re-compressed
if (!isCompressed && extension.equals("zip")) {
documents.addAll(processZipFile(fp, conf));
return documents;
}
// don't allow forcing of filetype on list files:
// expect that the "force" applies to the inside
if (extension.equals("list")) {
documents.addAll(processListFile(fp, conf));
return documents; // now considered processed1
}
// We'll try to detect by extension first, so we don't have to open the file
if (fileType == null) {
if (extension.equals("subcoll")) {
documents.addAll(processSubCollectionFile(fp, conf));
return documents; // now considered processed
}
if (DocumentStreamParser.hasParserForExtension(extension)) {
fileType = extension;
} else if (!isCompressed && (fp.getName().equals("corpus") || (BTreeFactory.isBTree(fp)))) {
// perhaps the user has renamed the corpus index, but not if they compressed it
// we need random access and even bz2 is dumb. just (b|g)?unzip it.
documents.addAll(processCorpusFile(fp, conf));
return documents; // done now;
} else {
// finally try to be 'clever'...
fileType = detectTrecTextOrWeb(StreamCreator.openInputStream(fp), fp.getAbsolutePath());
}
}
// Eventually it'd be nice to do more format detection here.
if (fileType != null) {
DocumentSplit split = DocumentSplitFactory.file(fp, fileType);
return Collections.singletonList(split);
}else {
logger.warning(String.format("No parser found for file extension: %s.\n", extension));
}
return Collections.emptyList();
}
示例15: testDocumentMappingCreator
import org.lemurproject.galago.core.util.DocumentSplitFactory; //导入依赖的package包/类
@Test
public void testDocumentMappingCreator() throws Exception {
File index1 = null;
File index2 = null;
File index3 = null;
try {
index1 = FileUtility.createTemporaryDirectory();
index2 = FileUtility.createTemporaryDirectory();
index3 = FileUtility.createTemporaryDirectory();
// three 10 document indexes (0 -> 9)
makeNamesIndex(9, index1);
makeNamesIndex(9, index2);
makeNamesIndex(9, index3);
Catcher<DocumentMappingData> catcher = new Catcher<DocumentMappingData>();
DocumentNumberMapper mapper = new DocumentNumberMapper();
mapper.setProcessor( catcher );
mapper.process( DocumentSplitFactory.numberedFile(index1.getAbsolutePath(), 0, 3));
mapper.process( DocumentSplitFactory.numberedFile(index2.getAbsolutePath(), 1, 3));
mapper.process( DocumentSplitFactory.numberedFile(index3.getAbsolutePath(), 2, 3));
mapper.close();
assertEquals(0, catcher.data.get(0).indexId);
assertEquals(0, catcher.data.get(0).docNumIncrement);
assertEquals(1, catcher.data.get(1).indexId);
assertEquals(10, catcher.data.get(1).docNumIncrement);
assertEquals(2, catcher.data.get(2).indexId);
assertEquals(20, catcher.data.get(2).docNumIncrement);
} finally {
if (index1 != null) {
FSUtil.deleteDirectory(index1);
}
if (index2 != null) {
FSUtil.deleteDirectory(index2);
}
if (index3 != null) {
FSUtil.deleteDirectory(index3);
}
}
}