本文整理汇总了Java中org.lemurproject.galago.core.tokenize.Tokenizer类的典型用法代码示例。如果您正苦于以下问题:Java Tokenizer类的具体用法?Java Tokenizer怎么用?Java Tokenizer使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
Tokenizer类属于org.lemurproject.galago.core.tokenize包,在下文中一共展示了Tokenizer类的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: Indexer
import org.lemurproject.galago.core.tokenize.Tokenizer; //导入依赖的package包/类
public Indexer(/*Parameters p ? */) throws Exception {
// basic plan:
// pass universal parser a file path
// universal parser will extract a stream of documents (or just one)
// passes them to the tokenzier
// tokenizer will extract word tokens
// passes documents to the numberer
// a number is assigned
// a fully formed document is the given to the index
indexer = new UniversalParser(new FakeParameters(Parameters.create()));
Tokenizer p2 = Tokenizer.create(Parameters.create());
SequentialDocumentNumberer p3 = new SequentialDocumentNumberer();
Parameters indexParams = Parameters.create();
indexParams.set("shardDirectory", "/path/to/store/output/");
indexParams.set("indexBlockSize", 100);
indexParams.set("radix", 2);
indexParams.set("mergeMode", "local");
indexParams.set("stemming", true);
indexParams.set("makecorpus", false);
index = new GeometricIndex(new FakeParameters(indexParams));
retrieval = new LocalRetrieval(index);
// now link these steps together
indexer.setProcessor(p2);
p2.setProcessor(p3);
p3.setProcessor(index);
}
示例2: fromStream
import org.lemurproject.galago.core.tokenize.Tokenizer; //导入依赖的package包/类
@Override
public Document fromStream(DataInputStream stream, Document.DocumentComponents selection) throws IOException {
SerializerCommon.ByteBuf buffer = new SerializerCommon.ByteBuf();
DataInputStream input = new DataInputStream(new SnappyInputStream(stream));
Document d = new Document();
int metadataSize = input.readInt();
int textSize = input.readInt(); // ignored
// identifier
d.identifier = input.readLong();
// name
d.name = buffer.readString(input);
if (selection.metadata) {
d.metadata = SerializerCommon.readMetadata(input, buffer);
// only both skipping if we need to
} else if (selection.text || selection.tokenize) {
input.skip(metadataSize);
}
// can't get tokens without text in this case...
if (selection.text || selection.tokenize) {
d.text = SerializerCommon.readText(input, selection, buffer);
}
input.close();
// give back terms & tags
if(selection.tokenize) {
// Tokenizer is *not* threadsafe, so we must make a copy of it for each use in case of threads.
Tokenizer tokenizer = getTokenizer();
tokenizer.tokenize(d);
}
return d;
}
示例3: run
import org.lemurproject.galago.core.tokenize.Tokenizer; //导入依赖的package包/类
@Override
public void run(Parameters p, PrintStream output) throws Exception {
Retrieval index = RetrievalFactory.create(p);
Tokenizer tokenizer = index.getTokenizer();
String text = StreamUtil.copyStreamToString(StreamCreator.openInputStream(p.getString("input")));
Document doc = tokenizer.tokenize(text);
HashSet<String> uniq = new HashSet<>(doc.terms);
List<Parameters> termInfos = new ArrayList<>();
for (String query : uniq) {
Parameters termStats = Parameters.create();
NodeStatistics counts = index.getNodeStatistics(new Node("counts", query));
termStats.set("term", query);
termStats.set("cf", counts.nodeFrequency);
termStats.set("maxTF", counts.maximumCount);
termStats.set("df", counts.nodeDocumentCount);
termInfos.add(termStats);
}
Parameters overall = Parameters.create();
FieldStatistics lengths = index.getCollectionStatistics(new Node("lengths"));
overall.put("clen", lengths.collectionLength);
overall.put("terms", termInfos);
if(p.get("pretty", true)) {
output.println(overall.toPrettyString());
} else {
output.println(overall);
}
}
示例4: getTokenizer
import org.lemurproject.galago.core.tokenize.Tokenizer; //导入依赖的package包/类
@Override
public Tokenizer getTokenizer() {
return this.retrievals.get(0).getTokenizer();
}
示例5: getTokenizer
import org.lemurproject.galago.core.tokenize.Tokenizer; //导入依赖的package包/类
@Override
public Tokenizer getTokenizer() {
return groups.get(defGroup).getTokenizer();
}
示例6: getTokenizer
import org.lemurproject.galago.core.tokenize.Tokenizer; //导入依赖的package包/类
@Override
public Tokenizer getTokenizer() {
return Tokenizer.create(this.index.getManifest());
}
示例7: AnchorTextDocumentCreator
import org.lemurproject.galago.core.tokenize.Tokenizer; //导入依赖的package包/类
public AnchorTextDocumentCreator(TupleFlowParameters tp) {
tokenizer = Tokenizer.create(tp.getJSON());
}
示例8: getTokenizerStep
import org.lemurproject.galago.core.tokenize.Tokenizer; //导入依赖的package包/类
public static StepInformation getTokenizerStep(Parameters p) {
return getTokenizerStep(p, Tokenizer.getTokenizerClass(p));
}
示例9: testMergeFlushedSequentialIndexes
import org.lemurproject.galago.core.tokenize.Tokenizer; //导入依赖的package包/类
@Test
public void testMergeFlushedSequentialIndexes() throws Exception {
File index1 = null;
File index2 = null;
File indexmerged = null;
try {
Parameters p1 = Parameters.parseString("{\"documentNumberOffset\":0}");
Parameters p2 = Parameters.parseString("{\"documentNumberOffset\":1000}");
MemoryIndex mi1 = new MemoryIndex(new FakeParameters(p1));
MemoryIndex mi2 = new MemoryIndex(new FakeParameters(p2));
Tokenizer tok = Tokenizer.create(p1);
for (int i = 0; i < 100; i++) {
Document d1 = new Document("DOCS1-" + i, "this is sample document " + i);
Document d2 = new Document("DOCS2-" + i, "this is a different document " + i);
tok.tokenize(d1);
tok.tokenize(d2);
mi1.process(d1);
mi2.process(d2);
}
index1 = FileUtility.createTemporaryDirectory();
FlushToDisk.flushMemoryIndex(mi1, index1.getAbsolutePath());
index2 = FileUtility.createTemporaryDirectory();
FlushToDisk.flushMemoryIndex(mi2, index2.getAbsolutePath());
AppTest.verifyIndexStructures(index1);
AppTest.verifyIndexStructures(index2);
indexmerged = FileUtility.createTemporaryDirectory();
Parameters mergeParams = Parameters.create();
mergeParams.set("indexPath", indexmerged.getAbsolutePath());
ArrayList<String> inputs = new ArrayList<String>();
inputs.add(index1.getAbsolutePath());
inputs.add(index2.getAbsolutePath());
mergeParams.set("inputPath", inputs);
mergeParams.set("renumberDocuments", false);
App.run("merge-index", mergeParams, System.out);
AppTest.verifyIndexStructures(indexmerged);
DiskIndex di_index1 = new DiskIndex(index1.getAbsolutePath());
DiskIndex di_index2 = new DiskIndex(index2.getAbsolutePath());
DiskIndex di_merged = new DiskIndex(indexmerged.getAbsolutePath());
assertEquals(di_index1.getIndexPartStatistics("postings").collectionLength, 500);
assertEquals(di_index2.getIndexPartStatistics("postings").collectionLength, 600);
assertEquals(di_merged.getIndexPartStatistics("postings").collectionLength, 1100);
assertEquals(di_index1.getIndexPartStatistics("postings").vocabCount, 104);
assertEquals(di_index2.getIndexPartStatistics("postings").vocabCount, 105);
assertEquals(di_merged.getIndexPartStatistics("postings").vocabCount, 106);
assertEquals(di_index1.getIndexPartStatistics("postings").highestDocumentCount, 100);
assertEquals(di_index2.getIndexPartStatistics("postings").highestDocumentCount, 100);
assertEquals(di_merged.getIndexPartStatistics("postings").highestDocumentCount, 200);
assertEquals(di_index1.getIndexPartStatistics("postings").highestFrequency, 100);
assertEquals(di_index2.getIndexPartStatistics("postings").highestFrequency, 100);
assertEquals(di_merged.getIndexPartStatistics("postings").highestFrequency, 200);
assertEquals(di_merged.getName(50), mi1.getName(50));
assertEquals(di_merged.getName(1050), mi2.getName(1050));
} finally {
if (index1 != null) {
FSUtil.deleteDirectory(index1);
}
if (index2 != null) {
FSUtil.deleteDirectory(index2);
}
if (indexmerged != null) {
FSUtil.deleteDirectory(indexmerged);
}
}
}
示例10: getTerms
import org.lemurproject.galago.core.tokenize.Tokenizer; //导入依赖的package包/类
public List<String> getTerms(Parameters config) {
Tokenizer tok = Tokenizer.instance(config);
List<String> terms = tok.tokenize(text.replace('\u2013', '-')).terms;
return QueryUtil.filterTerms(config, terms);
}
示例11: getTokenizer
import org.lemurproject.galago.core.tokenize.Tokenizer; //导入依赖的package包/类
/**
* Get a tokenizer!
* @return a tokenizer according to how this index was built.
*/
Tokenizer getTokenizer();
示例12: getTokenizer
import org.lemurproject.galago.core.tokenize.Tokenizer; //导入依赖的package包/类
/**
* Get a new Tokenizer based on the parameter settings. Since Tokenizer is not assumed to be threadsafe,
* we expect to allocate one for each request.
* @return Tokenizer instance.
*/
public Tokenizer getTokenizer() {
return Tokenizer.create(opts);
}