本文整理汇总了Java中org.lemurproject.galago.core.parse.Document类的典型用法代码示例。如果您正苦于以下问题:Java Document类的具体用法?Java Document怎么用?Java Document使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
Document类属于org.lemurproject.galago.core.parse包,在下文中一共展示了Document类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: useRelevantDocuments
import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
private Node useRelevantDocuments() throws Exception {
List<String> docnames = p.getList("documents", String.class);
List<Document> docs = getDocuments(docnames);
FieldLanguageModel flm = new FieldLanguageModel();
for (Document d : docs) {
if (d.terms != null && d.terms.size() > 0) {
flm.addDocument(d);
}
}
Node termNodes = new Node("combine", new NodeParameters(), new ArrayList<Node>(), 0);
termNodes.getNodeParameters().set("norm", false);
// Now put get the sub-model for each term
TObjectDoubleHashMap<String> weights = new TObjectDoubleHashMap<String>();
for (String term : queryTerms) {
weights.clear();
for (String field : fields) {
weights.put(term, flm.getFieldProbGivenTerm(field, term));
}
termNodes.addChild(createTermFieldNodes(term, weights));
}
return termNodes;
}
示例2: nextDocument
import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
public Document nextDocument() throws IOException {
String line;
if (reader == null) {
return null;
}
if (identifier.length() == 0) {
return null;
}
StringBuilder buffer = new StringBuilder();
int lines = 0;
while ((line = reader.readLine()) != null) {
buffer.append(line);
buffer.append('\n');
lines ++;
}
if(lines == 0) return null;
// System.out.println(identifier+ "\t\t"+lines+" lines.");
return new Document(identifier, buffer.toString());
}
示例3: nextDocument
import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public Document nextDocument() throws IOException {
String line = reader.readLine();
if(line == null) return null;
Document doc = new Document();
Parameters jdoc = Parameters.parseString(line);
doc.metadata.put("source", jdoc.getString("source"));
doc.metadata.put("title", jdoc.getString("title"));
doc.metadata.put("media-type", jdoc.getString("media-type")); // News or Blog
doc.metadata.put("published", jdoc.getString("published"));
doc.name = jdoc.getString("id");
doc.text = "<title>"+jdoc.getString("title")+"</title>"+jdoc.getString("content");
return doc;
}
示例4: run
import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public void run(Parameters p, PrintStream output) throws Exception {
CorpusReader reader = new CorpusReader(p.getString("path"));
if (reader.getManifest().get("emptyIndexFile", false)) {
output.println("Empty Corpus.");
return;
}
DocumentReader.DocumentIterator iterator = reader.getIterator();
DocumentComponents dc = new DocumentComponents(p);
while (!iterator.isDone()) {
output.println("#IDENTIFIER: " + iterator.getKeyString());
Document document = iterator.getDocument(dc);
output.println("#NAME: " + document.name);
output.println("#METADATA");
for (Map.Entry<String, String> entry : document.metadata.entrySet()) {
output.println(entry.getKey() + "," + entry.getValue());
}
output.println("#TEXT");
output.println(document.text);
iterator.nextKey();
}
reader.close();
}
示例5: addDocument
import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public void addDocument(Document doc) throws IOException {
// add the document
lengths.get(document).add(doc.identifier, doc.terms.size());
// now deal with fields:
TObjectIntHashMap<Bytes> currentFieldLengths = new TObjectIntHashMap<>(doc.tags.size());
for (Tag tag : doc.tags) {
int len = tag.end - tag.begin;
currentFieldLengths.adjustOrPutValue(new Bytes(ByteUtil.fromString(tag.name)), len, len);
}
for (Bytes field : currentFieldLengths.keySet()) {
if (!lengths.containsKey(field)) {
lengths.put(field, new FieldLengthList(field));
}
lengths.get(field).add(doc.identifier, currentFieldLengths.get(field));
}
}
示例6: addDocument
import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public void addDocument(Document doc) {
if (names.isEmpty()) {
offset = doc.identifier;
}
assert (names.size() + offset <= doc.identifier);
while (names.size() + offset < doc.identifier) {
names.add(null); // add nulls to ensure the size of the array is correct
}
docCount += 1;
termCount += doc.terms.size();
names.add(doc.name);
namesRev.put(doc.name, doc.identifier);
}
示例7: addDocument
import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public void addDocument(Document doc) throws IOException {
int postingCount = 0;
for (Tag tag : doc.tags) {
if (tag.name.equals(fieldName) == false) {
continue;
}
for (int i = tag.begin; i < tag.end; i++) {
String stem = stemAsRequired(doc.terms.get(i));
if (stem != null) {
addPosting(ByteUtil.fromString(stem), doc.identifier, i);
postingCount++;
}
}
}
collectionDocumentCount += 1;
collectionPostingsCount += postingCount;
vocabCount = postings.size();
}
示例8: toBytes
import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public byte[] toBytes(Document doc) throws IOException {
ByteArrayOutputStream headerArray = new ByteArrayOutputStream();
DataOutputStream output = new DataOutputStream(headerArray);
// identifier
output.writeLong(doc.identifier);
// name
SerializerCommon.writeString(output, doc.name);
ByteArrayOutputStream metadataArray = SerializerCommon.writeMetadata(doc);
ByteArrayOutputStream textArray = SerializerCommon.writeText(doc);
ByteArrayOutputStream docArray = new ByteArrayOutputStream();
output = new DataOutputStream(new SnappyOutputStream(docArray));
output.writeInt(metadataArray.size());
output.writeInt(textArray.size());
output.write(headerArray.toByteArray());
output.write(metadataArray.toByteArray());
output.write(textArray.toByteArray());
output.close();
return docArray.toByteArray();
}
示例9: getDocument
import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public Document getDocument(String document, DocumentComponents p) throws IOException {
if (parts.containsKey("corpus")) {
try {
CorpusReader corpus = (CorpusReader) parts.get("corpus");
if(corpus == null) {
throw new IllegalArgumentException("Attempted to pull a document from index without a corpus");
}
long docId = getIdentifier(document);
return corpus.getDocument(docId, p);
} catch (IOException e) {
// ignore the exception
logger.log(Level.SEVERE,"IOException while pulling document: "+document,e);
/*logger.log(Level.SEVERE,
"Failed to get document: {0}\n{1}",
new Object[]{document, e.toString()});*/
}
}
return null;
}
示例10: process
import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public void process(Document document) throws IOException {
ByteArrayOutputStream array = new ByteArrayOutputStream();
ObjectOutputStream output;
if (compressed) {
output = new ObjectOutputStream(new SnappyOutputStream(array));
} else {
output = new ObjectOutputStream(array);
}
output.writeObject(document);
output.close();
byte[] key = Utility.fromLong(document.identifier);
byte[] value = array.toByteArray();
KeyValuePair pair = new KeyValuePair(key, value);
processor.process(pair);
}
示例11: testProcess
import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Test
public void testProcess() throws Exception {
DocumentToKeyValuePair dkvp = new DocumentToKeyValuePair();
KeyValuePairProcessor kvpProcessor = new KeyValuePairProcessor();
dkvp.setProcessor(kvpProcessor);
Document document = new Document();
document.identifier = 1;
document.text = "This is text.";
document.name = "DOC2";
document.metadata.put("this", "that");
dkvp.process(document);
KeyValuePair pair = kvpProcessor.pair;
assertEquals(Utility.toLong(pair.key), 1);
ByteArrayInputStream stream = new ByteArrayInputStream(pair.value);
ObjectInputStream input = new ObjectInputStream(stream);
Document result = (Document) input.readObject();
assertEquals(result.identifier, document.identifier);
assertEquals(result.text, document.text);
assertEquals(result.name, document.name);
assertEquals(result.metadata.size(), document.metadata.size());
assertEquals(result.metadata.get("this"), "that");
}
示例12: parseAsDocument
import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
private Document parseAsDocument(String text, ArrayList<IntSpan> positions) throws IOException {
Document document = new Document();
document.text = text;
// Tokenize the document
TagTokenizer tokenizer = new TagTokenizer();
tokenizer.process(document);
if (positions != null) {
positions.addAll(tokenizer.getTokenPositions());
}
if (stemming) {
document = stemmer.stem(document);
}
return document;
}
示例13: testSerializeDocument
import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Test
public void testSerializeDocument() throws IOException {
WebDocumentSerializer wds = new WebDocumentSerializer(Parameters.create());
Document doc = new Document();
doc.metadata.put("meta-key", "value");
doc.metadata.put("null-meta-key", null);
doc.name = "doc-name";
doc.text = "doc text goes here\nand <tag>continues. This is weird</tag>";
wds.getTokenizer().tokenize(doc);
byte[] docBytes = wds.toBytes(doc);
assertNotNull(docBytes);
Document doc2 = wds.fromBytes(docBytes, Document.DocumentComponents.All);
assertEquals(doc.name, doc2.name);
assertEquals(doc.text, doc2.text);
assertNotNull(doc2.metadata);
assertEquals(doc.metadata.get("meta-key"), doc2.metadata.get("meta-key"));
assertTrue(doc.metadata.containsKey("null-meta-key"));
assertNull(doc.metadata.get("null-meta-key"));
assertEquals(doc.terms, doc2.terms);
assertEquals(doc.tags, doc2.tags);
}
示例14: writeMetadata
import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
public static ByteArrayOutputStream writeMetadata(Document doc) throws IOException {
DataOutputStream output;
ByteArrayOutputStream metadataArray = new ByteArrayOutputStream();
output = new DataOutputStream(metadataArray);
// metadata
if (doc.metadata == null) {
output.writeInt(-1);
} else {
output.writeInt(doc.metadata.size());
for (String key : doc.metadata.keySet()) {
writeString(output, key);
writeString(output, doc.metadata.get(key));
}
}
output.close();
return metadataArray;
}
示例15: readText
import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
public static String readText(DataInputStream input, Document.DocumentComponents selection, ByteBuf buffer) throws IOException {
final int textLen = input.readInt();
if(textLen < 0) return null;
// handle offset into text
final int start = Math.max(0, selection.subTextStart);
if(start > textLen) return "";
int len = textLen - start;
if(selection.subTextLen > 0) {
len = Math.min(len, selection.subTextLen);
}
// skip
if(start > 0) input.skip(start);
String output = buffer.readString(input, len);
// move past rest
if (len < textLen) {
input.skip(len - textLen);
}
return output;
}