本文整理汇总了Java中org.lemurproject.galago.core.parse.Document.DocumentComponents方法的典型用法代码示例。如果您正苦于以下问题:Java Document.DocumentComponents方法的具体用法?Java Document.DocumentComponents怎么用?Java Document.DocumentComponents使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.lemurproject.galago.core.parse.Document
的用法示例。
在下文中一共展示了Document.DocumentComponents方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: readText
import org.lemurproject.galago.core.parse.Document; //导入方法依赖的package包/类
public static String readText(DataInputStream input, Document.DocumentComponents selection, ByteBuf buffer) throws IOException {
final int textLen = input.readInt();
if(textLen < 0) return null;
// handle offset into text
final int start = Math.max(0, selection.subTextStart);
if(start > textLen) return "";
int len = textLen - start;
if(selection.subTextLen > 0) {
len = Math.min(len, selection.subTextLen);
}
// skip
if(start > 0) input.skip(start);
String output = buffer.readString(input, len);
// move past rest
if (len < textLen) {
input.skip(len - textLen);
}
return output;
}
示例2: fromStream
import org.lemurproject.galago.core.parse.Document; //导入方法依赖的package包/类
@Override
public Document fromStream(DataInputStream stream, Document.DocumentComponents selection) throws IOException {
SerializerCommon.ByteBuf buffer = new SerializerCommon.ByteBuf();
DataInputStream input = new DataInputStream(new SnappyInputStream(stream));
Document d = new Document();
int metadataSize = input.readInt();
int textSize = input.readInt(); // ignored
// identifier
d.identifier = input.readLong();
// name
d.name = buffer.readString(input);
if (selection.metadata) {
d.metadata = SerializerCommon.readMetadata(input, buffer);
// only both skipping if we need to
} else if (selection.text || selection.tokenize) {
input.skip(metadataSize);
}
// can't get tokens without text in this case...
if (selection.text || selection.tokenize) {
d.text = SerializerCommon.readText(input, selection, buffer);
}
input.close();
// give back terms & tags
if(selection.tokenize) {
// Tokenizer is *not* threadsafe, so we must make a copy of it for each use in case of threads.
Tokenizer tokenizer = getTokenizer();
tokenizer.tokenize(d);
}
return d;
}
示例3: fromStream
import org.lemurproject.galago.core.parse.Document; //导入方法依赖的package包/类
@Override
public Document fromStream(DataInputStream stream, Document.DocumentComponents selection) throws IOException {
SerializerCommon.ByteBuf buffer = new SerializerCommon.ByteBuf();
DataInputStream input = new DataInputStream(new SnappyInputStream(stream));
Document d = new Document();
// identifier
d.identifier = input.readLong();
// name
d.name = buffer.readString(input);
// exit with no parts
if(!selection.metadata && !selection.text && !selection.tokenize) return d;
int metadataSize = input.readInt();
int textSize = input.readInt();
int termsSize = input.readInt();
if (selection.metadata) {
d.metadata = SerializerCommon.readMetadata(input, buffer);
// only both skipping if we need to
} else {
input.skip(metadataSize);
}
if (selection.text) {
d.text = SerializerCommon.readText(input, selection, buffer);
} else {
input.skip(textSize);
}
// give back terms
if(selection.tokenize) {
int count = input.readInt();
ArrayList<String> terms = new ArrayList<String>(count);
for (int i = 0; i < count; i++) {
terms.add(buffer.readString(input));
}
d.terms = terms;
}
input.close();
return d;
}
示例4: fromBytes
import org.lemurproject.galago.core.parse.Document; //导入方法依赖的package包/类
/**
* Convert a byte array into a Document
* @throws IOException
*/
public Document fromBytes(byte[] data, Document.DocumentComponents selection) throws IOException {
ByteArrayInputStream stream = new ByteArrayInputStream(data);
return fromStream(new DataInputStream(stream), selection);
}
示例5: pullDocuments
import org.lemurproject.galago.core.parse.Document; //导入方法依赖的package包/类
/**
* Takes all the elements in this ranked list of results, and pulls the Document for them.
* @param what the document-components object, tells whether you want text, terms, or metadata.
* @return a mapping of Document identifier to Document objects
* @throws IOException if the corpus doesn't exist or is corrupted somehow.
*/
@Nonnull
public Map<String, Document> pullDocuments(@Nonnull Document.DocumentComponents what) throws IOException {
return retrieval.getDocuments(new ArrayList<>(this.resultSet()), what);
}
示例6: fromStream
import org.lemurproject.galago.core.parse.Document; //导入方法依赖的package包/类
/**
* Convert an input stream into a Document, this is the lowest level call, to be implemented by serialization methods.
* @param stream inputStream to read the document.
* @param components An object that describes what to load from the serialized document.
* @return the Galago Document object.
* @throws IOException
*/
public abstract Document fromStream(DataInputStream stream, Document.DocumentComponents components) throws IOException;