当前位置: 首页>>代码示例>>Java>>正文


Java Document类代码示例

本文整理汇总了Java中org.lemurproject.galago.core.parse.Document的典型用法代码示例。如果您正苦于以下问题:Java Document类的具体用法?Java Document怎么用?Java Document使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


Document类属于org.lemurproject.galago.core.parse包,在下文中一共展示了Document类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: useRelevantDocuments

import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
private Node useRelevantDocuments() throws Exception {
  List<String> docnames = p.getList("documents", String.class);
  List<Document> docs = getDocuments(docnames);
  FieldLanguageModel flm = new FieldLanguageModel();
  for (Document d : docs) {
    if (d.terms != null && d.terms.size() > 0) {
      flm.addDocument(d);
    }
  }

  Node termNodes = new Node("combine", new NodeParameters(), new ArrayList<Node>(), 0);
  termNodes.getNodeParameters().set("norm", false);

  // Now put get the sub-model for each term
  TObjectDoubleHashMap<String> weights = new TObjectDoubleHashMap<String>();
  for (String term : queryTerms) {
    weights.clear();
    for (String field : fields) {
      weights.put(term, flm.getFieldProbGivenTerm(field, term));
    }
    termNodes.addChild(createTermFieldNodes(term, weights));
  }
  return termNodes;
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:25,代码来源:FieldRelevanceModelTraversal.java

示例2: nextDocument

import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
public Document nextDocument() throws IOException {
  String line;

  if (reader == null) {
    return null;
  }

  if (identifier.length() == 0) {
    return null;
  }

  StringBuilder buffer = new StringBuilder();

  int lines = 0;
  while ((line = reader.readLine()) != null) {
    buffer.append(line);
    buffer.append('\n');
    lines ++;
  }

  if(lines == 0) return null;

  //        System.out.println(identifier+ "\t\t"+lines+" lines.");
  return new Document(identifier, buffer.toString());
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:26,代码来源:SingleFileParser.java

示例3: nextDocument

import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public Document nextDocument() throws IOException {
  String line = reader.readLine();
  if(line == null) return null;

  Document doc = new Document();
  Parameters jdoc = Parameters.parseString(line);
  doc.metadata.put("source", jdoc.getString("source"));
  doc.metadata.put("title", jdoc.getString("title"));
  doc.metadata.put("media-type", jdoc.getString("media-type")); // News or Blog
  doc.metadata.put("published", jdoc.getString("published"));

  doc.name = jdoc.getString("id");
  doc.text = "<title>"+jdoc.getString("title")+"</title>"+jdoc.getString("content");
  return doc;
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:17,代码来源:SignalMediaJSONParser.java

示例4: run

import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public void run(Parameters p, PrintStream output) throws Exception {
  CorpusReader reader = new CorpusReader(p.getString("path"));
  if (reader.getManifest().get("emptyIndexFile", false)) {
    output.println("Empty Corpus.");
    return;
  }

  DocumentReader.DocumentIterator iterator = reader.getIterator();
  DocumentComponents dc = new DocumentComponents(p);
  
  while (!iterator.isDone()) {
    output.println("#IDENTIFIER: " + iterator.getKeyString());
    Document document = iterator.getDocument(dc);
    output.println("#NAME: " + document.name);
    output.println("#METADATA");
    for (Map.Entry<String, String> entry : document.metadata.entrySet()) {
      output.println(entry.getKey() + "," + entry.getValue());
    }
    output.println("#TEXT");
    output.println(document.text);
    iterator.nextKey();
  }
  reader.close();
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:26,代码来源:DumpCorpusFn.java

示例5: addDocument

import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public void addDocument(Document doc) throws IOException {
  // add the document
  lengths.get(document).add(doc.identifier, doc.terms.size());

  // now deal with fields:
  TObjectIntHashMap<Bytes> currentFieldLengths = new TObjectIntHashMap<>(doc.tags.size());
  for (Tag tag : doc.tags) {
    int len = tag.end - tag.begin;
    currentFieldLengths.adjustOrPutValue(new Bytes(ByteUtil.fromString(tag.name)), len, len);
  }

  for (Bytes field : currentFieldLengths.keySet()) {
    if (!lengths.containsKey(field)) {
      lengths.put(field, new FieldLengthList(field));
    }
    lengths.get(field).add(doc.identifier, currentFieldLengths.get(field));
  }
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:20,代码来源:MemoryDocumentLengths.java

示例6: addDocument

import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public void addDocument(Document doc) {
  if (names.isEmpty()) {
    offset = doc.identifier;
  }

  assert (names.size() + offset <= doc.identifier);
  while (names.size() + offset < doc.identifier) {
    names.add(null); // add nulls to ensure the size of the array is correct
  }

  docCount += 1;
  termCount += doc.terms.size();
  names.add(doc.name);
  namesRev.put(doc.name, doc.identifier);
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:17,代码来源:MemoryDocumentNames.java

示例7: addDocument

import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public void addDocument(Document doc) throws IOException {

  int postingCount = 0;
  for (Tag tag : doc.tags) {
    if (tag.name.equals(fieldName) == false) {
      continue;
    }
    for (int i = tag.begin; i < tag.end; i++) {
      String stem = stemAsRequired(doc.terms.get(i));
      if (stem != null) {
        addPosting(ByteUtil.fromString(stem), doc.identifier, i);
        postingCount++;
      }
    }
  }
  collectionDocumentCount += 1;
  collectionPostingsCount += postingCount;
  vocabCount = postings.size();
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:21,代码来源:MemoryPositionalFieldIndex.java

示例8: toBytes

import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public byte[] toBytes(Document doc) throws IOException {
  ByteArrayOutputStream headerArray = new ByteArrayOutputStream();
  DataOutputStream output = new DataOutputStream(headerArray);
  // identifier
  output.writeLong(doc.identifier);
  // name
  SerializerCommon.writeString(output, doc.name);

  ByteArrayOutputStream metadataArray = SerializerCommon.writeMetadata(doc);
  ByteArrayOutputStream textArray = SerializerCommon.writeText(doc);

  ByteArrayOutputStream docArray = new ByteArrayOutputStream();
  output = new DataOutputStream(new SnappyOutputStream(docArray));

  output.writeInt(metadataArray.size());
  output.writeInt(textArray.size());

  output.write(headerArray.toByteArray());
  output.write(metadataArray.toByteArray());
  output.write(textArray.toByteArray());

  output.close();

  return docArray.toByteArray();
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:27,代码来源:WebDocumentSerializer.java

示例9: getDocument

import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public Document getDocument(String document, DocumentComponents p) throws IOException {
  if (parts.containsKey("corpus")) {
    try {
      CorpusReader corpus = (CorpusReader) parts.get("corpus");
      if(corpus == null) {
        throw new IllegalArgumentException("Attempted to pull a document from index without a corpus");
      }

      long docId = getIdentifier(document);
      return corpus.getDocument(docId, p);
    } catch (IOException e) {
      // ignore the exception
      logger.log(Level.SEVERE,"IOException while pulling document: "+document,e);
      /*logger.log(Level.SEVERE,
              "Failed to get document: {0}\n{1}",
              new Object[]{document, e.toString()});*/
    }
  }
  return null;
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:22,代码来源:DiskIndex.java

示例10: process

import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Override
public void process(Document document) throws IOException {
  ByteArrayOutputStream array = new ByteArrayOutputStream();
  ObjectOutputStream output;
  if (compressed) {
    output = new ObjectOutputStream(new SnappyOutputStream(array));
  } else {
    output = new ObjectOutputStream(array);
  }

  output.writeObject(document);
  output.close();

  byte[] key = Utility.fromLong(document.identifier);
  byte[] value = array.toByteArray();
  KeyValuePair pair = new KeyValuePair(key, value);
  processor.process(pair);

}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:20,代码来源:DocumentToKeyValuePair.java

示例11: testProcess

import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Test
public void testProcess() throws Exception {
  DocumentToKeyValuePair dkvp = new DocumentToKeyValuePair();
  KeyValuePairProcessor kvpProcessor = new KeyValuePairProcessor();
  dkvp.setProcessor(kvpProcessor);

  Document document = new Document();
  document.identifier = 1;
  document.text = "This is text.";
  document.name = "DOC2";
  document.metadata.put("this", "that");
  dkvp.process(document);

  KeyValuePair pair = kvpProcessor.pair;
  assertEquals(Utility.toLong(pair.key), 1);

  ByteArrayInputStream stream = new ByteArrayInputStream(pair.value);
  ObjectInputStream input = new ObjectInputStream(stream);
  Document result = (Document) input.readObject();

  assertEquals(result.identifier, document.identifier);
  assertEquals(result.text, document.text);
  assertEquals(result.name, document.name);
  assertEquals(result.metadata.size(), document.metadata.size());
  assertEquals(result.metadata.get("this"), "that");
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:27,代码来源:DocumentToKeyValuePairTest.java

示例12: parseAsDocument

import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
private Document parseAsDocument(String text, ArrayList<IntSpan> positions) throws IOException {
  Document document = new Document();
  document.text = text;

  // Tokenize the document
  TagTokenizer tokenizer = new TagTokenizer();
  tokenizer.process(document);

  if (positions != null) {
    positions.addAll(tokenizer.getTokenPositions());
  }
  if (stemming) {
    document = stemmer.stem(document);
  }

  return document;
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:18,代码来源:SnippetGenerator.java

示例13: testSerializeDocument

import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
@Test
public void testSerializeDocument() throws IOException {
    WebDocumentSerializer wds = new WebDocumentSerializer(Parameters.create());

    Document doc = new Document();
    doc.metadata.put("meta-key", "value");
    doc.metadata.put("null-meta-key", null);
    doc.name = "doc-name";
    doc.text = "doc text goes here\nand <tag>continues. This is weird</tag>";
    wds.getTokenizer().tokenize(doc);

    byte[] docBytes = wds.toBytes(doc);
    assertNotNull(docBytes);

    Document doc2 = wds.fromBytes(docBytes, Document.DocumentComponents.All);
    assertEquals(doc.name, doc2.name);
    assertEquals(doc.text, doc2.text);
    assertNotNull(doc2.metadata);
    assertEquals(doc.metadata.get("meta-key"), doc2.metadata.get("meta-key"));
    assertTrue(doc.metadata.containsKey("null-meta-key"));
    assertNull(doc.metadata.get("null-meta-key"));
    assertEquals(doc.terms, doc2.terms);
    assertEquals(doc.tags, doc2.tags);
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:25,代码来源:WebDocumentSerializerTest.java

示例14: writeMetadata

import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
public static ByteArrayOutputStream writeMetadata(Document doc) throws IOException {
  DataOutputStream output;
  ByteArrayOutputStream metadataArray = new ByteArrayOutputStream();
  output = new DataOutputStream(metadataArray);
  // metadata
  if (doc.metadata == null) {
    output.writeInt(-1);
  } else {
    output.writeInt(doc.metadata.size());
    for (String key : doc.metadata.keySet()) {
      writeString(output, key);
      writeString(output, doc.metadata.get(key));
    }
  }
  output.close();
  return metadataArray;
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:18,代码来源:SerializerCommon.java

示例15: readText

import org.lemurproject.galago.core.parse.Document; //导入依赖的package包/类
public static String readText(DataInputStream input, Document.DocumentComponents selection, ByteBuf buffer) throws IOException {
  final int textLen = input.readInt();
  if(textLen < 0) return null;

  // handle offset into text
  final int start = Math.max(0, selection.subTextStart);
  if(start > textLen) return "";

  int len = textLen - start;
  if(selection.subTextLen > 0) {
    len = Math.min(len, selection.subTextLen);
  }

  // skip
  if(start > 0) input.skip(start);

  String output = buffer.readString(input, len);

  // move past rest
  if (len < textLen) {
    input.skip(len - textLen);
  }

  return output;
}
 
开发者ID:teanalab,项目名称:demidovii,代码行数:26,代码来源:SerializerCommon.java


注:本文中的org.lemurproject.galago.core.parse.Document类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。