本文整理汇总了Java中org.apache.lucene.index.DocsEnum.freq方法的典型用法代码示例。如果您正苦于以下问题:Java DocsEnum.freq方法的具体用法?Java DocsEnum.freq怎么用?Java DocsEnum.freq使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.index.DocsEnum
的用法示例。
在下文中一共展示了DocsEnum.freq方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getTotalTermFreq
import org.apache.lucene.index.DocsEnum; //导入方法依赖的package包/类
public static long getTotalTermFreq(IndexReader reader, Term term) throws Exception {
long totalTF = 0L;
for (final AtomicReaderContext ctx : reader.leaves()) {
AtomicReader r = ctx.reader();
if (!r.hasDeletions()) {
// TODO: we could do this up front, during the scan
// (next()), instead of after-the-fact here w/ seek,
// if the codec supports it and there are no del
// docs...
final long totTF = r.totalTermFreq(term);
if (totTF != -1) {
totalTF += totTF;
continue;
} // otherwise we fall-through
}
// note: what should we do if field omits freqs? currently it counts as 1...
DocsEnum de = r.termDocsEnum(term);
if (de != null) {
while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)
totalTF += de.freq();
}
}
return totalTF;
}
示例2: addToWordcloud
import org.apache.lucene.index.DocsEnum; //导入方法依赖的package包/类
private void addToWordcloud(ArrayList<String> ids2) throws IOException {
MultiTermVectorsRequestBuilder mtr = client.prepareMultiTermVectors();
for (String id : ids2) {
mtr.add(newTermVectorRequest().id(id));
}
MultiTermVectorsResponse r = mtr.execute().actionGet();
TermsEnum e = null;
DocsEnum docsenum = null;
for (MultiTermVectorsItemResponse a : r) {
TermVectorResponse t = a.getResponse();
Fields fields = t.getFields();
for (String f : fields) {
Terms terms = fields.terms(f);
TermsEnum it = terms.iterator(e);
while (it.next() != null) {
String term = it.term().utf8ToString();
if (term.length() < 2) {continue;}
DocsEnum docsit = it.docs(new Bits.MatchAllBits(ids2.size()), docsenum);
int freq = docsit.freq();
WordData data = words.get(term);
if (data == null)
words.put(term, new WordData(term, freq));
else
data.count += freq;
}
}
}
}
示例3: addSingle
import org.apache.lucene.index.DocsEnum; //导入方法依赖的package包/类
private void addSingle(String id) throws IOException {
TermVectorRequestBuilder tr = client.prepareTermVector("kb", "doc", id)
.setSelectedFields("text_content", "article_dc_title")
.setPositions(false)
.setOffsets(false)
.setPayloads(false)
.setFieldStatistics(false)
.setTermStatistics(false);
TermVectorResponse t = tr.execute().actionGet();
TermsEnum e = null;
DocsEnum docsenum = null;
Fields fields = t.getFields();
for (String f : fields) {
Terms terms = fields.terms(f);
TermsEnum it = terms.iterator(e);
while (it.next() != null) {
String term = it.term().utf8ToString();
if (term.length() < 2) {continue;}
DocsEnum docsit = it.docs(new Bits.MatchAllBits(1), docsenum);
int freq = docsit.freq();
WordData data = words.get(term);
if (data == null)
words.put(term, new WordData(term, freq));
else
data.count += freq;
}
}
}
示例4: testReadTokens
import org.apache.lucene.index.DocsEnum; //导入方法依赖的package包/类
/**
* Test ReadTokensTask
*/
public void testReadTokens() throws Exception {
// We will call ReadTokens on this many docs
final int NUM_DOCS = 20;
// Read tokens from first NUM_DOCS docs from Reuters and
// then build index from the same docs
String algLines1[] = {
"# ----- properties ",
"analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"# ----- alg ",
"{ReadTokens}: " + NUM_DOCS,
"ResetSystemErase",
"CreateIndex",
"{AddDoc}: " + NUM_DOCS,
"CloseIndex",
};
// Run algo
Benchmark benchmark = execBenchmark(algLines1);
List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats();
// Count how many tokens all ReadTokens saw
int totalTokenCount1 = 0;
for (final TaskStats stat : stats) {
if (stat.getTask().getName().equals("ReadTokens")) {
totalTokenCount1 += stat.getCount();
}
}
// Separately count how many tokens are actually in the index:
IndexReader reader = DirectoryReader.open(benchmark.getRunData().getDirectory());
assertEquals(NUM_DOCS, reader.numDocs());
int totalTokenCount2 = 0;
Fields fields = MultiFields.getFields(reader);
for (String fieldName : fields) {
if (fieldName.equals(DocMaker.ID_FIELD) || fieldName.equals(DocMaker.DATE_MSEC_FIELD) || fieldName.equals(DocMaker.TIME_SEC_FIELD)) {
continue;
}
Terms terms = fields.terms(fieldName);
if (terms == null) {
continue;
}
TermsEnum termsEnum = terms.iterator(null);
DocsEnum docs = null;
while(termsEnum.next() != null) {
docs = TestUtil.docs(random(), termsEnum, MultiFields.getLiveDocs(reader), docs, DocsEnum.FLAG_FREQS);
while(docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
totalTokenCount2 += docs.freq();
}
}
}
reader.close();
// Make sure they are the same
assertEquals(totalTokenCount1, totalTokenCount2);
}
示例5: SortingDocsEnum
import org.apache.lucene.index.DocsEnum; //导入方法依赖的package包/类
SortingDocsEnum(int maxDoc, SortingDocsEnum reuse, final DocsEnum in, boolean withFreqs, final Sorter.DocMap docMap) throws IOException {
super(in);
this.maxDoc = maxDoc;
this.withFreqs = withFreqs;
if (reuse != null) {
if (reuse.maxDoc == maxDoc) {
sorter = reuse.sorter;
} else {
sorter = new DocFreqSorter(maxDoc);
}
docs = reuse.docs;
freqs = reuse.freqs; // maybe null
} else {
docs = new int[64];
sorter = new DocFreqSorter(maxDoc);
}
docIt = -1;
int i = 0;
int doc;
if (withFreqs) {
if (freqs == null || freqs.length < docs.length) {
freqs = new int[docs.length];
}
while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS){
if (i >= docs.length) {
docs = ArrayUtil.grow(docs, docs.length + 1);
freqs = ArrayUtil.grow(freqs, freqs.length + 1);
}
docs[i] = docMap.oldToNew(doc);
freqs[i] = in.freq();
++i;
}
} else {
freqs = null;
while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS){
if (i >= docs.length) {
docs = ArrayUtil.grow(docs, docs.length + 1);
}
docs[i++] = docMap.oldToNew(doc);
}
}
// TimSort can save much time compared to other sorts in case of
// reverse sorting, or when sorting a concatenation of sorted readers
sorter.reset(docs, freqs);
sorter.sort(0, i);
upto = i;
}
示例6: testReadTokens
import org.apache.lucene.index.DocsEnum; //导入方法依赖的package包/类
/**
* Test ReadTokensTask
*/
public void testReadTokens() throws Exception {
// We will call ReadTokens on this many docs
final int NUM_DOCS = 20;
// Read tokens from first NUM_DOCS docs from Reuters and
// then build index from the same docs
String algLines1[] = {
"# ----- properties ",
"analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"# ----- alg ",
"{ReadTokens}: " + NUM_DOCS,
"ResetSystemErase",
"CreateIndex",
"{AddDoc}: " + NUM_DOCS,
"CloseIndex",
};
// Run algo
Benchmark benchmark = execBenchmark(algLines1);
List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats();
// Count how many tokens all ReadTokens saw
int totalTokenCount1 = 0;
for (final TaskStats stat : stats) {
if (stat.getTask().getName().equals("ReadTokens")) {
totalTokenCount1 += stat.getCount();
}
}
// Separately count how many tokens are actually in the index:
IndexReader reader = DirectoryReader.open(benchmark.getRunData().getDirectory());
assertEquals(NUM_DOCS, reader.numDocs());
int totalTokenCount2 = 0;
Fields fields = MultiFields.getFields(reader);
for (String fieldName : fields) {
if (fieldName.equals(DocMaker.ID_FIELD) || fieldName.equals(DocMaker.DATE_MSEC_FIELD) || fieldName.equals(DocMaker.TIME_SEC_FIELD)) {
continue;
}
Terms terms = fields.terms(fieldName);
if (terms == null) {
continue;
}
TermsEnum termsEnum = terms.iterator(null);
DocsEnum docs = null;
while(termsEnum.next() != null) {
docs = _TestUtil.docs(random(), termsEnum, MultiFields.getLiveDocs(reader), docs, DocsEnum.FLAG_FREQS);
while(docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
totalTokenCount2 += docs.freq();
}
}
}
reader.close();
// Make sure they are the same
assertEquals(totalTokenCount1, totalTokenCount2);
}