本文整理汇总了Java中org.apache.lucene.index.PostingsEnum.freq方法的典型用法代码示例。如果您正苦于以下问题:Java PostingsEnum.freq方法的具体用法?Java PostingsEnum.freq怎么用?Java PostingsEnum.freq使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.index.PostingsEnum
的用法示例。
在下文中一共展示了PostingsEnum.freq方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: buildTerm
import org.apache.lucene.index.PostingsEnum; //导入方法依赖的package包/类
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
// start term, optimized writing
BytesRef term = termIter.next();
spare.copyUTF8Bytes(term);
builder.startObject(spare.toString());
buildTermStatistics(builder, termIter);
// finally write the term vectors
PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
int termFreq = posEnum.freq();
builder.field(FieldStrings.TERM_FREQ, termFreq);
initMemory(curTerms, termFreq);
initValues(curTerms, posEnum, termFreq);
buildValues(builder, curTerms, termFreq);
buildScore(builder, boostAtt);
builder.endObject();
}
示例2: buildEntryValue
import org.apache.lucene.index.PostingsEnum; //导入方法依赖的package包/类
private NamedList<Object> buildEntryValue(long count, Term t, List<Entry<LeafReader, Bits>> leaves) throws IOException {
NamedList<Object> entry = new NamedList<>();
entry.add("count", count);
int i = -1;
for (Entry<LeafReader, Bits> e : leaves) {
PostingsEnum postings = e.getKey().postings(t, PostingsEnum.PAYLOADS);
Bits liveDocs = e.getValue();
while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (!liveDocs.get(postings.docID())) {
continue;
}
i++;
NamedList<Object> documentEntry = new NamedList<>();
entry.add("doc" + i, documentEntry);
for (int j = 0; j < postings.freq(); j++) {
postings.nextPosition();
String extra = postings.getPayload().utf8ToString();
documentEntry.add("position" + j, extra);
}
}
}
return entry;
}
示例3: getEntropy
import org.apache.lucene.index.PostingsEnum; //导入方法依赖的package包/类
/**
* Gets the 1 - entropy (i.e. 1+ plogp) of a term,
* a function that favors terms that are focally distributed
* We use the definition of log-entropy weighting provided in
* Martin and Berry (2007):
* Entropy = 1 + sum ((Pij log2(Pij)) / log2(n))
* where Pij = frequency of term i in doc j / global frequency of term i
* n = number of documents in collection
* @param term whose entropy you want
* Thanks to Vidya Vasuki for adding the hash table to
* eliminate redundant calculation
*/
private float getEntropy(Term term) {
if (termEntropy.containsKey(term))
return termEntropy.get(term);
int gf = getGlobalTermFreq(term);
double entropy = 0;
try {
PostingsEnum docsEnum = this.getDocsForTerm(term);
while ((docsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
double p = docsEnum.freq(); //frequency in this document
p = p / gf; //frequency across all documents
entropy += p * (Math.log(p) / Math.log(2)); //sum of Plog(P)
}
int n = this.getNumDocs();
double log2n = Math.log(n) / Math.log(2);
entropy = entropy / log2n;
} catch (IOException e) {
logger.info("Couldn't get term entropy for term " + term.text());
}
termEntropy.put(term, 1 + (float) entropy);
return (float) (1 + entropy);
}
示例4: addTermFrequencies
import org.apache.lucene.index.PostingsEnum; //导入方法依赖的package包/类
/**
* Adds terms and frequencies found in vector into the Map termFreqMap
*
* @param termFreqMap a Map of terms and their frequencies
* @param vector List of terms and their frequencies for a doc/field
* @param fieldName Optional field name of the terms for skip terms
*/
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException {
final TermsEnum termsEnum = vector.iterator();
final CharsRefBuilder spare = new CharsRefBuilder();
BytesRef text;
while((text = termsEnum.next()) != null) {
spare.copyUTF8Bytes(text);
final String term = spare.toString();
if (isNoiseWord(term)) {
continue;
}
if (isSkipTerm(fieldName, term)) {
continue;
}
final PostingsEnum docs = termsEnum.postings(null);
int freq = 0;
while(docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
freq += docs.freq();
}
// increment frequency
Int cnt = termFreqMap.get(term);
if (cnt == null) {
cnt = new Int();
termFreqMap.put(term, cnt);
cnt.x = freq;
} else {
cnt.x += freq;
}
}
}
示例5: compareTermVectors
import org.apache.lucene.index.PostingsEnum; //导入方法依赖的package包/类
private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException {
Terms terms0 = fields0.terms(fieldName);
Terms terms1 = fields1.terms(fieldName);
assertThat(terms0, notNullValue());
assertThat(terms1, notNullValue());
assertThat(terms0.size(), equalTo(terms1.size()));
TermsEnum iter0 = terms0.iterator();
TermsEnum iter1 = terms1.iterator();
for (int i = 0; i < terms0.size(); i++) {
BytesRef next0 = iter0.next();
assertThat(next0, notNullValue());
BytesRef next1 = iter1.next();
assertThat(next1, notNullValue());
// compare field value
String string0 = next0.utf8ToString();
String string1 = next1.utf8ToString();
assertThat("expected: " + string0, string0, equalTo(string1));
// compare df and ttf
assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq()));
assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq()));
// compare freq and docs
PostingsEnum docsAndPositions0 = iter0.postings(null, PostingsEnum.ALL);
PostingsEnum docsAndPositions1 = iter1.postings(null, PostingsEnum.ALL);
assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc()));
assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq()));
// compare position, start offsets and end offsets
for (int j = 0; j < docsAndPositions0.freq(); j++) {
assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition()));
assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset()));
assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset()));
}
}
assertThat(iter0.next(), nullValue());
assertThat(iter1.next(), nullValue());
}
示例6: getFeatures
import org.apache.lucene.index.PostingsEnum; //导入方法依赖的package包/类
static double[] getFeatures(IndexReader ir, String fieldName, BytesRef rawPhrase, int docId, int docSize, int numDocs, boolean inc)
throws IOException {
PostingsEnum de = MultiFields.getTermDocsEnum(ir, fieldName, rawPhrase);
int ret = de.advance(docId);
if(ret == PostingsEnum.NO_MORE_DOCS){
throw new RuntimeException("no more docs...");
}
else{
int freq = de.freq();
if(freq < 2) return null;
PostingsEnum pe = MultiFields.getTermPositionsEnum(ir, fieldName, rawPhrase);
int ret2 = pe.advance(docId);
if(ret2 == PostingsEnum.NO_MORE_DOCS){
throw new RuntimeException("no more docs...");
}
else{
double[] features = new double[2];
int pos = pe.nextPosition();
int docFreq = ir.docFreq(new Term(fieldName, rawPhrase));
if(inc){
docFreq++;
numDocs++;
}
features[0] = Commons.calcTfIdf(freq, docSize, docFreq, numDocs);
features[1] = Commons.calcFirstOccurrence(pos, docSize);
return features;
}
}
}
示例7: printFieldTermsWithInfo
import org.apache.lucene.index.PostingsEnum; //导入方法依赖的package包/类
/** Prints the terms indexed under the given fields with full postings information. */
public static void printFieldTermsWithInfo(LeafReader reader, String... fields) throws IOException {
for (final String field : fields) {
System.out.println(format("Terms for field [%s], with positional info:", field));
final TermsEnum te = reader.terms(field).iterator();
BytesRef scratch;
PostingsEnum postings = null;
while ((scratch = te.next()) != null) {
System.out.println(format(" %s", scratch.utf8ToString()));
postings = te.postings(postings, PostingsEnum.ALL);
for (postings.nextDoc(); postings.docID() != DocIdSetIterator.NO_MORE_DOCS; postings.nextDoc()) {
final Map<Integer, BytesRef> positions = Maps.newTreeMap();
boolean addedPayload = false;
for (int i = 0; i < postings.freq(); i++) {
final int pos = postings.nextPosition();
final BytesRef payload = postings.getPayload();
if (payload != null) {
positions.put(pos, BytesRef.deepCopyOf(payload));
addedPayload = true;
} else {
positions.put(pos, null);
}
}
if (addedPayload) {
System.out.println(format(" doc=%d, freq=%d", postings.docID(), postings.freq(), positions));
for (final Entry<Integer, BytesRef> e : positions.entrySet()) {
System.out.println(format(" pos=%d, payload=%s", e.getKey(), e.getValue()));
}
} else {
System.out.println(format(" doc=%d, freq=%d, pos=%s", postings.docID(), postings.freq(),
positions.keySet()));
}
}
}
}
}
示例8: printAnnotations
import org.apache.lucene.index.PostingsEnum; //导入方法依赖的package包/类
public static void printAnnotations(LeafReader reader, Term term) throws IOException {
System.out.println("Annotations for " + term);
final ByteArrayDataInput in = new ByteArrayDataInput();
final PostingsEnum postings = reader.postings(term, PostingsEnum.PAYLOADS);
for (int docID = postings.nextDoc(); docID != DocIdSetIterator.NO_MORE_DOCS; docID = postings.nextDoc()) {
final int freq = postings.freq();
System.out.println(" doc=" + docID + ", freq=" + freq);
for (int i = 0; i < freq; i++) {
postings.nextPosition();
final BytesRef payload = postings.getPayload();
in.reset(payload.bytes, payload.offset, payload.length);
System.out.println(" start=" + in.readVInt() + ", length=" + in.readVInt());
}
}
}
示例9: getTermVectorWithException
import org.apache.lucene.index.PostingsEnum; //导入方法依赖的package包/类
private Map<Integer,String> getTermVectorWithException(String field, String id) throws IOException {
TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id)
.setOffsets(false).setPositions(true).setFieldStatistics(false)
.setTermStatistics(false)
.setSelectedFields(field).
execute().actionGet();
Map<Integer,String> map = new HashMap<>();
Terms terms = response.getFields().terms(field);
if (terms==null){
return map;
}
TermsEnum iterator = terms.iterator();
PostingsEnum postings = null;
for (BytesRef termBytes = null; (termBytes = iterator.next()) != null; ) {
String term = termBytes.utf8ToString();
postings = iterator.postings(postings, PostingsEnum.ALL);
//there can only be one doc since we are getting with id. get the doc and the position
postings.nextDoc();
int tf = postings.freq();
for (int i = 0; i < tf; i++) {
int pos = postings.nextPosition();
map.put(pos,term);
}
}
return map;
}
示例10: collectTermOffsets
import org.apache.lucene.index.PostingsEnum; //导入方法依赖的package包/类
private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
List<MWESentenceContext> result = new ArrayList<>();
TermsEnum tiRef= termVectorLookup.iterator();
BytesRef luceneTerm = tiRef.next();
while (luceneTerm != null) {
if (luceneTerm.length == 0) {
luceneTerm = tiRef.next();
continue;
}
String tString = luceneTerm.utf8ToString();
if(!allCandidates.contains(tString)) {
luceneTerm=tiRef.next();
continue;
}
PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
//PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);
int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
if (doc != PostingsEnum.NO_MORE_DOCS) {
int totalOccurrence = postingsEnum.freq();
for (int i = 0; i < totalOccurrence; i++) {
postingsEnum.nextPosition();
int start = postingsEnum.startOffset();
int end = postingsEnum.endOffset();
BytesRef payload=postingsEnum.getPayload();
int sentenceId=-1;
if(payload!=null){
sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
}
result.add(new MWESentenceContext(tString,sentenceId, start, end));
}
}
luceneTerm = tiRef.next();
}
Collections.sort(result);
return result;
}
示例11: executeNeedleTests
import org.apache.lucene.index.PostingsEnum; //导入方法依赖的package包/类
private void executeNeedleTests(Analyzer analyzer) throws Exception {
String needle = getNeedle(analyzer);
int numFieldValues = 23;
Directory directory = buildNeedleIndex(needle, analyzer, numFieldValues);
IndexReader reader = DirectoryReader.open(directory);
LeafReaderContext ctx = reader.leaves().get(0);
LeafReader r = ctx.reader();
PostingsEnum dpe = r.postings(new Term(FIELD, needle), PostingsEnum.ALL);
int numTests = 0;
try {
while (dpe.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int frq = dpe.freq();
int advanced = 0;
String[] fieldValues = r.document(dpe.docID()).getValues(FIELD);
while (++advanced < frq) {
dpe.nextPosition();
String rebuilt = SimpleAnalyzerUtil.substringFromMultiValuedFields(dpe.startOffset(),
dpe.endOffset(), fieldValues, analyzer.getOffsetGap(FIELD), " | ");
assertEquals(needle, rebuilt);
numTests++;
}
}
} finally {
reader.close();
directory.close();
}
assertEquals("number of tests", numFieldValues - 1, numTests);
}
示例12: seekExact
import org.apache.lucene.index.PostingsEnum; //导入方法依赖的package包/类
@Override
public boolean seekExact(BytesRef text) throws IOException {
int docFreq = 0;
long totalTermFreq = 0;
for (Holder anEnum : enums) {
if (anEnum.termsEnum.seekExact(text)) {
if (anEnum.bits == null) {
docFreq += anEnum.termsEnum.docFreq();
if (docsEnumFlag == PostingsEnum.FREQS) {
long leafTotalTermFreq = anEnum.termsEnum.totalTermFreq();
if (totalTermFreq == -1 || leafTotalTermFreq == -1) {
totalTermFreq = -1;
continue;
}
totalTermFreq += leafTotalTermFreq;
}
} else {
final PostingsEnum docsEnum = anEnum.docsEnum = anEnum.termsEnum.postings(anEnum.docsEnum, docsEnumFlag);
// 2 choices for performing same heavy loop - one attempts to calculate totalTermFreq and other does not
if (docsEnumFlag == PostingsEnum.FREQS) {
for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
if (anEnum.bits != null && anEnum.bits.get(docId) == false) {
continue;
}
docFreq++;
// docsEnum.freq() returns 1 if doc indexed with IndexOptions.DOCS_ONLY so no way of knowing if value
// is really 1 or unrecorded when filtering like this
totalTermFreq += docsEnum.freq();
}
} else {
for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
if (anEnum.bits != null && anEnum.bits.get(docId) == false) {
continue;
}
// docsEnum.freq() behaviour is undefined if docsEnumFlag==PostingsEnum.FLAG_NONE so don't bother with call
docFreq++;
}
}
}
}
}
if (docFreq > 0) {
currentDocFreq = docFreq;
currentTotalTermFreq = totalTermFreq;
current = text;
return true;
} else {
currentDocFreq = NOT_FOUND;
currentTotalTermFreq = NOT_FOUND;
current = null;
return false;
}
}
示例13: getTermFreq
import org.apache.lucene.index.PostingsEnum; //导入方法依赖的package包/类
private int getTermFreq(TermsEnum termsEnum, PostingsEnum docsEnum) throws IOException {
docsEnum = termsEnum.postings(docsEnum);
docsEnum.nextDoc();
return docsEnum.freq();
}
示例14: validateResponse
import org.apache.lucene.index.PostingsEnum; //导入方法依赖的package包/类
protected void validateResponse(TermVectorsResponse esResponse, Fields luceneFields, TestConfig testConfig) throws IOException {
assertThat(esResponse.getIndex(), equalTo(testConfig.doc.index));
TestDoc testDoc = testConfig.doc;
HashSet<String> selectedFields = testConfig.selectedFields == null ? null : new HashSet<>(
Arrays.asList(testConfig.selectedFields));
Fields esTermVectorFields = esResponse.getFields();
for (TestFieldSetting field : testDoc.fieldSettings) {
Terms esTerms = esTermVectorFields.terms(field.name);
if (selectedFields != null && !selectedFields.contains(field.name)) {
assertNull(esTerms);
continue;
}
assertNotNull(esTerms);
Terms luceneTerms = luceneFields.terms(field.name);
TermsEnum esTermEnum = esTerms.iterator();
TermsEnum luceneTermEnum = luceneTerms.iterator();
while (esTermEnum.next() != null) {
assertNotNull(luceneTermEnum.next());
assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq()));
PostingsEnum esDocsPosEnum = esTermEnum.postings(null, PostingsEnum.POSITIONS);
PostingsEnum luceneDocsPosEnum = luceneTermEnum.postings(null, PostingsEnum.POSITIONS);
if (luceneDocsPosEnum == null) {
// test we expect that...
assertFalse(field.storedOffset);
assertFalse(field.storedPayloads);
assertFalse(field.storedPositions);
continue;
}
String currentTerm = esTermEnum.term().utf8ToString();
assertThat("Token mismatch for field: " + field.name, currentTerm, equalTo(luceneTermEnum.term().utf8ToString()));
esDocsPosEnum.nextDoc();
luceneDocsPosEnum.nextDoc();
int freq = esDocsPosEnum.freq();
assertThat(freq, equalTo(luceneDocsPosEnum.freq()));
for (int i = 0; i < freq; i++) {
String failDesc = " (field:" + field.name + " term:" + currentTerm + ")";
int lucenePos = luceneDocsPosEnum.nextPosition();
int esPos = esDocsPosEnum.nextPosition();
if (field.storedPositions && testConfig.requestPositions) {
assertThat("Position test failed" + failDesc, lucenePos, equalTo(esPos));
} else {
assertThat("Missing position test failed" + failDesc, esPos, equalTo(-1));
}
if (field.storedOffset && testConfig.requestOffsets) {
assertThat("Offset test failed" + failDesc, luceneDocsPosEnum.startOffset(), equalTo(esDocsPosEnum.startOffset()));
assertThat("Offset test failed" + failDesc, luceneDocsPosEnum.endOffset(), equalTo(esDocsPosEnum.endOffset()));
} else {
assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1));
assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1));
}
if (field.storedPayloads && testConfig.requestPayloads) {
assertThat("Payload test failed" + failDesc, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload()));
} else {
assertThat("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload(), equalTo(null));
}
}
}
assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next());
}
}
示例15: load
import org.apache.lucene.index.PostingsEnum; //导入方法依赖的package包/类
public void load(String filename) throws Exception {
BufferedReader in = new BufferedReader(new FileReader(filename));
String line = null;
QueryParser qps = new QueryParser(FreebaseTools.FIELD_NAME_TEXT, tools.getIndexAnalyzer());
IndexSearcher searcher = tools.getIndexSearcher();
IndexReader reader = tools.getIndexReader();
while ((line = in.readLine()) != null) {
String[] fields = line.split("\t");
System.out.println("[Query: " + fields[0] + "] [KBid: " + fields[1] + "] [type: " + fields[2] + "]");
try {
// execute a Lucene query for the entity, get back 10 docs
Query q = qps.parse(fields[0]);
TopDocs results = searcher.search(q, 10);
ScoreDoc[] hits = results.scoreDocs;
boolean found = false;
long Ndocs = reader.numDocs();
for (ScoreDoc sd : hits) {
// - if d is the relevant doc, then found=true, this one's relevant.
boolean rel = false;
Document d = tools.getDocumentInMode(sd.doc);
String kbid = d.get("subject");
if (kbid.equals(fields[1])) {
found = true;
rel = true;
}
// - get its termvector
Fields docfields = reader.getTermVectors(sd.doc);
// - make it into what jforests wants
for (String f : docfields) {
TermsEnum t = docfields.terms(f).iterator();
BytesRef tstring;
while ((tstring = t.next()) != null) {
PostingsEnum pe = t.postings(null);
int i;
int df = t.docFreq();
while ((i = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
int freq = pe.freq();
double idf = Math.log(Ndocs / df);
double tf = 1 + Math.log(freq);
double tfidf = tf * idf;
// and that's the weight.
}
}
}
}
} catch (ParseException e) {
e.printStackTrace();
}
}
}