本文整理汇总了Python中lexicon.Lexicon.get_vocabulary方法的典型用法代码示例。如果您正苦于以下问题:Python Lexicon.get_vocabulary方法的具体用法?Python Lexicon.get_vocabulary怎么用?Python Lexicon.get_vocabulary使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lexicon.Lexicon
的用法示例。
在下文中一共展示了Lexicon.get_vocabulary方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: InvertedIndex
# 需要导入模块: from lexicon import Lexicon [as 别名]
# 或者: from lexicon.Lexicon import get_vocabulary [as 别名]
class InvertedIndex():
def __init__(self):
self.invertedindex = {}
self.lexicon = Lexicon()
self.tokenizer = Tokenizer()
self.doc_reader = DocReader()
self.build_index()
def build_index(self):
#comments?
cache = self.doc_reader.get_cache()
docs = self.doc_reader.read_docs(cache)
print "\nINVERTEDINDEX : Indexing %d documents..\n" % len(docs)
for d in range(len(docs)):
print "Indexing document '%s'" % (settings.PATH_DOCS + str(d))
self.add_document(docs[d], d)
print "Indexed total %d unique terms" % self.lexicon.size()
def get_postinglist(self, lex_id):
return self.invertedindex[lex_id]
def add_document(self, doc, document_id):
"""FIXME:
-Needs doc
-Too slow?
-Remove stop words
-Reduce number of tokens
"""
tokens = self.tokenizer.tokenize(doc)
for t in tokens:
lex_id = self.lexicon.lookup(t.get_value())
if(lex_id == settings.INVALID):
lex_id = self.lexicon.add_value(t.get_value())
pl = PostingList()
pl.append_posting(Posting(document_id, t.get_position()))
self.invertedindex[lex_id] = pl
else:
pl = self.get_postinglist(lex_id)
if pl.get_last_posting().get_document_id() != document_id:
pl.append_posting(Posting(document_id, t.get_position()))
else:
p = pl.get_last_posting()
p.append_position(t.get_position())
def size(self):
return len(self.invertedindex)
def debugprint(self):
voc = self.lexicon.get_vocabulary()
for v in voc:
lid = self.lexicon.lookup(v)
pl = self.get_postinglist(lid)
print "[%s]" % v
pl.info()