本文整理汇总了Python中vocabulary.Vocabulary.load方法的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary.load方法的具体用法?Python Vocabulary.load怎么用?Python Vocabulary.load使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类vocabulary.Vocabulary
的用法示例。
在下文中一共展示了Vocabulary.load方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: VocabularyTest
# 需要导入模块: from vocabulary import Vocabulary [as 别名]
# 或者: from vocabulary.Vocabulary import load [as 别名]
class VocabularyTest(unittest.TestCase):
def setUp(self):
self.vocabulary = Vocabulary()
self.vocabulary.load('testdata/vocabulary.dat', 'testdata/custom_words')
pprint.pprint(self.vocabulary.trie)
pprint.pprint(self.vocabulary.words)
def test_vocabulary(self):
self.assertIn(u'英雄三国', self.vocabulary.words.keys())
self.assertIn(u'魔鬼代言人', self.vocabulary.words.keys())
self.assertIn(u'黄河水利委员会', self.vocabulary.words.keys())
self.assertNotIn(u'十大伪歌手', self.vocabulary.words.keys())
self.assertNotIn(u'走路太牛', self.vocabulary.words.keys())
self.assertEqual('n', self.vocabulary.get_pos(u'英雄三国'))
self.assertEqual('n', self.vocabulary.get_pos(u'魔鬼代言人'))
self.assertEqual('nt', self.vocabulary.get_pos(u'黄河水利委员会'))
self.assertEqual('UNK', self.vocabulary.get_pos(u'十大伪歌手'))
self.assertEqual('UNK', self.vocabulary.get_pos(u'走路太牛'))
def test_gen_DAG(self):
pprint.pprint(self.vocabulary.gen_DAG(
u'《英雄三国》是由网易历时四年自主研发运营的一款英雄对战竞技网游。'))
示例2: VocabularyTest
# 需要导入模块: from vocabulary import Vocabulary [as 别名]
# 或者: from vocabulary.Vocabulary import load [as 别名]
class VocabularyTest(unittest.TestCase):
def setUp(self):
self.vocabulary = Vocabulary()
self.vocabulary.load("../testdata/vocabulary.dat")
def test_has_word(self):
self.assertTrue(self.vocabulary.has_word('ipad'))
self.assertTrue(self.vocabulary.has_word('iphone'))
self.assertTrue(self.vocabulary.has_word('macbook'))
self.assertFalse(self.vocabulary.has_word('nokia'))
self.assertFalse(self.vocabulary.has_word('thinkpad'))
def test_word_index(self):
self.assertEqual(0, self.vocabulary.word_index('ipad'))
self.assertEqual(1, self.vocabulary.word_index('iphone'))
self.assertEqual(2, self.vocabulary.word_index('macbook'))
self.assertEqual(-1, self.vocabulary.word_index('nokia'))
self.assertEqual(-1, self.vocabulary.word_index('thinkpad'))
def test_word(self):
self.assertEqual('ipad', self.vocabulary.word(0))
self.assertEqual('iphone', self.vocabulary.word(1))
self.assertEqual('macbook', self.vocabulary.word(2))
def test_size(self):
self.assertEqual(17, self.vocabulary.size())
示例3: MaxProbSegmenterTest
# 需要导入模块: from vocabulary import Vocabulary [as 别名]
# 或者: from vocabulary.Vocabulary import load [as 别名]
class MaxProbSegmenterTest(unittest.TestCase):
def setUp(self):
self.vocabulary = Vocabulary()
self.vocabulary.load('../data/vocabulary.dat')
self.hmm_segmenter = HMMSegmenter()
self.hmm_segmenter.load('../data/hmm_segment_model')
self.max_prob_segmenter = MaxProbSegmenter(
self.vocabulary, self.hmm_segmenter)
def call_segment(self, text):
for word in self.max_prob_segmenter.segment(text):
print word + '/\t',
print ''
def test_segment(self):
fp = open('testdata/document.dat', 'rb')
for text in fp.readlines():
self.call_segment(text.strip())
fp.close()
示例4: len
# 需要导入模块: from vocabulary import Vocabulary [as 别名]
# 或者: from vocabulary.Vocabulary import load [as 别名]
elif os.path.isfile(_dir):
file_list.append(_dir)
if __name__ == '__main__':
if len(sys.argv) <= 3:
print >> sys.stderr, '%s [stop word file] [output name] ' \
'[doc file] ...' % sys.argv[0]
sys.exit(1)
file_list = []
for _dir in sys.argv[3:]:
collect_files(file_list, _dir)
stop_word = Vocabulary()
stop_word.load(sys.argv[1])
vocab = Vocabulary()
articles = []
for filename in file_list:
article = stem_file(filename, vocab, stop_word)
articles.append(article)
# random.shuffle(articles)
vocab.sort()
vocab.save(sys.argv[2] + '-vocab')
fp = open(sys.argv[2] + '-train', 'w')
for article in articles:
sb = ''
for word in article:
示例5: DocumentTest
# 需要导入模块: from vocabulary import Vocabulary [as 别名]
# 或者: from vocabulary.Vocabulary import load [as 别名]
class DocumentTest(unittest.TestCase):
def setUp(self):
self.document = Document(20)
self.vocabulary = Vocabulary()
self.vocabulary.load("../testdata/vocabulary.dat")
self.model = Model(20)
self.model.load('../testdata/lda_model')
self.doc_tokens = ['macbook', 'ipad', # exist in vocabulary and model
'mac os x', 'chrome', # only exist in vocabulary
'nokia', 'null'] # inexistent
def test_parse_from_tokens(self):
# initialize document during lda training.
self.document.parse_from_tokens(
self.doc_tokens, random, self.vocabulary)
self.assertEqual(4, self.document.num_words())
topic_hist = self.document.doc_topic_hist
for i in xrange(len(topic_hist.non_zeros) - 1):
self.assertGreaterEqual(topic_hist.non_zeros[i].count,
topic_hist.non_zeros[i + 1].count)
logging.info(str(self.document))
# initialize document during lda inference.
self.document.parse_from_tokens(
self.doc_tokens, random, self.vocabulary, self.model)
self.assertEqual(2, self.document.num_words())
for i in xrange(len(topic_hist.non_zeros) - 1):
self.assertGreaterEqual(topic_hist.non_zeros[i].count,
topic_hist.non_zeros[i + 1].count)
# print str(self.document)
def test_serialize_and_parse(self):
self.document.parse_from_tokens(
self.doc_tokens, random, self.vocabulary)
test_doc = Document(20)
test_doc.parse_from_string(self.document.serialize_to_string())
self.assertEqual(self.document.num_words(), test_doc.num_words())
self.assertEqual(str(self.document), str(test_doc))
def test_increase_decrease_topic(self):
self.document.parse_from_tokens(
self.doc_tokens, random, self.vocabulary, self.model)
self.document.increase_topic(0, 5)
self.document.increase_topic(4, 5)
self.document.increase_topic(9, 5)
topic_hist = self.document.doc_topic_hist
for i in xrange(len(topic_hist.non_zeros) - 1):
self.assertGreaterEqual(topic_hist.non_zeros[i].count,
topic_hist.non_zeros[i + 1].count)
self.document.decrease_topic(4, 4)
self.document.decrease_topic(9, 3)
for i in xrange(len(topic_hist.non_zeros) - 1):
self.assertGreaterEqual(topic_hist.non_zeros[i].count,
topic_hist.non_zeros[i + 1].count)
示例6: save_topic_word_prob_readable
# 需要导入模块: from vocabulary import Vocabulary [as 别名]
# 或者: from vocabulary.Vocabulary import load [as 别名]
fp.close()
return topic_prob_list_map
def save_topic_word_prob_readable(filename, topic_prob_list_map, vocab, threshold=None):
if threshold is None:
threshold = 1e-6
fp = open(filename, "w")
for topic in topic_prob_list_map.keys():
fp.write("topic %d\n" % topic)
prob_list = topic_prob_list_map[topic]
prob_list = [(_id, float(p)) for _id, p in prob_list if float(p) >= threshold]
prob_list.sort(key=lambda tup: tup[1], reverse=True)
for (_id, p) in prob_list:
fp.write("%24.24s:%8.8f\n" % (vocab.get_token_from_id(_id), p))
fp.write("--------------------\n")
fp.close()
if __name__ == "__main__":
if len(sys.argv) < 3:
print >>sys.stderr, "%s [vocabulary] [xxx-topic-word]" % sys.argv[0]
sys.exit(1)
vocab = Vocabulary()
vocab.load(sys.argv[1])
topic_prob_list_map = load_topic_word_prob_file(sys.argv[2])
save_topic_word_prob_readable(sys.argv[2] + "-readable", topic_prob_list_map, vocab)
save_topic_word_prob_readable(sys.argv[2] + "-readable.0.005", topic_prob_list_map, vocab, threshold=0.005)