Python Vocabulary.load方法代码示例

本文整理汇总了Python中vocabulary.Vocabulary.load方法的典型用法代码示例。如果您正苦于以下问题：Python Vocabulary.load方法的具体用法？Python Vocabulary.load怎么用？Python Vocabulary.load使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类vocabulary.Vocabulary的用法示例。

在下文中一共展示了Vocabulary.load方法的6个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: VocabularyTest

# 需要导入模块: from vocabulary import Vocabulary [as 别名]
# 或者: from vocabulary.Vocabulary import load [as 别名]
class VocabularyTest(unittest.TestCase):

    def setUp(self):
        self.vocabulary = Vocabulary()
        self.vocabulary.load('testdata/vocabulary.dat', 'testdata/custom_words')

        pprint.pprint(self.vocabulary.trie)
        pprint.pprint(self.vocabulary.words)

    def test_vocabulary(self):
        self.assertIn(u'英雄三国', self.vocabulary.words.keys())
        self.assertIn(u'魔鬼代言人', self.vocabulary.words.keys())
        self.assertIn(u'黄河水利委员会', self.vocabulary.words.keys())
        self.assertNotIn(u'十大伪歌手', self.vocabulary.words.keys())
        self.assertNotIn(u'走路太牛', self.vocabulary.words.keys())

        self.assertEqual('n', self.vocabulary.get_pos(u'英雄三国'))
        self.assertEqual('n', self.vocabulary.get_pos(u'魔鬼代言人'))
        self.assertEqual('nt', self.vocabulary.get_pos(u'黄河水利委员会'))
        self.assertEqual('UNK', self.vocabulary.get_pos(u'十大伪歌手'))
        self.assertEqual('UNK', self.vocabulary.get_pos(u'走路太牛'))

    def test_gen_DAG(self):
        pprint.pprint(self.vocabulary.gen_DAG(
            u'《英雄三国》是由网易历时四年自主研发运营的一款英雄对战竞技网游。'))

开发者ID:fandywang，项目名称:python-wordsegmenter，代码行数:27，代码来源:vocabulary_test.py

示例2: VocabularyTest

# 需要导入模块: from vocabulary import Vocabulary [as 别名]
# 或者: from vocabulary.Vocabulary import load [as 别名]
class VocabularyTest(unittest.TestCase):

    def setUp(self):
        self.vocabulary = Vocabulary()
        self.vocabulary.load("../testdata/vocabulary.dat")

    def test_has_word(self):
        self.assertTrue(self.vocabulary.has_word('ipad'))
        self.assertTrue(self.vocabulary.has_word('iphone'))
        self.assertTrue(self.vocabulary.has_word('macbook'))
        self.assertFalse(self.vocabulary.has_word('nokia'))
        self.assertFalse(self.vocabulary.has_word('thinkpad'))

    def test_word_index(self):
        self.assertEqual(0, self.vocabulary.word_index('ipad'))
        self.assertEqual(1, self.vocabulary.word_index('iphone'))
        self.assertEqual(2, self.vocabulary.word_index('macbook'))
        self.assertEqual(-1, self.vocabulary.word_index('nokia'))
        self.assertEqual(-1, self.vocabulary.word_index('thinkpad'))

    def test_word(self):
        self.assertEqual('ipad', self.vocabulary.word(0))
        self.assertEqual('iphone', self.vocabulary.word(1))
        self.assertEqual('macbook', self.vocabulary.word(2))

    def test_size(self):
        self.assertEqual(17, self.vocabulary.size())

开发者ID:Ambier，项目名称:python-sparselda，代码行数:29，代码来源:vocabulary_test.py

示例3: MaxProbSegmenterTest

# 需要导入模块: from vocabulary import Vocabulary [as 别名]
# 或者: from vocabulary.Vocabulary import load [as 别名]
class MaxProbSegmenterTest(unittest.TestCase):

    def setUp(self):
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../data/vocabulary.dat')
        self.hmm_segmenter = HMMSegmenter()
        self.hmm_segmenter.load('../data/hmm_segment_model')
        self.max_prob_segmenter = MaxProbSegmenter(
                self.vocabulary, self.hmm_segmenter)

    def call_segment(self, text):
        for word in self.max_prob_segmenter.segment(text):
            print word + '/\t',
        print ''

    def test_segment(self):
        fp = open('testdata/document.dat', 'rb')
        for text in fp.readlines():
            self.call_segment(text.strip())
        fp.close()

开发者ID:fandywang，项目名称:python-wordsegmenter，代码行数:22，代码来源:max_prob_segmenter_test.py

示例4: len

# 需要导入模块: from vocabulary import Vocabulary [as 别名]
# 或者: from vocabulary.Vocabulary import load [as 别名]
    elif os.path.isfile(_dir):
        file_list.append(_dir)


if __name__ == '__main__':
    if len(sys.argv) <= 3:
        print >> sys.stderr, '%s [stop word file] [output name] ' \
                             '[doc file] ...' % sys.argv[0]
        sys.exit(1)

    file_list = []
    for _dir in sys.argv[3:]:
        collect_files(file_list, _dir)

    stop_word = Vocabulary()
    stop_word.load(sys.argv[1])
    vocab = Vocabulary()
    articles = []

    for filename in file_list:
        article = stem_file(filename, vocab, stop_word)
        articles.append(article)
    # random.shuffle(articles)

    vocab.sort()
    vocab.save(sys.argv[2] + '-vocab')

    fp = open(sys.argv[2] + '-train', 'w')
    for article in articles:
        sb = ''
        for word in article:

开发者ID:zhangyafeikimi，项目名称:ml-pack，代码行数:33，代码来源:gen_yahoo.py

示例5: DocumentTest

# 需要导入模块: from vocabulary import Vocabulary [as 别名]
# 或者: from vocabulary.Vocabulary import load [as 别名]
class DocumentTest(unittest.TestCase):

    def setUp(self):
        self.document = Document(20)
        self.vocabulary = Vocabulary()
        self.vocabulary.load("../testdata/vocabulary.dat")

        self.model = Model(20)
        self.model.load('../testdata/lda_model')

        self.doc_tokens = ['macbook', 'ipad',  # exist in vocabulary and model
                'mac os x', 'chrome',  # only exist in vocabulary
                'nokia', 'null']  # inexistent

    def test_parse_from_tokens(self):
        # initialize document during lda training.
        self.document.parse_from_tokens(
                self.doc_tokens, random, self.vocabulary)

        self.assertEqual(4, self.document.num_words())
        topic_hist = self.document.doc_topic_hist
        for i in xrange(len(topic_hist.non_zeros) - 1):
            self.assertGreaterEqual(topic_hist.non_zeros[i].count,
                    topic_hist.non_zeros[i + 1].count)
        logging.info(str(self.document))

        # initialize document during lda inference.
        self.document.parse_from_tokens(
                self.doc_tokens, random, self.vocabulary, self.model)
        self.assertEqual(2, self.document.num_words())
        for i in xrange(len(topic_hist.non_zeros) - 1):
            self.assertGreaterEqual(topic_hist.non_zeros[i].count,
                    topic_hist.non_zeros[i + 1].count)
        # print str(self.document)

    def test_serialize_and_parse(self):
        self.document.parse_from_tokens(
                self.doc_tokens, random, self.vocabulary)

        test_doc = Document(20)
        test_doc.parse_from_string(self.document.serialize_to_string())

        self.assertEqual(self.document.num_words(), test_doc.num_words())
        self.assertEqual(str(self.document), str(test_doc))

    def test_increase_decrease_topic(self):
        self.document.parse_from_tokens(
                self.doc_tokens, random, self.vocabulary, self.model)
        self.document.increase_topic(0, 5)
        self.document.increase_topic(4, 5)
        self.document.increase_topic(9, 5)
        topic_hist = self.document.doc_topic_hist
        for i in xrange(len(topic_hist.non_zeros) - 1):
            self.assertGreaterEqual(topic_hist.non_zeros[i].count,
                    topic_hist.non_zeros[i + 1].count)

        self.document.decrease_topic(4, 4)
        self.document.decrease_topic(9, 3)
        for i in xrange(len(topic_hist.non_zeros) - 1):
            self.assertGreaterEqual(topic_hist.non_zeros[i].count,
                    topic_hist.non_zeros[i + 1].count)

开发者ID:JackieXie168，项目名称:mltk，代码行数:63，代码来源:document_test.py

示例6: save_topic_word_prob_readable

# 需要导入模块: from vocabulary import Vocabulary [as 别名]
# 或者: from vocabulary.Vocabulary import load [as 别名]
    fp.close()
    return topic_prob_list_map


def save_topic_word_prob_readable(filename, topic_prob_list_map, vocab, threshold=None):
    if threshold is None:
        threshold = 1e-6
    fp = open(filename, "w")
    for topic in topic_prob_list_map.keys():
        fp.write("topic %d\n" % topic)
        prob_list = topic_prob_list_map[topic]
        prob_list = [(_id, float(p)) for _id, p in prob_list if float(p) >= threshold]
        prob_list.sort(key=lambda tup: tup[1], reverse=True)
        for (_id, p) in prob_list:
            fp.write("%24.24s:%8.8f\n" % (vocab.get_token_from_id(_id), p))
        fp.write("--------------------\n")
    fp.close()


if __name__ == "__main__":
    if len(sys.argv) < 3:
        print >>sys.stderr, "%s [vocabulary] [xxx-topic-word]" % sys.argv[0]
        sys.exit(1)

    vocab = Vocabulary()
    vocab.load(sys.argv[1])
    topic_prob_list_map = load_topic_word_prob_file(sys.argv[2])

    save_topic_word_prob_readable(sys.argv[2] + "-readable", topic_prob_list_map, vocab)
    save_topic_word_prob_readable(sys.argv[2] + "-readable.0.005", topic_prob_list_map, vocab, threshold=0.005)

开发者ID:zhuoxiongzhao，项目名称:ml-pack，代码行数:32，代码来源:gen_topic_word.py

注：本文中的vocabulary.Vocabulary.load方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。