当前位置: 首页>>代码示例>>Python>>正文


Python jieba.load_userdict方法代码示例

本文整理汇总了Python中jieba.load_userdict方法的典型用法代码示例。如果您正苦于以下问题:Python jieba.load_userdict方法的具体用法?Python jieba.load_userdict怎么用?Python jieba.load_userdict使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在jieba的用法示例。


在下文中一共展示了jieba.load_userdict方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: process_data

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def process_data(train_file, user_dict=None, stop_dict=None):
    # 结巴分词加载自定义词典(要符合jieba自定义词典规范)
    if user_dict:
        jieba.load_userdict(user_dict)

    # 加载停用词表(每行一个停用词)
    stop_words = []
    if stop_dict:
        with open(stop_dict, 'r', encoding='utf-8') as file:
            stop_words = [stop_word.strip() for stop_word in file.readlines()]

    # 读取文件内容并分词, 去掉停用词
    with open(train_file, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
        sentences = [jieba.lcut(sentence.strip()) for sentence in sentences]
        sentences = [[s for s in sentence if s not in stop_words and s.strip() != ''] for sentence in sentences]

    return sentences 
开发者ID:msgi,项目名称:nlp-journey,代码行数:20,代码来源:pre_process.py

示例2: load_dict

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def load_dict(path):
    """
    Load dictionary
    """
    jieba.load_userdict(path + 'default.dic')

    stop_words = open(path + 'stop.dic', 'r', encoding='utf-8').readlines()
    stop_dic = {}.fromkeys([line.strip() for line in stop_words])

    single_words = open(path + 'single.dic', 'r', encoding='utf-8').readlines()
    single_dic = {}.fromkeys([line.strip() for line in single_words])

    synonym_words = open(path + 'synonym.dic', 'r', encoding='UTF-8').readlines()
    synonym_dic = dict([line.strip().split(" ", 1) for line in synonym_words])

    return stop_dic, single_dic, synonym_dic 
开发者ID:YCG09,项目名称:tf-text-classification,代码行数:18,代码来源:seg_words.py

示例3: __init__

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def __init__(self,
                 component_config=None,
                 ent_tagger=None,
                 session=None,
                 char_to_id=None,
                 id_to_tag=None):
        super(BilstmCRFEntityExtractor, self).__init__(component_config)

        self.ent_tagger = ent_tagger  # 指的是训练好的model
        self.session = session
        self.char_to_id = char_to_id
        self.id_to_tag = id_to_tag
        dictionary_path = self.component_config.get('dictionary_path')

        if dictionary_path:
            jieba.load_userdict(dictionary_path)

        self.seg = jieba 
开发者ID:GaoQ1,项目名称:rasa_nlu_gq,代码行数:20,代码来源:bilstm_crf_entity_extractor.py

示例4: do_text_analyze

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def do_text_analyze(text):
    load_dictionary_to_cache()
    jieba.load_userdict("./resources/dict_terminology.txt")

    article = domain.article.Article(text)
    raw_sentences = article.split_into_sentences(text)
    for raw_sentence in raw_sentences:
        sentence = domain.sentence.Sentence(article.article_id, raw_sentence)
        article.sentences.append(sentence)

    article.cache_raw_seg()
    article.generate_sentence_brief()
    article.generate_sentence_score()
    article.generate_article_score()
    print("Article total score:" + str(article.total_score))
    article.clean_up_cache()
    return article 
开发者ID:galaxyyao,项目名称:public-opinion-analysis,代码行数:19,代码来源:text_analyzer.py

示例5: jieba_tokenize

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def jieba_tokenize(self,documents): 
        '''Cut the documents into a sequence of independent words.

        # Arguments:
            documents: List of news(articles).
        '''
        chnSTW = self.getchnSTW()
        corpora_documents = []
        jieba.load_userdict(self.finance_dict)
        for item_text in documents: 
            outstr = []
            sentence_seged = list(jieba.cut(item_text))
            for word in sentence_seged:  
                if word not in chnSTW and word != '\t' \
                and word != ' ':  
                    outstr.append(word)
            corpora_documents.append(outstr)
        return corpora_documents 
开发者ID:DemonDamon,项目名称:Listed-company-news-crawl-and-text-analysis,代码行数:20,代码来源:text_processing.py

示例6: cal_and_show_job_impression_hot_words

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def cal_and_show_job_impression_hot_words(self, interviewee_comments_dir='../spider/impression'):
        """
        calculate and show hot words of Job Impression
        :param interviewee_comments_dir:
        :return:
        """
        if not os.path.exists(interviewee_comments_dir) or len(os.listdir(interviewee_comments_dir)) == 0:
            print('Error! No valid content in {0}'.format(interviewee_comments_dir))
            sys.exit(0)
        else:
            job_and_dir = {_: os.path.join(interviewee_comments_dir, _) for _ in os.listdir(interviewee_comments_dir)}

            for k, v in job_and_dir.items():
                text = self.concat_all_text(v)
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show() 
开发者ID:lucasxlu,项目名称:LagouJob,代码行数:37,代码来源:hot_words_generator.py

示例7: cal_and_show_jd_hot_words

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def cal_and_show_jd_hot_words(self, jd_dir='../spider/jd'):
        """
        calculate and show hot words of Job Description (JD)
        :param jd_dir:
        :return:
        """
        if not os.path.exists(jd_dir) or len(os.listdir(jd_dir)) == 0:
            print('Error! No valid content in {0}'.format(jd_dir))
            sys.exit(0)
        else:
            jd_and_dir = {_.split('.')[0]: os.path.join(jd_dir, _) for _ in os.listdir(jd_dir)}

            for k, v in jd_and_dir.items():
                text = "".join(pd.read_excel(v)['详情描述'])
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show() 
开发者ID:lucasxlu,项目名称:LagouJob,代码行数:37,代码来源:hot_words_generator.py

示例8: init_jieba

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def init_jieba(self, seg_dic, userdic):

        """
        jieba custom setting.
        """

        jieba.load_userdict(userdic)
        jieba.set_dictionary(seg_dic)
        with open(userdic,'r',encoding='utf-8') as input:
            for word in input:
                word = word.strip('\n')
                jieba.suggest_freq(word, True) 
开发者ID:zake7749,项目名称:Chatbot,代码行数:14,代码来源:console.py

示例9: __init__

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def __init__(self, file_corpus,file_userdict):
        self.file_corpus = file_corpus
        jieba.load_userdict(file_userdict) 
开发者ID:cjymz886,项目名称:sentence-similarity,代码行数:5,代码来源:train_word2vec.py

示例10: __init__

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def __init__(self):
        t1 = time.time()
        self.voc=load_voc(file_voc)
        print("Loading  word2vec vector cost %.3f seconds...\n" % (time.time() - t1))
        t1 = time.time()
        self.idf=load_idf(file_idf)
        print("Loading  idf data cost %.3f seconds...\n" % (time.time() - t1))
        jieba.load_userdict(file_userdict) 
开发者ID:cjymz886,项目名称:sentence-similarity,代码行数:10,代码来源:similarity.py

示例11: __init__

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def __init__(self, stopwords_path="", userdict_path="", *args, **kwargs):
        super().__init__(*args, **kwargs)
        if userdict_path and os.path.exists(userdict_path):
            jieba.load_userdict(str(userdict_path))
        self.reset(stopwords_path) 
开发者ID:hscspring,项目名称:Multi-Label-Text-Classification-for-Chinese,代码行数:7,代码来源:chinese.py

示例12: tf_idf

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def tf_idf(texts):
    jieba.load_userdict("./model/dict.txt")
    jieba.analyse.set_idf_path("./model/idf.txt")
    jieba.analyse.set_stop_words("./model/chinese_stopwords.txt")
    jieba.enable_parallel(8)

    corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts]
    return corpus 
开发者ID:03pie,项目名称:SMPCUP2017,代码行数:10,代码来源:tf_idf.py

示例13: __init__

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def __init__(self, dict_file, stop_word_file, sentences_folder, process_num):
        self.process_num = process_num
        self.stop_word = read_txt(stop_word_file)
        jieba.load_userdict(dict_file)
        # sentence files
        self.sentence_files = []
        for root, dirs, files in os.walk(sentences_folder):
            for file in files:
                self.sentence_files.append(os.path.join(root, file)) 
开发者ID:xiaolalala,项目名称:Distant-Supervised-Chinese-Relation-Extraction,代码行数:11,代码来源:SentenceSegment.py

示例14: addDictionary

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def addDictionary(self, dict_list):
        """
        添加用户自定义字典列表
        """
        map(lambda x: jieba.load_userdict(x), dict_list) 
开发者ID:FesonX,项目名称:cn-text-classifier,代码行数:7,代码来源:cutWord.py

示例15: __init__

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def __init__(self, component_config=None):
        # type: (Optional[Dict[Text, Text]]) -> None
        super(JiebaPsegExtractor, self).__init__(component_config)
        dictionary_path = self.component_config.get('dictionary_path')

        if dictionary_path is not None:
            jieba.load_userdict(dictionary_path) 
开发者ID:GaoQ1,项目名称:rasa_nlu_gq,代码行数:9,代码来源:jieba_pseg_extractor.py


注:本文中的jieba.load_userdict方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。