當前位置: 首頁>>代碼示例>>Python>>正文


Python jieba.load_userdict方法代碼示例

本文整理匯總了Python中jieba.load_userdict方法的典型用法代碼示例。如果您正苦於以下問題:Python jieba.load_userdict方法的具體用法?Python jieba.load_userdict怎麽用?Python jieba.load_userdict使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在jieba的用法示例。


在下文中一共展示了jieba.load_userdict方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: process_data

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import load_userdict [as 別名]
def process_data(train_file, user_dict=None, stop_dict=None):
    # 結巴分詞加載自定義詞典(要符合jieba自定義詞典規範)
    if user_dict:
        jieba.load_userdict(user_dict)

    # 加載停用詞表(每行一個停用詞)
    stop_words = []
    if stop_dict:
        with open(stop_dict, 'r', encoding='utf-8') as file:
            stop_words = [stop_word.strip() for stop_word in file.readlines()]

    # 讀取文件內容並分詞, 去掉停用詞
    with open(train_file, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
        sentences = [jieba.lcut(sentence.strip()) for sentence in sentences]
        sentences = [[s for s in sentence if s not in stop_words and s.strip() != ''] for sentence in sentences]

    return sentences 
開發者ID:msgi,項目名稱:nlp-journey,代碼行數:20,代碼來源:pre_process.py

示例2: load_dict

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import load_userdict [as 別名]
def load_dict(path):
    """
    Load dictionary
    """
    jieba.load_userdict(path + 'default.dic')

    stop_words = open(path + 'stop.dic', 'r', encoding='utf-8').readlines()
    stop_dic = {}.fromkeys([line.strip() for line in stop_words])

    single_words = open(path + 'single.dic', 'r', encoding='utf-8').readlines()
    single_dic = {}.fromkeys([line.strip() for line in single_words])

    synonym_words = open(path + 'synonym.dic', 'r', encoding='UTF-8').readlines()
    synonym_dic = dict([line.strip().split(" ", 1) for line in synonym_words])

    return stop_dic, single_dic, synonym_dic 
開發者ID:YCG09,項目名稱:tf-text-classification,代碼行數:18,代碼來源:seg_words.py

示例3: __init__

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import load_userdict [as 別名]
def __init__(self,
                 component_config=None,
                 ent_tagger=None,
                 session=None,
                 char_to_id=None,
                 id_to_tag=None):
        super(BilstmCRFEntityExtractor, self).__init__(component_config)

        self.ent_tagger = ent_tagger  # 指的是訓練好的model
        self.session = session
        self.char_to_id = char_to_id
        self.id_to_tag = id_to_tag
        dictionary_path = self.component_config.get('dictionary_path')

        if dictionary_path:
            jieba.load_userdict(dictionary_path)

        self.seg = jieba 
開發者ID:GaoQ1,項目名稱:rasa_nlu_gq,代碼行數:20,代碼來源:bilstm_crf_entity_extractor.py

示例4: do_text_analyze

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import load_userdict [as 別名]
def do_text_analyze(text):
    load_dictionary_to_cache()
    jieba.load_userdict("./resources/dict_terminology.txt")

    article = domain.article.Article(text)
    raw_sentences = article.split_into_sentences(text)
    for raw_sentence in raw_sentences:
        sentence = domain.sentence.Sentence(article.article_id, raw_sentence)
        article.sentences.append(sentence)

    article.cache_raw_seg()
    article.generate_sentence_brief()
    article.generate_sentence_score()
    article.generate_article_score()
    print("Article total score:" + str(article.total_score))
    article.clean_up_cache()
    return article 
開發者ID:galaxyyao,項目名稱:public-opinion-analysis,代碼行數:19,代碼來源:text_analyzer.py

示例5: jieba_tokenize

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import load_userdict [as 別名]
def jieba_tokenize(self,documents): 
        '''Cut the documents into a sequence of independent words.

        # Arguments:
            documents: List of news(articles).
        '''
        chnSTW = self.getchnSTW()
        corpora_documents = []
        jieba.load_userdict(self.finance_dict)
        for item_text in documents: 
            outstr = []
            sentence_seged = list(jieba.cut(item_text))
            for word in sentence_seged:  
                if word not in chnSTW and word != '\t' \
                and word != ' ':  
                    outstr.append(word)
            corpora_documents.append(outstr)
        return corpora_documents 
開發者ID:DemonDamon,項目名稱:Listed-company-news-crawl-and-text-analysis,代碼行數:20,代碼來源:text_processing.py

示例6: cal_and_show_job_impression_hot_words

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import load_userdict [as 別名]
def cal_and_show_job_impression_hot_words(self, interviewee_comments_dir='../spider/impression'):
        """
        calculate and show hot words of Job Impression
        :param interviewee_comments_dir:
        :return:
        """
        if not os.path.exists(interviewee_comments_dir) or len(os.listdir(interviewee_comments_dir)) == 0:
            print('Error! No valid content in {0}'.format(interviewee_comments_dir))
            sys.exit(0)
        else:
            job_and_dir = {_: os.path.join(interviewee_comments_dir, _) for _ in os.listdir(interviewee_comments_dir)}

            for k, v in job_and_dir.items():
                text = self.concat_all_text(v)
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show() 
開發者ID:lucasxlu,項目名稱:LagouJob,代碼行數:37,代碼來源:hot_words_generator.py

示例7: cal_and_show_jd_hot_words

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import load_userdict [as 別名]
def cal_and_show_jd_hot_words(self, jd_dir='../spider/jd'):
        """
        calculate and show hot words of Job Description (JD)
        :param jd_dir:
        :return:
        """
        if not os.path.exists(jd_dir) or len(os.listdir(jd_dir)) == 0:
            print('Error! No valid content in {0}'.format(jd_dir))
            sys.exit(0)
        else:
            jd_and_dir = {_.split('.')[0]: os.path.join(jd_dir, _) for _ in os.listdir(jd_dir)}

            for k, v in jd_and_dir.items():
                text = "".join(pd.read_excel(v)['詳情描述'])
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show() 
開發者ID:lucasxlu,項目名稱:LagouJob,代碼行數:37,代碼來源:hot_words_generator.py

示例8: init_jieba

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import load_userdict [as 別名]
def init_jieba(self, seg_dic, userdic):

        """
        jieba custom setting.
        """

        jieba.load_userdict(userdic)
        jieba.set_dictionary(seg_dic)
        with open(userdic,'r',encoding='utf-8') as input:
            for word in input:
                word = word.strip('\n')
                jieba.suggest_freq(word, True) 
開發者ID:zake7749,項目名稱:Chatbot,代碼行數:14,代碼來源:console.py

示例9: __init__

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import load_userdict [as 別名]
def __init__(self, file_corpus,file_userdict):
        self.file_corpus = file_corpus
        jieba.load_userdict(file_userdict) 
開發者ID:cjymz886,項目名稱:sentence-similarity,代碼行數:5,代碼來源:train_word2vec.py

示例10: __init__

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import load_userdict [as 別名]
def __init__(self):
        t1 = time.time()
        self.voc=load_voc(file_voc)
        print("Loading  word2vec vector cost %.3f seconds...\n" % (time.time() - t1))
        t1 = time.time()
        self.idf=load_idf(file_idf)
        print("Loading  idf data cost %.3f seconds...\n" % (time.time() - t1))
        jieba.load_userdict(file_userdict) 
開發者ID:cjymz886,項目名稱:sentence-similarity,代碼行數:10,代碼來源:similarity.py

示例11: __init__

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import load_userdict [as 別名]
def __init__(self, stopwords_path="", userdict_path="", *args, **kwargs):
        super().__init__(*args, **kwargs)
        if userdict_path and os.path.exists(userdict_path):
            jieba.load_userdict(str(userdict_path))
        self.reset(stopwords_path) 
開發者ID:hscspring,項目名稱:Multi-Label-Text-Classification-for-Chinese,代碼行數:7,代碼來源:chinese.py

示例12: tf_idf

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import load_userdict [as 別名]
def tf_idf(texts):
    jieba.load_userdict("./model/dict.txt")
    jieba.analyse.set_idf_path("./model/idf.txt")
    jieba.analyse.set_stop_words("./model/chinese_stopwords.txt")
    jieba.enable_parallel(8)

    corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts]
    return corpus 
開發者ID:03pie,項目名稱:SMPCUP2017,代碼行數:10,代碼來源:tf_idf.py

示例13: __init__

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import load_userdict [as 別名]
def __init__(self, dict_file, stop_word_file, sentences_folder, process_num):
        self.process_num = process_num
        self.stop_word = read_txt(stop_word_file)
        jieba.load_userdict(dict_file)
        # sentence files
        self.sentence_files = []
        for root, dirs, files in os.walk(sentences_folder):
            for file in files:
                self.sentence_files.append(os.path.join(root, file)) 
開發者ID:xiaolalala,項目名稱:Distant-Supervised-Chinese-Relation-Extraction,代碼行數:11,代碼來源:SentenceSegment.py

示例14: addDictionary

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import load_userdict [as 別名]
def addDictionary(self, dict_list):
        """
        添加用戶自定義字典列表
        """
        map(lambda x: jieba.load_userdict(x), dict_list) 
開發者ID:FesonX,項目名稱:cn-text-classifier,代碼行數:7,代碼來源:cutWord.py

示例15: __init__

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import load_userdict [as 別名]
def __init__(self, component_config=None):
        # type: (Optional[Dict[Text, Text]]) -> None
        super(JiebaPsegExtractor, self).__init__(component_config)
        dictionary_path = self.component_config.get('dictionary_path')

        if dictionary_path is not None:
            jieba.load_userdict(dictionary_path) 
開發者ID:GaoQ1,項目名稱:rasa_nlu_gq,代碼行數:9,代碼來源:jieba_pseg_extractor.py


注:本文中的jieba.load_userdict方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。