本文整理汇总了Python中jieba.load_userdict方法的典型用法代码示例。如果您正苦于以下问题:Python jieba.load_userdict方法的具体用法?Python jieba.load_userdict怎么用?Python jieba.load_userdict使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类jieba
的用法示例。
在下文中一共展示了jieba.load_userdict方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: process_data
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def process_data(train_file, user_dict=None, stop_dict=None):
# 结巴分词加载自定义词典(要符合jieba自定义词典规范)
if user_dict:
jieba.load_userdict(user_dict)
# 加载停用词表(每行一个停用词)
stop_words = []
if stop_dict:
with open(stop_dict, 'r', encoding='utf-8') as file:
stop_words = [stop_word.strip() for stop_word in file.readlines()]
# 读取文件内容并分词, 去掉停用词
with open(train_file, 'r', encoding='utf-8') as file:
sentences = file.readlines()
sentences = [jieba.lcut(sentence.strip()) for sentence in sentences]
sentences = [[s for s in sentence if s not in stop_words and s.strip() != ''] for sentence in sentences]
return sentences
示例2: load_dict
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def load_dict(path):
"""
Load dictionary
"""
jieba.load_userdict(path + 'default.dic')
stop_words = open(path + 'stop.dic', 'r', encoding='utf-8').readlines()
stop_dic = {}.fromkeys([line.strip() for line in stop_words])
single_words = open(path + 'single.dic', 'r', encoding='utf-8').readlines()
single_dic = {}.fromkeys([line.strip() for line in single_words])
synonym_words = open(path + 'synonym.dic', 'r', encoding='UTF-8').readlines()
synonym_dic = dict([line.strip().split(" ", 1) for line in synonym_words])
return stop_dic, single_dic, synonym_dic
示例3: __init__
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def __init__(self,
component_config=None,
ent_tagger=None,
session=None,
char_to_id=None,
id_to_tag=None):
super(BilstmCRFEntityExtractor, self).__init__(component_config)
self.ent_tagger = ent_tagger # 指的是训练好的model
self.session = session
self.char_to_id = char_to_id
self.id_to_tag = id_to_tag
dictionary_path = self.component_config.get('dictionary_path')
if dictionary_path:
jieba.load_userdict(dictionary_path)
self.seg = jieba
示例4: do_text_analyze
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def do_text_analyze(text):
load_dictionary_to_cache()
jieba.load_userdict("./resources/dict_terminology.txt")
article = domain.article.Article(text)
raw_sentences = article.split_into_sentences(text)
for raw_sentence in raw_sentences:
sentence = domain.sentence.Sentence(article.article_id, raw_sentence)
article.sentences.append(sentence)
article.cache_raw_seg()
article.generate_sentence_brief()
article.generate_sentence_score()
article.generate_article_score()
print("Article total score:" + str(article.total_score))
article.clean_up_cache()
return article
示例5: jieba_tokenize
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def jieba_tokenize(self,documents):
'''Cut the documents into a sequence of independent words.
# Arguments:
documents: List of news(articles).
'''
chnSTW = self.getchnSTW()
corpora_documents = []
jieba.load_userdict(self.finance_dict)
for item_text in documents:
outstr = []
sentence_seged = list(jieba.cut(item_text))
for word in sentence_seged:
if word not in chnSTW and word != '\t' \
and word != ' ':
outstr.append(word)
corpora_documents.append(outstr)
return corpora_documents
示例6: cal_and_show_job_impression_hot_words
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def cal_and_show_job_impression_hot_words(self, interviewee_comments_dir='../spider/impression'):
"""
calculate and show hot words of Job Impression
:param interviewee_comments_dir:
:return:
"""
if not os.path.exists(interviewee_comments_dir) or len(os.listdir(interviewee_comments_dir)) == 0:
print('Error! No valid content in {0}'.format(interviewee_comments_dir))
sys.exit(0)
else:
job_and_dir = {_: os.path.join(interviewee_comments_dir, _) for _ in os.listdir(interviewee_comments_dir)}
for k, v in job_and_dir.items():
text = self.concat_all_text(v)
jieba.analyse.set_stop_words(STOPWORDS_PATH)
jieba.load_userdict(USER_CORPUS)
hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())
frequencies = {_[0]: _[1] for _ in hot_words_with_weights}
print(frequencies)
x, y = np.ogrid[:300, :300]
mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)
wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
repeat=False,
mask=mask)
wordcloud.generate_from_frequencies(frequencies)
import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
示例7: cal_and_show_jd_hot_words
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def cal_and_show_jd_hot_words(self, jd_dir='../spider/jd'):
"""
calculate and show hot words of Job Description (JD)
:param jd_dir:
:return:
"""
if not os.path.exists(jd_dir) or len(os.listdir(jd_dir)) == 0:
print('Error! No valid content in {0}'.format(jd_dir))
sys.exit(0)
else:
jd_and_dir = {_.split('.')[0]: os.path.join(jd_dir, _) for _ in os.listdir(jd_dir)}
for k, v in jd_and_dir.items():
text = "".join(pd.read_excel(v)['详情描述'])
jieba.analyse.set_stop_words(STOPWORDS_PATH)
jieba.load_userdict(USER_CORPUS)
hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())
frequencies = {_[0]: _[1] for _ in hot_words_with_weights}
print(frequencies)
x, y = np.ogrid[:300, :300]
mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)
wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
repeat=False,
mask=mask)
wordcloud.generate_from_frequencies(frequencies)
import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
示例8: init_jieba
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def init_jieba(self, seg_dic, userdic):
"""
jieba custom setting.
"""
jieba.load_userdict(userdic)
jieba.set_dictionary(seg_dic)
with open(userdic,'r',encoding='utf-8') as input:
for word in input:
word = word.strip('\n')
jieba.suggest_freq(word, True)
示例9: __init__
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def __init__(self, file_corpus,file_userdict):
self.file_corpus = file_corpus
jieba.load_userdict(file_userdict)
示例10: __init__
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def __init__(self):
t1 = time.time()
self.voc=load_voc(file_voc)
print("Loading word2vec vector cost %.3f seconds...\n" % (time.time() - t1))
t1 = time.time()
self.idf=load_idf(file_idf)
print("Loading idf data cost %.3f seconds...\n" % (time.time() - t1))
jieba.load_userdict(file_userdict)
示例11: __init__
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def __init__(self, stopwords_path="", userdict_path="", *args, **kwargs):
super().__init__(*args, **kwargs)
if userdict_path and os.path.exists(userdict_path):
jieba.load_userdict(str(userdict_path))
self.reset(stopwords_path)
示例12: tf_idf
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def tf_idf(texts):
jieba.load_userdict("./model/dict.txt")
jieba.analyse.set_idf_path("./model/idf.txt")
jieba.analyse.set_stop_words("./model/chinese_stopwords.txt")
jieba.enable_parallel(8)
corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts]
return corpus
示例13: __init__
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def __init__(self, dict_file, stop_word_file, sentences_folder, process_num):
self.process_num = process_num
self.stop_word = read_txt(stop_word_file)
jieba.load_userdict(dict_file)
# sentence files
self.sentence_files = []
for root, dirs, files in os.walk(sentences_folder):
for file in files:
self.sentence_files.append(os.path.join(root, file))
开发者ID:xiaolalala,项目名称:Distant-Supervised-Chinese-Relation-Extraction,代码行数:11,代码来源:SentenceSegment.py
示例14: addDictionary
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def addDictionary(self, dict_list):
"""
添加用户自定义字典列表
"""
map(lambda x: jieba.load_userdict(x), dict_list)
示例15: __init__
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import load_userdict [as 别名]
def __init__(self, component_config=None):
# type: (Optional[Dict[Text, Text]]) -> None
super(JiebaPsegExtractor, self).__init__(component_config)
dictionary_path = self.component_config.get('dictionary_path')
if dictionary_path is not None:
jieba.load_userdict(dictionary_path)