本文整理汇总了Python中jieba.analyse方法的典型用法代码示例。如果您正苦于以下问题:Python jieba.analyse方法的具体用法?Python jieba.analyse怎么用?Python jieba.analyse使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类jieba
的用法示例。
在下文中一共展示了jieba.analyse方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_tag
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def get_tag(sentence, config):
"""Get semantic tag of sentence. 获取句子语义标签。
"""
iquestion = sentence.format(**config)
try:
keywords = analyse.extract_tags(iquestion, topK=1)
keyword = keywords[0]
except IndexError:
keyword = iquestion
tags = synonym_cut(keyword, 'wf') # tuple list
if tags:
tag = tags[0][1]
if not tag:
tag = keyword
else:
tag = keyword
return tag
示例2: get_keywords
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def get_keywords(cls, text, size=3):
return jieba.analyse.textrank(text, topK=size)
示例3: cal_and_show_job_impression_hot_words
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def cal_and_show_job_impression_hot_words(self, interviewee_comments_dir='../spider/impression'):
"""
calculate and show hot words of Job Impression
:param interviewee_comments_dir:
:return:
"""
if not os.path.exists(interviewee_comments_dir) or len(os.listdir(interviewee_comments_dir)) == 0:
print('Error! No valid content in {0}'.format(interviewee_comments_dir))
sys.exit(0)
else:
job_and_dir = {_: os.path.join(interviewee_comments_dir, _) for _ in os.listdir(interviewee_comments_dir)}
for k, v in job_and_dir.items():
text = self.concat_all_text(v)
jieba.analyse.set_stop_words(STOPWORDS_PATH)
jieba.load_userdict(USER_CORPUS)
hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())
frequencies = {_[0]: _[1] for _ in hot_words_with_weights}
print(frequencies)
x, y = np.ogrid[:300, :300]
mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)
wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
repeat=False,
mask=mask)
wordcloud.generate_from_frequencies(frequencies)
import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
示例4: cal_and_show_jd_hot_words
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def cal_and_show_jd_hot_words(self, jd_dir='../spider/jd'):
"""
calculate and show hot words of Job Description (JD)
:param jd_dir:
:return:
"""
if not os.path.exists(jd_dir) or len(os.listdir(jd_dir)) == 0:
print('Error! No valid content in {0}'.format(jd_dir))
sys.exit(0)
else:
jd_and_dir = {_.split('.')[0]: os.path.join(jd_dir, _) for _ in os.listdir(jd_dir)}
for k, v in jd_and_dir.items():
text = "".join(pd.read_excel(v)['详情描述'])
jieba.analyse.set_stop_words(STOPWORDS_PATH)
jieba.load_userdict(USER_CORPUS)
hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())
frequencies = {_[0]: _[1] for _ in hot_words_with_weights}
print(frequencies)
x, y = np.ogrid[:300, :300]
mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)
wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
repeat=False,
mask=mask)
wordcloud.generate_from_frequencies(frequencies)
import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
示例5: jieba_textrank
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def jieba_textrank(self):
"""
Use textrank in jieba to extract keywords in a sentence.
"""
speech = input('Input a sentence: ')
return jieba.analyse.textrank(speech, withWeight=True, topK=20)
示例6: jieba_tf_idf
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def jieba_tf_idf(self):
"""
Use tf/idf in jieba to extract keywords in a sentence
"""
speech = input('Input a sentence: ')
return jieba.analyse.extract_tags(speech, topK=20, withWeight=True)
示例7: tfidf
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def tfidf(self) -> list:
kw_with_weight = jieba.analyse.extract_tags(
self.text, allowPOS=ALLOW_POS, withWeight=True)
return self.standardize(kw_with_weight)
示例8: textrank
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def textrank(self) -> list:
kw_with_weight = jieba.analyse.textrank(
self.text, allowPOS=ALLOW_POS, withWeight=True)
return self.standardize(kw_with_weight)
示例9: tf_idf
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def tf_idf(texts):
jieba.load_userdict("./model/dict.txt")
jieba.analyse.set_idf_path("./model/idf.txt")
jieba.analyse.set_stop_words("./model/chinese_stopwords.txt")
jieba.enable_parallel(8)
corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts]
return corpus
示例10: segment_text
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def segment_text(text):
# load user dict
jieba.load_userdict(user_dict)
# set stop words
jieba.analyse.set_stop_words(stop_words)
tags = jieba.analyse.extract_tags(text, topK=20, withWeight=True, allowPOS=())
for tag in tags:
print(str(tag[0]) + "\t" + str(tag[1]))
示例11: sim_hash
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def sim_hash(content):
seg = jieba.cut(content)
keyWord = jieba.analyse.extract_tags('|'.join(seg), topK=20, withWeight=True, allowPOS=())
# 先按照权重排序,再按照词排序
keyList = []
# print(keyWord)
for feature, weight in keyWord:
weight = int(weight * 20)
feature = string_hash(feature)
temp = []
for f in feature:
if f == '1':
temp.append(weight)
else:
temp.append(-weight)
keyList.append(temp)
content_list = np.sum(np.array(keyList), axis=0)
# 编码读不出来
if len(keyList) == 0:
return '00'
simhash = ''
for c in content_list:
if c > 0:
simhash = simhash + '1'
else:
simhash = simhash + '0'
return simhash
示例12: get_top_words
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def get_top_words(top, filename):
topK = top
content = open(filename, 'rb').read()
tags = jieba.analyse.extract_tags(content, topK=topK)
# items = str(tags).replace('u\'', '\'').decode("unicode-escape")
return tags
示例13: synonym_cut
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def synonym_cut(sentence, pattern="wf"):
"""Cut the sentence into a synonym vector tag.
将句子切分为同义词向量标签。
If a word in this sentence was not found in the synonym dictionary,
it will be marked with default value of the word segmentation tool.
如果同义词词典中没有则标注为切词工具默认的词性。
Args:
pattern: 'w'-分词, 'k'-唯一关键词,'t'-关键词列表, 'wf'-分词标签, 'tf-关键词标签'。
"""
# 句尾标点符号过滤
sentence = sentence.rstrip(''.join(punctuation_all))
# 句尾语气词过滤
sentence = sentence.rstrip(tone_words)
synonym_vector = []
if pattern == "w":
synonym_vector = [item for item in jieba.cut(sentence) if item not in filter_characters]
elif pattern == "k":
synonym_vector = analyse.extract_tags(sentence, topK=1)
elif pattern == "t":
synonym_vector = analyse.extract_tags(sentence, topK=10)
elif pattern == "wf":
result = posseg.cut(sentence)
# synonym_vector = [(item.word, item.flag) for item in result \
# if item.word not in filter_characters]
# Modify in 2017.4.27
for item in result:
if item.word not in filter_characters:
if len(item.flag) < 4:
item.flag = list(posseg.cut(item.word))[0].flag
synonym_vector.append((item.word, item.flag))
elif pattern == "tf":
result = posseg.cut(sentence)
tags = analyse.extract_tags(sentence, topK=10)
for item in result:
if item.word in tags:
synonym_vector.append((item.word, item.flag))
return synonym_vector