Python jieba.analyse方法代码示例

本文整理汇总了Python中jieba.analyse方法的典型用法代码示例。如果您正苦于以下问题：Python jieba.analyse方法的具体用法？Python jieba.analyse怎么用？Python jieba.analyse使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类jieba的用法示例。

在下文中一共展示了jieba.analyse方法的13个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_tag

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def get_tag(sentence, config):
    """Get semantic tag of sentence. 获取句子语义标签。
    """
    iquestion = sentence.format(**config)
    try:
        keywords = analyse.extract_tags(iquestion, topK=1)
        keyword = keywords[0]
    except IndexError:
        keyword = iquestion
    tags = synonym_cut(keyword, 'wf') # tuple list
    if tags:
        tag = tags[0][1]
        if not tag:
            tag = keyword
    else:
        tag = keyword
    return tag

开发者ID:Decalogue，项目名称:chat，代码行数:19，代码来源:semantic.py

示例2: get_keywords

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def get_keywords(cls, text, size=3):
        return jieba.analyse.textrank(text, topK=size)

开发者ID:shibing624，项目名称:dialogbot，代码行数:4，代码来源:tokenizer.py

示例3: cal_and_show_job_impression_hot_words

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def cal_and_show_job_impression_hot_words(self, interviewee_comments_dir='../spider/impression'):
        """
        calculate and show hot words of Job Impression
        :param interviewee_comments_dir:
        :return:
        """
        if not os.path.exists(interviewee_comments_dir) or len(os.listdir(interviewee_comments_dir)) == 0:
            print('Error! No valid content in {0}'.format(interviewee_comments_dir))
            sys.exit(0)
        else:
            job_and_dir = {_: os.path.join(interviewee_comments_dir, _) for _ in os.listdir(interviewee_comments_dir)}

            for k, v in job_and_dir.items():
                text = self.concat_all_text(v)
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show()

开发者ID:lucasxlu，项目名称:LagouJob，代码行数:37，代码来源:hot_words_generator.py

示例4: cal_and_show_jd_hot_words

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def cal_and_show_jd_hot_words(self, jd_dir='../spider/jd'):
        """
        calculate and show hot words of Job Description (JD)
        :param jd_dir:
        :return:
        """
        if not os.path.exists(jd_dir) or len(os.listdir(jd_dir)) == 0:
            print('Error! No valid content in {0}'.format(jd_dir))
            sys.exit(0)
        else:
            jd_and_dir = {_.split('.')[0]: os.path.join(jd_dir, _) for _ in os.listdir(jd_dir)}

            for k, v in jd_and_dir.items():
                text = "".join(pd.read_excel(v)['详情描述'])
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show()

开发者ID:lucasxlu，项目名称:LagouJob，代码行数:37，代码来源:hot_words_generator.py

示例5: jieba_textrank

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def jieba_textrank(self):

        """
        Use textrank in jieba to extract keywords in a sentence.
        """

        speech = input('Input a sentence: ')
        return jieba.analyse.textrank(speech, withWeight=True, topK=20)

开发者ID:zake7749，项目名称:Chatbot，代码行数:10，代码来源:console.py

示例6: jieba_tf_idf

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def jieba_tf_idf(self):

        """
        Use tf/idf in jieba to extract keywords in a sentence
        """

        speech = input('Input a sentence: ')
        return jieba.analyse.extract_tags(speech, topK=20, withWeight=True)

开发者ID:zake7749，项目名称:Chatbot，代码行数:10，代码来源:console.py

示例7: tfidf

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def tfidf(self) -> list:
        kw_with_weight = jieba.analyse.extract_tags(
            self.text, allowPOS=ALLOW_POS, withWeight=True)
        return self.standardize(kw_with_weight)

开发者ID:hscspring，项目名称:Multi-Label-Text-Classification-for-Chinese，代码行数:6，代码来源:engineering.py

示例8: textrank

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def textrank(self) -> list:
        kw_with_weight = jieba.analyse.textrank(
            self.text, allowPOS=ALLOW_POS, withWeight=True)
        return self.standardize(kw_with_weight)

开发者ID:hscspring，项目名称:Multi-Label-Text-Classification-for-Chinese，代码行数:6，代码来源:engineering.py

示例9: tf_idf

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def tf_idf(texts):
    jieba.load_userdict("./model/dict.txt")
    jieba.analyse.set_idf_path("./model/idf.txt")
    jieba.analyse.set_stop_words("./model/chinese_stopwords.txt")
    jieba.enable_parallel(8)

    corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts]
    return corpus

开发者ID:03pie，项目名称:SMPCUP2017，代码行数:10，代码来源:tf_idf.py

示例10: segment_text

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def segment_text(text):
    # load user dict
    jieba.load_userdict(user_dict)
    # set stop words
    jieba.analyse.set_stop_words(stop_words)
    tags = jieba.analyse.extract_tags(text, topK=20, withWeight=True, allowPOS=())
    for tag in tags:
        print(str(tag[0]) + "\t" + str(tag[1]))

开发者ID:lucasxlu，项目名称:JiaYuan，代码行数:10，代码来源:text_segment.py

示例11: sim_hash

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def sim_hash(content):
    seg = jieba.cut(content)
    keyWord = jieba.analyse.extract_tags('|'.join(seg), topK=20, withWeight=True, allowPOS=())
    # 先按照权重排序，再按照词排序
    keyList = []
    # print(keyWord)
    for feature, weight in keyWord:
        weight = int(weight * 20)
        feature = string_hash(feature)
        temp = []
        for f in feature:
            if f == '1':
                temp.append(weight)
            else:
                temp.append(-weight)
        keyList.append(temp)
    content_list = np.sum(np.array(keyList), axis=0)
    # 编码读不出来
    if len(keyList) == 0:
        return '00'
    simhash = ''
    for c in content_list:
        if c > 0:
            simhash = simhash + '1'
        else:
            simhash = simhash + '0'
    return simhash

开发者ID:yongzhuo，项目名称:nlp_xiaojiang，代码行数:29，代码来源:distance_text_or_vec.py

示例12: get_top_words

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def get_top_words(top, filename):
	topK = top
	content = open(filename, 'rb').read()
	tags = jieba.analyse.extract_tags(content, topK=topK)
	# items = str(tags).replace('u\'', '\'').decode("unicode-escape")
	return tags

开发者ID:h-j-13，项目名称:Malicious_Domain_Whois，代码行数:8，代码来源:tfidf_top.py

示例13: synonym_cut

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import analyse [as 别名]
def synonym_cut(sentence, pattern="wf"):
    """Cut the sentence into a synonym vector tag.
    将句子切分为同义词向量标签。

    If a word in this sentence was not found in the synonym dictionary,
    it will be marked with default value of the word segmentation tool.
    如果同义词词典中没有则标注为切词工具默认的词性。

    Args:
        pattern: 'w'-分词, 'k'-唯一关键词，'t'-关键词列表, 'wf'-分词标签, 'tf-关键词标签'。
    """
    # 句尾标点符号过滤
    sentence = sentence.rstrip(''.join(punctuation_all))
    # 句尾语气词过滤
    sentence = sentence.rstrip(tone_words)
    synonym_vector = []
    if pattern == "w":
        synonym_vector = [item for item in jieba.cut(sentence) if item not in filter_characters]
    elif pattern == "k":
        synonym_vector = analyse.extract_tags(sentence, topK=1)
    elif pattern == "t":
        synonym_vector = analyse.extract_tags(sentence, topK=10)
    elif pattern == "wf":
        result = posseg.cut(sentence)
        # synonym_vector = [(item.word, item.flag) for item in result \
        # if item.word not in filter_characters]
        # Modify in 2017.4.27 
        for item in result:
            if item.word not in filter_characters:
                if len(item.flag) < 4:
                    item.flag = list(posseg.cut(item.word))[0].flag
                synonym_vector.append((item.word, item.flag))
    elif pattern == "tf":
        result = posseg.cut(sentence)
        tags = analyse.extract_tags(sentence, topK=10)
        for item in result:
            if item.word in tags:
                synonym_vector.append((item.word, item.flag))
    return synonym_vector

开发者ID:Decalogue，项目名称:chat，代码行数:41，代码来源:semantic.py

注：本文中的jieba.analyse方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。