當前位置: 首頁>>代碼示例>>Python>>正文


Python jieba.posseg方法代碼示例

本文整理匯總了Python中jieba.posseg方法的典型用法代碼示例。如果您正苦於以下問題:Python jieba.posseg方法的具體用法?Python jieba.posseg怎麽用?Python jieba.posseg使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在jieba的用法示例。


在下文中一共展示了jieba.posseg方法的14個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: cutfunc

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import posseg [as 別名]
def cutfunc(sentence, _, HMM=True):
        for w, f in jieba.posseg.cut(sentence, HMM):
            yield w + posdelim + f 
開發者ID:deepcs233,項目名稱:jieba_fast,代碼行數:5,代碼來源:__main__.py

示例2: __init__

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import posseg [as 別名]
def __init__(self, idf_path=None):
        self.tokenizer = jieba.dt
        self.postokenizer = jieba.posseg.dt
        self.stop_words = self.STOP_WORDS.copy()
        self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
        self.idf_freq, self.median_idf = self.idf_loader.get_idf() 
開發者ID:deepcs233,項目名稱:jieba_fast,代碼行數:8,代碼來源:tfidf.py

示例3: testPosseg

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import posseg [as 別名]
def testPosseg(self):
        import jieba.posseg as pseg
        for content in test_contents:
            result = pseg.cut(content)
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
            print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
        print("testPosseg", file=sys.stderr) 
開發者ID:deepcs233,項目名稱:jieba_fast,代碼行數:11,代碼來源:jieba_test.py

示例4: testPosseg_NOHMM

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import posseg [as 別名]
def testPosseg_NOHMM(self):
        import jieba.posseg as pseg
        for content in test_contents:
            result = pseg.cut(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
            print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
        print("testPosseg_NOHMM", file=sys.stderr) 
開發者ID:deepcs233,項目名稱:jieba_fast,代碼行數:11,代碼來源:jieba_test.py

示例5: text2ner

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import posseg [as 別名]
def text2ner(text):
    seq, pos, label = [], [], []
    segment = jieba.posseg.cut(text)
    words, flags = [], []
    for seg in segment:
        words.append(seg.word)
        flags.append(seg.flag)
    i = 0
    tag = 'O'
    pre = 0  # 判斷前麵<>
    sign = 0  # 記錄有多個連續的<>
    while i < len(words):
        if words[i] != '<':
            seq.append(words[i])
            pos.append(flags[i])
            label.append(tag)
            if tag == 'B':
                tag = 'I'
                sign = 1
            i += 1
        else:
            if words[i+1] == '/':
                pre -= 1
                if pre == 0:
                    tag = 'O'
                else:
                    tag = 'I'
            else:
                pre += 1
                if pre == 1:
                    tag = 'B'
                    sign = 0
                elif sign == 1:
                    tag = 'I'
            while i < len(words) and words[i] != '>':
                i += 1
            i += 1
    return seq, pos, label 
開發者ID:baiyyang,項目名稱:medical-entity-recognition,代碼行數:40,代碼來源:predata.py

示例6: test_segment

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import posseg [as 別名]
def test_segment():
    """測試疾病名糾錯"""
    error_sentence_1 = '這個新藥奧美砂坦脂片能治療心絞痛,效果還可以'  # 奧美沙坦酯片
    print(error_sentence_1)
    print(segment(error_sentence_1))
    import jieba
    print(list(jieba.tokenize(error_sentence_1)))
    import jieba.posseg as pseg
    words = pseg.lcut("我愛北京天安門")  # jieba默認模式
    print('old:', words)
    # jieba.enable_paddle()  # 啟動paddle模式。 0.40版之後開始支持,早期版本不支持
    # words = pseg.cut("我愛北京天安門", use_paddle=True)  # paddle模式
    # for word, flag in words:
    #     print('new:','%s %s' % (word, flag)) 
開發者ID:shibing624,項目名稱:pycorrector,代碼行數:16,代碼來源:tokenizer_test.py

示例7: posseg_cut_examples

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import posseg [as 別名]
def posseg_cut_examples(self, example):
        raw_entities = example.get("entities", [])
        example_posseg = self.posseg(example.text)

        for (item_posseg, start, end) in example_posseg:
            part_of_speech = self.component_config["part_of_speech"]
            for (word_posseg, flag_posseg) in item_posseg:
                if flag_posseg in part_of_speech:
                    raw_entities.append({
                        'start': start,
                        'end': end,
                        'value': word_posseg,
                        'entity': flag_posseg
                    })
        return raw_entities 
開發者ID:GaoQ1,項目名稱:rasa_nlu_gq,代碼行數:17,代碼來源:jieba_pseg_extractor.py

示例8: posseg

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import posseg [as 別名]
def posseg(text):
        # type: (Text) -> List[Token]
        result = []
        for (word, start, end) in jieba.tokenize(text):
            pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
            result.append((pseg_data, start, end))

        return result 
開發者ID:GaoQ1,項目名稱:rasa_nlu_gq,代碼行數:10,代碼來源:jieba_pseg_extractor.py

示例9: posseg_cut_examples

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import posseg [as 別名]
def posseg_cut_examples(self, example):
        raw_entities = example.get("entities", [])
        example_posseg = self.posseg(example.text)
        for (item_posseg, start, end) in example_posseg:
            part_of_speech = self.component_config["part_of_speech"]
            for (word_posseg, flag_posseg) in item_posseg:
                if flag_posseg in part_of_speech:
                    raw_entities.append({
                        'start': start,
                        'end': end,
                        'value': word_posseg,
                        'entity': flag_posseg
                    })
        return raw_entities 
開發者ID:weizhenzhao,項目名稱:rasa_nlu,代碼行數:16,代碼來源:jieba_pseg_extractor.py

示例10: posseg

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import posseg [as 別名]
def posseg(text):
        # type: (Text) -> List[Token]

        import jieba
        import jieba.posseg as pseg

        result = []
        for (word, start, end) in jieba.tokenize(text):
            pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
            result.append((pseg_data, start, end))

        return result 
開發者ID:weizhenzhao,項目名稱:rasa_nlu,代碼行數:14,代碼來源:jieba_pseg_extractor.py

示例11: get_n

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import posseg [as 別名]
def get_n(sentence):
    words = jieba.posseg.cut(sentence)
    word_list = []
    for word, flag in words:
        if 'n' in flag or flag in ['vn']:
            word_list.append(word)
    return set(word_list) 
開發者ID:SeanLee97,項目名稱:chinese_reading_comprehension,代碼行數:9,代碼來源:predict.py

示例12: posseg

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import posseg [as 別名]
def posseg(self, sent, standard_name=False, stopwords=None):
        if self.language == 'en':
            from nltk import word_tokenize, pos_tag
            stopwords = set() if stopwords is None else stopwords
            tokens = [word for word in word_tokenize(sent) if word not in stopwords]
            return pos_tag(tokens, tagset='universal')
        else:
            self.standard_name = standard_name
            entities_info = self.entity_linking(sent)
            sent2 = self.decoref(sent, entities_info)
            result = []
            i = 0
            for word, flag in pseg.cut(sent2):
                if word in self.entity_types:
                    if self.standard_name:
                        word = entities_info[i][1][0]  # 使用鏈接的實體
                    else:
                        l, r = entities_info[i][0]  # 或使用原文
                        word = sent[l:r]
                    flag = entities_info[i][1][1][1:-1]
                    i += 1
                else:
                    if stopwords and word in stopwords:
                        continue
                result.append((word, flag))
            return result 
開發者ID:blmoistawinde,項目名稱:HarvestText,代碼行數:28,代碼來源:harvesttext.py

示例13: synonym_cut

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import posseg [as 別名]
def synonym_cut(sentence, pattern="wf"):
    """Cut the sentence into a synonym vector tag.
    將句子切分為同義詞向量標簽。

    If a word in this sentence was not found in the synonym dictionary,
    it will be marked with default value of the word segmentation tool.
    如果同義詞詞典中沒有則標注為切詞工具默認的詞性。

    Args:
        pattern: 'w'-分詞, 'k'-唯一關鍵詞,'t'-關鍵詞列表, 'wf'-分詞標簽, 'tf-關鍵詞標簽'。
    """
    # 句尾標點符號過濾
    sentence = sentence.rstrip(''.join(punctuation_all))
    # 句尾語氣詞過濾
    sentence = sentence.rstrip(tone_words)
    synonym_vector = []
    if pattern == "w":
        synonym_vector = [item for item in jieba.cut(sentence) if item not in filter_characters]
    elif pattern == "k":
        synonym_vector = analyse.extract_tags(sentence, topK=1)
    elif pattern == "t":
        synonym_vector = analyse.extract_tags(sentence, topK=10)
    elif pattern == "wf":
        result = posseg.cut(sentence)
        # synonym_vector = [(item.word, item.flag) for item in result \
        # if item.word not in filter_characters]
        # Modify in 2017.4.27 
        for item in result:
            if item.word not in filter_characters:
                if len(item.flag) < 4:
                    item.flag = list(posseg.cut(item.word))[0].flag
                synonym_vector.append((item.word, item.flag))
    elif pattern == "tf":
        result = posseg.cut(sentence)
        tags = analyse.extract_tags(sentence, topK=10)
        for item in result:
            if item.word in tags:
                synonym_vector.append((item.word, item.flag))
    return synonym_vector 
開發者ID:Decalogue,項目名稱:chat,代碼行數:41,代碼來源:semantic.py

示例14: extract_tags

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import posseg [as 別名]
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
        """
        Extract keywords from sentence using TF-IDF algorithm.
        Parameter:
            - topK: return how many top keywords. `None` for all possible words.
            - withWeight: if True, return a list of (word, weight);
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
                        if the POS of w is not in this list,it will be filtered.
            - withFlag: only work with allowPOS is not empty.
                        if True, return a list of pair(word, weight) like posseg.cut
                        if False, return a list of words
        """
        if allowPOS:
            allowPOS = frozenset(allowPOS)
            words = self.postokenizer.cut(sentence)
        else:
            words = self.tokenizer.cut(sentence)
        freq = {}
        for w in words:
            if allowPOS:
                if w.flag not in allowPOS:
                    continue
                elif not withFlag:
                    w = w.word
            wc = w.word if allowPOS and withFlag else w
            if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
                continue
            freq[w] = freq.get(w, 0.0) + 1.0
        total = sum(freq.values())
        for k in freq:
            kw = k.word if allowPOS and withFlag else k
            freq[k] *= self.idf_freq.get(kw, self.median_idf) / total

        if withWeight:
            tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
        else:
            tags = sorted(freq, key=freq.__getitem__, reverse=True)
        if topK:
            return tags[:topK]
        else:
            return tags 
開發者ID:deepcs233,項目名稱:jieba_fast,代碼行數:44,代碼來源:tfidf.py


注:本文中的jieba.posseg方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。