Python posseg.cut方法代碼示例

本文整理匯總了Python中jieba.posseg.cut方法的典型用法代碼示例。如果您正苦於以下問題：Python posseg.cut方法的具體用法？Python posseg.cut怎麽用？Python posseg.cut使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類jieba.posseg的用法示例。

在下文中一共展示了posseg.cut方法的15個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: clean_entity

# 需要導入模塊: from jieba import posseg [as 別名]
# 或者: from jieba.posseg import cut [as 別名]
def clean_entity(self, text):
        wps = pseg.cut(text)
        res = []
        for w, pos in wps:
            # 人名
            if pos == "nr":
                res.append("P")
            # 地名
            elif pos == "ns":
                res.append("L")
            # 機構名
            elif pos == "nt":
                res.append("O")
            else:
                res.append(w)
        return "".join(res)

開發者ID:hscspring，項目名稱:Multi-Label-Text-Classification-for-Chinese，代碼行數:18，代碼來源:chinese.py

示例2: place_recognize

# 需要導入模塊: from jieba import posseg [as 別名]
# 或者: from jieba.posseg import cut [as 別名]
def place_recognize(cls, text):
        places = [w for w, flag in posseg.cut(text) if "ns" in flag
                  and len(w) >= 2
                  and w not in cls.not_place_set
                  and "哈" not in w
                  and "之" not in w
                  and "本" not in w
                  and "中" not in w
                  and "嫩" not in w
                  and "大" not in w
                  and "鮮" not in w
                  and "國" not in w
                  and "上" not in w
                  and "確" not in w
                  and "牙" not in w
                  and "壺" not in w
                  and "阿" not in w
                  and "入" not in w
                  and "哥" not in w
                  and "顆" not in w
                  and "的" not in w
                  and "聯" not in w
                  and "哇" not in w]

        return places

開發者ID:shibing624，項目名稱:dialogbot，代碼行數:27，代碼來源:tokenizer.py

示例3: cut

# 需要導入模塊: from jieba import posseg [as 別名]
# 或者: from jieba.posseg import cut [as 別名]
def cut(dataset):
    store_file = '分詞後文件\\'
    if not os.path.exists(store_file):
        os.mkdir(store_file)

    store_data = open(store_file + dataset+'_分詞.txt', 'w', encoding='utf-8')
    data = open('數據\\'+dataset+'.txt', encoding='utf-8').read()

    # 沒有人工去停用詞，因為虛詞連詞甚至標點後麵分析都可能用到
    words = psg.cut(data, HMM=True)

    words_dict = {}
    for word, flag in words:
        if word in words_dict.keys():
            words_dict[word][1] += 1
        else:
            words_dict[word] = [flag, 1]

    words_dict = sorted(words_dict.items(), key=lambda item: item[1][1], reverse=True)
    store_data.write(str(words_dict))

開發者ID:ZubinGou，項目名稱:AI_Poet_Totoro，代碼行數:22，代碼來源:詞性分詞.py

示例4: MatchItem

# 需要導入模塊: from jieba import posseg [as 別名]
# 或者: from jieba.posseg import cut [as 別名]
def MatchItem(self, input, start, end,muststart, mode=None):
        self.LogIn(input, start,end)
        pos = start;
        if end is None:
            end=len(input);
        seg_list = pseg.cut(input[start:end] if self.Len == -1 else input[start:start + self.Len]);
        for word, flag in seg_list:
            if self.Pos is None:
                sword = word;
                break;
            else:
                if flag in self.Pos:
                    sword = word;
                    break;
            pos += len(word);
        if pos < 0 or (muststart == True and pos != start):
            self.LogOut(None)
            return start + self.Len if self.Len < 0 else tnpy.int_max;
        self.LogOut(sword)
        m = tnpy.MatchResult(self, sword, pos);
        m.rstr = sword;
        return m;

開發者ID:ferventdesert，項目名稱:tnpy，代碼行數:24，代碼來源:tnnlp.py

示例5: _create_examples

# 需要導入模塊: from jieba import posseg [as 別名]
# 或者: from jieba.posseg import cut [as 別名]
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      guid = "%s-%s" % (set_type, i)
      text_a = tokenization.convert_to_unicode(line[1])
      text_b = ''
      for x in psg.cut(text_a):
          text_b = text_b + x.flag + ' '
      text_b = text_b.strip()
      print(text_a)
      print(text_b)
      # text_b = tokenization.convert_to_unicode(line[1])
      if(set_type == 'test'):
          label = '0'
      else:
          label = tokenization.convert_to_unicode(line[2])
          # print(text_a)
      # print(label)
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

開發者ID:tracy-talent，項目名稱:curriculum，代碼行數:24，代碼來源:run_classifier.py

示例6: word_flag_cut

# 需要導入模塊: from jieba import posseg [as 別名]
# 或者: from jieba.posseg import cut [as 別名]
def word_flag_cut(sentence):
    """
        jieba切詞詞性
    :param sentence: 
    :return: 
    """
    sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').\
                        replace(' ', '').replace('\t', '').upper().strip()
    word_list = []
    flag_list = []
    try:
        sentence_cut =  ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
        words = jieba_seg.cut(sentence_cut)
        for word in words:
            word_list.append(word.word)
            flag_list.append(word.flag)
    except Exception as e:
        word_list = [sentence]
        flag_list = ['nt']
    return word_list, flag_list

開發者ID:yongzhuo，項目名稱:nlp_xiaojiang，代碼行數:22，代碼來源:chatbot_sentence_vec_by_word.py

示例7: _entity_recheck

# 需要導入模塊: from jieba import posseg [as 別名]
# 或者: from jieba.posseg import cut [as 別名]
def _entity_recheck(self, sent, entities_info, pinyin_tolerance, char_tolerance):
        sent2 = self.decoref(sent, entities_info)
        for word, flag in pseg.cut(sent2):
            if flag.startswith("n"):  # 對於名詞，再檢查是否有誤差範圍內匹配的其他指稱
                entity0, type0 = None, None
                mention_cands = []
                if pinyin_tolerance is not None:
                    mention_cands += self.get_pinyin_correct_candidates(word, pinyin_tolerance)
                if char_tolerance is not None:
                    mention_cands += self.search_word_trie(word, char_tolerance)

                if len(mention_cands) > 0:
                    entity0, type0 = self.choose_from_multi_mentions(mention_cands, sent)
                if entity0:
                    l = sent.find(word)
                    entities_info.append([(l,l+len(word)),(entity0, type0)])

開發者ID:blmoistawinde，項目名稱:HarvestText，代碼行數:18，代碼來源:harvesttext.py

示例8: entity_linking

# 需要導入模塊: from jieba import posseg [as 別名]
# 或者: from jieba.posseg import cut [as 別名]
def entity_linking(self, sent, pinyin_tolerance=None, char_tolerance=None, keep_all=False, with_ch_pos=False):
        '''

        :param sent: 句子/文本
        :param pinyin_tolerance: {None, 0, 1} 搜索拚音相同(取0時)或者差別隻有一個(取1時)的候選詞鏈接到現有實體，默認不使用(None)
        :param char_tolerance: {None, 1} 搜索字符隻差1個的候選詞(取1時)鏈接到現有實體，默認不使用(None)
        :param keep_all: if True, keep all the possibilities of linked entities
        :param with_ch_pos: if True, also returns ch_pos
        :return: entities_info：依存弧,列表中的列表。
            if not keep_all: [([l, r], (entity, type)) for each linked mention m]
            else: [( [l, r], set((entity, type) for each possible entity of m) ) for each linked mention m]
            ch_pos: 每個字符對應詞語的詞性標注（不考慮登錄的實體，可用來過濾實體，比如去掉都由名詞組成的實體，有可能是錯誤鏈接）

        '''
        self.check_prepared()
        entities_info = self._entity_linking(sent, pinyin_tolerance, char_tolerance, keep_all)
        if (not keep_all) and (pinyin_tolerance is not None or char_tolerance is not None):
            self._entity_recheck(sent, entities_info, pinyin_tolerance, char_tolerance)
        if with_ch_pos:
            ch_pos = []
            for word, pos in pseg.cut(sent):
                ch_pos.extend([pos] * len(word))
            return entities_info, ch_pos
        else:
            return entities_info

開發者ID:blmoistawinde，項目名稱:HarvestText，代碼行數:27，代碼來源:harvesttext.py

示例9: get_linking_mention_candidates

# 需要導入模塊: from jieba import posseg [as 別名]
# 或者: from jieba.posseg import cut [as 別名]
def get_linking_mention_candidates(self, sent, pinyin_tolerance=None, char_tolerance=None):
        mention_cands = defaultdict(list)
        cut_result = []
        self.check_prepared()
        entities_info = self._entity_linking(sent, pinyin_tolerance, char_tolerance)
        sent2 = self.decoref(sent, entities_info)
        l = 0
        i = 0
        for word, flag in pseg.cut(sent2):
            if word in self.entity_types:
                word = entities_info[i][1][0]  # 使用鏈接的實體
                i += 1
            cut_result.append(word)
            if flag.startswith("n"):  # 對於名詞，再檢查是否有誤差範圍內匹配的其他指稱
                cands = []
                if pinyin_tolerance:
                    cands += self.get_pinyin_correct_candidates(word)
                if char_tolerance:
                    cands += self.search_word_trie(word)
                if len(cands) > 0:
                    mention_cands[(l, l + len(word))] = set(cands)
            l += len(word)
        sent2 =  "".join(cut_result)
        return sent2, mention_cands

開發者ID:blmoistawinde，項目名稱:HarvestText，代碼行數:26，代碼來源:harvesttext.py

示例10: extract_words

# 需要導入模塊: from jieba import posseg [as 別名]
# 或者: from jieba.posseg import cut [as 別名]
def extract_words(comment_df):
    stop_words = set(line.strip() for line in open('../common/stopwords.txt', encoding='utf-8'))
    news_list = []
    for item in comment_df.itertuples(index=False):
        comment = item.comment.replace(' ','')
        if comment.isspace():
            continue
        p = re.compile("n[a-z0-9]{0,2}")
        word_list = pseg.cut(comment)
        for word, flag in word_list:
            if not word in stop_words and p.search(flag) != None:
                news_list.append(word)
    content = {}
    for item in news_list:
        content[item] = content.get(item, 0) + 1
    return content

開發者ID:keejo125，項目名稱:web_scraping_and_data_analysis，代碼行數:18，代碼來源:analysis.py

示例11: extract_words

# 需要導入模塊: from jieba import posseg [as 別名]
# 或者: from jieba.posseg import cut [as 別名]
def extract_words():
    with open('hot_news.txt', 'r', encoding='utf-8') as f:
        news_subjects = f.readlines()

    stop_words = set(line.strip() for line in open('stopwords.txt', encoding='utf-8'))

    news_list = []

    for subject in news_subjects:
        if subject.isspace():
            continue

        p = re.compile("n[a-z0-9]{0,2}")
        word_list = pseg.cut(subject)
        for word, flag in word_list:
            if not word in stop_words and p.search(flag) != None:
                news_list.append(word)

    content = {}
    for item in news_list:
        content[item] = content.get(item, 0) + 1
    return content

開發者ID:keejo125，項目名稱:web_scraping_and_data_analysis，代碼行數:24，代碼來源:news_hot.py

示例12: segment

# 需要導入模塊: from jieba import posseg [as 別名]
# 或者: from jieba.posseg import cut [as 別名]
def segment(self, text, lower = True, use_stop_words = True, use_speech_tags_filter = False):
        """對一段文本進行分詞，返回list類型的分詞結果

        Keyword arguments:
        lower                  -- 是否將單詞小寫（針對英文）
        use_stop_words         -- 若為True，則利用停止詞集合來過濾（去掉停止詞）
        use_speech_tags_filter -- 是否基於詞性進行過濾。若為True，則使用self.default_speech_tag_filter過濾。否則，不過濾。    
        """
        text = util.as_text(text)
        jieba_result = pseg.cut(text)
        
        if use_speech_tags_filter == True:
            jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter]
        else:
            jieba_result = [w for w in jieba_result]

        # 去除特殊符號
        word_list = [w.word.strip() for w in jieba_result if w.flag!='x']
        word_list = [word for word in word_list if len(word)>0]
        
        if lower:
            word_list = [word.lower() for word in word_list]

        if use_stop_words:
            word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words]

        return word_list

開發者ID:ouprince，項目名稱:text-rank，代碼行數:29，代碼來源:Segmentation.py

示例13: textrank

# 需要導入模塊: from jieba import posseg [as 別名]
# 或者: from jieba.posseg import cut [as 別名]
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
        """
        Extract keywords from sentence using TextRank algorithm.
        Parameter:
            - topK: return how many top keywords. `None` for all possible words.
            - withWeight: if True, return a list of (word, weight);
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
                        if the POS of w is not in this list, it will be filtered.
            - withFlag: if True, return a list of pair(word, weight) like posseg.cut
                        if False, return a list of words
        """
        self.pos_filt = frozenset(allowPOS)
        g = UndirectWeightedGraph()
        cm = defaultdict(int)
        words = tuple(self.tokenizer.cut(sentence))
        for i, wp in enumerate(words):
            if self.pairfilter(wp):
                for j in xrange(i + 1, i + self.span):
                    if j >= len(words):
                        break
                    if not self.pairfilter(words[j]):
                        continue
                    if allowPOS and withFlag:
                        cm[(wp, words[j])] += 1
                    else:
                        cm[(wp.word, words[j].word)] += 1

        for terms, w in cm.items():
            g.addEdge(terms[0], terms[1], w)
        nodes_rank = g.rank()
        if withWeight:
            tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
        else:
            tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)

        if topK:
            return tags[:topK]
        else:
            return tags

開發者ID:deepcs233，項目名稱:jieba_fast，代碼行數:42，代碼來源:textrank.py

示例14: testDefaultCut

# 需要導入模塊: from jieba import posseg [as 別名]
# 或者: from jieba.posseg import cut [as 別名]
def testDefaultCut(self):
        for content in test_contents:
            result = jieba.cut(content)
            assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
            result = list(result)
            assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testDefaultCut", file=sys.stderr)

開發者ID:deepcs233，項目名稱:jieba_fast，代碼行數:10，代碼來源:jieba_test.py

示例15: testCutAll

# 需要導入模塊: from jieba import posseg [as 別名]
# 或者: from jieba.posseg import cut [as 別名]
def testCutAll(self):
        for content in test_contents:
            result = jieba.cut(content, cut_all=True)
            assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutAll error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testCutAll", file=sys.stderr)

開發者ID:deepcs233，項目名稱:jieba_fast，代碼行數:10，代碼來源:jieba_test.py

注：本文中的jieba.posseg.cut方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。