Python posseg.cut方法代码示例

本文整理汇总了Python中jieba.posseg.cut方法的典型用法代码示例。如果您正苦于以下问题：Python posseg.cut方法的具体用法？Python posseg.cut怎么用？Python posseg.cut使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类jieba.posseg的用法示例。

在下文中一共展示了posseg.cut方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: clean_entity

# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def clean_entity(self, text):
        wps = pseg.cut(text)
        res = []
        for w, pos in wps:
            # 人名
            if pos == "nr":
                res.append("P")
            # 地名
            elif pos == "ns":
                res.append("L")
            # 机构名
            elif pos == "nt":
                res.append("O")
            else:
                res.append(w)
        return "".join(res)

开发者ID:hscspring，项目名称:Multi-Label-Text-Classification-for-Chinese，代码行数:18，代码来源:chinese.py

示例2: place_recognize

# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def place_recognize(cls, text):
        places = [w for w, flag in posseg.cut(text) if "ns" in flag
                  and len(w) >= 2
                  and w not in cls.not_place_set
                  and "哈" not in w
                  and "之" not in w
                  and "本" not in w
                  and "中" not in w
                  and "嫩" not in w
                  and "大" not in w
                  and "鲜" not in w
                  and "国" not in w
                  and "上" not in w
                  and "确" not in w
                  and "牙" not in w
                  and "壶" not in w
                  and "阿" not in w
                  and "入" not in w
                  and "哥" not in w
                  and "颗" not in w
                  and "的" not in w
                  and "联" not in w
                  and "哇" not in w]

        return places

开发者ID:shibing624，项目名称:dialogbot，代码行数:27，代码来源:tokenizer.py

示例3: cut

# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def cut(dataset):
    store_file = '分词后文件\\'
    if not os.path.exists(store_file):
        os.mkdir(store_file)

    store_data = open(store_file + dataset+'_分词.txt', 'w', encoding='utf-8')
    data = open('数据\\'+dataset+'.txt', encoding='utf-8').read()

    # 没有人工去停用词，因为虚词连词甚至标点后面分析都可能用到
    words = psg.cut(data, HMM=True)

    words_dict = {}
    for word, flag in words:
        if word in words_dict.keys():
            words_dict[word][1] += 1
        else:
            words_dict[word] = [flag, 1]

    words_dict = sorted(words_dict.items(), key=lambda item: item[1][1], reverse=True)
    store_data.write(str(words_dict))

开发者ID:ZubinGou，项目名称:AI_Poet_Totoro，代码行数:22，代码来源:词性分词.py

示例4: MatchItem

# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def MatchItem(self, input, start, end,muststart, mode=None):
        self.LogIn(input, start,end)
        pos = start;
        if end is None:
            end=len(input);
        seg_list = pseg.cut(input[start:end] if self.Len == -1 else input[start:start + self.Len]);
        for word, flag in seg_list:
            if self.Pos is None:
                sword = word;
                break;
            else:
                if flag in self.Pos:
                    sword = word;
                    break;
            pos += len(word);
        if pos < 0 or (muststart == True and pos != start):
            self.LogOut(None)
            return start + self.Len if self.Len < 0 else tnpy.int_max;
        self.LogOut(sword)
        m = tnpy.MatchResult(self, sword, pos);
        m.rstr = sword;
        return m;

开发者ID:ferventdesert，项目名称:tnpy，代码行数:24，代码来源:tnnlp.py

示例5: _create_examples

# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      guid = "%s-%s" % (set_type, i)
      text_a = tokenization.convert_to_unicode(line[1])
      text_b = ''
      for x in psg.cut(text_a):
          text_b = text_b + x.flag + ' '
      text_b = text_b.strip()
      print(text_a)
      print(text_b)
      # text_b = tokenization.convert_to_unicode(line[1])
      if(set_type == 'test'):
          label = '0'
      else:
          label = tokenization.convert_to_unicode(line[2])
          # print(text_a)
      # print(label)
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

开发者ID:tracy-talent，项目名称:curriculum，代码行数:24，代码来源:run_classifier.py

示例6: word_flag_cut

# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def word_flag_cut(sentence):
    """
        jieba切词词性
    :param sentence: 
    :return: 
    """
    sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').\
                        replace(' ', '').replace('\t', '').upper().strip()
    word_list = []
    flag_list = []
    try:
        sentence_cut =  ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
        words = jieba_seg.cut(sentence_cut)
        for word in words:
            word_list.append(word.word)
            flag_list.append(word.flag)
    except Exception as e:
        word_list = [sentence]
        flag_list = ['nt']
    return word_list, flag_list

开发者ID:yongzhuo，项目名称:nlp_xiaojiang，代码行数:22，代码来源:chatbot_sentence_vec_by_word.py

示例7: _entity_recheck

# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def _entity_recheck(self, sent, entities_info, pinyin_tolerance, char_tolerance):
        sent2 = self.decoref(sent, entities_info)
        for word, flag in pseg.cut(sent2):
            if flag.startswith("n"):  # 对于名词，再检查是否有误差范围内匹配的其他指称
                entity0, type0 = None, None
                mention_cands = []
                if pinyin_tolerance is not None:
                    mention_cands += self.get_pinyin_correct_candidates(word, pinyin_tolerance)
                if char_tolerance is not None:
                    mention_cands += self.search_word_trie(word, char_tolerance)

                if len(mention_cands) > 0:
                    entity0, type0 = self.choose_from_multi_mentions(mention_cands, sent)
                if entity0:
                    l = sent.find(word)
                    entities_info.append([(l,l+len(word)),(entity0, type0)])

开发者ID:blmoistawinde，项目名称:HarvestText，代码行数:18，代码来源:harvesttext.py

示例8: entity_linking

# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def entity_linking(self, sent, pinyin_tolerance=None, char_tolerance=None, keep_all=False, with_ch_pos=False):
        '''

        :param sent: 句子/文本
        :param pinyin_tolerance: {None, 0, 1} 搜索拼音相同(取0时)或者差别只有一个(取1时)的候选词链接到现有实体，默认不使用(None)
        :param char_tolerance: {None, 1} 搜索字符只差1个的候选词(取1时)链接到现有实体，默认不使用(None)
        :param keep_all: if True, keep all the possibilities of linked entities
        :param with_ch_pos: if True, also returns ch_pos
        :return: entities_info：依存弧,列表中的列表。
            if not keep_all: [([l, r], (entity, type)) for each linked mention m]
            else: [( [l, r], set((entity, type) for each possible entity of m) ) for each linked mention m]
            ch_pos: 每个字符对应词语的词性标注（不考虑登录的实体，可用来过滤实体，比如去掉都由名词组成的实体，有可能是错误链接）

        '''
        self.check_prepared()
        entities_info = self._entity_linking(sent, pinyin_tolerance, char_tolerance, keep_all)
        if (not keep_all) and (pinyin_tolerance is not None or char_tolerance is not None):
            self._entity_recheck(sent, entities_info, pinyin_tolerance, char_tolerance)
        if with_ch_pos:
            ch_pos = []
            for word, pos in pseg.cut(sent):
                ch_pos.extend([pos] * len(word))
            return entities_info, ch_pos
        else:
            return entities_info

开发者ID:blmoistawinde，项目名称:HarvestText，代码行数:27，代码来源:harvesttext.py

示例9: get_linking_mention_candidates

# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def get_linking_mention_candidates(self, sent, pinyin_tolerance=None, char_tolerance=None):
        mention_cands = defaultdict(list)
        cut_result = []
        self.check_prepared()
        entities_info = self._entity_linking(sent, pinyin_tolerance, char_tolerance)
        sent2 = self.decoref(sent, entities_info)
        l = 0
        i = 0
        for word, flag in pseg.cut(sent2):
            if word in self.entity_types:
                word = entities_info[i][1][0]  # 使用链接的实体
                i += 1
            cut_result.append(word)
            if flag.startswith("n"):  # 对于名词，再检查是否有误差范围内匹配的其他指称
                cands = []
                if pinyin_tolerance:
                    cands += self.get_pinyin_correct_candidates(word)
                if char_tolerance:
                    cands += self.search_word_trie(word)
                if len(cands) > 0:
                    mention_cands[(l, l + len(word))] = set(cands)
            l += len(word)
        sent2 =  "".join(cut_result)
        return sent2, mention_cands

开发者ID:blmoistawinde，项目名称:HarvestText，代码行数:26，代码来源:harvesttext.py

示例10: extract_words

# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def extract_words(comment_df):
    stop_words = set(line.strip() for line in open('../common/stopwords.txt', encoding='utf-8'))
    news_list = []
    for item in comment_df.itertuples(index=False):
        comment = item.comment.replace(' ','')
        if comment.isspace():
            continue
        p = re.compile("n[a-z0-9]{0,2}")
        word_list = pseg.cut(comment)
        for word, flag in word_list:
            if not word in stop_words and p.search(flag) != None:
                news_list.append(word)
    content = {}
    for item in news_list:
        content[item] = content.get(item, 0) + 1
    return content

开发者ID:keejo125，项目名称:web_scraping_and_data_analysis，代码行数:18，代码来源:analysis.py

示例11: extract_words

# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def extract_words():
    with open('hot_news.txt', 'r', encoding='utf-8') as f:
        news_subjects = f.readlines()

    stop_words = set(line.strip() for line in open('stopwords.txt', encoding='utf-8'))

    news_list = []

    for subject in news_subjects:
        if subject.isspace():
            continue

        p = re.compile("n[a-z0-9]{0,2}")
        word_list = pseg.cut(subject)
        for word, flag in word_list:
            if not word in stop_words and p.search(flag) != None:
                news_list.append(word)

    content = {}
    for item in news_list:
        content[item] = content.get(item, 0) + 1
    return content

开发者ID:keejo125，项目名称:web_scraping_and_data_analysis，代码行数:24，代码来源:news_hot.py

示例12: segment

# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def segment(self, text, lower = True, use_stop_words = True, use_speech_tags_filter = False):
        """对一段文本进行分词，返回list类型的分词结果

        Keyword arguments:
        lower                  -- 是否将单词小写（针对英文）
        use_stop_words         -- 若为True，则利用停止词集合来过滤（去掉停止词）
        use_speech_tags_filter -- 是否基于词性进行过滤。若为True，则使用self.default_speech_tag_filter过滤。否则，不过滤。    
        """
        text = util.as_text(text)
        jieba_result = pseg.cut(text)
        
        if use_speech_tags_filter == True:
            jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter]
        else:
            jieba_result = [w for w in jieba_result]

        # 去除特殊符号
        word_list = [w.word.strip() for w in jieba_result if w.flag!='x']
        word_list = [word for word in word_list if len(word)>0]
        
        if lower:
            word_list = [word.lower() for word in word_list]

        if use_stop_words:
            word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words]

        return word_list

开发者ID:ouprince，项目名称:text-rank，代码行数:29，代码来源:Segmentation.py

示例13: textrank

# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
        """
        Extract keywords from sentence using TextRank algorithm.
        Parameter:
            - topK: return how many top keywords. `None` for all possible words.
            - withWeight: if True, return a list of (word, weight);
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
                        if the POS of w is not in this list, it will be filtered.
            - withFlag: if True, return a list of pair(word, weight) like posseg.cut
                        if False, return a list of words
        """
        self.pos_filt = frozenset(allowPOS)
        g = UndirectWeightedGraph()
        cm = defaultdict(int)
        words = tuple(self.tokenizer.cut(sentence))
        for i, wp in enumerate(words):
            if self.pairfilter(wp):
                for j in xrange(i + 1, i + self.span):
                    if j >= len(words):
                        break
                    if not self.pairfilter(words[j]):
                        continue
                    if allowPOS and withFlag:
                        cm[(wp, words[j])] += 1
                    else:
                        cm[(wp.word, words[j].word)] += 1

        for terms, w in cm.items():
            g.addEdge(terms[0], terms[1], w)
        nodes_rank = g.rank()
        if withWeight:
            tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
        else:
            tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)

        if topK:
            return tags[:topK]
        else:
            return tags

开发者ID:deepcs233，项目名称:jieba_fast，代码行数:42，代码来源:textrank.py

示例14: testDefaultCut

# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def testDefaultCut(self):
        for content in test_contents:
            result = jieba.cut(content)
            assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
            result = list(result)
            assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testDefaultCut", file=sys.stderr)

开发者ID:deepcs233，项目名称:jieba_fast，代码行数:10，代码来源:jieba_test.py

示例15: testCutAll

# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def testCutAll(self):
        for content in test_contents:
            result = jieba.cut(content, cut_all=True)
            assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutAll error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testCutAll", file=sys.stderr)

开发者ID:deepcs233，项目名称:jieba_fast，代码行数:10，代码来源:jieba_test.py

注：本文中的jieba.posseg.cut方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。