本文整理汇总了Python中jieba.posseg.cut方法的典型用法代码示例。如果您正苦于以下问题:Python posseg.cut方法的具体用法?Python posseg.cut怎么用?Python posseg.cut使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类jieba.posseg
的用法示例。
在下文中一共展示了posseg.cut方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: clean_entity
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def clean_entity(self, text):
wps = pseg.cut(text)
res = []
for w, pos in wps:
# 人名
if pos == "nr":
res.append("P")
# 地名
elif pos == "ns":
res.append("L")
# 机构名
elif pos == "nt":
res.append("O")
else:
res.append(w)
return "".join(res)
示例2: place_recognize
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def place_recognize(cls, text):
places = [w for w, flag in posseg.cut(text) if "ns" in flag
and len(w) >= 2
and w not in cls.not_place_set
and "哈" not in w
and "之" not in w
and "本" not in w
and "中" not in w
and "嫩" not in w
and "大" not in w
and "鲜" not in w
and "国" not in w
and "上" not in w
and "确" not in w
and "牙" not in w
and "壶" not in w
and "阿" not in w
and "入" not in w
and "哥" not in w
and "颗" not in w
and "的" not in w
and "联" not in w
and "哇" not in w]
return places
示例3: cut
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def cut(dataset):
store_file = '分词后文件\\'
if not os.path.exists(store_file):
os.mkdir(store_file)
store_data = open(store_file + dataset+'_分词.txt', 'w', encoding='utf-8')
data = open('数据\\'+dataset+'.txt', encoding='utf-8').read()
# 没有人工去停用词,因为虚词连词甚至标点后面分析都可能用到
words = psg.cut(data, HMM=True)
words_dict = {}
for word, flag in words:
if word in words_dict.keys():
words_dict[word][1] += 1
else:
words_dict[word] = [flag, 1]
words_dict = sorted(words_dict.items(), key=lambda item: item[1][1], reverse=True)
store_data.write(str(words_dict))
示例4: MatchItem
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def MatchItem(self, input, start, end,muststart, mode=None):
self.LogIn(input, start,end)
pos = start;
if end is None:
end=len(input);
seg_list = pseg.cut(input[start:end] if self.Len == -1 else input[start:start + self.Len]);
for word, flag in seg_list:
if self.Pos is None:
sword = word;
break;
else:
if flag in self.Pos:
sword = word;
break;
pos += len(word);
if pos < 0 or (muststart == True and pos != start):
self.LogOut(None)
return start + self.Len if self.Len < 0 else tnpy.int_max;
self.LogOut(sword)
m = tnpy.MatchResult(self, sword, pos);
m.rstr = sword;
return m;
示例5: _create_examples
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
guid = "%s-%s" % (set_type, i)
text_a = tokenization.convert_to_unicode(line[1])
text_b = ''
for x in psg.cut(text_a):
text_b = text_b + x.flag + ' '
text_b = text_b.strip()
print(text_a)
print(text_b)
# text_b = tokenization.convert_to_unicode(line[1])
if(set_type == 'test'):
label = '0'
else:
label = tokenization.convert_to_unicode(line[2])
# print(text_a)
# print(label)
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
示例6: word_flag_cut
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def word_flag_cut(sentence):
"""
jieba切词词性
:param sentence:
:return:
"""
sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').\
replace(' ', '').replace('\t', '').upper().strip()
word_list = []
flag_list = []
try:
sentence_cut = ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
words = jieba_seg.cut(sentence_cut)
for word in words:
word_list.append(word.word)
flag_list.append(word.flag)
except Exception as e:
word_list = [sentence]
flag_list = ['nt']
return word_list, flag_list
示例7: _entity_recheck
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def _entity_recheck(self, sent, entities_info, pinyin_tolerance, char_tolerance):
sent2 = self.decoref(sent, entities_info)
for word, flag in pseg.cut(sent2):
if flag.startswith("n"): # 对于名词,再检查是否有误差范围内匹配的其他指称
entity0, type0 = None, None
mention_cands = []
if pinyin_tolerance is not None:
mention_cands += self.get_pinyin_correct_candidates(word, pinyin_tolerance)
if char_tolerance is not None:
mention_cands += self.search_word_trie(word, char_tolerance)
if len(mention_cands) > 0:
entity0, type0 = self.choose_from_multi_mentions(mention_cands, sent)
if entity0:
l = sent.find(word)
entities_info.append([(l,l+len(word)),(entity0, type0)])
示例8: entity_linking
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def entity_linking(self, sent, pinyin_tolerance=None, char_tolerance=None, keep_all=False, with_ch_pos=False):
'''
:param sent: 句子/文本
:param pinyin_tolerance: {None, 0, 1} 搜索拼音相同(取0时)或者差别只有一个(取1时)的候选词链接到现有实体,默认不使用(None)
:param char_tolerance: {None, 1} 搜索字符只差1个的候选词(取1时)链接到现有实体,默认不使用(None)
:param keep_all: if True, keep all the possibilities of linked entities
:param with_ch_pos: if True, also returns ch_pos
:return: entities_info:依存弧,列表中的列表。
if not keep_all: [([l, r], (entity, type)) for each linked mention m]
else: [( [l, r], set((entity, type) for each possible entity of m) ) for each linked mention m]
ch_pos: 每个字符对应词语的词性标注(不考虑登录的实体,可用来过滤实体,比如去掉都由名词组成的实体,有可能是错误链接)
'''
self.check_prepared()
entities_info = self._entity_linking(sent, pinyin_tolerance, char_tolerance, keep_all)
if (not keep_all) and (pinyin_tolerance is not None or char_tolerance is not None):
self._entity_recheck(sent, entities_info, pinyin_tolerance, char_tolerance)
if with_ch_pos:
ch_pos = []
for word, pos in pseg.cut(sent):
ch_pos.extend([pos] * len(word))
return entities_info, ch_pos
else:
return entities_info
示例9: get_linking_mention_candidates
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def get_linking_mention_candidates(self, sent, pinyin_tolerance=None, char_tolerance=None):
mention_cands = defaultdict(list)
cut_result = []
self.check_prepared()
entities_info = self._entity_linking(sent, pinyin_tolerance, char_tolerance)
sent2 = self.decoref(sent, entities_info)
l = 0
i = 0
for word, flag in pseg.cut(sent2):
if word in self.entity_types:
word = entities_info[i][1][0] # 使用链接的实体
i += 1
cut_result.append(word)
if flag.startswith("n"): # 对于名词,再检查是否有误差范围内匹配的其他指称
cands = []
if pinyin_tolerance:
cands += self.get_pinyin_correct_candidates(word)
if char_tolerance:
cands += self.search_word_trie(word)
if len(cands) > 0:
mention_cands[(l, l + len(word))] = set(cands)
l += len(word)
sent2 = "".join(cut_result)
return sent2, mention_cands
示例10: extract_words
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def extract_words(comment_df):
stop_words = set(line.strip() for line in open('../common/stopwords.txt', encoding='utf-8'))
news_list = []
for item in comment_df.itertuples(index=False):
comment = item.comment.replace(' ','')
if comment.isspace():
continue
p = re.compile("n[a-z0-9]{0,2}")
word_list = pseg.cut(comment)
for word, flag in word_list:
if not word in stop_words and p.search(flag) != None:
news_list.append(word)
content = {}
for item in news_list:
content[item] = content.get(item, 0) + 1
return content
示例11: extract_words
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def extract_words():
with open('hot_news.txt', 'r', encoding='utf-8') as f:
news_subjects = f.readlines()
stop_words = set(line.strip() for line in open('stopwords.txt', encoding='utf-8'))
news_list = []
for subject in news_subjects:
if subject.isspace():
continue
p = re.compile("n[a-z0-9]{0,2}")
word_list = pseg.cut(subject)
for word, flag in word_list:
if not word in stop_words and p.search(flag) != None:
news_list.append(word)
content = {}
for item in news_list:
content[item] = content.get(item, 0) + 1
return content
示例12: segment
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def segment(self, text, lower = True, use_stop_words = True, use_speech_tags_filter = False):
"""对一段文本进行分词,返回list类型的分词结果
Keyword arguments:
lower -- 是否将单词小写(针对英文)
use_stop_words -- 若为True,则利用停止词集合来过滤(去掉停止词)
use_speech_tags_filter -- 是否基于词性进行过滤。若为True,则使用self.default_speech_tag_filter过滤。否则,不过滤。
"""
text = util.as_text(text)
jieba_result = pseg.cut(text)
if use_speech_tags_filter == True:
jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter]
else:
jieba_result = [w for w in jieba_result]
# 去除特殊符号
word_list = [w.word.strip() for w in jieba_result if w.flag!='x']
word_list = [word for word in word_list if len(word)>0]
if lower:
word_list = [word.lower() for word in word_list]
if use_stop_words:
word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words]
return word_list
示例13: textrank
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
"""
Extract keywords from sentence using TextRank algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
if the POS of w is not in this list, it will be filtered.
- withFlag: if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
"""
self.pos_filt = frozenset(allowPOS)
g = UndirectWeightedGraph()
cm = defaultdict(int)
words = tuple(self.tokenizer.cut(sentence))
for i, wp in enumerate(words):
if self.pairfilter(wp):
for j in xrange(i + 1, i + self.span):
if j >= len(words):
break
if not self.pairfilter(words[j]):
continue
if allowPOS and withFlag:
cm[(wp, words[j])] += 1
else:
cm[(wp.word, words[j].word)] += 1
for terms, w in cm.items():
g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank()
if withWeight:
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags
示例14: testDefaultCut
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def testDefaultCut(self):
for content in test_contents:
result = jieba.cut(content)
assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
result = list(result)
assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
print(" , ".join(result), file=sys.stderr)
print("testDefaultCut", file=sys.stderr)
示例15: testCutAll
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import cut [as 别名]
def testCutAll(self):
for content in test_contents:
result = jieba.cut(content, cut_all=True)
assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
result = list(result)
assert isinstance(result, list), "Test CutAll error on content: %s" % content
print(" , ".join(result), file=sys.stderr)
print("testCutAll", file=sys.stderr)