本文整理汇总了Python中jieba.posseg方法的典型用法代码示例。如果您正苦于以下问题:Python jieba.posseg方法的具体用法?Python jieba.posseg怎么用?Python jieba.posseg使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类jieba
的用法示例。
在下文中一共展示了jieba.posseg方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: cutfunc
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import posseg [as 别名]
def cutfunc(sentence, _, HMM=True):
for w, f in jieba.posseg.cut(sentence, HMM):
yield w + posdelim + f
示例2: __init__
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import posseg [as 别名]
def __init__(self, idf_path=None):
self.tokenizer = jieba.dt
self.postokenizer = jieba.posseg.dt
self.stop_words = self.STOP_WORDS.copy()
self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
self.idf_freq, self.median_idf = self.idf_loader.get_idf()
示例3: testPosseg
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import posseg [as 别名]
def testPosseg(self):
import jieba.posseg as pseg
for content in test_contents:
result = pseg.cut(content)
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
result = list(result)
assert isinstance(result, list), "Test Posseg error on content: %s" % content
print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
print("testPosseg", file=sys.stderr)
示例4: testPosseg_NOHMM
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import posseg [as 别名]
def testPosseg_NOHMM(self):
import jieba.posseg as pseg
for content in test_contents:
result = pseg.cut(content,HMM=False)
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
result = list(result)
assert isinstance(result, list), "Test Posseg error on content: %s" % content
print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
print("testPosseg_NOHMM", file=sys.stderr)
示例5: text2ner
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import posseg [as 别名]
def text2ner(text):
seq, pos, label = [], [], []
segment = jieba.posseg.cut(text)
words, flags = [], []
for seg in segment:
words.append(seg.word)
flags.append(seg.flag)
i = 0
tag = 'O'
pre = 0 # 判断前面<>
sign = 0 # 记录有多个连续的<>
while i < len(words):
if words[i] != '<':
seq.append(words[i])
pos.append(flags[i])
label.append(tag)
if tag == 'B':
tag = 'I'
sign = 1
i += 1
else:
if words[i+1] == '/':
pre -= 1
if pre == 0:
tag = 'O'
else:
tag = 'I'
else:
pre += 1
if pre == 1:
tag = 'B'
sign = 0
elif sign == 1:
tag = 'I'
while i < len(words) and words[i] != '>':
i += 1
i += 1
return seq, pos, label
示例6: test_segment
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import posseg [as 别名]
def test_segment():
"""测试疾病名纠错"""
error_sentence_1 = '这个新药奥美砂坦脂片能治疗心绞痛,效果还可以' # 奥美沙坦酯片
print(error_sentence_1)
print(segment(error_sentence_1))
import jieba
print(list(jieba.tokenize(error_sentence_1)))
import jieba.posseg as pseg
words = pseg.lcut("我爱北京天安门") # jieba默认模式
print('old:', words)
# jieba.enable_paddle() # 启动paddle模式。 0.40版之后开始支持,早期版本不支持
# words = pseg.cut("我爱北京天安门", use_paddle=True) # paddle模式
# for word, flag in words:
# print('new:','%s %s' % (word, flag))
示例7: posseg_cut_examples
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import posseg [as 别名]
def posseg_cut_examples(self, example):
raw_entities = example.get("entities", [])
example_posseg = self.posseg(example.text)
for (item_posseg, start, end) in example_posseg:
part_of_speech = self.component_config["part_of_speech"]
for (word_posseg, flag_posseg) in item_posseg:
if flag_posseg in part_of_speech:
raw_entities.append({
'start': start,
'end': end,
'value': word_posseg,
'entity': flag_posseg
})
return raw_entities
示例8: posseg
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import posseg [as 别名]
def posseg(text):
# type: (Text) -> List[Token]
result = []
for (word, start, end) in jieba.tokenize(text):
pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
result.append((pseg_data, start, end))
return result
示例9: posseg_cut_examples
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import posseg [as 别名]
def posseg_cut_examples(self, example):
raw_entities = example.get("entities", [])
example_posseg = self.posseg(example.text)
for (item_posseg, start, end) in example_posseg:
part_of_speech = self.component_config["part_of_speech"]
for (word_posseg, flag_posseg) in item_posseg:
if flag_posseg in part_of_speech:
raw_entities.append({
'start': start,
'end': end,
'value': word_posseg,
'entity': flag_posseg
})
return raw_entities
示例10: posseg
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import posseg [as 别名]
def posseg(text):
# type: (Text) -> List[Token]
import jieba
import jieba.posseg as pseg
result = []
for (word, start, end) in jieba.tokenize(text):
pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
result.append((pseg_data, start, end))
return result
示例11: get_n
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import posseg [as 别名]
def get_n(sentence):
words = jieba.posseg.cut(sentence)
word_list = []
for word, flag in words:
if 'n' in flag or flag in ['vn']:
word_list.append(word)
return set(word_list)
示例12: posseg
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import posseg [as 别名]
def posseg(self, sent, standard_name=False, stopwords=None):
if self.language == 'en':
from nltk import word_tokenize, pos_tag
stopwords = set() if stopwords is None else stopwords
tokens = [word for word in word_tokenize(sent) if word not in stopwords]
return pos_tag(tokens, tagset='universal')
else:
self.standard_name = standard_name
entities_info = self.entity_linking(sent)
sent2 = self.decoref(sent, entities_info)
result = []
i = 0
for word, flag in pseg.cut(sent2):
if word in self.entity_types:
if self.standard_name:
word = entities_info[i][1][0] # 使用链接的实体
else:
l, r = entities_info[i][0] # 或使用原文
word = sent[l:r]
flag = entities_info[i][1][1][1:-1]
i += 1
else:
if stopwords and word in stopwords:
continue
result.append((word, flag))
return result
示例13: synonym_cut
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import posseg [as 别名]
def synonym_cut(sentence, pattern="wf"):
"""Cut the sentence into a synonym vector tag.
将句子切分为同义词向量标签。
If a word in this sentence was not found in the synonym dictionary,
it will be marked with default value of the word segmentation tool.
如果同义词词典中没有则标注为切词工具默认的词性。
Args:
pattern: 'w'-分词, 'k'-唯一关键词,'t'-关键词列表, 'wf'-分词标签, 'tf-关键词标签'。
"""
# 句尾标点符号过滤
sentence = sentence.rstrip(''.join(punctuation_all))
# 句尾语气词过滤
sentence = sentence.rstrip(tone_words)
synonym_vector = []
if pattern == "w":
synonym_vector = [item for item in jieba.cut(sentence) if item not in filter_characters]
elif pattern == "k":
synonym_vector = analyse.extract_tags(sentence, topK=1)
elif pattern == "t":
synonym_vector = analyse.extract_tags(sentence, topK=10)
elif pattern == "wf":
result = posseg.cut(sentence)
# synonym_vector = [(item.word, item.flag) for item in result \
# if item.word not in filter_characters]
# Modify in 2017.4.27
for item in result:
if item.word not in filter_characters:
if len(item.flag) < 4:
item.flag = list(posseg.cut(item.word))[0].flag
synonym_vector.append((item.word, item.flag))
elif pattern == "tf":
result = posseg.cut(sentence)
tags = analyse.extract_tags(sentence, topK=10)
for item in result:
if item.word in tags:
synonym_vector.append((item.word, item.flag))
return synonym_vector
示例14: extract_tags
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import posseg [as 别名]
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
"""
Extract keywords from sentence using TF-IDF algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
if the POS of w is not in this list,it will be filtered.
- withFlag: only work with allowPOS is not empty.
if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
"""
if allowPOS:
allowPOS = frozenset(allowPOS)
words = self.postokenizer.cut(sentence)
else:
words = self.tokenizer.cut(sentence)
freq = {}
for w in words:
if allowPOS:
if w.flag not in allowPOS:
continue
elif not withFlag:
w = w.word
wc = w.word if allowPOS and withFlag else w
if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
continue
freq[w] = freq.get(w, 0.0) + 1.0
total = sum(freq.values())
for k in freq:
kw = k.word if allowPOS and withFlag else k
freq[k] *= self.idf_freq.get(kw, self.median_idf) / total
if withWeight:
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(freq, key=freq.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags