本文整理汇总了Python中jieba.lcut方法的典型用法代码示例。如果您正苦于以下问题:Python jieba.lcut方法的具体用法?Python jieba.lcut怎么用?Python jieba.lcut使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类jieba
的用法示例。
在下文中一共展示了jieba.lcut方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: process_data
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def process_data(train_file, user_dict=None, stop_dict=None):
# 结巴分词加载自定义词典(要符合jieba自定义词典规范)
if user_dict:
jieba.load_userdict(user_dict)
# 加载停用词表(每行一个停用词)
stop_words = []
if stop_dict:
with open(stop_dict, 'r', encoding='utf-8') as file:
stop_words = [stop_word.strip() for stop_word in file.readlines()]
# 读取文件内容并分词, 去掉停用词
with open(train_file, 'r', encoding='utf-8') as file:
sentences = file.readlines()
sentences = [jieba.lcut(sentence.strip()) for sentence in sentences]
sentences = [[s for s in sentence if s not in stop_words and s.strip() != ''] for sentence in sentences]
return sentences
示例2: __call__
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def __call__(self, sent):
sent = ptxt.Text(sent, "whi").clean
sent = self.clean_linkpic(sent)
sent = self.clean_english(sent)
sent = self.clean_date(sent)
sent = self.clean_time(sent)
sent = self.clean_money(sent)
sent = self.clean_weight(sent)
sent = self.clean_concentration(sent)
sent = self.clean_entity(sent)
sent = self.clean_nums(sent)
wlist = jieba.lcut(sent)
sent = self.clean_stopwords(wlist)
sent = self.clean_punctuation(sent)
return sent
示例3: cut_texts
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def cut_texts(texts=None, need_cut=True, word_len=1):
'''
Use jieba to cut texts
:param texts:list of texts
:param need_cut:whether need cut text
:param word_len:min length of words to keep,in order to delete stop-words
:param savepath:path to save word list in json file
:return:
'''
if need_cut:
if word_len > 1:
texts_cut = [[word for word in jieba.lcut(text) if len(word) >= word_len] for text in texts]
else:
texts_cut = [jieba.lcut(one_text) for one_text in texts]
else:
if word_len > 1:
texts_cut = [[word for word in text if len(word) >= word_len] for text in texts]
else:
texts_cut = texts
return texts_cut
示例4: segment
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def segment(self, sentence, entity_postag=dict()):
"""采用NLPIR进行分词处理
Args:
sentence: string,句子
entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生
Returns:
lemmas: list,分词结果
"""
# 添加实体词典
if entity_postag:
for entity in entity_postag:
# pynlpir.nlpir.AddUserWord(c_char_p(entity.encode()))
jieba.add_word(entity)
# pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode())) # 单个用户词加入示例
# pynlpir.nlpir.AddUserWord(c_char_p('习近平'.encode())) # 单个用户词加入示例
# 分词,不进行词性标注
# lemmas = pynlpir.segment(sentence, pos_tagging=False)
lemmas = jieba.lcut(sentence)
# pynlpir.close() # 释放
return lemmas
示例5: M_idf
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def M_idf(self,s1, s2):
v1, v2 = [], []
s1_list = jieba.lcut(s1)
s2_list = jieba.lcut(s2)
for s in s1_list:
idf_v = self.idf.get(s, 1)
if s in self.voc:
v1.append(1.0 * idf_v * self.voc[s])
for s in s2_list:
idf_v = self.idf.get(s, 1)
if s in self.voc:
v2.append(1.0 * idf_v * self.voc[s])
v1 = np.array(v1).sum(axis=0)
v2 = np.array(v2).sum(axis=0)
sim = 1 - spatial.distance.cosine(v1, v2)
return sim
示例6: _gen_sxhy_dict
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def _gen_sxhy_dict():
print("Parsing shixuehanying dictionary ...")
words = set()
with open(_rawsxhy_path, 'r') as fin:
for line in fin.readlines():
if line[0] == '<':
continue
for phrase in line.strip().split()[1:]:
if not is_cn_sentence(phrase):
continue
idx = 0
while idx + 4 <= len(phrase):
# Cut 2 chars each time.
words.add(phrase[idx : idx + 2])
idx += 2
# Use jieba to cut the last 3 chars.
if idx < len(phrase):
for word in jieba.lcut(phrase[idx:]):
words.add(word)
with open(sxhy_path, 'w') as fout:
fout.write(' '.join(words))
示例7: segment
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def segment(self, sentence):
# TODO: try CRF-based segmentation.
toks = []
idx = 0
while idx + 4 <= len(sentence):
# Cut 2 chars each time.
if sentence[idx : idx + 2] in self.sxhy_dict:
toks.append(sentence[idx : idx + 2])
else:
for tok in jieba.lcut(sentence[idx : idx + 2]):
toks.append(tok)
idx += 2
# Cut last 3 chars.
if idx < len(sentence):
if sentence[idx : ] in self.sxhy_dict:
toks.append(sentence[idx : ])
else:
for tok in jieba.lcut(sentence[idx : ]):
toks.append(tok)
return toks
# For testing purpose.
示例8: _prepare_data
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def _prepare_data(self, temp_data):
cans = temp_data["candidates"]
cans = [self.vocab.tran2id(each, True) for each in cans]
for text in temp_data["content"]:
content = re.split(r'(#idiom\d+#)', text)
doc = []
loc = []
labs = []
tags = []
for i, segment in enumerate(content):
if re.match(r'#idiom\d+#', segment) is not None:
tags.append(segment)
if segment in self.ans:
labs.append(self.ans[segment])
loc.append(len(doc))
doc.append(self.vocab.tran2id('#idiom#'))
else:
doc += [self.vocab.tran2id(each) for each in jieba.lcut(segment)]
yield doc, cans, labs, loc, tags
示例9: train
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def train():
"""
训练模型,并保存
"""
print('Loading Data...')
inputTexts, labels = load_data()
print(inputTexts.shape, labels.shape)
print('segment...')
# seg_data = [jieba.lcut(document.replace('\n', ''))for document in inputTexts]
# print('word2vec...')
# index_dict, word_vectors, data = word2vec_train(seg_data)
# n_symbols = len(index_dict) + 1
# x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.15)
# print(x_train.shape, y_train.shape)
# train_model(n_symbols, x_train, y_train, x_test, y_test)
word_index, data = train_wordtoVect(inputTexts)
input_dim=len(word_index) + 1
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.15)
print(x_train.shape, y_train.shape)
train_model(input_dim, x_train, y_train, x_test, y_test)
示例10: train_wordtoVect
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def train_wordtoVect(train_inputTexts):
"""
训练词向量函数
"""
texts=[]
for doc in train_inputTexts:
seg_doc = jieba.lcut(doc.replace('\n', ''))
d =" ".join(seg_doc)
texts.append(d)
tokenizer = text.Tokenizer() # 分词MAX_NB_WORDS
tokenizer.fit_on_texts(texts)
text_sequences = tokenizer.texts_to_sequences(texts) # 受num_words影响
word_index = tokenizer.word_index # 词_索引
data = sequence.pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH)
return word_index, data
示例11: word_flag_cut
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def word_flag_cut(sentence):
"""
jieba切词词性
:param sentence:
:return:
"""
sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').\
replace(' ', '').replace('\t', '').upper().strip()
word_list = []
flag_list = []
try:
sentence_cut = ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
words = jieba_seg.cut(sentence_cut)
for word in words:
word_list.append(word.word)
flag_list.append(word.flag)
except Exception as e:
word_list = [sentence]
flag_list = ['nt']
return word_list, flag_list
示例12: cut_td_idf
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def cut_td_idf(sources_path, target_path):
"""
结巴切词,汉语
:param path:
:return:
"""
print("cut_td_idf start! ")
corpus = txtRead(sources_path)
governments = []
for corpus_one in corpus:
corpus_one_clear = corpus_one.replace(' ', '').strip()
ques_q2b = strQ2B(corpus_one_clear.strip())
ques_q2b_syboml = get_syboml(ques_q2b)
governments.append(ques_q2b_syboml.strip())
government_ques = list(map(lambda x: ' '.join(jieba.lcut(x)), governments))
topic_ques_all = []
for topic_ques_one in government_ques:
top_ques_aqlq = topic_ques_one.replace(' ', ' ').replace(' ', ' ').strip() + '\n'
topic_ques_all.append(top_ques_aqlq)
txtWrite(topic_ques_all, target_path)
print("cut_td_idf ok! " + sources_path)
示例13: __iter__
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def __iter__(self):
for filename in self.filenames:
with codecs.open(filename, 'r', encoding='utf-8') as f:
for _,line in enumerate(f):
try:
line=line.strip()
line=line.split('\t')
assert len(line)==2
blocks=re_han.split(line[1])
word=[]
for blk in blocks:
if re_han.match(blk):
word.extend(jieba.lcut(blk))
yield word
except:
pass
示例14: sentence_cut
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def sentence_cut(sentences):
"""
Args:
sentence: a list of text need to segment
Returns:
seglist: a list of sentence cut by jieba
"""
re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)") # the method of cutting text by punctuation
seglist=[]
for sentence in sentences:
words=[]
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
words.extend(jieba.lcut(blk))
seglist.append(words)
return seglist
示例15: sentence_cut
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def sentence_cut(sentences):
"""
Args:
sentence: a list of text need to segment
Returns:
seglist: a list of sentence cut by jieba
"""
re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)") # the method of cutting text by punctuation
with codecs.open('./data/stopwords.txt','r',encoding='utf-8') as f:
stopwords=[line.strip() for line in f.readlines()]
contents=[]
for sentence in sentences:
words=[]
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
seglist = jieba.lcut(blk)
words.extend([w for w in seglist if w not in stopwords])
contents.append(words)
return contents