本文整理汇总了Python中jieba.cut方法的典型用法代码示例。如果您正苦于以下问题:Python jieba.cut方法的具体用法?Python jieba.cut怎么用?Python jieba.cut使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类jieba
的用法示例。
在下文中一共展示了jieba.cut方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: calculate_similarity
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def calculate_similarity(text1,text2):
raw1 = jieba.cut(text1)
raw2 = jieba.cut(text2)
raw1 = Counter(raw1)
raw2 = Counter(raw2)
same_words = set(raw1) & set(raw2)
if (math.sqrt(len(raw1)) * math.sqrt(len(raw2))) != 0:
dot_product = 0
mod1 = 0
mod2 = 0
for word in same_words:
dot_product += raw1[word] * raw2[word]
for word in raw1:
mod1 += math.pow(raw1[word],2)
for word in raw2:
mod2 += math.pow(raw2[word],2)
cos = dot_product/math.sqrt(mod1*mod2)
else:
cos = 0
return cos
示例2: title2wordbag
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def title2wordbag(self, title, remove_stopwords=True):
words=jieba.cut(title,cut_all=False)
str_cut=' '.join(words)
for sym in zh_symbol: # remove chinese symbols
str_cut=str_cut.replace(sym,'')
for sym in number: # remove number
str_cut=str_cut.replace(sym,'')
strlist_cut=str_cut.split(' ')
strlist_new=[]
for word in strlist_cut: # remove english letter
if (not len(word)) or (word in self.stop_words):
continue
elif (word[0]>='A' and word[0]<='Z') or(word[0]>='a' and word[0]<='z'):
continue
elif(ord(word[0])<1024):
continue
else:
strlist_new.append(word)
return strlist_new
################################ problem #####################################
示例3: sentenceToIndex
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def sentenceToIndex(sentence, word2idx, maxLen):
"""
将句子分词,并转换成embeddings列表的索引值
:param sentence: 句子
:param word2idx: 词语的索引
:param maxLen: 句子的最大长度
:return: 句子的词向量索引表示
"""
unknown = word2idx.get("UNKNOWN", 0)
num = word2idx.get("NUM", len(word2idx))
index = [unknown] * maxLen
i = 0
for word in jieba.cut(sentence):
if word in word2idx:
index[i] = word2idx[word]
else:
if re.match("\d+", word):
index[i] = num
else:
index[i] = unknown
if i >= maxLen - 1:
break
i += 1
return index
示例4: loadDataSet
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def loadDataSet(path): # 返回每条微博的分词与标签
line_cut = []
label = []
with open(path, encoding="utf-8") as fp:
for line in fp:
temp = line.strip()
try:
sentence = temp[2:].lstrip() # 每条微博
label.append(int(temp[:2])) # 获取标注
word_list = []
sentence = str(sentence).replace('\u200b', '')
for word in jieba.cut(sentence.strip()):
p = re.compile(r'\w', re.L)
result = p.sub("", word)
if not result or result == ' ': # 空字符
continue
word_list.append(word)
word_list = list(set(word_list) - set(stop) - set('\u200b')
- set(' ') - set('\u3000') - set('️'))
line_cut.append(word_list)
except Exception:
continue
return line_cut, label # 返回每条微博的分词和标注
示例5: loadDataSet
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def loadDataSet(path): # 返回每条微博的分词与标签
line_cut = []
label = []
with open(path, encoding="utf-8") as fp:
for line in fp:
temp = line.strip()
try:
sentence = temp[2:].lstrip() # 每条微博
label.append(int(temp[:2])) # 获取标注
word_list = []
sentence = str(sentence).replace('\u200b', '')
for word in jieba.cut(sentence.strip()):
p = re.compile(b'\w', re.L)
result = p.sub(b"", bytes(word, encoding="utf-8")).decode("utf-8")
if not result or result == ' ': # 空字符
continue
word_list.append(word)
word_list = list(set(word_list) - set(stop) - set('\u200b')
- set(' ') - set('\u3000') - set('️'))
line_cut.append(word_list)
except Exception:
continue
return line_cut, label # 返回每条微博的分词和标注
示例6: read_data
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def read_data(fin):
poem_words = list()
title_flag = False
title = ''
fd = codecs.open(fin, 'r', 'utf-8')
for line in fd:
line = line.strip()
line = reg_sep.sub(' ', line)
title_flag = not title_flag
if title_flag:
title = line
else:
words = ' '.join(jieba.cut(title + line))
poem_words.append(words)
fd.close()
print('Read data done.')
return poem_words
示例7: analyze_zh
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def analyze_zh():
translation_path = os.path.join(train_translation_folder, train_translation_zh_filename)
with open(translation_path, 'r') as f:
sentences = f.readlines()
sent_lengths = []
for sentence in tqdm(sentences):
seg_list = list(jieba.cut(sentence.strip()))
# Update word frequency
sent_lengths.append(len(seg_list))
num_bins = 100
n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
title = 'Chinese Sentence Lengths Distribution'
plt.title(title)
plt.show()
示例8: analyze_en
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def analyze_en():
translation_path = os.path.join(train_translation_folder, train_translation_en_filename)
with open(translation_path, 'r') as f:
sentences = f.readlines()
sent_lengths = []
for sentence in tqdm(sentences):
sentence_en = sentence.strip().lower()
tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)]
seg_list = list(jieba.cut(sentence.strip()))
# Update word frequency
sent_lengths.append(len(seg_list))
num_bins = 100
n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
title = 'English Sentence Lengths Distribution'
plt.title(title)
plt.show()
示例9: WordBeark
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def WordBeark():
logger.info("running Word Beark in " + path + data)
inputfile = path + data + ".zhs"
outputfile = path + data + ".wordbreak"
i = 0
output = open(outputfile, 'w')
input = open(inputfile, 'r')
for line in input.readlines():
seg_list = jieba.cut(line)
output.write(u' '.join(seg_list))
i = i + 1
if (i % 10000 == 0):
logger.info("Cut " + str(i) + " articles")
output.close()
logger.info("Finished Saved " + str(i) + " articles in " + outputfile)
示例10: build_key_word
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def build_key_word(path):
"""
通过词频产生key word
:param path:
:return:
"""
d = {}
with open(path, encoding="utf-8") as fp:
for line in fp:
for word in jieba.cut(line.strip()):
if len(word) > 1: # 避免大量无意义的词语进入统计范围
d[word] = d.get(word, 0) + 1
kw_list = sorted(d, key=lambda x: d[x], reverse=True)
# 取前0.5名
size = int(len(kw_list) * 0.2)
return kw_list[:size]
示例11: _zh_split
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def _zh_split(s):
"""
Split text length in Chinese
"""
import jieba
try:
s.encode('ascii')
has_zh = False
except ValueError:
has_zh = True
if has_zh:
return list(jieba.cut(s))
else:
return pofile.WORD_SEP.split(s)
# code modified from babel.messages.pofile (hash 359ecffca479dfe032d0f7210d5cd8160599c816)
示例12: _asian_tokenization
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def _asian_tokenization(doc, entity_type, tag_type, tokenizer):
sents = []
for paragraph in doc.split('\n'):
sent_splits = iter(re.split(r'(?|。|」|!)+', paragraph, flags=re.MULTILINE))
for partial_sent in sent_splits:
sent = partial_sent + next(sent_splits, '')
if sent.strip() == '': continue
toks = []
# for tok in jieba.cut(sent, ):
for tok in tokenizer(sent):
pos = 'WORD'
if tok.strip() == '':
pos = 'SPACE'
elif punct_re.match(tok):
pos = 'PUNCT'
toks.append(Tok(pos,
tok[:2].lower(),
tok.lower(),
tok,
ent_type='' if entity_type is None else entity_type.get(tok, ''),
tag='' if tag_type is None else tag_type.get(tok, '')))
sents.append(Sentence(toks, sent))
return Doc(sents, doc)
示例13: __init__
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def __init__(self, data_file, sequence_length, word2idx, char_level=True):
self.word2idx = word2idx
self.seq_len = sequence_length
x1, x2, y = [], [], []
for line in open(data_file, 'r'):
_, s1, s2, label = line.strip().split('\t')
s1, s2 = map(self._clean_text, [s1, s2])
if not char_level:
s1 = list(jieba.cut(s1))
s2 = list(jieba.cut(s2))
x1.append(s1)
x2.append(s2)
y.append(1) if label == '1' else y.append(0)
self.x1 = x1
self.x2 = x2
self.y = y
示例14: _load_data
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def _load_data(self, data_file):
"""Load origin train data and do text pre-processing (converting and cleaning)
Returns:
A generator
if self.is_training:
train sentence pairs and labels (s1, s2, y).
else:
train sentence pairs and None (s1, s2, None).
"""
for line in open(data_file):
line = line.strip().decode('utf-8').split('\t')
s1, s2 = map(self._clean_text, map(self._tradition2simple, line[1:3]))
if not self.char_level:
s1 = list(jieba.cut(s1))
s2 = list(jieba.cut(s2))
if self.is_training:
y = int(line[-1]) # 1 or [1]
yield s1, s2, y
else:
yield s1, s2, None # for consistent
示例15: extract_bow
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True):
"""return 4 tensors of train_q1,q2 and test_q1,q2"""
df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("")
df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("")
df = pd.DataFrame()
df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique()
if to_preprocess:
df['text'] = df['text'].map(lambda x: preprocess(x))
df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess)
df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess)
df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess)
df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess)
if analyzer == 'char':
vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
else:
vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
vect.fit(df["text"].tolist())
return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect