本文整理汇总了Python中jieba.enable_parallel方法的典型用法代码示例。如果您正苦于以下问题:Python jieba.enable_parallel方法的具体用法?Python jieba.enable_parallel怎么用?Python jieba.enable_parallel使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类jieba
的用法示例。
在下文中一共展示了jieba.enable_parallel方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: word_segmentation
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def word_segmentation(content, stop_words):
# 使用 jieba 分词对文本进行分词处理
jieba.enable_parallel()
seg_list = jieba.cut(content, cut_all=False)
seg_list = list(seg_list)
# 去除停用词
word_list = []
for word in seg_list:
if word not in stop_words:
word_list.append(word)
# 过滤遗漏词、空格
user_dict = [' ', '哒']
filter_space = lambda w: w not in user_dict
word_list = list(filter(filter_space, word_list))
return word_list
# 词频统计
# 返回前 top_N 个值,如果不指定则返回所有值
示例2: segment_file
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def segment_file(in_file, out_file, word_sep=' ', pos_sep='/', is_pos=True):
"""
segment input file to output file
:param in_file:
:param out_file:
:param word_sep:
:param pos_sep:
:param is_pos: 需要词性标注
:return:
"""
jieba.enable_parallel()
with open(in_file, 'r', encoding='utf-8') as fin, open(out_file, 'w', encoding='utf-8') as fout:
count = 0
for line in fin:
in_line = line.strip()
seg_line = ''
if is_pos:
words = posseg.lcut(in_line)
for word, pos in words:
seg_line += word + pos_sep + pos + word_sep
else:
words = jieba.lcut(in_line)
for word in words:
seg_line += word + word_sep
fout.write(seg_line + "\n")
count += 1
print("segment ok. input file count:", count)
示例3: tf_idf
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def tf_idf(texts):
jieba.load_userdict("./model/dict.txt")
jieba.analyse.set_idf_path("./model/idf.txt")
jieba.analyse.set_stop_words("./model/chinese_stopwords.txt")
jieba.enable_parallel(8)
corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts]
return corpus
示例4: word_segmentation
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def word_segmentation(content, stop_words):
# 使用 jieba 分词对文本进行分词处理
jieba.enable_parallel()
seg_list = jieba.cut(content)
seg_list = list(seg_list)
# 去除停用词
user_dict = [' ', '哒']
filter_space = lambda w: w not in stop_words and w not in user_dict
word_list = list(filter(filter_space, seg_list))
return word_list
#将数据库中的微博动态转化为字符串
示例5: load_chinese_data
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def load_chinese_data(file_path, save_path, test_size=0.1, verbose=True):
if os.path.exists(save_path):
data = pd.read_csv(save_path, sep=",", header=0)
else:
data = pd.read_excel(file_path, sheet_name="sheet1")
data = data.rename(index=str, columns={"分类": "label", "正文": "text"})
# tokenization
jieba.enable_parallel(16)
data["tokens"] = data["text"].apply(lambda x: jieba.cut(x.strip()))
data["tokens"] = [" ".join(x) for x in data["tokens"]]
data["tokens"] = data["tokens"].apply(
lambda x: re.sub(" +", " ", x.strip().replace("\n", " ").replace("\t", " ")))
data.to_csv(save_path, sep=",", header=True, index=False, na_rep="")
label_encoder = preprocessing.LabelEncoder()
labels = label_encoder.fit_transform(data.label.values)
x_train, x_test, y_train, y_test = train_test_split(data.tokens.values, labels, stratify=labels, random_state=1234,
test_size=test_size, shuffle=True)
if verbose:
print("sample tokenized text: {}".format(data["tokens"].values[0]), flush=True)
print("labels: {}".format(data.label.unique()), flush=True)
print("train set shape: {}, test set shape: {}".format(x_train.shape, x_test.shape))
return x_train, x_test, y_train, y_test
示例6: cut
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']):
jieba.enable_parallel(32)
for word in custom_words:
jieba.add_word(word)
words=jieba.lcut(text)
return words
示例7: jieba_initialize
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def jieba_initialize():
if not platform.system().upper().startswith("WINDOWS"):
jieba.enable_parallel(multiprocessing.cpu_count())
jieba.load_userdict('resources/QAattrdic.txt')
jieba.initialize()