Python jieba.enable_parallel方法代码示例

本文整理汇总了Python中jieba.enable_parallel方法的典型用法代码示例。如果您正苦于以下问题：Python jieba.enable_parallel方法的具体用法？Python jieba.enable_parallel怎么用？Python jieba.enable_parallel使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类jieba的用法示例。

在下文中一共展示了jieba.enable_parallel方法的7个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: word_segmentation

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def word_segmentation(content, stop_words):

    # 使用 jieba 分词对文本进行分词处理
    jieba.enable_parallel()
    seg_list = jieba.cut(content, cut_all=False)

    seg_list = list(seg_list)

    # 去除停用词
    word_list = []
    for word in seg_list:
        if word not in stop_words:
            word_list.append(word)

    # 过滤遗漏词、空格
    user_dict = [' ', '哒']
    filter_space = lambda w: w not in user_dict
    word_list = list(filter(filter_space, word_list))

    return word_list

# 词频统计
# 返回前 top_N 个值，如果不指定则返回所有值

开发者ID:GreatV，项目名称:CloudMusic-Crawler，代码行数:25，代码来源:text_mining.py

示例2: segment_file

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def segment_file(in_file, out_file, word_sep=' ', pos_sep='/', is_pos=True):
    """
    segment input file to output file
    :param in_file:
    :param out_file:
    :param word_sep:
    :param pos_sep:
    :param is_pos: 需要词性标注
    :return:
    """
    jieba.enable_parallel()
    with open(in_file, 'r', encoding='utf-8') as fin, open(out_file, 'w', encoding='utf-8') as fout:
        count = 0
        for line in fin:
            in_line = line.strip()
            seg_line = ''
            if is_pos:
                words = posseg.lcut(in_line)
                for word, pos in words:
                    seg_line += word + pos_sep + pos + word_sep
            else:
                words = jieba.lcut(in_line)
                for word in words:
                    seg_line += word + word_sep
            fout.write(seg_line + "\n")
            count += 1
    print("segment ok. input file count:", count)

开发者ID:shibing624，项目名称:dialogbot，代码行数:29，代码来源:tokenizer.py

示例3: tf_idf

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def tf_idf(texts):
    jieba.load_userdict("./model/dict.txt")
    jieba.analyse.set_idf_path("./model/idf.txt")
    jieba.analyse.set_stop_words("./model/chinese_stopwords.txt")
    jieba.enable_parallel(8)

    corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts]
    return corpus

开发者ID:03pie，项目名称:SMPCUP2017，代码行数:10，代码来源:tf_idf.py

示例4: word_segmentation

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def word_segmentation(content, stop_words):
    # 使用 jieba 分词对文本进行分词处理
    jieba.enable_parallel()
    seg_list = jieba.cut(content)

    seg_list = list(seg_list)

    # 去除停用词
    user_dict = [' ', '哒']
    filter_space = lambda w: w not in stop_words and w not in user_dict
    word_list = list(filter(filter_space, seg_list))

    return word_list

#将数据库中的微博动态转化为字符串

开发者ID:starFalll，项目名称:Spider，代码行数:17，代码来源:Data_analysis.py

示例5: load_chinese_data

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def load_chinese_data(file_path, save_path, test_size=0.1, verbose=True):
    if os.path.exists(save_path):
        data = pd.read_csv(save_path, sep=",", header=0)
    else:
        data = pd.read_excel(file_path, sheet_name="sheet1")
        data = data.rename(index=str, columns={"分类": "label", "正文": "text"})

        # tokenization
        jieba.enable_parallel(16)
        data["tokens"] = data["text"].apply(lambda x: jieba.cut(x.strip()))
        data["tokens"] = [" ".join(x) for x in data["tokens"]]
        data["tokens"] = data["tokens"].apply(
            lambda x: re.sub(" +", " ", x.strip().replace("\n", " ").replace("\t", " ")))
        data.to_csv(save_path, sep=",", header=True, index=False, na_rep="")

    label_encoder = preprocessing.LabelEncoder()
    labels = label_encoder.fit_transform(data.label.values)

    x_train, x_test, y_train, y_test = train_test_split(data.tokens.values, labels, stratify=labels, random_state=1234,
                                                        test_size=test_size, shuffle=True)

    if verbose:
        print("sample tokenized text: {}".format(data["tokens"].values[0]), flush=True)
        print("labels: {}".format(data.label.unique()), flush=True)
        print("train set shape: {}, test set shape: {}".format(x_train.shape, x_test.shape))

    return x_train, x_test, y_train, y_test

开发者ID:IsaacChanghau，项目名称:AmusingPythonCodes，代码行数:29，代码来源:data_helper.py

示例6: cut

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']):
    jieba.enable_parallel(32)
    for word in custom_words:
        jieba.add_word(word)
    words=jieba.lcut(text)
    return words

开发者ID:QuantumLiu，项目名称:Neural-Headline-Generator-CN，代码行数:8，代码来源:data_preprocess.py

示例7: jieba_initialize

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def jieba_initialize():
    if not platform.system().upper().startswith("WINDOWS"):
        jieba.enable_parallel(multiprocessing.cpu_count())
    jieba.load_userdict('resources/QAattrdic.txt')
    jieba.initialize()

开发者ID:smileboywtu，项目名称:MillionHeroAssistant，代码行数:7，代码来源:crawl.py

注：本文中的jieba.enable_parallel方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。