当前位置: 首页>>代码示例>>Python>>正文


Python jieba.enable_parallel方法代码示例

本文整理汇总了Python中jieba.enable_parallel方法的典型用法代码示例。如果您正苦于以下问题:Python jieba.enable_parallel方法的具体用法?Python jieba.enable_parallel怎么用?Python jieba.enable_parallel使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在jieba的用法示例。


在下文中一共展示了jieba.enable_parallel方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: word_segmentation

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def word_segmentation(content, stop_words):

    # 使用 jieba 分词对文本进行分词处理
    jieba.enable_parallel()
    seg_list = jieba.cut(content, cut_all=False)

    seg_list = list(seg_list)

    # 去除停用词
    word_list = []
    for word in seg_list:
        if word not in stop_words:
            word_list.append(word)

    # 过滤遗漏词、空格
    user_dict = [' ', '哒']
    filter_space = lambda w: w not in user_dict
    word_list = list(filter(filter_space, word_list))

    return word_list

# 词频统计
# 返回前 top_N 个值,如果不指定则返回所有值 
开发者ID:GreatV,项目名称:CloudMusic-Crawler,代码行数:25,代码来源:text_mining.py

示例2: segment_file

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def segment_file(in_file, out_file, word_sep=' ', pos_sep='/', is_pos=True):
    """
    segment input file to output file
    :param in_file:
    :param out_file:
    :param word_sep:
    :param pos_sep:
    :param is_pos: 需要词性标注
    :return:
    """
    jieba.enable_parallel()
    with open(in_file, 'r', encoding='utf-8') as fin, open(out_file, 'w', encoding='utf-8') as fout:
        count = 0
        for line in fin:
            in_line = line.strip()
            seg_line = ''
            if is_pos:
                words = posseg.lcut(in_line)
                for word, pos in words:
                    seg_line += word + pos_sep + pos + word_sep
            else:
                words = jieba.lcut(in_line)
                for word in words:
                    seg_line += word + word_sep
            fout.write(seg_line + "\n")
            count += 1
    print("segment ok. input file count:", count) 
开发者ID:shibing624,项目名称:dialogbot,代码行数:29,代码来源:tokenizer.py

示例3: tf_idf

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def tf_idf(texts):
    jieba.load_userdict("./model/dict.txt")
    jieba.analyse.set_idf_path("./model/idf.txt")
    jieba.analyse.set_stop_words("./model/chinese_stopwords.txt")
    jieba.enable_parallel(8)

    corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts]
    return corpus 
开发者ID:03pie,项目名称:SMPCUP2017,代码行数:10,代码来源:tf_idf.py

示例4: word_segmentation

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def word_segmentation(content, stop_words):
    # 使用 jieba 分词对文本进行分词处理
    jieba.enable_parallel()
    seg_list = jieba.cut(content)

    seg_list = list(seg_list)

    # 去除停用词
    user_dict = [' ', '哒']
    filter_space = lambda w: w not in stop_words and w not in user_dict
    word_list = list(filter(filter_space, seg_list))

    return word_list

#将数据库中的微博动态转化为字符串 
开发者ID:starFalll,项目名称:Spider,代码行数:17,代码来源:Data_analysis.py

示例5: load_chinese_data

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def load_chinese_data(file_path, save_path, test_size=0.1, verbose=True):
    if os.path.exists(save_path):
        data = pd.read_csv(save_path, sep=",", header=0)
    else:
        data = pd.read_excel(file_path, sheet_name="sheet1")
        data = data.rename(index=str, columns={"分类": "label", "正文": "text"})

        # tokenization
        jieba.enable_parallel(16)
        data["tokens"] = data["text"].apply(lambda x: jieba.cut(x.strip()))
        data["tokens"] = [" ".join(x) for x in data["tokens"]]
        data["tokens"] = data["tokens"].apply(
            lambda x: re.sub(" +", " ", x.strip().replace("\n", " ").replace("\t", " ")))
        data.to_csv(save_path, sep=",", header=True, index=False, na_rep="")

    label_encoder = preprocessing.LabelEncoder()
    labels = label_encoder.fit_transform(data.label.values)

    x_train, x_test, y_train, y_test = train_test_split(data.tokens.values, labels, stratify=labels, random_state=1234,
                                                        test_size=test_size, shuffle=True)

    if verbose:
        print("sample tokenized text: {}".format(data["tokens"].values[0]), flush=True)
        print("labels: {}".format(data.label.unique()), flush=True)
        print("train set shape: {}, test set shape: {}".format(x_train.shape, x_test.shape))

    return x_train, x_test, y_train, y_test 
开发者ID:IsaacChanghau,项目名称:AmusingPythonCodes,代码行数:29,代码来源:data_helper.py

示例6: cut

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']):
    jieba.enable_parallel(32)
    for word in custom_words:
        jieba.add_word(word)
    words=jieba.lcut(text)
    return words 
开发者ID:QuantumLiu,项目名称:Neural-Headline-Generator-CN,代码行数:8,代码来源:data_preprocess.py

示例7: jieba_initialize

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import enable_parallel [as 别名]
def jieba_initialize():
    if not platform.system().upper().startswith("WINDOWS"):
        jieba.enable_parallel(multiprocessing.cpu_count())
    jieba.load_userdict('resources/QAattrdic.txt')
    jieba.initialize() 
开发者ID:smileboywtu,项目名称:MillionHeroAssistant,代码行数:7,代码来源:crawl.py


注:本文中的jieba.enable_parallel方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。