當前位置: 首頁>>代碼示例>>Python>>正文


Python jieba.enable_parallel方法代碼示例

本文整理匯總了Python中jieba.enable_parallel方法的典型用法代碼示例。如果您正苦於以下問題:Python jieba.enable_parallel方法的具體用法?Python jieba.enable_parallel怎麽用?Python jieba.enable_parallel使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在jieba的用法示例。


在下文中一共展示了jieba.enable_parallel方法的7個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: word_segmentation

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import enable_parallel [as 別名]
def word_segmentation(content, stop_words):

    # 使用 jieba 分詞對文本進行分詞處理
    jieba.enable_parallel()
    seg_list = jieba.cut(content, cut_all=False)

    seg_list = list(seg_list)

    # 去除停用詞
    word_list = []
    for word in seg_list:
        if word not in stop_words:
            word_list.append(word)

    # 過濾遺漏詞、空格
    user_dict = [' ', '噠']
    filter_space = lambda w: w not in user_dict
    word_list = list(filter(filter_space, word_list))

    return word_list

# 詞頻統計
# 返回前 top_N 個值,如果不指定則返回所有值 
開發者ID:GreatV,項目名稱:CloudMusic-Crawler,代碼行數:25,代碼來源:text_mining.py

示例2: segment_file

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import enable_parallel [as 別名]
def segment_file(in_file, out_file, word_sep=' ', pos_sep='/', is_pos=True):
    """
    segment input file to output file
    :param in_file:
    :param out_file:
    :param word_sep:
    :param pos_sep:
    :param is_pos: 需要詞性標注
    :return:
    """
    jieba.enable_parallel()
    with open(in_file, 'r', encoding='utf-8') as fin, open(out_file, 'w', encoding='utf-8') as fout:
        count = 0
        for line in fin:
            in_line = line.strip()
            seg_line = ''
            if is_pos:
                words = posseg.lcut(in_line)
                for word, pos in words:
                    seg_line += word + pos_sep + pos + word_sep
            else:
                words = jieba.lcut(in_line)
                for word in words:
                    seg_line += word + word_sep
            fout.write(seg_line + "\n")
            count += 1
    print("segment ok. input file count:", count) 
開發者ID:shibing624,項目名稱:dialogbot,代碼行數:29,代碼來源:tokenizer.py

示例3: tf_idf

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import enable_parallel [as 別名]
def tf_idf(texts):
    jieba.load_userdict("./model/dict.txt")
    jieba.analyse.set_idf_path("./model/idf.txt")
    jieba.analyse.set_stop_words("./model/chinese_stopwords.txt")
    jieba.enable_parallel(8)

    corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts]
    return corpus 
開發者ID:03pie,項目名稱:SMPCUP2017,代碼行數:10,代碼來源:tf_idf.py

示例4: word_segmentation

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import enable_parallel [as 別名]
def word_segmentation(content, stop_words):
    # 使用 jieba 分詞對文本進行分詞處理
    jieba.enable_parallel()
    seg_list = jieba.cut(content)

    seg_list = list(seg_list)

    # 去除停用詞
    user_dict = [' ', '噠']
    filter_space = lambda w: w not in stop_words and w not in user_dict
    word_list = list(filter(filter_space, seg_list))

    return word_list

#將數據庫中的微博動態轉化為字符串 
開發者ID:starFalll,項目名稱:Spider,代碼行數:17,代碼來源:Data_analysis.py

示例5: load_chinese_data

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import enable_parallel [as 別名]
def load_chinese_data(file_path, save_path, test_size=0.1, verbose=True):
    if os.path.exists(save_path):
        data = pd.read_csv(save_path, sep=",", header=0)
    else:
        data = pd.read_excel(file_path, sheet_name="sheet1")
        data = data.rename(index=str, columns={"分類": "label", "正文": "text"})

        # tokenization
        jieba.enable_parallel(16)
        data["tokens"] = data["text"].apply(lambda x: jieba.cut(x.strip()))
        data["tokens"] = [" ".join(x) for x in data["tokens"]]
        data["tokens"] = data["tokens"].apply(
            lambda x: re.sub(" +", " ", x.strip().replace("\n", " ").replace("\t", " ")))
        data.to_csv(save_path, sep=",", header=True, index=False, na_rep="")

    label_encoder = preprocessing.LabelEncoder()
    labels = label_encoder.fit_transform(data.label.values)

    x_train, x_test, y_train, y_test = train_test_split(data.tokens.values, labels, stratify=labels, random_state=1234,
                                                        test_size=test_size, shuffle=True)

    if verbose:
        print("sample tokenized text: {}".format(data["tokens"].values[0]), flush=True)
        print("labels: {}".format(data.label.unique()), flush=True)
        print("train set shape: {}, test set shape: {}".format(x_train.shape, x_test.shape))

    return x_train, x_test, y_train, y_test 
開發者ID:IsaacChanghau,項目名稱:AmusingPythonCodes,代碼行數:29,代碼來源:data_helper.py

示例6: cut

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import enable_parallel [as 別名]
def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']):
    jieba.enable_parallel(32)
    for word in custom_words:
        jieba.add_word(word)
    words=jieba.lcut(text)
    return words 
開發者ID:QuantumLiu,項目名稱:Neural-Headline-Generator-CN,代碼行數:8,代碼來源:data_preprocess.py

示例7: jieba_initialize

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import enable_parallel [as 別名]
def jieba_initialize():
    if not platform.system().upper().startswith("WINDOWS"):
        jieba.enable_parallel(multiprocessing.cpu_count())
    jieba.load_userdict('resources/QAattrdic.txt')
    jieba.initialize() 
開發者ID:smileboywtu,項目名稱:MillionHeroAssistant,代碼行數:7,代碼來源:crawl.py


注:本文中的jieba.enable_parallel方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。