當前位置: 首頁>>代碼示例>>Python>>正文


Python jieba.lcut方法代碼示例

本文整理匯總了Python中jieba.lcut方法的典型用法代碼示例。如果您正苦於以下問題:Python jieba.lcut方法的具體用法?Python jieba.lcut怎麽用?Python jieba.lcut使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在jieba的用法示例。


在下文中一共展示了jieba.lcut方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: process_data

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut [as 別名]
def process_data(train_file, user_dict=None, stop_dict=None):
    # 結巴分詞加載自定義詞典(要符合jieba自定義詞典規範)
    if user_dict:
        jieba.load_userdict(user_dict)

    # 加載停用詞表(每行一個停用詞)
    stop_words = []
    if stop_dict:
        with open(stop_dict, 'r', encoding='utf-8') as file:
            stop_words = [stop_word.strip() for stop_word in file.readlines()]

    # 讀取文件內容並分詞, 去掉停用詞
    with open(train_file, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
        sentences = [jieba.lcut(sentence.strip()) for sentence in sentences]
        sentences = [[s for s in sentence if s not in stop_words and s.strip() != ''] for sentence in sentences]

    return sentences 
開發者ID:msgi,項目名稱:nlp-journey,代碼行數:20,代碼來源:pre_process.py

示例2: __call__

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut [as 別名]
def __call__(self, sent):
        sent = ptxt.Text(sent, "whi").clean
        sent = self.clean_linkpic(sent)

        sent = self.clean_english(sent)

        sent = self.clean_date(sent)
        sent = self.clean_time(sent)

        sent = self.clean_money(sent)
        sent = self.clean_weight(sent)
        sent = self.clean_concentration(sent)

        sent = self.clean_entity(sent)

        sent = self.clean_nums(sent)

        wlist = jieba.lcut(sent)
        sent = self.clean_stopwords(wlist)
        sent = self.clean_punctuation(sent)

        return sent 
開發者ID:hscspring,項目名稱:Multi-Label-Text-Classification-for-Chinese,代碼行數:24,代碼來源:chinese.py

示例3: cut_texts

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut [as 別名]
def cut_texts(texts=None, need_cut=True, word_len=1):
    '''
    Use jieba to cut texts
    :param texts:list of texts
    :param need_cut:whether need cut text
    :param word_len:min length of words to keep,in order to delete stop-words
    :param savepath:path to save word list in json file
    :return:
    '''
    if need_cut:
        if word_len > 1:
            texts_cut = [[word for word in jieba.lcut(text) if len(word) >= word_len] for text in texts]
        else:
            texts_cut = [jieba.lcut(one_text) for one_text in texts]
    else:
        if word_len > 1:
            texts_cut = [[word for word in text if len(word) >= word_len] for text in texts]
        else:
            texts_cut = texts

    return texts_cut 
開發者ID:renjunxiang,項目名稱:chatbot_by_similarity,代碼行數:23,代碼來源:cut_text.py

示例4: segment

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut [as 別名]
def segment(self, sentence, entity_postag=dict()):
        """采用NLPIR進行分詞處理
        Args:
            sentence: string,句子
            entity_postag: dict,實體詞性詞典,默認為空集合,分析每一個案例的結構化文本時產生
        Returns:
            lemmas: list,分詞結果
        """
        # 添加實體詞典
        if entity_postag:
            for entity in entity_postag:
                # pynlpir.nlpir.AddUserWord(c_char_p(entity.encode()))
                jieba.add_word(entity)
        # pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode()))  # 單個用戶詞加入示例
        # pynlpir.nlpir.AddUserWord(c_char_p('習近平'.encode()))  # 單個用戶詞加入示例
        # 分詞,不進行詞性標注
        # lemmas = pynlpir.segment(sentence, pos_tagging=False)
        lemmas = jieba.lcut(sentence)
        # pynlpir.close()  # 釋放
        return lemmas 
開發者ID:lemonhu,項目名稱:open-entity-relation-extraction,代碼行數:22,代碼來源:nlp.py

示例5: M_idf

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut [as 別名]
def M_idf(self,s1, s2):
        v1, v2 = [], []
        s1_list = jieba.lcut(s1)
        s2_list = jieba.lcut(s2)

        for s in s1_list:
            idf_v = self.idf.get(s, 1)
            if s in self.voc:
                v1.append(1.0 * idf_v * self.voc[s])

        for s in s2_list:
            idf_v = self.idf.get(s, 1)
            if s in self.voc:
                v2.append(1.0 * idf_v * self.voc[s])

        v1 = np.array(v1).sum(axis=0)
        v2 = np.array(v2).sum(axis=0)

        sim = 1 - spatial.distance.cosine(v1, v2)

        return sim 
開發者ID:cjymz886,項目名稱:sentence-similarity,代碼行數:23,代碼來源:similarity.py

示例6: _gen_sxhy_dict

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut [as 別名]
def _gen_sxhy_dict():
    print("Parsing shixuehanying dictionary ...")
    words = set()
    with open(_rawsxhy_path, 'r') as fin:
        for line in fin.readlines():
            if line[0] == '<':
                continue
            for phrase in line.strip().split()[1:]:
                if not is_cn_sentence(phrase):
                    continue
                idx = 0
                while idx + 4 <= len(phrase):
                    # Cut 2 chars each time.
                    words.add(phrase[idx : idx + 2])
                    idx += 2
                # Use jieba to cut the last 3 chars.
                if idx < len(phrase):
                    for word in jieba.lcut(phrase[idx:]):
                        words.add(word)
    with open(sxhy_path, 'w') as fout:
        fout.write(' '.join(words)) 
開發者ID:DevinZ1993,項目名稱:Chinese-Poetry-Generation,代碼行數:23,代碼來源:segment.py

示例7: segment

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut [as 別名]
def segment(self, sentence):
        # TODO: try CRF-based segmentation.
        toks = []
        idx = 0
        while idx + 4 <= len(sentence):
            # Cut 2 chars each time.
            if sentence[idx : idx + 2] in self.sxhy_dict:
                toks.append(sentence[idx : idx + 2])
            else:
                for tok in jieba.lcut(sentence[idx : idx + 2]):
                    toks.append(tok)
            idx += 2
        # Cut last 3 chars.
        if idx < len(sentence):
            if sentence[idx : ] in self.sxhy_dict:
                toks.append(sentence[idx : ])
            else:
                for tok in jieba.lcut(sentence[idx : ]):
                    toks.append(tok)
        return toks


# For testing purpose. 
開發者ID:DevinZ1993,項目名稱:Chinese-Poetry-Generation,代碼行數:25,代碼來源:segment.py

示例8: _prepare_data

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut [as 別名]
def _prepare_data(self, temp_data):
        cans = temp_data["candidates"]
        cans = [self.vocab.tran2id(each, True) for each in cans]

        for text in temp_data["content"]:
            content = re.split(r'(#idiom\d+#)', text)

            doc = []
            loc = []
            labs = []
            tags = []

            for i, segment in enumerate(content):
                if re.match(r'#idiom\d+#', segment) is not None:
                    tags.append(segment)
                    if segment in self.ans:
                        labs.append(self.ans[segment])
                    loc.append(len(doc))
                    doc.append(self.vocab.tran2id('#idiom#'))
                else:
                    doc += [self.vocab.tran2id(each) for each in jieba.lcut(segment)]

            yield doc, cans, labs, loc, tags 
開發者ID:chujiezheng,項目名稱:ChID-Dataset,代碼行數:25,代碼來源:DataManager.py

示例9: train

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut [as 別名]
def train():
    """
    訓練模型,並保存

    """
    print('Loading Data...')
    inputTexts, labels = load_data()
    print(inputTexts.shape, labels.shape)

    print('segment...')

    # seg_data = [jieba.lcut(document.replace('\n', ''))for document in inputTexts]
    # print('word2vec...')
    # index_dict, word_vectors, data = word2vec_train(seg_data)
    # n_symbols = len(index_dict) + 1   
    # x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.15)
    # print(x_train.shape, y_train.shape)
    # train_model(n_symbols, x_train, y_train, x_test, y_test)

    word_index, data = train_wordtoVect(inputTexts)
    input_dim=len(word_index) + 1
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.15)
    print(x_train.shape, y_train.shape)

    train_model(input_dim, x_train, y_train, x_test, y_test) 
開發者ID:jarvisqi,項目名稱:deep_learning,代碼行數:27,代碼來源:textAnalysis.py

示例10: train_wordtoVect

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut [as 別名]
def train_wordtoVect(train_inputTexts):
    """
    訓練詞向量函數
    """
    texts=[]
    for doc in train_inputTexts:
        seg_doc = jieba.lcut(doc.replace('\n', ''))
        d =" ".join(seg_doc)
        texts.append(d)
    tokenizer = text.Tokenizer()                            # 分詞MAX_NB_WORDS
    tokenizer.fit_on_texts(texts)
    text_sequences = tokenizer.texts_to_sequences(texts)    # 受num_words影響
    word_index = tokenizer.word_index                       # 詞_索引
    data = sequence.pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return word_index, data 
開發者ID:jarvisqi,項目名稱:deep_learning,代碼行數:18,代碼來源:textAnalysis.py

示例11: word_flag_cut

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut [as 別名]
def word_flag_cut(sentence):
    """
        jieba切詞詞性
    :param sentence: 
    :return: 
    """
    sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').\
                        replace(' ', '').replace('\t', '').upper().strip()
    word_list = []
    flag_list = []
    try:
        sentence_cut =  ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
        words = jieba_seg.cut(sentence_cut)
        for word in words:
            word_list.append(word.word)
            flag_list.append(word.flag)
    except Exception as e:
        word_list = [sentence]
        flag_list = ['nt']
    return word_list, flag_list 
開發者ID:yongzhuo,項目名稱:nlp_xiaojiang,代碼行數:22,代碼來源:chatbot_sentence_vec_by_word.py

示例12: cut_td_idf

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut [as 別名]
def cut_td_idf(sources_path, target_path):
    """
    結巴切詞,漢語
    :param path: 
    :return: 
    """
    print("cut_td_idf start! ")
    corpus = txtRead(sources_path)
    governments = []
    for corpus_one in corpus:
        corpus_one_clear = corpus_one.replace(' ', '').strip()
        ques_q2b = strQ2B(corpus_one_clear.strip())
        ques_q2b_syboml = get_syboml(ques_q2b)
        governments.append(ques_q2b_syboml.strip())

    government_ques = list(map(lambda x: ' '.join(jieba.lcut(x)), governments))

    topic_ques_all = []
    for topic_ques_one in government_ques:
        top_ques_aqlq = topic_ques_one.replace('   ', ' ').replace('  ', ' ').strip() + '\n'
        topic_ques_all.append(top_ques_aqlq)

    txtWrite(topic_ques_all, target_path)
    print("cut_td_idf ok! " + sources_path) 
開發者ID:yongzhuo,項目名稱:nlp_xiaojiang,代碼行數:26,代碼來源:cut_td_idf.py

示例13: __iter__

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut [as 別名]
def __iter__(self):
        for filename in self.filenames:
            with codecs.open(filename, 'r', encoding='utf-8') as f:
                for _,line in enumerate(f):
                    try:
                        line=line.strip()
                        line=line.split('\t')
                        assert len(line)==2
                        blocks=re_han.split(line[1])
                        word=[]
                        for blk in blocks:
                            if re_han.match(blk):
                                word.extend(jieba.lcut(blk))
                        yield word
                    except:
                        pass 
開發者ID:cjymz886,項目名稱:text-cnn,代碼行數:18,代碼來源:train_word2vec.py

示例14: sentence_cut

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut [as 別名]
def sentence_cut(sentences):
    """
    Args:
        sentence: a list of text need to segment
    Returns:
        seglist:  a list of sentence cut by jieba 

    """
    re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)")  # the method of cutting text by punctuation
    seglist=[]
    for sentence in sentences:
        words=[]
        blocks = re_han.split(sentence)
        for blk in blocks:
            if re_han.match(blk):
                words.extend(jieba.lcut(blk))
        seglist.append(words)
    return  seglist 
開發者ID:cjymz886,項目名稱:text-cnn,代碼行數:20,代碼來源:text_predict.py

示例15: sentence_cut

# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut [as 別名]
def sentence_cut(sentences):
    """
    Args:
        sentence: a list of text need to segment
    Returns:
        seglist:  a list of sentence cut by jieba

    """
    re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)")  # the method of cutting text by punctuation
    with codecs.open('./data/stopwords.txt','r',encoding='utf-8') as f:
            stopwords=[line.strip() for line in f.readlines()]
    contents=[]
    for sentence in sentences:
        words=[]
        blocks = re_han.split(sentence)
        for blk in blocks:
            if re_han.match(blk):
                seglist = jieba.lcut(blk)
                words.extend([w for w in seglist if w not in stopwords])
        contents.append(words)
    return  contents 
開發者ID:cjymz886,項目名稱:text_rnn_attention,代碼行數:23,代碼來源:text_predict.py


注:本文中的jieba.lcut方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。