当前位置: 首页>>代码示例>>Python>>正文


Python jieba.lcut方法代码示例

本文整理汇总了Python中jieba.lcut方法的典型用法代码示例。如果您正苦于以下问题:Python jieba.lcut方法的具体用法?Python jieba.lcut怎么用?Python jieba.lcut使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在jieba的用法示例。


在下文中一共展示了jieba.lcut方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: process_data

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def process_data(train_file, user_dict=None, stop_dict=None):
    # 结巴分词加载自定义词典(要符合jieba自定义词典规范)
    if user_dict:
        jieba.load_userdict(user_dict)

    # 加载停用词表(每行一个停用词)
    stop_words = []
    if stop_dict:
        with open(stop_dict, 'r', encoding='utf-8') as file:
            stop_words = [stop_word.strip() for stop_word in file.readlines()]

    # 读取文件内容并分词, 去掉停用词
    with open(train_file, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
        sentences = [jieba.lcut(sentence.strip()) for sentence in sentences]
        sentences = [[s for s in sentence if s not in stop_words and s.strip() != ''] for sentence in sentences]

    return sentences 
开发者ID:msgi,项目名称:nlp-journey,代码行数:20,代码来源:pre_process.py

示例2: __call__

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def __call__(self, sent):
        sent = ptxt.Text(sent, "whi").clean
        sent = self.clean_linkpic(sent)

        sent = self.clean_english(sent)

        sent = self.clean_date(sent)
        sent = self.clean_time(sent)

        sent = self.clean_money(sent)
        sent = self.clean_weight(sent)
        sent = self.clean_concentration(sent)

        sent = self.clean_entity(sent)

        sent = self.clean_nums(sent)

        wlist = jieba.lcut(sent)
        sent = self.clean_stopwords(wlist)
        sent = self.clean_punctuation(sent)

        return sent 
开发者ID:hscspring,项目名称:Multi-Label-Text-Classification-for-Chinese,代码行数:24,代码来源:chinese.py

示例3: cut_texts

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def cut_texts(texts=None, need_cut=True, word_len=1):
    '''
    Use jieba to cut texts
    :param texts:list of texts
    :param need_cut:whether need cut text
    :param word_len:min length of words to keep,in order to delete stop-words
    :param savepath:path to save word list in json file
    :return:
    '''
    if need_cut:
        if word_len > 1:
            texts_cut = [[word for word in jieba.lcut(text) if len(word) >= word_len] for text in texts]
        else:
            texts_cut = [jieba.lcut(one_text) for one_text in texts]
    else:
        if word_len > 1:
            texts_cut = [[word for word in text if len(word) >= word_len] for text in texts]
        else:
            texts_cut = texts

    return texts_cut 
开发者ID:renjunxiang,项目名称:chatbot_by_similarity,代码行数:23,代码来源:cut_text.py

示例4: segment

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def segment(self, sentence, entity_postag=dict()):
        """采用NLPIR进行分词处理
        Args:
            sentence: string,句子
            entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生
        Returns:
            lemmas: list,分词结果
        """
        # 添加实体词典
        if entity_postag:
            for entity in entity_postag:
                # pynlpir.nlpir.AddUserWord(c_char_p(entity.encode()))
                jieba.add_word(entity)
        # pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode()))  # 单个用户词加入示例
        # pynlpir.nlpir.AddUserWord(c_char_p('习近平'.encode()))  # 单个用户词加入示例
        # 分词,不进行词性标注
        # lemmas = pynlpir.segment(sentence, pos_tagging=False)
        lemmas = jieba.lcut(sentence)
        # pynlpir.close()  # 释放
        return lemmas 
开发者ID:lemonhu,项目名称:open-entity-relation-extraction,代码行数:22,代码来源:nlp.py

示例5: M_idf

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def M_idf(self,s1, s2):
        v1, v2 = [], []
        s1_list = jieba.lcut(s1)
        s2_list = jieba.lcut(s2)

        for s in s1_list:
            idf_v = self.idf.get(s, 1)
            if s in self.voc:
                v1.append(1.0 * idf_v * self.voc[s])

        for s in s2_list:
            idf_v = self.idf.get(s, 1)
            if s in self.voc:
                v2.append(1.0 * idf_v * self.voc[s])

        v1 = np.array(v1).sum(axis=0)
        v2 = np.array(v2).sum(axis=0)

        sim = 1 - spatial.distance.cosine(v1, v2)

        return sim 
开发者ID:cjymz886,项目名称:sentence-similarity,代码行数:23,代码来源:similarity.py

示例6: _gen_sxhy_dict

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def _gen_sxhy_dict():
    print("Parsing shixuehanying dictionary ...")
    words = set()
    with open(_rawsxhy_path, 'r') as fin:
        for line in fin.readlines():
            if line[0] == '<':
                continue
            for phrase in line.strip().split()[1:]:
                if not is_cn_sentence(phrase):
                    continue
                idx = 0
                while idx + 4 <= len(phrase):
                    # Cut 2 chars each time.
                    words.add(phrase[idx : idx + 2])
                    idx += 2
                # Use jieba to cut the last 3 chars.
                if idx < len(phrase):
                    for word in jieba.lcut(phrase[idx:]):
                        words.add(word)
    with open(sxhy_path, 'w') as fout:
        fout.write(' '.join(words)) 
开发者ID:DevinZ1993,项目名称:Chinese-Poetry-Generation,代码行数:23,代码来源:segment.py

示例7: segment

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def segment(self, sentence):
        # TODO: try CRF-based segmentation.
        toks = []
        idx = 0
        while idx + 4 <= len(sentence):
            # Cut 2 chars each time.
            if sentence[idx : idx + 2] in self.sxhy_dict:
                toks.append(sentence[idx : idx + 2])
            else:
                for tok in jieba.lcut(sentence[idx : idx + 2]):
                    toks.append(tok)
            idx += 2
        # Cut last 3 chars.
        if idx < len(sentence):
            if sentence[idx : ] in self.sxhy_dict:
                toks.append(sentence[idx : ])
            else:
                for tok in jieba.lcut(sentence[idx : ]):
                    toks.append(tok)
        return toks


# For testing purpose. 
开发者ID:DevinZ1993,项目名称:Chinese-Poetry-Generation,代码行数:25,代码来源:segment.py

示例8: _prepare_data

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def _prepare_data(self, temp_data):
        cans = temp_data["candidates"]
        cans = [self.vocab.tran2id(each, True) for each in cans]

        for text in temp_data["content"]:
            content = re.split(r'(#idiom\d+#)', text)

            doc = []
            loc = []
            labs = []
            tags = []

            for i, segment in enumerate(content):
                if re.match(r'#idiom\d+#', segment) is not None:
                    tags.append(segment)
                    if segment in self.ans:
                        labs.append(self.ans[segment])
                    loc.append(len(doc))
                    doc.append(self.vocab.tran2id('#idiom#'))
                else:
                    doc += [self.vocab.tran2id(each) for each in jieba.lcut(segment)]

            yield doc, cans, labs, loc, tags 
开发者ID:chujiezheng,项目名称:ChID-Dataset,代码行数:25,代码来源:DataManager.py

示例9: train

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def train():
    """
    训练模型,并保存

    """
    print('Loading Data...')
    inputTexts, labels = load_data()
    print(inputTexts.shape, labels.shape)

    print('segment...')

    # seg_data = [jieba.lcut(document.replace('\n', ''))for document in inputTexts]
    # print('word2vec...')
    # index_dict, word_vectors, data = word2vec_train(seg_data)
    # n_symbols = len(index_dict) + 1   
    # x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.15)
    # print(x_train.shape, y_train.shape)
    # train_model(n_symbols, x_train, y_train, x_test, y_test)

    word_index, data = train_wordtoVect(inputTexts)
    input_dim=len(word_index) + 1
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.15)
    print(x_train.shape, y_train.shape)

    train_model(input_dim, x_train, y_train, x_test, y_test) 
开发者ID:jarvisqi,项目名称:deep_learning,代码行数:27,代码来源:textAnalysis.py

示例10: train_wordtoVect

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def train_wordtoVect(train_inputTexts):
    """
    训练词向量函数
    """
    texts=[]
    for doc in train_inputTexts:
        seg_doc = jieba.lcut(doc.replace('\n', ''))
        d =" ".join(seg_doc)
        texts.append(d)
    tokenizer = text.Tokenizer()                            # 分词MAX_NB_WORDS
    tokenizer.fit_on_texts(texts)
    text_sequences = tokenizer.texts_to_sequences(texts)    # 受num_words影响
    word_index = tokenizer.word_index                       # 词_索引
    data = sequence.pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return word_index, data 
开发者ID:jarvisqi,项目名称:deep_learning,代码行数:18,代码来源:textAnalysis.py

示例11: word_flag_cut

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def word_flag_cut(sentence):
    """
        jieba切词词性
    :param sentence: 
    :return: 
    """
    sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').\
                        replace(' ', '').replace('\t', '').upper().strip()
    word_list = []
    flag_list = []
    try:
        sentence_cut =  ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
        words = jieba_seg.cut(sentence_cut)
        for word in words:
            word_list.append(word.word)
            flag_list.append(word.flag)
    except Exception as e:
        word_list = [sentence]
        flag_list = ['nt']
    return word_list, flag_list 
开发者ID:yongzhuo,项目名称:nlp_xiaojiang,代码行数:22,代码来源:chatbot_sentence_vec_by_word.py

示例12: cut_td_idf

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def cut_td_idf(sources_path, target_path):
    """
    结巴切词,汉语
    :param path: 
    :return: 
    """
    print("cut_td_idf start! ")
    corpus = txtRead(sources_path)
    governments = []
    for corpus_one in corpus:
        corpus_one_clear = corpus_one.replace(' ', '').strip()
        ques_q2b = strQ2B(corpus_one_clear.strip())
        ques_q2b_syboml = get_syboml(ques_q2b)
        governments.append(ques_q2b_syboml.strip())

    government_ques = list(map(lambda x: ' '.join(jieba.lcut(x)), governments))

    topic_ques_all = []
    for topic_ques_one in government_ques:
        top_ques_aqlq = topic_ques_one.replace('   ', ' ').replace('  ', ' ').strip() + '\n'
        topic_ques_all.append(top_ques_aqlq)

    txtWrite(topic_ques_all, target_path)
    print("cut_td_idf ok! " + sources_path) 
开发者ID:yongzhuo,项目名称:nlp_xiaojiang,代码行数:26,代码来源:cut_td_idf.py

示例13: __iter__

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def __iter__(self):
        for filename in self.filenames:
            with codecs.open(filename, 'r', encoding='utf-8') as f:
                for _,line in enumerate(f):
                    try:
                        line=line.strip()
                        line=line.split('\t')
                        assert len(line)==2
                        blocks=re_han.split(line[1])
                        word=[]
                        for blk in blocks:
                            if re_han.match(blk):
                                word.extend(jieba.lcut(blk))
                        yield word
                    except:
                        pass 
开发者ID:cjymz886,项目名称:text-cnn,代码行数:18,代码来源:train_word2vec.py

示例14: sentence_cut

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def sentence_cut(sentences):
    """
    Args:
        sentence: a list of text need to segment
    Returns:
        seglist:  a list of sentence cut by jieba 

    """
    re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)")  # the method of cutting text by punctuation
    seglist=[]
    for sentence in sentences:
        words=[]
        blocks = re_han.split(sentence)
        for blk in blocks:
            if re_han.match(blk):
                words.extend(jieba.lcut(blk))
        seglist.append(words)
    return  seglist 
开发者ID:cjymz886,项目名称:text-cnn,代码行数:20,代码来源:text_predict.py

示例15: sentence_cut

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut [as 别名]
def sentence_cut(sentences):
    """
    Args:
        sentence: a list of text need to segment
    Returns:
        seglist:  a list of sentence cut by jieba

    """
    re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)")  # the method of cutting text by punctuation
    with codecs.open('./data/stopwords.txt','r',encoding='utf-8') as f:
            stopwords=[line.strip() for line in f.readlines()]
    contents=[]
    for sentence in sentences:
        words=[]
        blocks = re_han.split(sentence)
        for blk in blocks:
            if re_han.match(blk):
                seglist = jieba.lcut(blk)
                words.extend([w for w in seglist if w not in stopwords])
        contents.append(words)
    return  contents 
开发者ID:cjymz886,项目名称:text_rnn_attention,代码行数:23,代码来源:text_predict.py


注:本文中的jieba.lcut方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。