Python jieba.cut方法代码示例

本文整理汇总了Python中jieba.cut方法的典型用法代码示例。如果您正苦于以下问题：Python jieba.cut方法的具体用法？Python jieba.cut怎么用？Python jieba.cut使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类jieba的用法示例。

在下文中一共展示了jieba.cut方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: calculate_similarity

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def calculate_similarity(text1,text2):
    raw1 = jieba.cut(text1)
    raw2 = jieba.cut(text2)
    raw1 = Counter(raw1)
    raw2 = Counter(raw2)
    same_words = set(raw1) & set(raw2)
    if (math.sqrt(len(raw1)) * math.sqrt(len(raw2))) != 0:
        dot_product = 0
        mod1 = 0
        mod2 = 0
        for word in same_words:
            dot_product += raw1[word] * raw2[word]
        for word in raw1:
            mod1 += math.pow(raw1[word],2)
        for word in raw2:
            mod2 += math.pow(raw2[word],2)
        cos = dot_product/math.sqrt(mod1*mod2)
    else:
        cos = 0
    return cos

开发者ID:ZRStea，项目名称:TiebaTool，代码行数:22，代码来源:run.py

示例2: title2wordbag

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def title2wordbag(self, title, remove_stopwords=True):
        words=jieba.cut(title,cut_all=False)
        str_cut=' '.join(words)
        for sym in zh_symbol: # remove chinese symbols
            str_cut=str_cut.replace(sym,'')
        for sym in number:    # remove number
            str_cut=str_cut.replace(sym,'')    
        strlist_cut=str_cut.split(' ')
        
        strlist_new=[]
        for word in strlist_cut: # remove english letter
            if (not len(word)) or (word in self.stop_words):
                continue
            elif (word[0]>='A' and word[0]<='Z') or(word[0]>='a' and word[0]<='z'):
                continue
            elif(ord(word[0])<1024):
                continue
            else:
                strlist_new.append(word)
        return strlist_new
    
    
################################ problem #####################################

开发者ID:Coldog2333，项目名称:Financial-NLP，代码行数:25，代码来源:NLP.py

示例3: sentenceToIndex

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def sentenceToIndex(sentence, word2idx, maxLen):
    """
    将句子分词，并转换成embeddings列表的索引值

    :param sentence: 句子
    :param word2idx: 词语的索引
    :param maxLen: 句子的最大长度
    :return: 句子的词向量索引表示
    """
    unknown = word2idx.get("UNKNOWN", 0)
    num = word2idx.get("NUM", len(word2idx))
    index = [unknown] * maxLen
    i = 0
    for word in jieba.cut(sentence):
        if word in word2idx:
            index[i] = word2idx[word]
        else:
            if re.match("\d+", word):
                index[i] = num
            else:
                index[i] = unknown
        if i >= maxLen - 1:
            break
        i += 1
    return index

开发者ID:shuaihuaiyi，项目名称:QA，代码行数:27，代码来源:qaData.py

示例4: loadDataSet

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def loadDataSet(path):  # 返回每条微博的分词与标签
    line_cut = []
    label = []
    with open(path, encoding="utf-8") as fp:
        for line in fp:
            temp = line.strip()
            try:
                sentence = temp[2:].lstrip()  # 每条微博
                label.append(int(temp[:2]))  # 获取标注
                word_list = []
                sentence = str(sentence).replace('\u200b', '')
                for word in jieba.cut(sentence.strip()):
                    p = re.compile(r'\w', re.L)
                    result = p.sub("", word)
                    if not result or result == ' ':  # 空字符
                        continue
                    word_list.append(word)
                word_list = list(set(word_list) - set(stop) - set('\u200b')
                                 - set(' ') - set('\u3000') - set('️'))
                line_cut.append(word_list)
            except Exception:
                continue
    return line_cut, label  # 返回每条微博的分词和标注

开发者ID:Zephery，项目名称:weiboanalysis，代码行数:25，代码来源:Bayes.py

示例5: loadDataSet

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def loadDataSet(path):  # 返回每条微博的分词与标签
    line_cut = []
    label = []
    with open(path, encoding="utf-8") as fp:
        for line in fp:
            temp = line.strip()
            try:
                sentence = temp[2:].lstrip()  # 每条微博
                label.append(int(temp[:2]))  # 获取标注
                word_list = []
                sentence = str(sentence).replace('\u200b', '')
                for word in jieba.cut(sentence.strip()):
                    p = re.compile(b'\w', re.L)
                    result = p.sub(b"", bytes(word, encoding="utf-8")).decode("utf-8")
                    if not result or result == ' ':  # 空字符
                        continue
                    word_list.append(word)
                word_list = list(set(word_list) - set(stop) - set('\u200b')
                                 - set(' ') - set('\u3000') - set('️'))
                line_cut.append(word_list)
            except Exception:
                continue
    return line_cut, label  # 返回每条微博的分词和标注

开发者ID:Zephery，项目名称:weiboanalysis，代码行数:25，代码来源:Bayes.py

示例6: read_data

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def read_data(fin):
    poem_words = list()
    title_flag = False
    title = ''
    fd = codecs.open(fin, 'r', 'utf-8')
    for line in fd:
        line = line.strip()
        line = reg_sep.sub(' ', line)
        title_flag = not title_flag
        if title_flag:
            title = line
        else:
            words = ' '.join(jieba.cut(title + line))
            poem_words.append(words)
    fd.close()
    print('Read data done.')
    return poem_words

开发者ID:lijiancheng0614，项目名称:poem_generator，代码行数:19，代码来源:get_topic.py

示例7: analyze_zh

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def analyze_zh():
    translation_path = os.path.join(train_translation_folder, train_translation_zh_filename)

    with open(translation_path, 'r') as f:
        sentences = f.readlines()

    sent_lengths = []

    for sentence in tqdm(sentences):
        seg_list = list(jieba.cut(sentence.strip()))
        # Update word frequency
        sent_lengths.append(len(seg_list))

    num_bins = 100
    n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
    title = 'Chinese Sentence Lengths Distribution'
    plt.title(title)
    plt.show()

开发者ID:foamliu，项目名称:Machine-Translation，代码行数:20，代码来源:analyze_data.py

示例8: analyze_en

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def analyze_en():
    translation_path = os.path.join(train_translation_folder, train_translation_en_filename)

    with open(translation_path, 'r') as f:
        sentences = f.readlines()

    sent_lengths = []

    for sentence in tqdm(sentences):
        sentence_en = sentence.strip().lower()
        tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)]
        seg_list = list(jieba.cut(sentence.strip()))
        # Update word frequency
        sent_lengths.append(len(seg_list))

    num_bins = 100
    n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
    title = 'English Sentence Lengths Distribution'
    plt.title(title)
    plt.show()

开发者ID:foamliu，项目名称:Machine-Translation，代码行数:22，代码来源:analyze_data.py

示例9: WordBeark

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def WordBeark():
    logger.info("running Word Beark in " + path + data)

    inputfile = path + data + ".zhs"
    outputfile = path + data + ".wordbreak"
    i = 0
    output = open(outputfile, 'w')
    input = open(inputfile, 'r')

    for line in input.readlines():
        seg_list = jieba.cut(line)
        output.write(u' '.join(seg_list))

        i = i + 1
        if (i % 10000 == 0):
            logger.info("Cut " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles in " + outputfile)

开发者ID:sefira，项目名称:question-classification-cnn-rnn-attention，代码行数:21，代码来源:process_corpus.py

示例10: build_key_word

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def build_key_word(path):
    """
    通过词频产生key word
    :param path:
    :return:
    """
    d = {}
    with open(path, encoding="utf-8") as fp:
        for line in fp:
            for word in jieba.cut(line.strip()):
                if len(word) > 1:  # 避免大量无意义的词语进入统计范围
                    d[word] = d.get(word, 0) + 1
    kw_list = sorted(d, key=lambda x: d[x], reverse=True)
    # 取前0.5名
    size = int(len(kw_list) * 0.2)
    return kw_list[:size]

开发者ID:MashiMaroLjc，项目名称:dudulu，代码行数:18，代码来源:tool.py

示例11: _zh_split

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def _zh_split(s):
    """
    Split text length in Chinese
    """
    import jieba
    try:
        s.encode('ascii')
        has_zh = False
    except ValueError:
        has_zh = True

    if has_zh:
        return list(jieba.cut(s))
    else:
        return pofile.WORD_SEP.split(s)


# code modified from babel.messages.pofile (hash 359ecffca479dfe032d0f7210d5cd8160599c816)

开发者ID:mars-project，项目名称:mars，代码行数:20，代码来源:norm_zh.py

示例12: _asian_tokenization

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def _asian_tokenization(doc, entity_type, tag_type, tokenizer):
	sents = []
	for paragraph in doc.split('\n'):
		sent_splits = iter(re.split(r'(？|。|」|！)+', paragraph, flags=re.MULTILINE))
		for partial_sent in sent_splits:
			sent = partial_sent + next(sent_splits, '')
			if sent.strip() == '': continue
			toks = []
			# for tok in jieba.cut(sent, ):
			for tok in tokenizer(sent):
				pos = 'WORD'
				if tok.strip() == '':
					pos = 'SPACE'
				elif punct_re.match(tok):
					pos = 'PUNCT'
				toks.append(Tok(pos,
				                tok[:2].lower(),
				                tok.lower(),
				                tok,
				                ent_type='' if entity_type is None else entity_type.get(tok, ''),
				                tag='' if tag_type is None else tag_type.get(tok, '')))
			sents.append(Sentence(toks, sent))
	return Doc(sents, doc)

开发者ID:JasonKessler，项目名称:scattertext，代码行数:25，代码来源:AsianNLP.py

示例13: init

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def __init__(self, data_file, sequence_length, word2idx, char_level=True):
        self.word2idx = word2idx
        self.seq_len = sequence_length

        x1, x2, y = [], [], []
        for line in open(data_file, 'r'):
            _, s1, s2, label = line.strip().split('\t')
            s1, s2 = map(self._clean_text, [s1, s2])
            if not char_level:
                s1 = list(jieba.cut(s1))
                s2 = list(jieba.cut(s2))
            x1.append(s1)
            x2.append(s2)
            y.append(1) if label == '1' else y.append(0)
        self.x1 = x1
        self.x2 = x2
        self.y = y

开发者ID:Lapis-Hong，项目名称:atec-nlp，代码行数:19，代码来源:dataset.py

示例14: _load_data

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def _load_data(self, data_file):
        """Load origin train data and do text pre-processing (converting and cleaning)
        Returns:
            A generator
            if self.is_training:
                train sentence pairs and labels (s1, s2, y).
            else:
                train sentence pairs and None (s1, s2, None).
        """
        for line in open(data_file):
            line = line.strip().decode('utf-8').split('\t')
            s1, s2 = map(self._clean_text, map(self._tradition2simple, line[1:3]))
            if not self.char_level:
                s1 = list(jieba.cut(s1))
                s2 = list(jieba.cut(s2))
            if self.is_training:
                y = int(line[-1])  # 1 or [1]
                yield s1, s2, y
            else:
                yield s1, s2, None  # for consistent

开发者ID:Lapis-Hong，项目名称:atec-nlp，代码行数:22，代码来源:dataset.py

示例15: extract_bow

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import cut [as 别名]
def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True):
    """return 4 tensors of train_q1,q2 and test_q1,q2"""
    df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df = pd.DataFrame()
    df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique()
        
    if to_preprocess:
        df['text'] = df['text'].map(lambda x: preprocess(x))
        df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess)
        df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess)
        df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess)
        df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess)
        
    if analyzer == 'char':
        vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    else:
        vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    vect.fit(df["text"].tolist())
    return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect

开发者ID:lampts，项目名称:wsdm19cup，代码行数:22，代码来源:make_handcrafted_33_features.py

注：本文中的jieba.cut方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。