当前位置: 首页>>代码示例>>Python>>正文


Python text.TfidfTransformer方法代码示例

本文整理汇总了Python中sklearn.feature_extraction.text.TfidfTransformer方法的典型用法代码示例。如果您正苦于以下问题:Python text.TfidfTransformer方法的具体用法?Python text.TfidfTransformer怎么用?Python text.TfidfTransformer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.feature_extraction.text的用法示例。


在下文中一共展示了text.TfidfTransformer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: write_topics

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def write_topics(ftopics, fwords, ftopics_words, poem_words, n_topic, n_topic_words):
    count_matrix = count_vect.fit_transform(poem_words)
    tfidf = TfidfTransformer().fit_transform(count_matrix)
    nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
    feature_names = count_vect.get_feature_names()
    fw = codecs.open(ftopics, 'w', 'utf-8')
    for topic in nmf.components_:
        fw.write(' '.join([feature_names[i] for i in topic.argsort()[:-n_topic_words - 1:-1]]) + '\n')
    fw.close()
    print('Write topics done.')
    fw = codecs.open(fwords, 'wb')
    pickle.dump(feature_names, fw)
    fw.close()
    print('Write words done.')
    fw = codecs.open(ftopics_words, 'wb')
    pickle.dump(nmf.components_, fw)
    fw.close()
    print('Write topic_words done.') 
开发者ID:lijiancheng0614,项目名称:poem_generator,代码行数:20,代码来源:get_topic.py

示例2: get_logistic_regression_coefs_l2

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def get_logistic_regression_coefs_l2(self, category,
                                         clf=RidgeClassifierCV()):
        ''' Computes l2-penalized logistic regression score.
        Parameters
        ----------
        category : str
            category name to score

        category : str
            category name to score
        Returns
        -------
            (coefficient array, accuracy, majority class baseline accuracy)
        '''
        try:
            from sklearn.cross_validation import cross_val_predict
        except:
            from sklearn.model_selection import cross_val_predict
        y = self._get_mask_from_category(category)
        X = TfidfTransformer().fit_transform(self._X)
        clf.fit(X, y)
        y_hat = cross_val_predict(clf, X, y)
        acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat)
        return clf.coef_[0], acc, baseline 
开发者ID:JasonKessler,项目名称:scattertext,代码行数:26,代码来源:TermDocMatrix.py

示例3: test_main

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def test_main(self):
		categories, documents = get_docs_categories()
		clean_function = lambda text: '' if text.startswith('[') else text
		entity_types = set(['GPE'])
		term_doc_mat = (
			TermDocMatrixFactory(
				category_text_iter=zip(categories, documents),
				clean_function=clean_function,
				nlp=_testing_nlp,
				feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types)
			).build()
		)
		clf = PassiveAggressiveClassifier()
		fdc = FeatsFromDoc(term_doc_mat._term_idx_store,
		                   clean_function=clean_function,
		                   feats_from_spacy_doc=FeatsFromSpacyDoc(
			                   entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
		tfidf = TfidfTransformer(norm='l1')
		X = tfidf.fit_transform(term_doc_mat._X)
		clf.fit(X, term_doc_mat._y)
		X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
		pred = clf.predict(tfidf.transform(X_to_predict))
		dec = clf.decision_function(X_to_predict) 
开发者ID:JasonKessler,项目名称:scattertext,代码行数:25,代码来源:test_termDocMatrixFactory.py

示例4: tdidf_sim

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def tdidf_sim(sentences):
    """
       tfidf相似度
    :param sentences: 
    :return: 
    """
    # tfidf计算
    model = TfidfVectorizer(tokenizer=jieba.cut,
                            ngram_range=(1, 2), # 3,5
                            stop_words=[' ', '\t', '\n'],  # 停用词
                            max_features=10000,
                            token_pattern=r"(?u)\b\w+\b",  # 过滤停用词
                            min_df=1,
                            max_df=0.9,
                            use_idf=1,  # 光滑
                            smooth_idf=1,  # 光滑
                            sublinear_tf=1, )  # 光滑
    matrix = model.fit_transform(sentences)
    matrix_norm = TfidfTransformer().fit_transform(matrix)
    return matrix_norm 
开发者ID:yongzhuo,项目名称:nlg-yongzhuo,代码行数:22,代码来源:textrank_sklearn.py

示例5: test_tf_idf_smoothing

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def test_tf_idf_smoothing():
    X = [[1, 1, 1],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=True, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert (tfidf >= 0).all()

    # check normalization
    assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])

    # this is robust to features with only zeros
    X = [[1, 1, 0],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=True, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert (tfidf >= 0).all() 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:20,代码来源:test_text.py

示例6: extract_characters

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def extract_characters(sentences: list, dimension: int):
    """
    vertorizer
    :param sentences: list
    :param dimension: int
    :return: weight, training_data
    """
    print("Vetorizier...")
    # Transfer into frequency matrix a[i][j], word j in text class i frequency
    vertorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.46)
    # vertorizer = CountVectorizer()
    # collect tf-idf weight
    transformer = TfidfTransformer()
    # outer transform for calculate tf-idf, second for transform into matrix
    tfidf = transformer.fit_transform(vertorizer.fit_transform(sentences))
    # get all words in BOW
    words_bag = vertorizer.get_feature_names()
    # w[i][j] represents word j's weight in text class i
    weight = tfidf.toarray()
    print('Features length:' + str(len(words_bag)))
    pca = PCA(n_components=dimension)
    training_data = pca.fit_transform(weight)
    return weight, training_data 
开发者ID:FesonX,项目名称:cn-text-classifier,代码行数:25,代码来源:preprocess.py

示例7: build_vectorization_pipeline

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
        """
        Build SKLearn vectorization pipeline for this field.
        This is used in field-based machine learning when we calculate value of one field based on the
        values of other fields of this document.

        We are able to detect only choice fields this way at the moment.

        To reach this we need to build a feature vector of all dependencies of the field being detected.
        This feature vector is built as a union of feature vectors of each dependency.

        See how the whole pipeline is built in FieldBasedMLOnlyFieldDetectionStrategy.build_pipeline(..)

        :return: Tuple of: 1. List of vectorization steps - to be added to a Pipeline()
                           2. List of str feature names or a function returning list of str feature names.
        """

        vect = CountVectorizer(strip_accents='unicode', analyzer='word',
                               stop_words=self._build_stop_words())
        return [('clean', vectorizers.ReplaceNoneTransformer('')),
                ('vect', vect),
                ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect) 
开发者ID:LexPredict,项目名称:lexpredict-contraxsuite,代码行数:24,代码来源:field_types.py

示例8: get_topic_idf

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def get_topic_idf(self, sentences):
        vectorizer = CountVectorizer()
        sent_word_matrix = vectorizer.fit_transform(sentences)

        transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False)
        tfidf = transformer.fit_transform(sent_word_matrix)
        tfidf = tfidf.toarray()

        centroid_vector = tfidf.sum(0)
        centroid_vector = np.divide(centroid_vector, centroid_vector.max())
        # print(centroid_vector.max())

        feature_names = vectorizer.get_feature_names()

        relevant_vector_indices = np.where(centroid_vector > self.topic_threshold)[0]

        word_list = list(np.array(feature_names)[relevant_vector_indices])
        return word_list 
开发者ID:gaetangate,项目名称:text-summarizer,代码行数:20,代码来源:centroid_word_embeddings.py

示例9: get_keyword_from_tf

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def get_keyword_from_tf(sentences, p):
    """
      获取某个类型下语料的热词
    :param sentences: list, cuted sentences, filter by " "
    :param p: float, rate, 0 < p < 1
    :return: list, words
    """
    sentence_cut_list = [" ".join(list(jieba.cut(text.strip(), cut_all=False, HMM=True))) for text in sentences]
    # token_pattern指定统计词频的模式, 不指定, 默认如英文, 不统计单字
    vectorizer = CountVectorizer(token_pattern='\\b\\w+\\b')
    # norm=None对词频结果不归一化
    # use_idf=False, 因为使用的是计算tfidf的函数, 所以要忽略idf的计算
    transformer = TfidfTransformer(norm=None, use_idf=False)
    vectorizer.fit_transform(sentence_cut_list)
    # tf = transformer.fit_transform(vectorizer.fit_transform(sentence_cut_list))
    word = vectorizer.get_feature_names()
    # weight = tf.toarray()
    return word[-int(len(word) * p):] 
开发者ID:yongzhuo,项目名称:nlp_xiaojiang,代码行数:20,代码来源:enhance_marko.py

示例10: load_data_ssl

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def load_data_ssl(data_name):
    adj_csr, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(data_name)
    adj_mat = np.asarray(adj_csr.toarray(), dtype=np_float_type)
    x_tr = np.reshape(np.arange(len(train_mask))[train_mask], (-1, 1))
    x_val = np.reshape(np.arange(len(val_mask))[val_mask], (-1, 1))
    x_test = np.reshape(np.arange(len(test_mask))[test_mask], (-1, 1))
    y_tr = np.asarray(y_train[train_mask], dtype=np.int32)
    y_tr = np.reshape(np.sum(np.tile(np.arange(y_tr.shape[1]), (np.sum(train_mask), 1)) * y_tr, axis=1), (-1, 1))
    y_val = np.asarray(y_val[val_mask], dtype=np.int32)
    y_val = np.reshape(np.sum(np.tile(np.arange(y_val.shape[1]), (np.sum(val_mask), 1)) * y_val, axis=1), (-1, 1))
    y_test = np.asarray(y_test[test_mask], dtype=np.int32)
    y_test = np.reshape(np.sum(np.tile(np.arange(y_test.shape[1]), (np.sum(test_mask), 1)) * y_test, axis=1), (-1, 1))
    node_features = features.toarray()
    if data_name.lower() != 'pubmed': #pubmed already comes with tf-idf
        transformer = TfidfTransformer(smooth_idf=True)
        node_features = transformer.fit_transform(node_features).toarray()
    return adj_mat, node_features, x_tr, y_tr, x_val, y_val, x_test, y_test 
开发者ID:yincheng,项目名称:GGP,代码行数:19,代码来源:utils.py

示例11: test_nlp_not_padded_invalid

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def test_nlp_not_padded_invalid(self):
        num_words = 1024
        (x_train, y_train), (_, _) = TestUtil.get_random_variable_length_dataset(max_value=num_words)

        explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True,
                                        num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
                                        batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
        masking_operation = WordDropMasking()
        loss = binary_crossentropy
        explainer = CXPlain(explained_model, model_builder, masking_operation, loss)

        with self.assertRaises(ValueError):
            explainer.fit(x_train, y_train) 
开发者ID:d909b,项目名称:cxplain,代码行数:25,代码来源:test_explanation_model.py

示例12: transform_bag_of_words

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def transform_bag_of_words(filename, n_dimensions, out_fn):
    import gzip
    from scipy.sparse import lil_matrix
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn import random_projection
    with gzip.open(filename, 'rb') as f:
        file_content = f.readlines()
        entries = int(file_content[0])
        words = int(file_content[1])
        file_content = file_content[3:]  # strip first three entries
        print("building matrix...")
        A = lil_matrix((entries, words))
        for e in file_content:
            doc, word, cnt = [int(v) for v in e.strip().split()]
            A[doc - 1, word - 1] = cnt
        print("normalizing matrix entries with tfidf...")
        B = TfidfTransformer().fit_transform(A)
        print("reducing dimensionality...")
        C = random_projection.GaussianRandomProjection(
            n_components=n_dimensions).fit_transform(B)
        X_train, X_test = train_test_split(C)
        write_output(numpy.array(X_train), numpy.array(
            X_test), out_fn, 'angular') 
开发者ID:erikbern,项目名称:ann-benchmarks,代码行数:25,代码来源:datasets.py

示例13: question_classifier

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def question_classifier(data):
  questions = [i[1] for i in data]
  sql_type = [i[2].split(' ')[1] for i in data]
  sql_type_set = set(sql_type)
  sql_classes = dict([(type, i) for i, type in enumerate(sql_type_set)])

  target = np.array([sql_classes[i] for i in sql_type])
  sql_type_to_indices = {}
  for type in sql_type_set:
    sql_type_to_indices[type] = [idx for idx, i in enumerate(sql_type) if i == type]

  # Build classifier
  # TODO better ones 
  text_clf = Pipeline([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('clf', svm.LinearSVC())])
  text_clf.fit(questions, target)
  predicted = text_clf.predict(questions)
  print('Training Acc.: %f' %(np.mean(predicted == target)))

  return sql_type_to_indices, text_clf 
开发者ID:microsoft,项目名称:PointerSQL,代码行数:23,代码来源:relevance.py

示例14: removeSimilarSentences

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def removeSimilarSentences(generatedSentences, originalSentences,  stopwords,threshold=0.80,):
    docs=[]
    for sent, sim in generatedSentences:
        docs.append(sent)
    docs.extend(originalSentences)
    
    bow_matrix = StemmedTfidfVectorizer(stop_words=stopwords).fit_transform(docs)
    normalized = TfidfTransformer().fit_transform(bow_matrix)
    #simMatrix = (normalized[0:] * normalized[0:].T).A
    simindices=[]
    #print 'Num original, ', len(originalSentences)
    for i in xrange(len(generatedSentences)):
        simGeneratedScores = linear_kernel(normalized[i], normalized[len(generatedSentences):]).flatten()
        if(max(simGeneratedScores) >= threshold):
            simindices.append(i)
    
    #print simindices
    finalGen=[sentence for k,sentence in enumerate(generatedSentences) if k not in simindices]
    #print len(generatedSentences), len(finalGen)
    return finalGen 
开发者ID:StevenLOL,项目名称:AbTextSumm,代码行数:22,代码来源:WGGraph.py

示例15: __init__

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def __init__(self, min_df=1, max_df=0.9, tokenizer=LemmaTokenizer, hash=False):
        """
        `min_df` is set to filter out extremely rare words,
        since we don't want those to dominate the distance metric.

        `max_df` is set to filter out extremely common words,
        since they don't convey much information.
        """

        # Wrap the specified tokenizer
        t = Tokenizer(tokenizer())

        if hash:
            vectr = HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t)
        else:
            vectr = CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t, min_df=min_df, max_df=max_df)

        args = [
            ('vectorizer', vectr),
            ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
            ('normalizer', Normalizer(copy=False))
        ]

        self.pipeline = Pipeline(args)
        self.trained = False 
开发者ID:frnsys,项目名称:broca,代码行数:27,代码来源:bow.py


注:本文中的sklearn.feature_extraction.text.TfidfTransformer方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。