Python text.CountVectorizer方法代码示例

本文整理汇总了Python中sklearn.feature_extraction.text.CountVectorizer方法的典型用法代码示例。如果您正苦于以下问题：Python text.CountVectorizer方法的具体用法？Python text.CountVectorizer怎么用？Python text.CountVectorizer使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.feature_extraction.text的用法示例。

在下文中一共展示了text.CountVectorizer方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_validate_sklearn_sgd_with_text_cv

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def test_validate_sklearn_sgd_with_text_cv(self):
        categories = ['alt.atheism','talk.religion.misc']
        data = fetch_20newsgroups(subset='train', categories=categories)
        X = data.data[:4]
        Y = data.target[:4]
        features = ['input']
        target = 'output'
        model = SGDClassifier(loss="log")
        file_name = model.__class__.__name__ + '_CountVec_.pmml'
        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('clf', model)
        ])
        pipeline.fit(X, Y)
        skl_to_pmml(pipeline, features , target, file_name)
        self.assertEqual(self.schema.is_valid(file_name), True)

开发者ID:nyoka-pmml，项目名称:nyoka，代码行数:18，代码来源:_validateSchema.py

示例2: read_relations

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def read_relations(file_name):
    bow = []
    count_vec = CountVectorizer()

    d = {}
    file = open(file_name)
    for line in file:
        index, name = line.strip().split('\t')
        d[name] = int(index)

        if args.feature_type == 'bow' and not os.path.exists('../data/' + args.dataset + '/bow.npy'):
            tokens = re.findall('[a-z]{2,}', name)
            bow.append(' '.join(tokens))
    file.close()

    if args.feature_type == 'bow' and not os.path.exists('../data/' + args.dataset + '/bow.npy'):
        bow = count_vec.fit_transform(bow)
        np.save('../data/' + args.dataset + '/bow.npy', bow.toarray())

    return d

开发者ID:hwwang55，项目名称:PathCon，代码行数:22，代码来源:data_loader.py

示例3: test_from_texts

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def test_from_texts():
    cal = PMICalculator(doc2word_vectorizer=CountVectorizer(min_df=0),
                        doc2label_vectorizer=LabelCountVectorizer())
    actual = cal.from_texts(docs, labels)
    assert_equal(actual.shape[1], 4)
    assert_equal(actual.shape[0], 9)
    assert_equal(cal.index2word_, {0: u'information',
                                   1: u'language',
                                   2: u'learning',
                                   3: u'machine',
                                   4: u'mining',
                                   5: u'natural',
                                   6: u'processing',
                                   7: u'retrieval',
                                   8: u'text'})
    assert_equal(cal.index2label_, {0: 'information retrieval'.split(),
                                    1: 'machine learning'.split(),
                                    2: 'natural language processing'.split(),
                                    3: 'text mining'.split()})

开发者ID:xiaohan2012，项目名称:chowmein，代码行数:21，代码来源:test_pmi_w2l.py

示例4: _te_ss_t_build

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def _te_ss_t_build(self):
		from sklearn.datasets import fetch_20newsgroups
		from sklearn.feature_extraction.text import CountVectorizer

		newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
		count_vectorizer = CountVectorizer()
		X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
		corpus = CorpusFromScikit(
			X=X_counts,
			y=newsgroups_train.target,
			feature_vocabulary=count_vectorizer.vocabulary_,
			category_names=newsgroups_train.target_names,
			raw_texts=newsgroups_train.data
		).build()
		self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
		self.assertEqual(corpus
		                 .get_term_freq_df()
		                 .assign(score=corpus.get_scaled_f_scores('alt.atheism'))
		                 .sort_values(by='score', ascending=False).index.tolist()[:5],
		                 ['atheism', 'atheists', 'islam', 'atheist', 'belief'])
		self.assertGreater(len(corpus.get_texts()[0]), 5)

开发者ID:JasonKessler，项目名称:scattertext，代码行数:23，代码来源:test_corpusFromScikit.py

示例5: test_build

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def test_build(self):
		from sklearn.feature_extraction.text import CountVectorizer
		categories, docs = get_docs_categories_semiotic()
		idx_store = IndexStore()
		y = np.array([idx_store.getidx(c) for c in categories])
		count_vectorizer = CountVectorizer()
		X_counts = count_vectorizer.fit_transform(docs)
		term_doc_mat = TermDocMatrixFromScikit(
			X=X_counts,
			y=y,
			feature_vocabulary=count_vectorizer.vocabulary_,
			category_names=idx_store.values()).build()
		self.assertEqual(term_doc_mat.get_categories()[:2], ['hamlet', 'jay-z/r. kelly'])
		self.assertEqual(term_doc_mat
		                 .get_term_freq_df()
		                 .assign(score=term_doc_mat.get_scaled_f_scores('hamlet'))
		                 .sort_values(by='score', ascending=False).index.tolist()[:5],
		                 ['that', 'march', 'did', 'majesty', 'sometimes'])

开发者ID:JasonKessler，项目名称:scattertext，代码行数:20，代码来源:test_termDocMatrixFromScikit.py

示例6: init

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def __init__(
        self,
        anonymize=True,
        trim_window=5,
        lowercase=True,
        drop_stopwords=True,
        stem=True,
        ngram_range=(1, 3),
        **vectorizer_kwargs,
    ):
        self.anonymize = anonymize
        self.lowercase = lowercase
        self.drop_stopwords = drop_stopwords
        if drop_stopwords:
            nltk.download("stopwords")
            self.stopwords = set(nltk.corpus.stopwords.words("english"))
        self.trim_window = trim_window
        self.stem = stem
        if stem:
            self.porter = nltk.PorterStemmer()

        self.vectorizer = CountVectorizer(
            ngram_range=ngram_range, binary=True, **vectorizer_kwargs
        )

开发者ID:HazyResearch，项目名称:metal，代码行数:26，代码来源:ngram_featurizer.py

示例7: _n_grams

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def _n_grams():
    """
    Calculates various statistical features over the 1-,2- and 3-grams of the suffix and dot free domain
    :return: 
    """
    global __unigram
    feature = []

    for i in range(1,4):
        ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(i, i))
        counts = ngram_vectorizer.build_analyzer()(__joined_dot_split_suffix_free)
        npa = numpy.array(list(Counter(counts).values()), dtype=int)
        if i == 1:
            __unigram = npa

        feature += __stats_over_n_grams(npa)

    return feature

开发者ID:fanci-dga-detection，项目名称:fanci，代码行数:20，代码来源:feature_extraction.py

示例8: init

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def __init__(self, language=None):
        """ Initialize stoplist builder with option for language specific parameters
        :type language: str
        :param language : text from which to build the stoplist
        """
        if language:
            self.language = language.lower()
        self.numpy_installed = True  # Write utility for common import traps?
        self.sklearn_installed = True

        try:
            import numpy as np
            self.np = np
        except ImportError:
            self.numpy_installed = False

        try:
            from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
            # self.vectorizer = CountVectorizer(input='content') # Set df?
            # self.tfidf_vectorizer = TfidfVectorizer()
        except ImportError:
            self.sklearn_installed = False

开发者ID:cltk，项目名称:cltk，代码行数:24，代码来源:stop.py

示例9: tf_word_feature

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def tf_word_feature(self, data_set):
        """
        Get TF feature by word
        :param data_set:
        :return:
        """
        data_set = get_word_segment_data(data_set)
        if self.is_infer:
            self.vectorizer = load_pkl(self.feature_vec_path)
            data_feature = self.vectorizer.transform(data_set)
        else:
            self.vectorizer = CountVectorizer(vocabulary=self.word_vocab)
            data_feature = self.vectorizer.fit_transform(data_set)
        vocab = self.vectorizer.vocabulary_
        logger.info('Vocab size:%d' % len(vocab))
        feature_names = self.vectorizer.get_feature_names()
        logger.info('feature_names:%s' % feature_names[:20])
        logger.info(data_feature.shape)
        if not self.is_infer:
            save_pkl(self.vectorizer, self.feature_vec_path, overwrite=True)
        return data_feature

开发者ID:shibing624，项目名称:text-classifier，代码行数:23，代码来源:feature.py

示例10: test_04_lgbm_regressor

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def test_04_lgbm_regressor(self):
        print("\ntest 04 (lgbm regressor with preprocessing)\n")
        auto = pd.read_csv('nyoka/tests/auto-mpg.csv')
        X = auto.drop(['mpg'], axis=1)
        y = auto['mpg']

        feature_names = [name for name in auto.columns if name not in ('mpg')]
        target_name='mpg'
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)
        pd.DataFrame(data=x_test, columns=feature_names).to_csv("test.csv",index=False)
        pipeline_obj = Pipeline([
            ('mapper', DataFrameMapper([
                ('car name', CountVectorizer()),
                (['displacement'],[StandardScaler()]) 
            ])),
            ('lgbmr',LGBMRegressor())
        ])
        pipeline_obj.fit(x_train,y_train)
        file_name = "test04lgbm.pmml"
        lgb_to_pmml(pipeline_obj, feature_names, 'mpg', file_name)
        model_name  = self.adapa_utility.upload_to_zserver(file_name)
        predictions, _ = self.adapa_utility.score_in_zserver(model_name, "test.csv")
        predictions = numpy.array(predictions)
        model_pred = pipeline_obj.predict(x_test)
        self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True)

开发者ID:nyoka-pmml，项目名称:nyoka，代码行数:27，代码来源:testScoreWithAdapaLgbm.py

示例11: word2vec

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def word2vec(word_list,n_features=1000,topics = 5):
    tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                    max_features=n_features,
                                    #stop_words='english',
                                    max_df=0.5,
                                    min_df=10)
    tf = tf_vectorizer.fit_transform(word_list)

    lda = LatentDirichletAllocation(n_components=topics,#主题数
                                    learning_method='batch',#样本量不大只是用来学习的话用"batch"比较好，这样可以少很多参数要调
                                    )
    #用变分贝叶斯方法训练模型
    lda.fit(tf)

    #依次输出每个主题的关键词表
    tf_feature_names = tf_vectorizer.get_feature_names()

    return lda,tf,tf_feature_names,tf_vectorizer

#将主题以可视化结果展现出来

开发者ID:starFalll，项目名称:Spider，代码行数:22，代码来源:LDA_Analysis.py

示例12: build_vectorization_pipeline

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
        """
        Build SKLearn vectorization pipeline for this field.
        This is used in field-based machine learning when we calculate value of one field based on the
        values of other fields of this document.

        We are able to detect only choice fields this way at the moment.

        To reach this we need to build a feature vector of all dependencies of the field being detected.
        This feature vector is built as a union of feature vectors of each dependency.

        See how the whole pipeline is built in FieldBasedMLOnlyFieldDetectionStrategy.build_pipeline(..)

        :return: Tuple of: 1. List of vectorization steps - to be added to a Pipeline()
                           2. List of str feature names or a function returning list of str feature names.
        """

        vect = CountVectorizer(strip_accents='unicode', analyzer='word',
                               stop_words=self._build_stop_words())
        return [('clean', vectorizers.ReplaceNoneTransformer('')),
                ('vect', vect),
                ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)

开发者ID:LexPredict，项目名称:lexpredict-contraxsuite，代码行数:24，代码来源:field_types.py

示例13: init

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def __init__(self):
        """Initializes the Encoder object and sets internal tokenizer,
            labelEncoder and vectorizer using predefined objects.
        """
        self.tokenizer = BOWTokenizer(
            English()
        )  # the tokenizer must have a tokenize() and parse() function.
        self.labelEncoder = LabelEncoder()
        self.vectorizer = CountVectorizer(
            tokenizer=self.tokenizer.tokenize, ngram_range=(1, 1)
        )
        self.decode_params = {}

    # The keep_ids flag, is used by explain local in the explainer to decode
    # importances over raw features.

开发者ID:interpretml，项目名称:interpret-text，代码行数:17，代码来源:utils_classical.py

示例14: create_random_forest_vectorizer

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def create_random_forest_vectorizer():
    vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
    rf = RandomForestClassifier(n_estimators=500, random_state=777)
    return Pipeline([("vectorizer", vectorizer), ("rf", rf)])

开发者ID:interpretml，项目名称:interpret-text，代码行数:6，代码来源:common_utils.py

示例15: create_logistic_vectorizer

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def create_logistic_vectorizer():
    vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
    lr = LogisticRegression(random_state=777)
    return Pipeline([("vectorizer", vectorizer), ("lr", lr)])