Python Vectorizer.transform方法代码示例

本文整理汇总了Python中sklearn.feature_extraction.text.Vectorizer.transform方法的典型用法代码示例。如果您正苦于以下问题：Python Vectorizer.transform方法的具体用法？Python Vectorizer.transform怎么用？Python Vectorizer.transform使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.feature_extraction.text.Vectorizer的用法示例。

在下文中一共展示了Vectorizer.transform方法的8个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: load_mlcomp

# 需要导入模块: from sklearn.feature_extraction.text import Vectorizer [as 别名]
# 或者: from sklearn.feature_extraction.text.Vectorizer import transform [as 别名]
print "n_samples: %d, n_features: %d" % X_train.shape
assert sp.issparse(X_train)
y_train = news_train.target

print "Loading 20 newsgroups test set... "
news_test = load_mlcomp('20news-18828', 'test')
t0 = time()
print "done in %fs" % (time() - t0)

print "Predicting the labels of the test set..."
print "%d documents" % len(news_test.filenames)
print "%d categories" % len(news_test.target_names)

print "Extracting features from the dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform((open(f).read() for f in news_test.filenames))
y_test = news_test.target
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape


###############################################################################
# Benchmark classifiers
def benchmark(clf_class, params, name):
    print "parameters:", params
    t0 = time()
    clf = clf_class(**params).fit(X_train, y_train)
    print "done in %fs" % (time() - t0)

    if hasattr(clf, 'coef_'):
        print "Percentage of non zeros coef: %f" % (

开发者ID:c0ldlimit，项目名称:scikit-learn，代码行数:33，代码来源:mlcomp_sparse_document_classification.py

示例2: time

# 需要导入模块: from sklearn.feature_extraction.text import Vectorizer [as 别名]
# 或者: from sklearn.feature_extraction.text.Vectorizer import transform [as 别名]
print

# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform(data_train.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
print

print "Extracting features from the test dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform(data_test.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape
print

if opts.select_chi2:
    print ("Extracting %d best features by a chi-squared test" %
           opts.select_chi2)
    t0 = time()
    ch2 = SelectKBest(chi2, k=opts.select_chi2)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    print "done in %fs" % (time() - t0)
    print

vocabulary = np.array([t for t, i in sorted(vectorizer.vocabulary.iteritems(),

开发者ID:joshbohde，项目名称:scikit-learn，代码行数:33，代码来源:document_classification_20newsgroups.py

示例3: for

# 需要导入模块: from sklearn.feature_extraction.text import Vectorizer [as 别名]
# 或者: from sklearn.feature_extraction.text.Vectorizer import transform [as 别名]
            charset='utf-8', 
            stop_words=set(['a', 'an', 'and', 'in', 'is', 'of', 'on', 'the', 'to']),
            )
        )
    title_train = title_vectorizer.fit_transform([ x for (x, y) in zip(all_data['title'], train_index) if y ])
    
    domain_vectorizer = extract.SimpleVectorizer()
    domain_train = domain_vectorizer.fit_transform([ x for (x, y) in zip(all_data['domain'], train_index) if y ])
    X_train = title_train
    print "done in %fs" % (time() - t0)
    print "n_samples: %d, n_features: %d" % X_train.shape
    print

    print "Extracting features from the test dataset using the same vectorizer"
    t0 = time()
    title_test = title_vectorizer.transform([ x for (x, y) in zip(all_data['title'], test_index) if y ])
    domain_test = domain_vectorizer.transform([ x for (x, y) in zip(all_data['domain'], test_index) if y ])
    X_test = domain_test
    print "done in %fs" % (time() - t0)
    print "n_samples: %d, n_features: %d" % X_test.shape
    print

    if opts.words_chi2:
        print ("Extracting %d best word features by a chi-squared test" %
               opts.words_chi2)
        t0 = time()
        ch2 = SelectKBest(chi2, k=opts.words_chi2)
        title_train = ch2.fit_transform(title_train, y_train)
        title_test = ch2.transform(title_test)
        print "done in %fs" % (time() - t0)
        print

开发者ID:smoreinis，项目名称:classificator，代码行数:33，代码来源:gold.py

示例4: test_vectorizer

# 需要导入模块: from sklearn.feature_extraction.text import Vectorizer [as 别名]
# 或者: from sklearn.feature_extraction.text.Vectorizer import transform [as 别名]
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"water"]], 1)

        # stop word from the fixed list
        assert_false(u"the" in v.vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false(u"copyright" in v.vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = toarray(t1.fit(counts_train).transform(counts_train))
    assert_equal(len(t1.idf_), len(v1.vocabulary))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary)))

    # test tf-idf with new data
    tfidf_test = toarray(t1.transform(counts_test))
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = toarray(t2.fit(counts_train).transform(counts_train))
    assert_equal(t2.idf_, None)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = Vectorizer(norm='l1')
    tv.tc.max_df = v1.max_df
    tfidf2 = toarray(tv.fit_transform(train_data))
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = toarray(tv.transform(test_data))
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    assert_raises(ValueError, v3.transform, train_data)

开发者ID:aravindgd，项目名称:scikit-learn，代码行数:76，代码来源:test_text.py

示例5: time

# 需要导入模块: from sklearn.feature_extraction.text import Vectorizer [as 别名]
# 或者: from sklearn.feature_extraction.text.Vectorizer import transform [as 别名]
y_L2_cookies = cookies_train.target
y_L2_share = share_train.target


# Extract features
print "Extracting features from Layer 1 training set using a sparse vectorizer..."
t0 = time()
vectorizer = Vectorizer()
X_L1 = vectorizer.fit_transform(data_train.data)
print "Done in %0.3fs" % (time() - t0)
print "L1:      n_samples: %d, n_features: %d" % X_L1.shape
print

print "Extracting features from Layer 2 training sets using the same vectorizer..."
t0 = time()
X_L2_ca = vectorizer.transform(ca_train.data)
X_L2_collect = vectorizer.transform(collect_train.data)
X_L2_cookies = vectorizer.transform(cookies_train.data)
X_L2_share = vectorizer.transform(share_train.data)
print "Done in %0.3fs" % (time() - t0)
print "CA:      n_samples: %d, n_features: %d" % X_L2_ca.shape
print "Collect: n_samples: %d, n_features: %d" % X_L2_collect.shape
print "Cookies: n_samples: %d, n_features: %d" % X_L2_cookies.shape
print "Share:   n_samples: %d, n_features: %d" % X_L2_share.shape
print


# # Feature selection for the L1 dataset
# select_chi2 = 1000
# print ("Extracting %d best features by a chi-squared test" % select_chi2)
# t0 = time()

开发者ID:YuanhaoSun，项目名称:WebClassifier，代码行数:33，代码来源:classifier.py

示例6: len

# 需要导入模块: from sklearn.feature_extraction.text import Vectorizer [as 别名]
# 或者: from sklearn.feature_extraction.text.Vectorizer import transform [as 别名]
# print "%d categories" % len(data_set.target_names)
print

# load unlabeled data
data_set_unlabel = load_files('Privacypolicy/unlabeled', shuffle = True, random_state = 30)


# Extract features
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform(data_set.data)
X = Normalizer(norm="l2", copy=False).transform(X)
X = X.toarray()

X_unlabel = vectorizer.transform(data_set_unlabel.data)
X_unlabel = X_unlabel.toarray()

y = data_set.target

n_samples, n_features = X.shape
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % (n_samples, n_features)
print


def _test_semi(NaiveBayes):


    clf = MultinomialNB(alpha=.01)
    clf.fit(X_train, y_train)

开发者ID:YuanhaoSun，项目名称:PPLearn，代码行数:33，代码来源:18_semiNB_apply.py

示例7: number

# 需要导入模块: from sklearn.feature_extraction.text import Vectorizer [as 别名]
# 或者: from sklearn.feature_extraction.text.Vectorizer import transform [as 别名]
# subjectivity disagreement itens
docs_new = [
"When you register for account credentials, Ning collects certain Personal Information, including your name, email address, and a password that you select. In addition, Network Creators must provide their credit card or other payment information and telephone number (for customer support purposes). Ning or its third party payment providers use this Personal Information related to your billing information solely to administer your services on the Ning Platform and to process your transactions including your purchase of Ning Product Plans, Support Services, and upgrades",
"From time to time, EA employs third party contractors to collect personal information on our behalf to provide email delivery, product, prize or promotional fulfillment, contest administration, credit card processing, shipping or other services on our sites. When requesting these services, you may be asked to supply your name, mailing address, telephone number and email address to our contractors. We ask some third party contractors, such as credit agencies, data analytics or market research firms, to supplement personal information that you provide to us for our own marketing and demographic studies, so that we can consistently improve our sites and related advertising to better meet our visitors' needs and preferences. To enrich our understanding of individual customers, we tie this information to the personal information you provide to us",
"Like many other websites, we also collect information through cookies and other automated means. Cookies are commonly used by websites to save data on your computer. The information we collect from cookies may include your IP address, browser and device characteristics, referring URLs, and a record of your interactions with our websites. We use cookies to create a more personalized shopping experience on our websites",
"The Network Advertising Initiative (NAI) is a self-regulatory cooperative of online marketing and analytics companies. The NAI provides educational content and opt-out tools to help Internet users learn about and address online behavioral marketing practices. Through the NAI's online options, you may opt out of particular NAI network members' behavioral advertising programs or you may opt out of all NAI network members' programs. Opting out will prevent the given network from which you opted out from using your Web preferences and usage patterns to deliver targeted online ads. The NAI opt-out only works with participating third party advertising networks that use cookies and Web beacons to execute their advertising initiatives. If you would like additional information about online behavioral marketing and your options regarding these standard Internet practices, please visit the NAI website",
"""Third Party Information and Content. If you access a FOX Service through a third party connection or log-in, your user submitted information may also include your user ID and/or user name associated with that third party service, any information/content you have permitted the third party to share with FOX, and any information you have made public in connection with that third party service (collectively, "Third Party Information and Content"). Third Party Information and Content obtained in this manner will be governed by this Privacy Policy, any applicable policy of the third party and the terms of use for the FOX Service""",
"Like most web-based services, Ning automatically receives and records information on our server logs from your browser when you use the Ning Platform. We may use a variety of methods, including clear GIFs (also known as web beacons), and cookies to collect this information. The information that we collect with these automated methods may include, for example, your IP address, Ning cookie information, a unique device or user ID, browser type, system type, the content and pages that you access on the Ning Platform, and the referring URL (i.e., the page from which you navigated to the Ning Platform).",
"Other Information We Receive and Store : When you register to use MailChimp, we store 'cookies,' which are strings of code, on your computer. We also use electronic images known as Web beacons. With those cookies, we are aware of and collect information concerning when you visit our Website, when you use MailChimp, your browser type and version, your operating system and platform and other similar information. With Web beacons, we can determine when you open email we send you, and collect other data. You may turn off all cookies that have been placed on your computer by following the instructions on your browser on how to block cookies that have been placed on your computer. However, if you block our cookies it will be more difficult, and maybe impossible, to use the Services",
"EMC strives to keep your personal information accurate. We have implemented technology, management processes and policies to maintain data integrity. We will provide you with access to your information when reasonable, or in accordance with relevant laws, including making reasonable effort to provide you with online access and the opportunity to change your information. To protect your privacy and security, we will take steps to verify your identity before granting access or making changes to your personal information. To access and/or correct information, you can do so online or notify us via the appropriate method below depending on which site is at issue",
"Your information to our service providers. We use service providers who help us to provide you with our services. We give relevant persons working for some of these providers access to your information, but only to the extent necessary for them to perform their services for us. We also implement reasonable contractual and technical protections to ensure the confidentiality of your personal information and data is maintained, used only for the provision of their services to us, and handled in accordance with this privacy policy. Examples of service providers include payment processors, email service providers, and web traffic analytics tools",
"Some Microsoft sites allow you to choose to share your personal information with select Microsoft partners so that they can contact you about their products, services or offers. Other sites, such as MSN instead may give you a separate choice as to whether you wish to receive communications from Microsoft about a partner's particular offering (without transferring your personal information to the third party). See the Communication Preferences section below for more information.",
]

X_new = vectorizer.transform(docs_new)


# Train classifiers
print "Training Classifiers..."
t0 = time()

clf_nb = MultinomialNB()
clf_lsvc = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
clf_svc = SVC(C=1024, kernel='rbf', degree=3, gamma=0.001, probability=True)
clf_rdg = RidgeClassifier(tol=1e-1)
clf_sgd = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")

# Logistic regression requires OneVsRestClassifier which hides
# its methods such as decision_function
# It will require extra implementation efforts to use it as a candidate

开发者ID:YuanhaoSun，项目名称:PPLearn，代码行数:32，代码来源:05_multilabel.py

示例8: load_files

# 需要导入模块: from sklearn.feature_extraction.text import Vectorizer [as 别名]
# 或者: from sklearn.feature_extraction.text.Vectorizer import transform [as 别名]
data_test = load_files('./Test/Unlabeled', categories = test_category,
                        shuffle = True, random_state = 42)
print 'data loaded'
print len(data_train.data)
print len(data_test.data)
print

# Extract features
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()

vectorizer = Vectorizer(max_features=10000)
X_test = vectorizer.fit_transform(data_test.data)
X_test = Normalizer(norm="l2", copy=False).transform(X_test)

X = vectorizer.transform(data_train.data)
X = Normalizer(norm="l2", copy=False).transform(X)

X = X.toarray()
X_test = X_test.toarray()

n_samples, n_features = X.shape
test_samples, test_features = X_test.shape
print "done in %fs" % (time() - t0)
print "Train set - n_samples: %d, n_features: %d" % (n_samples, n_features)
print "Test set  - n_samples: %d, n_features: %d" % (test_samples, test_features)
print


# fit the model
# when nu=0.01, gamma=0.0034607 is the smallest to generate >0 result

开发者ID:YuanhaoSun，项目名称:PPLearn，代码行数:33，代码来源:detect_oneclass_svm.py

注：本文中的sklearn.feature_extraction.text.Vectorizer.transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。