Python text.TfidfVectorizer類代碼示例

本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer類的典型用法代碼示例。如果您正苦於以下問題：Python TfidfVectorizer類的具體用法？Python TfidfVectorizer怎麽用？Python TfidfVectorizer使用的例子？那麽, 這裏精選的類代碼示例或許可以為您提供幫助。

在下文中一共展示了TfidfVectorizer類的15個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: genfeature

    def genfeature(self, ls_x):
        '''
        a. Shallow features
	        1. number of words in the sentence (normalize)
	        2. average number of characters in the words
            3. percentage of stop words
	        4. minimum, maximum and average inverse document frequency
        :param ls_x: sencences X without label
        :return:
        '''

        vectorizer = TfidfVectorizer(stop_words='english',smooth_idf=True, sublinear_tf=False,
                 use_idf=True)
        tfidf = vectorizer.fit_transform(ls_x)
        array = tfidf.toarray()
        X = []
        append = X.append
        maxtoken = 0
        for idx,l in enumerate(ls_x):
            ws = l.split()
            maxtoken = max(len(ws),maxtoken)
            try:
                stops = round(reduce(lambda x,y: x+1 if y in self.tweetmanager.stop else x, ws,0)/(len(ws)+1e-10),2)
            except:
                pass

            append([len(ws),self.avgch(ws), stops,
                    min(array[idx]), max(array[idx]), sum(array[idx])/len(array[idx])])

        return [[round(x[0]*1.0/maxtoken,2)] + x[1:]  for x in X]

開發者ID:siyuqtt，項目名稱:independent，代碼行數:30，代碼來源:util.py

示例2: get_data

    def get_data(self, abstract=False):
        data = self.mongo.get_all(order_by='id_doc')
        data = [doc for doc in data]

        if abstract:
            only_text = self.get_data_with_abstract(data)
        else:
            only_text = [doc['text'] for doc in data]

        only_labels = [doc['label'] for doc in data]
        tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
                                           max_features=200000,
                                           min_df=2,
                                           stop_words='english',
                                           strip_accents='unicode',
                                           use_idf=True,
                                           ngram_range=(1, 1),
                                           norm='l2',
                                           tokenizer=TextUtils.tokenize_and_stem)
        tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)
        print 'After tfidf vectorizer: found %s documents and %s terms' \
              % (tfidf_matrix.shape[0], tfidf_matrix.shape[1])
        dict_out = {}
        for l in sorted(set(only_labels)):
            dict_out[l] = {
                'docs': [],
                'fscore': ''
            }
        for doc in data:
            dict_out[doc['label']]['docs'].append(doc['id_doc'])

        return tfidf_matrix, dict_out

開發者ID:Neuro17，項目名稱:LOD-doc-clustering，代碼行數:32，代碼來源:document_processor.py

示例3: tfidf_ize

def tfidf_ize(train, test, node_info):
    vectorizer = TfidfVectorizer(ngram_range=(1,1))
    vectorizer.fit(node_info.abstract.as_matrix())
    
    for table in [train, test]:
        table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna(''))
        table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna(''))
        table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna(''))
        table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna(''))
        
        #table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1)
        table.loc[:, 'temp22'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel
        table.loc[:, 'temp23'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1)
        table.loc[:, 'temp24'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \
                        + table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1)
    
    vectorizer = TfidfVectorizer(ngram_range=(2,2))
    vectorizer.fit(node_info.abstract.as_matrix())
    
    for table in [train, test]:
        table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna(''))
        table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna(''))
        table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna(''))
        table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna(''))
        
        #table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1)
        table.loc[:, 'temp27'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel
        table.loc[:, 'temp28'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1)
        table.loc[:, 'temp29'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \
                        + table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1)
    
    return train, test

開發者ID:Leobouloc，項目名稱:mva_link_prediction，代碼行數:32，代碼來源:main.py

示例4: t_test_accuracy

def t_test_accuracy(topic_id, n_runs, estimator_params_votes_per_doc_tuples):
  """ Test if accuracy for estimators with given parameters is
      significantly better than that of the first estimator in the tuple
  """
  texts, vote_lists, truths = texts_vote_lists_truths_by_topic_id[topic_id]
  vectorizer = TfidfVectorizer()
  text_similarity = cosine_similarity(vectorizer.fit_transform(texts))

  accuracy_arrays = []
  for estimator, args, votes_per_doc in estimator_params_votes_per_doc_tuples:
    stop_idx = votes_per_doc * len(texts)
    # Now get n_runs accuracies and put then into numpy arrays
    accuracies = Parallel(n_jobs=4)( delayed(get_accuracy_sequence)(estimator, stop_idx, texts, 
        vote_lists, truths, text_similarity, idx, True, *args) for idx in xrange(n_runs) )
    accuracy_arrays.append( np.array( filter(lambda x: x is not None, accuracies) ) )

  # Baseline
  result_row = []
  result_row.append( "%0.2f" % np.mean(accuracy_arrays[0]) )
  # T-tests
  for accuracy_array in accuracy_arrays[1:]:
    _, pval = ttest_ind(accuracy_array, accuracy_arrays[0], equal_var=False)
    significance_indicator = lambda p: "*" if p < 0.01 else " "
    is_better = "$" if np.mean(accuracy_array) > np.mean(accuracy_arrays[0]) else " "
    result_row.append( "%0.2f %s %s" % (np.mean(accuracy_array), significance_indicator(pval), is_better))

  return "|".join(result_row)

開發者ID:piyushbansal，項目名稱:ir-crowd-thesis，代碼行數:27，代碼來源:experiments.py

示例5: tfidf

def tfidf(synopses):
    tfidf_vectorizer=TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(synopses)
    terms=tfidf_vectorizer.get_feature_names()
    print("terms:",terms)
    print(tfidf_matrix.shape)
    return  terms,tfidf_matrix  # 返回tfidf矩陣

開發者ID:tuling56，項目名稱:Python，代碼行數:7，代碼來源:mtextcluster_fun.py

示例6: read_examples

def read_examples(filename, sparm):
    """Parses an input file into an example sequence."""
    # This reads example files of the type read by SVM^multiclass.
    examples = []
    text = []
    count = 0
    # Open the file and read each example.
    for line in file(filename):
        # Get rid of comments.
        if line.find('#'): line = line[:line.find('#')]
        target, tokens = line.split('::')[0], line.split('::')[1:]
        # If the line is empty, who cares?
        if not tokens: continue
        # Get the target.
        text[count] = target
        # Get the features.
        tokens = [t.split(':') for t in tokens]
        features = [(0,1)]+[(int(k),float(v)) for k,v in tokens]
        # Add the example to the list
        examples.append((svmapi.Sparse(features), count))
        count += 1
    # Print out some very useful statistics.
    vectorizer = TfidfVectorizer(stop_words='english')
    global tf_idf_transformed_matrix
    tf_idf_transformed_matrix = vectorizer.fit_transform(text)
    print len(examples),'examples read'
    return examples

開發者ID:ankeshanand，項目名稱:svm-struct-textual，代碼行數:27，代碼來源:textual_classes.py

示例7: get_features

    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
        # more important topic words a body contains of a certain topic, the higher its value for this topic
        lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3)

        print("latent_dirichlet_allocation_cos: fit and transform body")
        t0 = time()
        lda_body_matrix = lda_body.fit_transform(X_train_body)
        print("done in %0.3fs." % (time() - t0))

        print("latent_dirichlet_allocation_cos: transform head")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        lda_head_matrix = lda_body.transform(X_train_head)

        #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)

        print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body')
        # calculate cosine distance between the body and head
        X = []
        for i in range(len(lda_head_matrix)):
            X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
            X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
            cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
            X.append(cos_dist.tolist())
        return X

開發者ID:paris5020，項目名稱:athene_system，代碼行數:32，代碼來源:topic_models.py

示例8: gen_data

    def gen_data(self, fname):
        """
        :fname : input file, every line means a single data
        :rtype : List[List[float]]: data matrix
        """
        
        lines = [ self.langConvHandler.convert(line.strip().lower()) for line in codecs.open(fname, "rb","utf-8") if len(line) > 6]
        # lines = list(set(lines))  # remove duplicates
        
        
        logging.info("number of data %d " % len(lines))
        cut_lines = [" ".join(jieba.cut(line)) for line in lines]

        # transform to tfidfVec
        tfidfVec = TfidfVectorizer(max_features = 3000)
        tfidf_data = tfidfVec.fit_transform(cut_lines)
        tfidf_data = tfidf_data.toarray()
       
        # save origin text
        with open("./output/origin_lines.txt", "wb") as fw:
            json.dump(lines, fw)
        
        # save vectorize data
        np.save("./output/tfidf.corpus.npy", tfidf_data)
        
        self.lines = lines
        self.tfidf_data = tfidf_data

開發者ID:jkmiao，項目名稱:textcluster，代碼行數:27，代碼來源:cluster_text.py

示例9: simple_tfidf_alldocs

def simple_tfidf_alldocs():
	qs = Posts.objects.all()
	docs,post_index_map = vectorize_docs(n_samples=n_samples,log_batch_size=log_batch_size, qs=qs) #Get the doc bodies
	tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features = n_features_init,ngram_range=(1,n_gram),max_df=0.8)
	tfidf_matrix_raw = tfidf_vectorizer.fit_transform(docs) #docs x n-gram-features
	tfidf_matrix_scaled = scale(tfidf_matrix_raw, with_mean = False) #Can't use sparse matrices unless with_mean=False
	return tfidf_matrix_scaled, post_index_map

開發者ID:afanslau，項目名稱:CSEducationTool，代碼行數:7，代碼來源:loadTermMatrix.py

示例10: train_classifier

def train_classifier(download=True, parameters=None, ngram_range=(1, 1)):
    """Train the intent classifier."""
    if download:
        download_wiki()

    path = os.path.join(l.TOPDIR, 'train.json')
    training_set = json.load(open(path))
    path = os.path.join(l.TOPDIR, 'wiki.json')
    wiki_set = json.load(open(path))

    target_names = list(set([i['unit'] for i in training_set + wiki_set]))
    train_data, train_target = [], []
    for example in training_set + wiki_set:
        train_data.append(clean_text(example['text']))
        train_target.append(target_names.index(example['unit']))

    tfidf_model = TfidfVectorizer(sublinear_tf=True,
                                  ngram_range=ngram_range,
                                  stop_words='english')

    matrix = tfidf_model.fit_transform(train_data)

    if parameters is None:
        parameters = {'loss': 'log', 'penalty': 'l2', 'n_iter': 50,
                      'alpha': 0.00001, 'fit_intercept': True}

    clf = SGDClassifier(**parameters).fit(matrix, train_target)
    obj = {'tfidf_model': tfidf_model,
           'clf': clf,
           'target_names': target_names}
    path = os.path.join(l.TOPDIR, 'clf.pickle')
    pickle.dump(obj, open(path, 'w'))

開發者ID:marcolagi，項目名稱:quantulum，代碼行數:32，代碼來源:classifier.py

示例11: get_top_terms

  def get_top_terms(self, stops=STOPS):

    # vecotrize using only 1-grams
    vectorizer = TfidfVectorizer(stop_words=stops, ngram_range=(1,3))
    tfidf = vectorizer.fit_transform(self.docs)

    # enumerate feature names, ie. the actual words
    self.feature_names = vectorizer.get_feature_names()

    # convert to dense array
    dense = tfidf.todense()

    # container for top terms per doc
    self.features = []

    for doc in dense:
      doc = doc.tolist()[0]

      # creates a list of tuples, (term_id, score)
      phrase_scores = [pair for pair in zip(range(0, len(doc)), doc) if pair[1] > 0]
      # feature_ids = sorted(phrase_scores, key=lambda t: t[1] * -1)
      doc_features = []

      for f_ in phrase_scores:
        fname = self.feature_names[f_[0]]
        fscore = f_[1]
        doc_features.append((fscore, fname))

      top_terms = sorted(doc_features, reverse=True) #[:n_terms]
      # top_terms = ",".join([ x[1] for x in top_terms ])
      self.features.append(top_terms)

開發者ID:USStateDept，項目名稱:reader-concept，代碼行數:31，代碼來源:docsim.py

示例12: classify

def classify(clf, chapter_contents_train, y_train, chapter_contents_test,k=20):
    # convert the training data text to features using TF-IDF vectorization
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
    X_train = vectorizer.fit_transform(chapter_contents_train)
    # X_train_array = X_train.toarray()
    # print "tfidf vector length: ", len(X_train_array) #dbg
    # print "X_train_array[0] length: ", len(X_train_array[0]) #dbg

    # use only the best k features according to chi-sq selection
    ch2 = SelectKBest(chi2, k=k)
    X_train = ch2.fit_transform(X_train, y_train)

    # determine the actual features used after best-k selection
    feature_names = np.asarray(vectorizer.get_feature_names())
    chisq_mask = ch2.get_support()
    features_masks = zip(feature_names,chisq_mask)
    selected_features = [z[0] for z in features_masks if z[1]]

    # train the classifier
    clf.fit(X_train, y_train)

    # convert the test data text into features using the same vectorizer as for training
    X_test = vectorizer.transform(chapter_contents_test)
    X_test = ch2.transform(X_test)

    # obtain binary class predictions for the test set
    preds = clf.predict(X_test)
    return preds, selected_features, clf

開發者ID:DerrickStuckey，項目名稱:got-narrators，代碼行數:28，代碼來源:prediction_utils.py

示例13: tfidf_covariance

def tfidf_covariance(texts, savepath):
    if not savepath.endswith("/"):
        savepath = savepath + "/"
    if os.path.exists(savepath + "__linkage_average.npy"):
        Z = np.load(savepath + "__linkage_average.npy")
    else:
        if not os.path.exists(savepath):
            os.makedirs(savepath)
        from sklearn.feature_extraction.text import TfidfVectorizer
        vectorizer = TfidfVectorizer(input = str,
                                 strip_accents = 'ascii',
                                 analyzer ='word',
                                 max_features=5000)
        y = vectorizer.fit_transform(" ".join(text) for text in texts)
        Z = linkage(y.todense(), method='average', metric='euclidean')
        np.save(savepath + "__linkage_average.npy", Z)

    if os.path.exists(savepath + "__covariance__.npy"):
        Cov = np.load(savepath + "__covariance__.npy")
        observables = HierarchicalObservation(Cov)
    else:
        root, nodes = to_tree(Z, rd=True)
        assign_parents(root)
        adj_mat = get_adjacency_matrix(nodes)
        deg_mat = get_degree_matrix(nodes)
        sigma = 5
        laplacian = np.diag(deg_mat) - adj_mat + 1/(sigma**2) * np.eye(len(deg_mat))
        Cov = np.linalg.inv(laplacian)[:len(texts), :len(texts)]
        np.save(savepath + "__covariance__.npy", Cov)
        observables = HierarchicalObservation(Cov)
    return observables

開發者ID:JonathanRaiman，項目名稱:PythonObjectLM，代碼行數:31，代碼來源:covariance.py

示例14: fit

    def fit(self, docs, clean=False):
        '''
        pipeline: clean, tokenize, tfidf, nmf, kmeans
        '''

        if clean:
            print 'cleaning raw docs ......'
            clean_docs = self.clean(docs)
        else:
            clean_docs = docs

        print 'running tfidf ......'
        if 'tokenizer' not in self.kw_tfidf:
            self.tfidf = TfidfVectorizer(tokenizer=self.tokenize,
                                         **self.kw_tfidf)
        else:
            self.tfidf = TfidfVectorizer(**self.kw_tfidf)
        X = self.tfidf.fit_transform(clean_docs)

        print 'running NMF ......'
        self.nmf = NMF(**self.kw_nmf)
        H = self.nmf.fit_transform(X)
        W = self.nmf.components_

        print 'fetching top 50 words for each topic ......'
        self.top_n_words(50, W)

        return X, H, W

開發者ID:Hongtian22，項目名稱:Movier，代碼行數:28，代碼來源:movier.py

示例15: MedicalKeywordTfIdf

class MedicalKeywordTfIdf(BaseEstimator, TransformerMixin):
    MEDICAL_KEYWORDS = ["Medical_Keyword_" + str(i) for i in range(1, 49)]

    def __init__(self):
        self._vec = TfidfVectorizer(max_df=0.95, min_df=2)

    def get_feature_names(self):

        return [x + "_TFIDF" for x in self._vec.get_feature_names()]

    def get_data_array(self, df):

        return df[self.MEDICAL_KEYWORDS] \
            .apply(lambda x: " ".join(x[x == 1].index), axis=1).values

    def fit(self, df, y=None):
        data_arr = self.get_data_array(df)
        self._vec.fit(data_arr)

        return self

    def transform(self, df):
        data_arr = self.get_data_array(df)

        return self._vec.transform(data_arr).toarray()

開發者ID:haisland0909，項目名稱:PrudentialLifeInsuranceAssessment，代碼行數:25，代碼來源:features.py

注：本文中的sklearn.feature_extraction.text.TfidfVectorizer類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。