當前位置: 首頁>>代碼示例>>Python>>正文


Python TfidfVectorizer.inverse_transform方法代碼示例

本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.inverse_transform方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.inverse_transform方法的具體用法?Python TfidfVectorizer.inverse_transform怎麽用?Python TfidfVectorizer.inverse_transform使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在sklearn.feature_extraction.text.TfidfVectorizer的用法示例。


在下文中一共展示了TfidfVectorizer.inverse_transform方法的7個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: buildVectorizer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import inverse_transform [as 別名]
def buildVectorizer(bio):
    nounlist = []
    for doc in bio:
        st = ""
        for (word, pos) in tag(doc):
            if pos in ["JJ", "NNS", "NN", "NNP"]:
                st = st+word+" "
            else:
                if st!= "":
                    st = st[0:-1]+" "
                    #print "got one"
        nounlist.extend([st])
    sciencestopwords = set([u'model','according', 'data', u'models', 'function', 'properties', 'approach', 'parameters', 
                    'systems', 'number', 'order', u'data', 'analysis', u'information', u'journal',
                    'results','using','research', 'consumers', 'scientists', 'model', 'models', 'journal',
                    'researchers','paper','new','study','time','case', 'simulation', u'simulation', 'equation',
                    'based','years','better', 'theory', 'particular','many','due','much','set', 'studies', 'systems',
                    'simple', 'example','work','non','experiments', 'large', 'small', 'experiment', u'experiments',
                    'provide', 'analysis', 'problem', 'method', 'used', 'methods'])
    #now doing the new vectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    english = nltk.corpus.stopwords.words('english')
    newstop = english+list(sciencestopwords) 
    vectorizer = TfidfVectorizer(min_df=1, max_df=.5, stop_words=newstop, decode_error='ignore')
    X = vectorizer.fit_transform(nounlist)
    Xinv = vectorizer.inverse_transform(X)
        #X is a sparse matrix of docs x vocab size (7638). 
    #so X[doc_num] is the sparse vector of its words. 
    #the ||X[doc_num]|| = 1 there are 7638 unique words and 755 docs. with a total number of 38888 non-zeros.
    #Xinv[doc_num] is the list of words in the doc.
     
    return nounlist, vectorizer, X, Xinv
開發者ID:dbgannon,項目名稱:sciml,代碼行數:34,代碼來源:doc_analysis_final.py

示例2: test_add_hashtag_bow_to_graph

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import inverse_transform [as 別名]
    def test_add_hashtag_bow_to_graph(self):
        g = IU.add_hastag_bow_to_graph(self.g_undecom)
        tfidf = TfidfVectorizer(preprocessor=None,
                                tokenizer=lambda s: s.split(),
                                stop_words=None)
        tfidf.fit([' '.join(g.node[n]['hashtags'])
                   for n in g.nodes_iter()])

        for n in g.nodes_iter():
            assert_true(issparse(g.node[n]['hashtag_bow']))
            assert_equal(
                sorted(g.node[n]['hashtags']),
                sorted(
                    tfidf.inverse_transform(
                        g.node[n]['hashtag_bow']
                    )[0].tolist()
                )
            )
開發者ID:xiaohan2012,項目名稱:lst,代碼行數:20,代碼來源:test_interactions.py

示例3: main

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import inverse_transform [as 別名]
def main(args):
    logger.debug("Arguments: %r", args)
    tfidf_vect = TfidfVectorizer(
        preprocessor=get_preprocessor(args.fields),
        analyzer='word',  # maybe callable
        token_pattern=r'\b[a-z]\w+\b',
        ngram_range=(args.min_ngrams, args.max_ngrams),
        max_df=args.max_df,
        max_features=args.max_features,
        sublinear_tf=args.sublinear_tf,
        stop_words=STOP_WORDS,
        norm=args.norm,
    )

    with LogRuntime("Loaded input data in {elapsed} seconds", logger):
        data = get_data(args)
    if data:
        logger.debug("Corpus size: {0}".format(len(data)))
    else:
        logger.error("Empty data")
        return

    with LogRuntime("Fitted in {0.elapsed} seconds", logger):
        X = tfidf_vect.fit_transform(data)

    logger.debug("Vocabulary size: {}".format(len(tfidf_vect.vocabulary_)))
    logger.debug("Max DF stop words size: {}".format(
        len(tfidf_vect.stop_words_)))
    logger.debug("Stop words size: {}".format(len(tfidf_vect.stop_words)))

    if args.clusters:
        true_k = args.clusters
    else:
        # ref: http://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set#Finding_Number_of_Clusters_in_Text_Databases
        m_docs, n_terms = X.shape
        t_nonzeros = len(X.nonzero()[0])
        true_k = (m_docs * n_terms) / t_nonzeros
        logger.debug("Calculated number of clusters: {}".format(true_k))

    if args.minibatch:
        km = MiniBatchKMeans(
            n_clusters=true_k,
            init='k-means++',
            n_init=10,
            init_size=1000,
            batch_size=1000,
            verbose=-1)
    else:
        km = KMeans(
            n_clusters=args.clusters,
            init='random',
            max_iter=100,
            n_init=10,
            verbose=1,
            n_jobs=-1)

    with LogRuntime("KMeans Fitted in {0.elapsed} seconds", logger):
        km.fit(X)

    if args.sample_random and args.sample_size:
        sample = [
            data[i]
            for i in np.random.random_integers(0, len(data), args.sample_size)
        ]
    elif args.sample_size:
        sample = data[args.sample_skip:args.sample_size]
    else:
        sample = data

    Y = tfidf_vect.transform(sample)
    sample_terms = tfidf_vect.inverse_transform(Y)

    labels = km.predict(Y)
    distances = km.transform(Y)
    center_terms = tfidf_vect.inverse_transform(km.cluster_centers_)

    clusters = defaultdict(list)
    vocabulary = tfidf_vect.vocabulary_

    for i, doc in enumerate(sample):
        clusters[labels[i]].append((i, doc))

    truncate = lambda t: t[:100] + '...' if len(t) > 100 else t

    for label, result in sorted(clusters.iteritems()):
        # skip single results
        if len(result) < args.cluster_minsize:
            continue
        terms_joined = ', '.join(
            sorted(
                center_terms[label],
                reverse=True,
                key=lambda t: km.cluster_centers_[label, vocabulary[t]]))
        print '=' * 79
        print '=' * 79
        print '=' * 79
        print '-> ' + truncate(terms_joined) + '\n\n'
        result = sorted(
            result,
            key=lambda (i, _): distances[i, label],
#.........這裏部分代碼省略.........
開發者ID:rolando-archive,項目名稱:yatiri,代碼行數:103,代碼來源:run_clustering.py

示例4: DSOM

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import inverse_transform [as 別名]
class DSOM(object):
    
    def __init__(self, inputFile=None, fileType=None, widthOfMap=2, useGPU=True):
        self.inputFile = inputFile
        self.fileType = fileType
        self.widthOfMap = widthOfMap
        self.useGPU = useGPU
        self.arrayTrain = []
        self.Y = None
        self.vectorizer = None
        self.nodeHolder = dict()
        self.text = ""
        self.dataset = ""
        
        
        
    def readDocument(self):
        if(self.fileType == 'pdf'):
            self.text = readPDF.pdfparser(self.inputFile)
        else:   
            self.text = open(self.inputFile, "r").read()    
        self.dataset = self.text.split("\n\n")         
        
    def train(self, inputFile=None):
        ###############################################################################
        #clean_file = open("data/paragraph_vector_output (copy).txt")
        #dataset = clean_file.read().split("\n\n")
    #     print(dataset)
    #     print("%d Paragraphs " % len(dataset))
    #     print()
    #     print("Extracting features from the dataset using a sparse vectorizer")
        #t0 = time()
        self.vectorizer = TfidfVectorizer(max_df=0.5, max_features=1000,
                                         min_df=2, stop_words='english',
                                         use_idf=True, sublinear_tf=True)
        self.Y = self.vectorizer.fit_transform(self.dataset)
        
        #arrayTrain = X.toarray()
        svd = TruncatedSVD(n_components=100, random_state=42)
        X = svd.fit_transform(self.Y)
        self.arrayTrain = X
        #print("done in %fs" % (time() - t0))
        #print("n_samples: %d, n_features: %d" % X.shape)
        #print()
        ###############################################################################
        ## SOM
        #For plotting the images
        
        #Train a 20x30 SOM with 400 iterations
        #print("<-- Starting SOM -- >")
        mapSide = self.widthOfMap
        som = SOM.SOM(DATA=self.arrayTrain, num_units=mapSide*mapSide, width=mapSide, height=mapSide)
        #print("<-- Training SOM -- >")
        #t0 = time()
        if(self.useGPU == True):
            try:
                import theano.sandbox.cuda
                theano.sandbox.cuda.use('gpu')
            except: 
                print("Switching to GPU didn't work, will fallback to CPU.")
            som.train_batch_theano(verbose=False)
        else:
            som.train_batch(verbose=False)
        #print("<-- Done Training SOM %fs -- >" %(time()-t0))
        #Get output grid
        #print("<-- Testing SOM -- >")
        #print("<-- Begin Output -- >")
        #np.set_printoptions(threshold='nan')
        clusters = som.ins_unit_assign
        #print(clusters)
        
        
        for i in range(mapSide*mapSide):
                self.nodeHolder[i] = []
                
        for i, m in enumerate(clusters):
            if (m) in self.nodeHolder:
                self.nodeHolder[m].append(i)
            else:
                self.nodeHolder[m] = [i]
    
    def getClusters(self):
        return self.nodeHolder
    
    def getDataset(self):
        return self.dataset
    
    def tfIDFArray(self):
        inverse = self.vectorizer.inverse_transform(self.Y)
        outList = []
        for x in inverse:
            outList.append([y.encode('UTF8') for y in x])
        return outList
開發者ID:Glavin001,項目名稱:textFlowAnalysis,代碼行數:95,代碼來源:DocumentRSOM.py

示例5: summarize_cisco_support_forum_texts

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import inverse_transform [as 別名]
def summarize_cisco_support_forum_texts():
    # cisco_plain_text = LazyCorpusLoader(
    #    'content', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin_1')
    cisco_plain_text = LazyCorpusLoader(
        "cisco_forum_subset", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin_1"
    )
    token_dict = {}
    for article in cisco_plain_text.fileids():
        token_dict[article] = cisco_plain_text.raw(article)

    tfidf = TfidfVectorizer(tokenizer=tokenize_and_stem, stop_words="english", decode_error="ignore")

    sys.stdout.flush()

    # creates Compressed Sparse Row format numpy matrix
    tdm = tfidf.fit_transform(token_dict.values())
    feature_names = tfidf.get_feature_names()

    # problem_statement_#1 - summarize support_forum articles automatically
    for article_id in range(0, tdm.shape[0] - 2):
        article_text = cisco_plain_text.raw(cisco_plain_text.fileids()[article_id])
        sent_scores = []
        for sentence in nltk.sent_tokenize(article_text):
            score = 0
            sent_tokens = tokenize_and_stem(sentence)
            for token in (t for t in sent_tokens if t in feature_names):
                score += tdm[article_id, feature_names.index(token)]
            sent_scores.append((score / len(sent_tokens), sentence))
        summary_length = int(math.ceil(len(sent_scores) / 5))
        sent_scores.sort(key=lambda sent: sent[0])
        print "\n*** SUMMARY ***"
        for summary_sentence in sent_scores[:summary_length]:
            print summary_sentence[1]
        print "\n*** ORIGINAL ***"
        print article_text

    # problem_statement_#2 - automatically categorize forum posts by tags into various groups
    reduce_dimensionality_and_cluster_docs(tfidf, tdm, num_features=200)

    # problem_statement_#3 - find similar documents to a current document (that user is reading) automatically
    # eg - quora: find similar questions, find similar answers
    cosine_similarity(tdm[0:1], tdm)
    """
    output looks like this
    array([[ 1.        ,  0.22185251,  0.0215558 ,  0.03805012,  0.04796646,
         0.05069365,  0.05507056,  0.03374501,  0.03643342,  0.05308392,
         0.06002623,  0.0298806 ,  0.04177088,  0.0844478 ,  0.07951179,
         0.02822186,  0.03036787,  0.11022385,  0.0535391 ,  0.10009412,
         0.07432719,  0.03753424,  0.06596462,  0.01256566,  0.02135591,
         0.13931643,  0.03062681,  0.02595649,  0.04897851,  0.06276997,
         0.03173952,  0.01822134,  0.04043555,  0.06629454,  0.05436211,
         0.0549144 ,  0.04400169,  0.05157118,  0.05409632,  0.09541703,
         0.02473209,  0.05646599,  0.05728387,  0.04672681,  0.04519217,
         0.04126276,  0.06289187,  0.03116767,  0.04828476,  0.04745193,
         0.01404426,  0.04201325,  0.023492  ,  0.07138136,  0.03778315,
         0.03677206,  0.02553581]])
    The first document is compared to the rest, with the most similar to it being itself with score of 1, next most similar to it is document with score 0.22185251
    """

    cosine_similarities = linear_kernel(tdm[0:1], tdm).flatten()

    # mapping back to document_name space
    related_docs_indices = cosine_similarities.argsort()
    """
    document_ids
    array([23, 50, 31, 24,  2, 52, 40, 56, 27, 15, 11, 16, 26, 47, 30,  7,  8,
       55, 21, 54,  3, 32, 45, 12, 51, 36, 44, 43, 49,  4, 48, 28,  5, 37,
        9, 18, 38, 34, 35,  6, 41, 42, 10, 29, 46, 22, 33, 53, 20, 14, 13,
       39, 19, 17, 25,  1,  0])

       docs 0 and 1 are very similar which are the following posts (last 2 array elements above when sorted)
        https://supportforums.cisco.com/discussion/11469881/aniserver-failed-run-lms-40
        and
        supportforums.cisco.com/discussion/11469606/eos-lms-31-support-quest
    """

    cosine_similarities[related_docs_indices]
    for key, value in token_dict.iteritems():
        print key, value
    # find the actual posts which are the most similar
    tfidf.inverse_transform(tdm)[0]
    tfidf.inverse_transform(tdm)[1]
開發者ID:lelakshm,項目名稱:texata2015-hackathon,代碼行數:84,代碼來源:suhas_satish_solution.py

示例6: CommentsAnalyzer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import inverse_transform [as 別名]
class CommentsAnalyzer(pmlutil.Configurable):
    
    def configTypes(self):
        return dict(amount=int, min_ngram=int, max_ngram=int, min_df=int, max_df=float, use_idf=int, alpha=readArray, l1_ratio=readArray, n_folds=int)

    def _loadData(self):
        logging.info("loading data")
        self.data = []
        count = 0
        for fn in os.listdir(self._datafolder):
            if not self._amount < 1 and count >= self._amount:
                break
            if fn.endswith(self._metaextension):
                mfn = self._datafolder + "/" + fn
                ddm = pml.Datum(mfn,None)
                if len(ddm.meta()['comments'])>0:
                    self.data.append(ddm)
                    count +=1
        logging.info("loaded %d data" % count)

    def __init__(self):
        self.data=[]

    def _aggregateComments(self, subset):
        allcomments = []
        for datum in subset:
            comments = []
            for comment in datum.meta()['comments']:
                comments.append(comment['text'])
            allcomments.append(" ".join(comments))
        return np.array(allcomments)

    def _buildDictionary(self, allcomments):
        print allcomments
        self.vectorizer = TfidfVectorizer(analyzer=self._analyzer, ngram_range=(self._min_ngram,self._max_ngram),
                                     min_df=self._min_df, max_df=self._max_df, norm='l2', smooth_idf=True, use_idf=bool(self._use_idf))
        self.vectorizer.fit(allcomments)

    def run(self):
        allcomments = self._aggregateComments(self.data)
        self._buildDictionary(allcomments)

        # create representation of documents
        tfidfArray = self.vectorizer.transform(allcomments)

        # create labelling
        labels = []
        for datum in self.data:
            labels.append(len(datum.meta()['favorites']))
        labels = np.array(labels)

        print self.vectorizer.get_params()
        print self.vectorizer.get_feature_names()

        # training
        self.elasticNet = ElasticNetCV(alphas=self._alpha, l1_ratio=self._l1_ratio, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, copy_X=True, tol=0.0001, rho=None, cv=self._n_folds)
        self.elasticNet.fit(tfidfArray,labels)

        for i,l1_ratio in enumerate(self._l1_ratio):
            for j,alpha in enumerate(self._alpha):
                print "alpha: %f, l1_ratio: %f --> %f" % (alpha,l1_ratio,np.mean(self.elasticNet.mse_path_[i,j,:]))

        print self.vectorizer.inverse_transform(self.elasticNet.coef_)
開發者ID:yk,項目名稱:pml14publish,代碼行數:65,代碼來源:commentsshell.py

示例7: main

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import inverse_transform [as 別名]

#.........這裏部分代碼省略.........
                article_info.popitem()
            else:
                article_info[article['newid']]['topic'].append(topic_list)


        '''
        Extracting the dictionary of features into a .csv file
        Format :
            Article ID,Topic,Place, Label
            20057,[u'south-korea'],[],TEST
        '''

    # with open('dictionary.csv', 'wb') as f:
    #     f.write('Article ID,Topic,Place,Label')
    #     f.write('\n')
    #     for key, value in article_info.iteritems():
    #         f.write(key)
    #         f.write(',')
    #         for inner_key,inner_value in value.items():
    #             f.write(str(inner_value))
    #             f.write(',')
    #         f.write('\n')

    print 'No of valid articles = {}'.format(len(article_list))
    #To create a global list of topics used while tokenization(used in tokenize function) to make sure feature words do not belong to topic list
    global topics_list
    topics_list = list()
    topics = getTopics(article_info,[])

    for topic_article in topics:
        if topic_article:
            for topic in topic_article:
                if topic:
                    for top in topic:
                        topics_list.append(top)


    with open('topic_labels', 'wb') as outfile:
        pickle.dump(topics, outfile, pickle.HIGHEST_PROTOCOL)

        # with open('initial_word_count.txt', 'wb') as ini:
        #     sum =0
        #     for word in article_list:
        #         sum += len(word.split())
        #  ini.write('Total words in body tag of all the 21578 documents initially :'+str(sum))


    vectorizer = TfidfVectorizer(min_df= 0.001,max_df=0.9, tokenizer=tokenize, strip_accents='unicode', smooth_idf=True)

    feature_vector = vectorizer.fit_transform(article_list)

    feature_list = vectorizer.get_feature_names()

    with open('feature_vector', 'wb') as outfile:
        pickle.dump(feature_vector, outfile, pickle.HIGHEST_PROTOCOL)

    with open('features_list', 'wb') as f:
        pickle.dump(feature_list, f, pickle.HIGHEST_PROTOCOL)

# with open('feature_list.csv','wb') as feature:
#     for value in feature_list:
#         feature.write(str(value)+'\n')

    counter_vectorizer = CountVectorizer(vocabulary=vectorizer.vocabulary_, strip_accents='unicode')

    # for the word frequency counts
    data_matrix = counter_vectorizer.fit_transform(article_list)  # data matrix
    transaction_matrix = vectorizer.inverse_transform(feature_vector)  # transaction matrix
    # terms = counter_vectorizer.get_feature_names()
    # freqs = data_matrix
    # result = dict(zip(terms, freqs))
    # print result
    # print(len(result))



## Un-comment from here to generate data_matrix and transaction_matrix

# with open('data_matrix.dat', 'wb') as outfile:
#     pickle.dump(data_matrix, outfile, pickle.HIGHEST_PROTOCOL)
#
# with open('transaction_matrix.dat', 'wb') as outfile:
#     pickle.dump(transaction_matrix, outfile, pickle.HIGHEST_PROTOCOL)

# with open('unigram_word_count.txt','wb') as ini:
#         sum = len(vectorizer.get_feature_names())
#         ini.write('Total words in body tag remaining after stemming , removing stop words and computing tf-idf counts :'+str(sum))

    bigram_vectorizer = TfidfVectorizer(min_df=0.001, tokenizer=tokenize, ngram_range=(2,2), strip_accents='unicode', max_df=0.9, smooth_idf=True)

    bigram_feature_vector = bigram_vectorizer.fit_transform(article_list)

    indices = np.argsort(bigram_vectorizer.idf_)[::-1]
    features = bigram_vectorizer.get_feature_names()
    top_n = 20
    top_features = [features[i] for i in indices[:top_n]]
    print top_features
    # with open('top_20_bigrams.txt','wb') as ini:
    #          ini.write(str(top_features))
    print("Done in %0.3fs" % (time() - t0))
開發者ID:anirban1992,項目名稱:mining-project,代碼行數:104,代碼來源:data_parser.py


注:本文中的sklearn.feature_extraction.text.TfidfVectorizer.inverse_transform方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。