当前位置: 首页>>代码示例>>Python>>正文


Python LatentDirichletAllocation.transform方法代码示例

本文整理汇总了Python中sklearn.decomposition.LatentDirichletAllocation.transform方法的典型用法代码示例。如果您正苦于以下问题:Python LatentDirichletAllocation.transform方法的具体用法?Python LatentDirichletAllocation.transform怎么用?Python LatentDirichletAllocation.transform使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.decomposition.LatentDirichletAllocation的用法示例。


在下文中一共展示了LatentDirichletAllocation.transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: score_lda

# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import transform [as 别名]
def score_lda(src, dst):
	##read sentence pairs to two lists
	b1 = []
	b2 = []
	lines = 0
	with open(src) as p:
		for i, line in enumerate(p):
			s = line.split('\t')
			b1.append(s[0])
			b2.append(s[1][:-1]) #remove \n
			lines = i + 1

	vectorizer = CountVectorizer()
	vectors=vectorizer.fit_transform(b1 + b2)

	lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
	X = lda.fit_transform(vectors)
	print X.shape
	b1_v = vectorizer.transform(b1)
	b2_v = vectorizer.transform(b2)
	b1_vecs = lda.transform(b1_v)
	b2_vecs = lda.transform(b2_v)

	res = [round(5*(1 - spatial.distance.cosine(b1_vecs[i], b2_vecs[i])),2) for i in range(lines)]
	with open(dst, 'w') as thefile:
		thefile.write("\n".join(str(i) for i in res))
开发者ID:wintor12,项目名称:SemEval2015,代码行数:30,代码来源:run.py

示例2: applyLDA2

# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import transform [as 别名]
    def applyLDA2(self, number_of_clusters, country_specific_tweets):
        train, feature_names = self.extractFeatures(country_specific_tweets,False)
        
        name = "lda"
        if self.results:
            print("Fitting LDA model with tfidf", end= " - ")
        t0 = time()     
        lda = LatentDirichletAllocation(n_topics=number_of_clusters, max_iter=5,
                                        learning_method='online', learning_offset=50.,
                                        random_state=0)

        lda.fit(train)
        
        if self.results:
            print("done in %0.3fs." % (time() - t0))
        
        parameters = lda.get_params()
        topics = lda.components_
        doc_topic = lda.transform(train)
        top10, labels = self.printTopicCluster(topics, doc_topic, feature_names)
        labels = numpy.asarray(labels)
        
        if self.results:
            print("Silhouette Coefficient {0}: {1}".format(name, metrics.silhouette_score(train, labels)))
        
        return name, parameters, top10, labels
开发者ID:michaelprummer,项目名称:datascience,代码行数:28,代码来源:clustering.py

示例3: get_features

# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import transform [as 别名]
    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
        # more important topic words a body contains of a certain topic, the higher its value for this topic
        lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3)

        print("latent_dirichlet_allocation_cos: fit and transform body")
        t0 = time()
        lda_body_matrix = lda_body.fit_transform(X_train_body)
        print("done in %0.3fs." % (time() - t0))

        print("latent_dirichlet_allocation_cos: transform head")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        lda_head_matrix = lda_body.transform(X_train_head)

        #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)

        print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body')
        # calculate cosine distance between the body and head
        X = []
        for i in range(len(lda_head_matrix)):
            X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
            X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
            cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
            X.append(cos_dist.tolist())
        return X
开发者ID:paris5020,项目名称:athene_system,代码行数:34,代码来源:topic_models.py

示例4: topicmodel

# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import transform [as 别名]
def topicmodel( comments ):

    _texts = []
    texts = []

    for c in comments:

        c = c['text']
        _texts.append( c )
        texts.append( c )



    tf_vectorizer = CountVectorizer(
                max_df=.20,
                min_df=10,
                stop_words = stopwords )
    texts = tf_vectorizer.fit_transform( texts )

    ## test between 2 and 20 topics
    topics = {}

    for k in range(2, 10):

        print "Testing", k

        model = LatentDirichletAllocation(
                    n_topics= k ,
                    max_iter=5,
                    learning_method='batch',
                    learning_offset=50.,
                    random_state=0
                )
        model.fit( texts )
        ll = model.score( texts )
        topics[ ll ] = model

    topic = max( topics.keys() )

    ret = collections.defaultdict( list )

    ## ugly, rewrite some day
    model = topics[ topic ]

    ## for debug pront chosen models' names
    feature_names = tf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print "Topic #%d:" % topic_idx
        print " ".join( [feature_names[i].encode('utf8') for i in topic.argsort()[:-5 - 1:-1]])
        print

    for i, topic in enumerate( model.transform( texts ) ):

        topic = numpy.argmax( topic )
        text = _texts[ i ].encode('utf8')

        ret[ topic ].append( text )

    return ret
开发者ID:matnel,项目名称:hs-comments-visu,代码行数:61,代码来源:main.py

示例5: LDA

# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import transform [as 别名]
def LDA(matrix,preserve,n_topics=100):

    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                        learning_method='online', learning_offset=50.,
                                        random_state=randint(1,100))
    lda.fit(matrix[preserve])
    topic_model=lda.transform(matrix)

    return topic_model
开发者ID:azhe825,项目名称:CSC510,代码行数:11,代码来源:func_GUI.py

示例6: find_topics

# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import transform [as 别名]
def find_topics(df_train, df_test, n_topics):
    
    #http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html    
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation    
    
    # Use tf (raw term count) features for LDA.
    print("Extracting character frequency features for topic modeling...")
    
    #Need to create a dtm with combined (train/test) vocabulary in columns
    n_train = df_train.shape[0]    
    df_combined = df_train.copy(deep = True).append(df_test.copy(deep = True))
    vectorizer = CountVectorizer(decode_error = 'strict', analyzer = 'char')
    corpus_combined = df_combined.loc[:,'text_read']
    dtm_combined = vectorizer.fit_transform(corpus_combined) 
    
    #split the train and test data again to ensure we only use test set for
    #supervised cross-validated learning
    dtm_train = dtm_combined[:n_train,:]
    dtm_test = dtm_combined[n_train:,:]
    
    print("Fitting LDA models with character frequency features...")
    #This requires sklearn.__version__ to be 0.17.X or greater    
    lda = LatentDirichletAllocation(n_topics=n_topics, learning_method='batch', 
                                    random_state=0)
    #fit to the training document term matrix
    lda.fit(dtm_train)
    
    #create topic 'names' and columns in dataframe    
    topic_names = []    
    for i in range(0, n_topics):
        name = 't' + str(i+1)        
        topic_names.append(name)
        df_train.loc[:, name] = 0.0
        df_test.loc[:, name] = 0.0
    
    df_train.loc[:, topic_names] = lda.transform(dtm_train)
    df_test.loc[:, topic_names] = lda.transform(dtm_test)
    
    #normalize these topic features
    df_train = normalize_features(df_train, topic_names)    
    df_test = normalize_features(df_test, topic_names)
    
    return df_train
开发者ID:jedisom,项目名称:Machine-Learning-Engineer-Nanodegree,代码行数:46,代码来源:feature_creation.py

示例7: test_lda_fit_transform

# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import transform [as 别名]
def test_lda_fit_transform(method):
    # Test LDA fit_transform & transform
    # fit_transform and transform result should be the same
    rng = np.random.RandomState(0)
    X = rng.randint(10, size=(50, 20))
    lda = LatentDirichletAllocation(n_components=5, learning_method=method,
                                    random_state=rng)
    X_fit = lda.fit_transform(X)
    X_trans = lda.transform(X)
    assert_array_almost_equal(X_fit, X_trans, 4)
开发者ID:AlexisMignon,项目名称:scikit-learn,代码行数:12,代码来源:test_online_lda.py

示例8: LDA

# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import transform [as 别名]
class LDA():

    def __init__(self, args=None, from_file=None):
        # Initialize LDA model from either arguments or a file. If both are
        # provided, file will be used.
        assert args or from_file, 'Improper initialization of LDA model'
        if from_file is not None:
            with open(from_file, 'rb') as f:
                self.model, self.vectorizer = pickle.load(f, encoding='latin1')
        else:  # training for the first time
            self.vectorizer = TfidfVectorizer(lowercase=False, token_pattern=u'[^;]+')
            self.alpha = args.alpha
            self.beta = args.beta
            self.ntopics = args.ntopics
            self.model = None

    def top_words(self, n):
        features = self.vectorizer.get_feature_names()
        words = [OrderedDict([(features[i], topic[i]) for i in topic.argsort()[:-n - 1:-1]])
                 for topic in self.model.components_]
        return words

    def train(self, docs):
        data = [';'.join(bow) for bow in docs]
        vect = self.vectorizer.fit_transform(data)
        self.alpha = self.alpha if self.alpha is not None else 50./self.ntopics
        self.beta = self.beta if self.beta is not None else 200./len(self.vectorizer.vocabulary_)
        print('{} words in vocabulary'.format(len(self.vectorizer.vocabulary_)))
        print('Training LDA with {} topics, {} alpha, {} beta'.format(self.ntopics, self.alpha, self.beta))
        self.model = LatentDirichletAllocation(self.ntopics,
                                               doc_topic_prior=self.alpha, topic_word_prior=self.beta,
                                               learning_method='batch', max_iter=100,
                                               verbose=1, evaluate_every=1,
                                               max_doc_update_iter=100, mean_change_tol=1e-5)
        self.model.fit(vect)
        # normalizing does not change subsequent inference, provided no further training is done
        self.model.components_ /= self.model.components_.sum(axis=1)[:, np.newaxis]

    def infer(self, docs):
        data = [';'.join(bow) for bow in docs]
        vect = self.vectorizer.transform(data)
        dist = self.model.transform(vect)
        assert vect.shape[0] == dist.shape[0]

        # NOTE: if a document is empty, this method returns a zero topic-dist vector
        samples = [list(doc_topic_dist) if m.nnz > 0 else ([0.] * self.model.n_components)
                   for m, doc_topic_dist in zip(vect, dist)]
        return samples
开发者ID:FoxxyMoxxy,项目名称:Vision,代码行数:50,代码来源:model.py

示例9: get_topics

# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import transform [as 别名]
def get_topics(n_topics):

    t0 = time()
    tf = np.genfromtxt('tf.txt', delimiter=',')
    print "feature laoded in %0.3fs." % (time() - t0)

    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=10,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    t0 = time()
    lda.fit(tf.T)
    doc_topic = lda.transform(tf.T)
    print "lda done in %0.3fs." % (time() - t0)

    #tfidf = np.genfromtxt('tfidf.txt', delimiter=',')
    #nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
    #doc_topic = nmf.transform(tfidf)
    plt.imshow(doc_topic, cmap='hot', interpolation='nearest')
    plt.show()
开发者ID:Thunder1989,项目名称:SDB,代码行数:21,代码来源:lda.py

示例10: save_clusters

# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import transform [as 别名]
        # Save the clusters
        tf_feature_names = tf_vectorizer.get_feature_names()
        for x in save_clusters(lda, tf_feature_names, n_top_words):
            writer.writerow([str(x[0]),str(x[1])])

        # # You can also pring out the clusters if you want to see them
        # print_clusters(lda, tf_feature_names, n_top_words)

    # Now match up the records with the best fit clusters & corresponding keywords
    with open('records_to_ldaclusters_v2.csv', 'wb') as f2:
        writer = csv.writer(f2)
        writer.writerow(["record_index","record_text","five_best_clusters","suggested_keywords"])

        # Restart the clock
        t0 = time()
        print("Finding the best keywords for each record and writing up results...")

        results = lda.transform(tf)
        for i in range(len(results)):
            try:
                best_results = (-results[i]).argsort()[:5]
                keywords = []
                for x in np.nditer(best_results):
                    keywords.extend(get_words(tf_feature_names, x))
                flattened = " ".join(keywords)
                writer.writerow([i, noaa_samples[i], best_results, flattened])
            #TODO => need to figure out the Unicode Error
            except UnicodeEncodeError: pass

        print("done in %0.3fs." % (time() - t0))
开发者ID:danhammer,项目名称:recordtagger,代码行数:32,代码来源:lda_tag.py

示例11:

# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import transform [as 别名]
 #topic matrix W: K x V
 #components[i,j]: topic i, word j
 topics = lda_vb.components_
     
 f = plt.figure()
 plt.matshow(topics, cmap = 'gray')   
 plt.gca().set_aspect('auto')
 plt.title('learned topic matrix')
 plt.ylabel('topics')
 plt.xlabel('dictionary')
 plt.show()
 f.savefig('./figures/topic.png')
  
 #topic proportions matrix: D x K
 #note: np.sum(H, axis=1) is not 1
 H = lda_vb.transform(A_tfidf_sp)
 
 f = plt.figure()
 plt.matshow(H, cmap = 'gray')   
 plt.gca().set_aspect('auto')
 plt.show()
 plt.title('topic proportions')
 plt.xlabel('topics')
 plt.ylabel('documents')
 f.savefig('./figures/proportions.png')
             
 #compute perplexity
 print "perplexity: %.2f" % lda_vb.perplexity(A_tfidf_sp)    
 plot_perplexity_iter(A_tfidf_sp, num_topics)
 plot_perplexity_topics(A_tfidf_sp)
 plot_perplexity_batch(A_tfidf_sp, A_tfidf_sp.shape[0])
开发者ID:vsmolyakov,项目名称:ml,代码行数:33,代码来源:lda_vb.py

示例12: main

# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import transform [as 别名]
def main(trace_fpath, leaveout=0.3):
    leaveout = float(leaveout)
    df = pd.read_csv(trace_fpath, sep="\t", names=["dt", "u", "s", "d"])

    num_lines = len(df)
    to = int(num_lines - num_lines * leaveout)

    df_train = df[:to]
    df_test = df[to:]

    documents_train_right = OrderedDict()
    documents_train_left = OrderedDict()
    tokens_train = set()
    for _, u, s, d in df_train.values:
        u = str(u)
        s = str(s)
        d = str(d)
        if u not in documents_train_right:
            documents_train_right[u] = []
            documents_train_left[u] = []

        documents_train_right[u].append(s)
        documents_train_left[u].append(d)
        tokens_train.add(s)
        tokens_train.add(d)

    for u in documents_train_right:
        documents_train_right[u] = "\t".join(documents_train_right[u])
        documents_train_left[u] = "\t".join(documents_train_left[u])

    vectorizer = CountVectorizer(tokenizer=lambda x: x.split("\t"), vocabulary=tokens_train)
    X_train_counts = vectorizer.fit_transform(documents_train_right.values())
    Y_train_counts = vectorizer.transform(documents_train_left.values())

    lda_model = LatentDirichletAllocation(n_topics=10, n_jobs=-1)
    lda_model.fit(X_train_counts)

    Theta_zh = lda_model.transform(X_train_counts).T
    ph = X_train_counts.sum(axis=1)
    pz = np.asarray(Theta_zh.dot(ph))[:, 0]

    Psi_oz = lda_model.components_.T
    pz = pz / pz.sum()
    Psi_zo = (Psi_oz * pz).T

    # Normalize matrices
    Psi_oz = Psi_oz / Psi_oz.sum(axis=0)
    Psi_zo = Psi_zo / Psi_zo.sum(axis=0)

    X_train_probs = []
    Y_train_probs = []
    for _, u, s, d in df_train.values:
        if str(s) in vectorizer.vocabulary_ and str(d) in vectorizer.vocabulary_:
            id_s = vectorizer.vocabulary_.get(str(s))
            id_d = vectorizer.vocabulary_.get(str(d))
            X_train_probs.append(Psi_zo[:, id_s])
            Y_train_probs.append(Psi_zo[:, id_d])

    X_train_probs = np.array(X_train_probs)
    Y_train_probs = np.array(Y_train_probs)
    P_zz = lstsq(X_train_probs, Y_train_probs)[0].T

    # numerical errors, expected as in paper.
    P_zz[P_zz < 0] = 0

    I = Psi_oz.dot(P_zz)
    I = I / I.sum(axis=0)

    probs_tmlda = {}
    probs_lda = {}

    ll_tmlda = 0.0
    ll_lda = 0.0
    n = 0
    for _, u, s, d in df_test.values:
        u = str(u)
        s = str(s)
        d = str(d)
        if s in vectorizer.vocabulary_ and d in vectorizer.vocabulary_:
            id_s = vectorizer.vocabulary_.get(s)
            id_d = vectorizer.vocabulary_.get(d)
            if (id_d, id_s) not in probs_tmlda:
                probs_tmlda[id_d, id_s] = (Psi_zo[:, id_s] * I[id_s]).sum()
                probs_lda[id_d, id_s] = (Psi_zo[:, id_s] * Psi_oz[id_s]).sum()

            if probs_tmlda[id_d, id_s] != 0:
                ll_tmlda += np.log(probs_tmlda[id_d, id_s])
            if probs_lda[id_d, id_s] != 0:
                ll_lda += np.log(probs_lda[id_d, id_s])
            n += 1

    print(ll_tmlda, ll_lda)
    print(ll_tmlda / n, ll_lda / n)
    print(n)
开发者ID:flaviovdf,项目名称:tribeflow,代码行数:96,代码来源:tmlda.py

示例13: __init__

# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import transform [as 别名]
class BuildLda:
    def __init__(self, print_list=True):
        # Create dictionary
        self.dictionary = Dictionary()
        self.topics = ['Topic {}'.format(i) for i in range(1,31)]
        self.print_list = print_list

    def build_object(self):
        self.build_model()
        self.transform_set()
        self.build_nearest_neighbours()

    def build_model(self):
        if self.print_list:
            print('Building LDA')
        strings = JobDescription.objects.values('url', 'body')

        data_samples = []
        seen_strings = set()
        for string in strings:
            if string['body'] not in seen_strings:
                seen_strings.add(string['body'])
                data_samples.append({'url': string['url'], 'string': self.dictionary.clean_string(string['body'])})

        self.data_samples = DataFrame(data_samples)

        n_features = 10000
        n_topics = 15
        n_top_words = 10
        max_iter = 40

        self.tf_vectorizer = CountVectorizer(max_features=n_features,
                                        stop_words='english')

        tf = self.tf_vectorizer.fit_transform(self.data_samples['string'])

        self.lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=max_iter,
                                        learning_method='online')

        self.lda.fit(tf)

        if self.print_list:
            print()
            print("\nTopics in LDA model:")
        tf_feature_names = self.tf_vectorizer.get_feature_names()
        self.create_word_topics(self.lda, tf_feature_names)
        if self.print_list:
            self.print_top_words(self.lda, tf_feature_names, n_top_words)

    def test_single_doc(self, string):
        data_samples = DataFrame([{'string': self.dictionary.clean_string(string)}])
        test = self.tf_vectorizer.transform(data_samples['string'])
        lda_result = self.lda.transform(test)
        top_tags = []
        return_value = {'lda_result': lda_result, 'tags': []}
        index_set = sorted(range(len(lda_result[0])), key=lambda i: lda_result[0][i], reverse=True)
        position = 0
        for index in index_set:
            return_value['tags'].append({'tag': self.topics[index], 'position': position, 'score': lda_result[0][index]})
            top_tags.append(self.topics[index])
            position += 1
        return return_value

    def transform_set(self):
        if self.print_list:
            print('Getting LDA Transformation')
        vectorizor_data = self.tf_vectorizer.transform(self.data_samples['string'])
        self.results = self.lda.transform(vectorizor_data)

    def build_nearest_neighbours(self):
        if self.print_list:
            print('Build Nearest Neighbours')
        self.nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(self.results)

    def get_neighbours(self, string, print=False):
        return_result = self.test_single_doc(string)
        return_result['distances'], return_result['indices'] = self.nbrs.kneighbors(return_result['lda_result'])

        if print:
            self.print_neighbours(return_result['indices'][0])
        return_result['neighbours'] = self.return_neighbours(return_result['indices'][0], return_result['distances'][0])

        return {'tags': return_result['tags'], 'neighbours': return_result['neighbours']}

    def print_neighbours(self, indices):
        print('Closest 10 jobs:')
        for indice in indices:
            url = self.data_samples.get_value(indice, 'url')
            print('http://www.seek.com.au%s' % url)

    def return_neighbours(self, indices, distances):
        return_value = []
        for index in range(len(indices)):
            url = self.data_samples.get_value(indices[index], 'url')
            return_value.append({'url': 'http://www.seek.com.au{}'.format(url), 'distance': distances[index]})
        return return_value

    def print_top_words(self, model, feature_names, n_top_words):
        for topic_idx, topic in enumerate(model.components_):
            print(self.topics[topic_idx]+": "+" ".join([feature_names[i]
#.........这里部分代码省略.........
开发者ID:Grungnie,项目名称:jobseek,代码行数:103,代码来源:buildlda.py

示例14: enumerate

# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import transform [as 别名]
for i, each in enumerate(data):  # 一级分类样本数5889 二级分类5887
    if each[TAG_LEVEL].strip() == '':
        continue
    else:
        rawdialogue.append(rawdata[i])
        content.append(each[0])
        tag.append(each[TAG_LEVEL])
total_acc = 0
for i in range(TIMES):
    train_content, train_tag, train_raw, test_content, test_tag, test_raw = divideData(rawdialogue, content, tag, 0.2)
    # 得到单词-文档共现矩阵
    vectorizer = CountVectorizer(encoding='unicode', stop_words='english', max_features=N_FEATURES)

    train_data = vectorizer.fit_transform(train_content)
    test_data = vectorizer.fit_transform(test_content)  # [n_samples, n_features]

    model = LDA(n_topics=N_TOPICS, batch_size=64)
    model.fit(train_data)

    dt_matrix = model.transform(train_data)
    test_dt_matrix = model.transform(test_data)
    svc = SVC(C=0.99, kernel='linear')

    svc = svc.fit(dt_matrix, train_tag)
    pred = svc.predict(test_dt_matrix)
    acc = np.round(np.mean(pred == test_tag), 4)
    total_acc += acc
    print 'LDA分类器的准确率: %.4f' % acc
print 'average accuary: ', total_acc / TIMES
开发者ID:Samurais,项目名称:DialogueClassifier,代码行数:31,代码来源:lda_svmClassifier.py

示例15: range

# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import transform [as 别名]
for i in range(TIMES):
    train_content, train_tag, train_raw, test_content, test_tag, test_raw = divideData(rawdialogue, content, tag, 0.2)
    # 得到单词-文档共现矩阵
    vectorizer = CountVectorizer(encoding='unicode', stop_words='english', max_features=N_FEATURES)

    train_data = vectorizer.fit_transform(train_content)

    train_tag = np.array(train_tag)

    test_data = vectorizer.fit_transform(test_content)  # [n_samples, n_features]

    model = LDA(n_topics=N_TOPICS, max_iter=5, batch_size=128)
    model.fit(train_data)

    train_data_distr = model.transform(train_data)
    pred_tag = train_data_distr.argmax(axis=1)

    # 投票
    id2class = dict()
    for idx in range(N_TOPICS):
        idxs = np.where(pred_tag == idx)[0]
        # print Counter(train_tag[idxs])
        id2class[idx] = Counter(train_tag[idxs]).most_common(1)[0][0]
    print id2class
    doc_topic_distr = model.transform(test_data)  # [n_samples, n_topics]
    class_id = doc_topic_distr.argmax(axis=1)
    pred = [id2class[each] for each in class_id]
    pred=np.array(pred)
    test_tag=np.array(test_tag)
    acc=np.mean(pred==test_tag)
开发者ID:Samurais,项目名称:DialogueClassifier,代码行数:32,代码来源:ldaClassifier.py


注:本文中的sklearn.decomposition.LatentDirichletAllocation.transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。