当前位置: 首页>>代码示例>>Python>>正文


Python MiniBatchKMeans.transform方法代码示例

本文整理汇总了Python中sklearn.cluster.MiniBatchKMeans.transform方法的典型用法代码示例。如果您正苦于以下问题:Python MiniBatchKMeans.transform方法的具体用法?Python MiniBatchKMeans.transform怎么用?Python MiniBatchKMeans.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.cluster.MiniBatchKMeans的用法示例。


在下文中一共展示了MiniBatchKMeans.transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: KMeansFeatureTransformer

# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
class KMeansFeatureTransformer(object):
    def __init__(self, patches, k=1500, model_path=None):
        self.mean_distances = None
        if model_path is None:
            self.k_means = MiniBatchKMeans(n_clusters=k, compute_labels=False,
                                           reassignment_ratio=0, max_no_improvement=10, batch_size=10000,
                                           verbose=2)
            self.k_means.fit(patches)
            # update mean distances
            self.compute_mean_distances(patches)
        else:
            self.load(model_path)

    def transform(self, patches):
        return self.k_means.transform(patches)

    def predict(self, patches):
        return self.k_means.predict(patches)

    def compute_mean_distances(self, patches):
        self.mean_distances = np.mean(self.k_means.transform(patches), axis=0)

    def save(self, file_path='model/k_means_model'):
        joblib.dump(self.k_means, file_path)

    def load(self, file_path):
        self.k_means = joblib.load(file_path)
开发者ID:EDFward,项目名称:10601-playground,代码行数:29,代码来源:feats_repr.py

示例2: Embedder

# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
class Embedder(object):

    """Transform a set of sparse high dimensional vectors to a set of low dimensional dense vectors.

    Under the hood sparse random projection and simplex volume maximization factorization is used.
    """

    def __init__(self, complexity=10, n_kmeans=None, random_state=1):
        self.complexity = complexity
        self.n_kmeans = n_kmeans
        self.transformer = None
        self.matrix_factorizer = None
        self.kmeans = None
        self.random_state = random_state

    def fit(self, data_matrix):
        n_rows, n_cols = data_matrix.shape
        if n_rows <= n_cols:
            n_components = n_rows
        elif n_cols < 5000:
            n_components = n_cols
        else:
            n_components = 'auto'
        self.transformer = random_projection.SparseRandomProjection(n_components=n_components,
                                                                    dense_output=True,
                                                                    random_state=self.random_state)
        data_matrix_new = self.transformer.fit_transform(data_matrix)
        self.matrix_factorizer = pymf.SIVM(data_matrix_new.T, num_bases=self.complexity)
        self.matrix_factorizer.factorize()
        if self.n_kmeans:
            self.kmeans = MiniBatchKMeans(n_clusters=self.n_kmeans)
            self.kmeans.fit(self.matrix_factorizer.H.T)

    def fit_transform(self, data_matrix):
        self.fit(data_matrix)
        if self.n_kmeans:
            return self.kmeans.transform(self.matrix_factorizer.H.T)
        else:
            return self.matrix_factorizer.H.T

    def transform(self, data_matrix):
        basis_data_matrix = self.matrix_factorizer.W
        data_matrix_new = self.transformer.transform(data_matrix)
        self.matrix_factorizer = pymf.SIVM(data_matrix_new.T, num_bases=self.complexity)
        self.matrix_factorizer.W = basis_data_matrix
        self.matrix_factorizer.factorize(compute_w=False)
        if self.n_kmeans:
            return self.kmeans.transform(self.matrix_factorizer.H.T)
        else:
            return self.matrix_factorizer.H.T
开发者ID:gianlucacorrado,项目名称:EDeN,代码行数:52,代码来源:embedding.py

示例3: make_cluster

# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def make_cluster(datasets):
    num_clusters = 5
    lsa_dim = 500
    max_df = 0.8
    max_features = 10000
    minibatch = True
    print("datasets are %(datasets)s" % locals())

    km = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++',
                         batch_size=1000, n_init=10, max_no_improvement=10, verbose=True)
    km.fit(datasets)
    labels = km.labels_

    transformed = km.transform(x)
    dists = np.zeros(labels.shape)
    for i in range(len(labels)):
        dists[i] = transformed[i, labels[i]]

    clusters = []
    for i in range(num_clusters):
        cluster = []
        ii = np.where(labels == i)[0]
        dd = dists[ii]
        di = np.vstack([dd, ii]).transpose().tolist()
        di.sort()
        for d, j in di:
            cluster.append(datasets[int(j)])
        clusters.append(cluster)

    return clusters
开发者ID:id774,项目名称:sandbox,代码行数:32,代码来源:cluster_with_dataset.py

示例4: clustering

# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
    def clustering(self, X, NUM_CLUSTERS, MINIBATCH):
        '''
        k平均法によってクラス分け
        '''
        
        if MINIBATCH:
            km = MiniBatchKMeans(n_clusters = NUM_CLUSTERS,
                                 init='k-means++', batch_size=1000,
                                 n_init=10, max_no_improvement=10)
        else:
            km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1)
        
        km.fit(X)
        transformed = km.transform(X) #商品の各クラスの中心への距離
        labels = km.labels_
        
        dists = []
        for i in range(len(labels)):
            dists.append(transformed[i, labels[i]]) #商品の属するクラスの中心への距離

        labels = DataFrame(labels)
        dists = DataFrame(dists)
        labels.columns = ['label']
        dists.columns = ['dists']
        self.data = pd.concat([labels, dists, self.data], axis=1) #元のデータにラベルを加える
        
        return km
开发者ID:takeru-nitta,项目名称:auction,代码行数:29,代码来源:clustering.py

示例5: make_cluster

# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
  def make_cluster(self):
    texts = self._read_from_file()
    # print "texts are %(texts)s" %locals()

    # ベクトルを生成
    vectorizer = TfidfVectorizer(
      max_df       = self.max_df,
      max_features = self.max_features,
      stop_words   = 'english'
      )
    X = vectorizer.fit_transform(texts)
    # ここでの値は何度やっても同じでした
    # print "X values are %(X)s" %locals()

    # KMeans インスタンスを生成しクラスタリングする
    # パラメータはデータの量や特性に応じて適切なものを与えるようにする
    if self.minibatch:
      km = MiniBatchKMeans(
        n_clusters         = self.num_clusters,
        init               = 'k-means++',
        batch_size         = 1000,
        n_init             = 10,
        max_no_improvement = 10,
        verbose            = True
        )
    else:
      km = KMeans(
        n_clusters = self.num_clusters,
        init       = 'k-means++',
        n_init     = 1,
        verbose    = True
        )
    km.fit(X)
    labels = km.labels_

    transformed = km.transform(X)
    dists       = np.zeros(labels.shape)
    for i in range(len(labels)):
      dists[i] = transformed[i, labels[i]]

    clusters = []
    for i in range(self.num_clusters):
      cluster = []
      ii      = np.where(labels==i)[0]
      dd      = dists[ii]
      di      = np.vstack([dd,ii]).transpose().tolist()
      di.sort()
      for d, j in di:
        cluster.append(texts[int(j)])
      clusters.append(cluster)

    return clusters
开发者ID:Aipakazuma,项目名称:study_clustering,代码行数:54,代码来源:clustering.py

示例6: applyKmeans

# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def applyKmeans():

    rng = RandomState(0)
    components = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
#     components = [10]
    '''
    Lets work on stepName only.
    '''
    sIdMtx = LoadSparseMatrix(ROOTDIR+"train_sId.txt")
    sectionMtx = LoadSparseMatrix(ROOTDIR+"train_section.txt")
    problemMtx = LoadSparseMatrix(ROOTDIR+"train_problem.txt")
    stepMtx = LoadSparseMatrix(ROOTDIR+"train_step.txt")
    
    label = np.load(ROOTDIR+"label_train.npy")
    
    rdata = hstack((sIdMtx, sectionMtx), format='csr')
    rdata = hstack((rdata, problemMtx), format='csr')
    rdata = hstack((rdata, stepMtx), format='csr')
    
    kcMtx = LoadSparseMatrix(ROOTDIR + "train_kc.txt")
    
    print 'starting to run kmeans++..'
    
    for i in components:
        km = MiniBatchKMeans(n_clusters=i, tol=1e-3, batch_size=20, max_iter=60, random_state=rng)
        km.fit(kcMtx)
        objscore = km.score(kcMtx)
        
        print 'With ' + str(i) +' components, the object score is ' + str(objscore)
        nkcMtx = km.transform(kcMtx) 
        
#         io.mmwrite(ROOTDIR+"train_step_kmeans_"+str(i)+".txt", nkcMtx)
        
        data = hstack((rdata, nkcMtx), format='csr')
#         io.mmwrite(ROOTDIR+"TRAIN_KMEANS_"+str(i)+".txt", data)
        
        # now train it!
        data = scale(data, with_mean=False)
        lrmodel = linear_model.LogisticRegression(max_iter=1000, penalty='l2', multi_class='ovr', verbose=0)
        
        lrmodel.fit(data, label)
        print 'Trainning Done!'
        scr = lrmodel.score(data, label)
        print 'accuracy on the training set is:' + str(scr)
    
        predLabel = lrmodel.predict(data)
        calcualteRMSE(label, predLabel)
        
        print '************************'
开发者ID:cyinv,项目名称:10601Project-KDD2010,代码行数:51,代码来源:Preprocessing.py

示例7: process_vec_info

# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def process_vec_info(g, n_clusters=8):
    """process_vec_info."""
    # extract node vec information and make np data matrix
    data_matrix = np.array([g.node[u]['vec'] for u in g.nodes()])
    # cluster with kmeans
    clu = MiniBatchKMeans(n_clusters=n_clusters, n_init=10)
    clu.fit(data_matrix)
    preds = clu.predict(data_matrix)
    vecs = clu.transform(data_matrix)
    vecs = 1 / (1 + vecs)
    # replace node information
    graph = g.copy()
    for u in graph.nodes():
        graph.node[u]['label'] = str(preds[u])
        graph.node[u]['vec'] = list(vecs[u])
    return graph
开发者ID:fabriziocosta,项目名称:EDeN,代码行数:18,代码来源:estimator_utils.py

示例8: make_cluster

# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
    def make_cluster(self):
        texts = self.texts
        print("texts are %(texts)s" %locals())

        vectorizer = TfidfVectorizer(
            analyzer=mecab_util.extractNoun,
            max_df=self.max_df,
            max_features=self.max_features
            )
        X = vectorizer.fit_transform(texts)
        print("X values are %(X)s" %locals())

        if self.minibatch:
            km = MiniBatchKMeans(
                n_clusters=self.num_clusters,
                init='k-means++', batch_size=1000,
                n_init=10, max_no_improvement=10,
                verbose=True
                )
        else:
            km = KMeans(
                n_clusters=self.num_clusters,
                init='k-means++',
                n_init=10,
                verbose=True
                )
        km.fit(X)
        labels = km.labels_

        transformed = km.transform(X)
        dists = np.zeros(labels.shape)
        for i in range(len(labels)):
            dists[i] = transformed[i, labels[i]]

        clusters = []
        for i in range(self.num_clusters):
            cluster = []
            ii = np.where(labels == i)[0]
            dd = dists[ii]
            di = np.vstack([dd, ii]).transpose().tolist()
            di.sort()
            for d, j in di:
                cluster.append(texts[int(j)])
            clusters.append(cluster)
        return clusters
开发者ID:shotat,项目名称:submodular-summarization,代码行数:47,代码来源:clustering.py

示例9: auto_label

# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def auto_label(graphs, n_clusters=16, **opts):
    """Label nodes with cluster id.

    Cluster nodes using as features the output of vertex_vectorize.
    """
    data_list = Vectorizer(**opts).vertex_transform(graphs)
    data_matrix = vstack(data_list)
    clu = MiniBatchKMeans(n_clusters=n_clusters, n_init=10)
    clu.fit(data_matrix)
    preds = clu.predict(data_matrix)
    vecs = clu.transform(data_matrix)
    sizes = [m.shape[0] for m in data_list]
    label_list = []
    vecs_list = []
    pointer = 0
    for size in sizes:
        label_list.append(preds[pointer: pointer + size])
        vecs_list.append(vecs[pointer: pointer + size])
        pointer += size
    return label_list, vecs_list
开发者ID:fabriziocosta,项目名称:EDeN,代码行数:22,代码来源:graph.py

示例10: main

# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def main(filename):
    # load tweets
    tweets = get_tweets_from_csv(filename)
    # print tweets
 
    # feature extraction
    vectorizer = TfidfVectorizer(analyzer=analyzer, max_df=MAX_DF)
    vectorizer.max_features = MAX_FEATURES
    X = vectorizer.fit_transform(tweets)
    # dimensionality reduction by LSA
    lsa = TruncatedSVD(LSA_DIM)
    X = lsa.fit_transform(X)
    X = Normalizer(copy=False).fit_transform(X)
 
    # clustering by KMeans
    if MINIBATCH:
        km = MiniBatchKMeans(n_clusters=NUM_CLUSTERS, init='k-means++', batch_size=1000, n_init=10, max_no_improvement=10, verbose=True)
    else:
        km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1, verbose=True)
    km.fit(X)
    labels = km.labels_
 
    transformed = km.transform(X)
    dists = np.zeros(labels.shape)
    for i in range(len(labels)):
        dists[i] = transformed[i, labels[i]]
 
    # sort by distance
    clusters = []
    for i in range(NUM_CLUSTERS):
        cluster = []
        ii = np.where(labels==i)[0]
        dd = dists[ii]
        di = np.vstack([dd,ii]).transpose().tolist()
        di.sort()
        for d, j in di:
            cluster.append(tweets[int(j)])
        clusters.append(cluster)
 
    return clusters
开发者ID:TakeumiYamamura,项目名称:gci,代码行数:42,代码来源:lessons_clustering.py

示例11: new_clustered_sortind

# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def new_clustered_sortind(x, k=10, row_key=None, cluster_key=None):
    """
    Uses MiniBatch k-means clustering to cluster matrix into groups.

    Each cluster of rows is then sorted by `scorefunc` -- by default, the max
    peak height when all rows in a cluster are averaged, or
    cluster.mean(axis=0).max().

    Returns the index that will sort the rows of `x` and a list of "breaks".
    `breaks` is essentially a cumulative row count for each cluster boundary.
    In other words, after plotting the array you can use axhline on each
    "break" to plot the cluster boundary.

    If `k` is a list or tuple, iteratively try each one and select the best
    with the lowest mean distance from cluster centers.

    :param x: Matrix whose rows are to be clustered
    :param k: Number of clusters to create or a list of potential clusters; the
        optimum will be chosen from the list
    :param row_key:
        Optional function to act as a sort key for sorting rows within
        clusters.  Signature should be `scorefunc(a)` where `a` is a 1-D NumPy
        array.
    :param cluster_key:
        Optional function for sorting clusters.  Signature is `clusterfunc(a)`
        where `a` is a NumPy array containing all rows of `x` for cluster `i`.
        It must return a single value.
    """
    try:
        from sklearn.cluster import MiniBatchKMeans
    except ImportError:
        raise ImportError('please install scikits.learn for '
                          'clustering.')

    # If integer, do it once and we're done
    if isinstance(k, int):
        best_k = k

    else:
        mean_dists = {}
        for _k in k:
            mbk = MiniBatchKMeans(init='k-means++', n_clusters=_k)
            mbk.fit(x)
            mean_dists[_k] = mbk.transform(x).mean()
        best_k = sorted(mean_dists.items(), key=lambda x: x[1])[-1][0]

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=best_k)
    mbk.fit(x)
    k = best_k
    labels = mbk.labels_
    scores = np.zeros(labels.shape, dtype=float)

    if cluster_key:
        # It's easier for calling code to provide something that operates on
        # a cluster level, but here it's converted to work on a label level
        # that looks in to the array `x`.
        def _cluster_key(i):
            return cluster_key(x[labels == i, :])
        sorted_labels = sorted(range(k), key=_cluster_key)
    else:
        # Otherwise just use them as-is.
        sorted_labels = range(k)

    if row_key:
        # Again, easier to provide a function to operate on a row.  But here we
        # need it to accept an index
        def _row_key(i):
            return row_key(x[i, :])

    final_ind = []
    breaks = []
    pos = 0
    for label in sorted_labels:
        # which rows in `x` have this label
        label_inds = np.nonzero(labels == label)[0]
        if row_key:
            label_sort_ind = sorted(label_inds, key=_row_key)
        else:
            label_sort_ind = label_inds
        for li in label_sort_ind:
            final_ind.append(li)
        pos += len(label_inds)
        breaks.append(pos)

    return np.array(final_ind), np.array(breaks)
开发者ID:rbeagrie,项目名称:metaseq,代码行数:87,代码来源:plotutils.py

示例12: clustered_sortind

# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def clustered_sortind(x, k=10, scorefunc=None):
    """
    Uses MiniBatch k-means clustering to cluster matrix into groups.

    Each cluster of rows is then sorted by `scorefunc` -- by default, the max
    peak height when all rows in a cluster are averaged, or
    cluster.mean(axis=0).max().

    Returns the index that will sort the rows of `x` and a list of "breaks".
    `breaks` is essentially a cumulative row count for each cluster boundary.
    In other words, after plotting the array you can use axhline on each
    "break" to plot the cluster boundary.

    If `k` is a list or tuple, iteratively try each one and select the best
    with the lowest mean distance from cluster centers.

    :param x: Matrix whose rows are to be clustered
    :param k: Number of clusters to create or a list of potential clusters; the
        optimum will be chosen from the list
    :param scorefunc: Optional function for sorting rows within clusters.  Must
        accept a single argument of a NumPy array.
    """
    try:
        from sklearn.cluster import MiniBatchKMeans
    except ImportError:
        raise ImportError('please install scikits.learn for '
                          'clustering.')

    # If integer, do it once and we're done
    if isinstance(k, int):
        best_k = k

    else:
        mean_dists = {}
        for _k in k:
            mbk = MiniBatchKMeans(init='k-means++', n_clusters=_k)
            mbk.fit(x)
            mean_dists[_k] = mbk.transform(x).mean()
        best_k = sorted(mean_dists.items(), key=lambda x: x[1])[-1][0]

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=best_k)
    mbk.fit(x)
    k = best_k
    labels = mbk.labels_
    scores = np.zeros(labels.shape, dtype=float)

    if not scorefunc:
        def scorefunc(x):
            return x.mean(axis=0).max()

    for label in range(k):
        ind = labels == label
        score = scorefunc(x[ind, :])
        scores[ind] = score

    pos = 0
    breaks = []
    ind = np.argsort(scores)
    for k, g in groupby(labels[ind]):
        pos += len(list(g))
        breaks.append(pos)

    return ind, breaks
开发者ID:rbeagrie,项目名称:metaseq,代码行数:65,代码来源:plotutils.py

示例13: main

# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def main(args):
    logger.debug("Arguments: %r", args)
    vect = default_vectorizer()
    vect.set_params(
        ngram_range=(args.min_ngrams, args.max_ngrams),
        max_df=args.max_df,
        max_features=args.max_features,
        sublinear_tf=args.sublinear_tf,
        norm=args.norm,
    )

    with LogRuntime("Loaded input data in {elapsed} seconds", logger):
        data = get_data(args)
    if data:
        logger.debug("Corpus size: {0}".format(len(data)))
    else:
        logger.error("Empty data")
        return

    with LogRuntime("Fitted in {0.elapsed} seconds", logger):
        X = tfidf_vect.fit_transform(data)

    logger.debug("Vocabulary size: {}".format(len(tfidf_vect.vocabulary_)))
    logger.debug("Max DF stop words size: {}".format(len(tfidf_vect.stop_words_)))
    logger.debug("Stop words size: {}".format(len(tfidf_vect.stop_words)))

    if args.clusters:
        true_k = args.clusters
    else:
        # ref: http://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set#Finding_Number_of_Clusters_in_Text_Databases
        m_docs, n_terms = X.shape
        t_nonzeros = len(X.nonzero()[0])
        true_k = (m_docs * n_terms) / t_nonzeros
        logger.debug("Calculated number of clusters: {}".format(true_k))

    if args.minibatch:
        km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=10,
                             init_size=1000, batch_size=1000, verbose=-1)
    else:
        km = KMeans(n_clusters=args.clusters, init='random', max_iter=100,
                    n_init=10, verbose=1, n_jobs=-1)

    with LogRuntime("KMeans Fitted in {0.elapsed} seconds", logger):
        km.fit(X)

    if args.sample_random and args.sample_size:
        sample = [
            data[i] for i in np.random.random_integers(0, len(data), args.sample_size)
        ]
    elif args.sample_size:
        sample = data[args.sample_skip:args.sample_size]
    else:
        sample = data

    Y = tfidf_vect.transform(sample)
    sample_terms = tfidf_vect.inverse_transform(Y)

    labels = km.predict(Y)
    distances = km.transform(Y)
    center_terms = tfidf_vect.inverse_transform(km.cluster_centers_)

    clusters = defaultdict(list)
    vocabulary = tfidf_vect.vocabulary_

    for i, doc in enumerate(sample):
        clusters[labels[i]].append((i, doc))

    truncate = lambda t: t[:100] + '...' if len(t) > 100 else t

    for label, result in sorted(clusters.iteritems()):
        # skip single results
        if len(result) < args.cluster_minsize:
            continue
        terms_joined = ', '.join(sorted(
            center_terms[label], reverse=True,
            key=lambda t: km.cluster_centers_[label, vocabulary[t]]
        ))
        print '='*79
        print '='*79
        print '='*79
        print '-> ' + truncate(terms_joined) + '\n\n'
        result = sorted(
            result,
            key=lambda (i,_): distances[i,label],
        )

        j = 0
        for i, doc in result:
            j += 1
            doc_terms = ', '.join(sorted(
                sample_terms[i], reverse=True,
                key=lambda t: Y[i, vocabulary[t]],
            ))
            print doc['headline']
            print get_corpus_key(doc)
            print doc['url']
            print 'distance:', distances[i,label]
            print truncate(doc_terms)
            print
            if j > 10:
#.........这里部分代码省略.........
开发者ID:rolando-archive,项目名称:yatiri,代码行数:103,代码来源:categories_clustering.py

示例14: getClustersOfpapers

# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
 def getClustersOfpapers(self,papers_list,cluster_num = 10):
     corpus = list()
     for paperID in papers_list:
         corpus.append(self.lda_corpus[paperID])
     #print len(corpus)
     clusterModel = MiniBatchKMeans(init='k-means++', n_clusters=cluster_num, n_init=100)
     clusterModel.fit(corpus)
     
     clusterTopics = dict()
     clusterPapers = dict()
     clusterYear = dict()
     clusterSize = dict()
     
     
     
     for paperID in papers_list:
         cluster = int(clusterModel.predict(self.lda_corpus[paperID])[0])
         
         clusterSize[cluster] = clusterSize.get(cluster,0)
         clusterSize[cluster] += 1
         
         if cluster in clusterTopics:
             clusterTopics[cluster] += np.array(self.lda_corpus[paperID])
         else:
             clusterTopics[cluster] = np.array(self.lda_corpus[paperID])
         
         clusterPapers[cluster] = clusterPapers.get(cluster,{})
         clusterPapers[cluster][paperID] = self.lda_corpus[paperID]
         
         
         clusterYear[cluster] = clusterYear.get(cluster,dict())
         year = self.abstracts[paperID]['year']
         if year:
             clusterYear[cluster][year] = clusterYear[cluster].get(year,0)
             clusterYear[cluster][year]+=1
     
     
     clusterCords = getPCAonDict(clusterTopics)
     cluster_result = list()
     paper_result = list()
     for cluster in clusterPapers:
         topPapers = list()
         clusterPapers[cluster] = getPCAonDict(clusterPapers[cluster])
         for paperID in clusterPapers[cluster]:
             x = clusterPapers[cluster][paperID][0]
             y = clusterPapers[cluster][paperID][1]
             distance = clusterModel.transform(self.lda_corpus[paperID])[0][cluster]
             #print 'papreID', cluster, distance, clusterModel.transform(self.lda_corpus[paperID])
             topPapers.append([distance, paperID])
             
             paper_result.append('%d,%d,%.3f,%.3f,%.3f,%s,%d,%s,%s,%s'%(cluster,
                                                      paperID,
                                                      x,
                                                      y,
                                                      distance,
                                                      str(self.abstracts[paperID]['year']),
                                                      self.abstracts[paperID]['pages'],
                                                      self.abstracts[paperID]['path'],
                                                      self.abstracts[paperID]['header'],
                                                      self.abstracts[paperID]['abstract']
                                                      ))
         
             
         topPapers.sort()
         topNames = [self.abstracts[paperID[1]]['header'].replace(',','') for paperID in topPapers[:5]]
         topNames_str = '"%s"'%(','.join(topNames))
         
         
         
         
         topics = list(clusterTopics[cluster])
         #print topics, topics.index(max(topics)),topics[topics.index(max(topics))]
         #print len(self.topics_names),self.topics_names[topics.index(max(topics))]
         name = '"'+','.join(self.topics_names[topics.index(max(topics))])+'"'
         x = clusterCords[cluster][0]
         y = clusterCords[cluster][1]
         years = [[clusterYear[cluster][year],year] for year in clusterYear[cluster]]
         if len(years) == 0:
             year = 'None'
         else:
             years.sort(reverse=True)
             year = years[0][1]
         
         cluster_result.append('%s,%d,%.3f,%.3f,%d,%d,%s,%s'%(str(name),
                                       cluster,
                                       x,
                                       y,
                                       clusterSize[cluster],
                                       cluster,
                                       year,
                                       topNames_str))
         
         
     
 
     return paper_result, cluster_result
开发者ID:kobauman,项目名称:8Q_code,代码行数:98,代码来源:searcher.py

示例15: main

# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def main(tcu_fpath):
    data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath)
    data = data[data['Situacao'] == 'Aceito e Habilitado']
    
    desc_column = data['Descricao']
    des_cmp_column = data['DescricaoComplementar']
    unidade_column = data['UnidadeFornecimento']
    qtd_column = [str(qtd) for qtd in data['Quantidade']]
    
    #Transforms descriptions to base strings
    as_docs = []
    for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column):
        doc = " ".join(as_text)
        as_docs.append(doc)

    #Vectorizes to TF-IDF
    vectorizer = Vectorizer()
    doc_sparse_matrix = vectorizer.fit_transform(as_docs)
    
    #Compute clusters
    inter = {}
    intra = {}
    n_runs = 20
    k_vals = range(2, 16)
    for i in xrange(n_runs):
        for k in k_vals:
            #Each K has n_runs clusterings
            inter_array = inter.setdefault(k, np.zeros(n_runs))
            intra_array = intra.setdefault(k, np.zeros(n_runs))
            
            #Run K-Means
            mbkm = MiniBatchKMeans(k, init = 'random')
            mbkm.fit(doc_sparse_matrix)
            
            centers = mbkm.cluster_centers_
            labels = mbkm.labels_
            
            #Inter distance. We use min because the ideia is to maximize this.
            #Min serves as a penalty for worse case.
            dist_centers = pairwise.euclidean_distances(centers)
            min_dist_between_centers = \
                np.min(dist_centers[dist_centers > 0])
            inter_array[i] = min_dist_between_centers

            #Intra distance
            dist_all_centers = mbkm.transform(doc_sparse_matrix)
            intra_dists = []
            for doc_id, cluster in enumerate(labels):
                dist = dist_all_centers[doc_id, cluster]
                intra_dists.append(dist)
            intra_array[i] = np.mean(intra_dists)
            
            #Prints num elements per cluster
            print('Run %d ; k = %d' %(i, k))
            counter = Counter(labels)
            for cluster, population in counter.items():
                print('\tK = %d; Pop = %d' %(cluster, population))
            print()
    
    x = inter.keys()
    y = []
    c = []
    for k in x:
        div = inter[k] / intra[k]
        y.append(np.mean(div))
        c.append(half_confidence_interval_size(div, 0.90))
    
    #hack for the zero to apper
    x = [0] + x
    y = [0] + y
    c = [0] + c
    
    ax = plt.gca()
    ax.set_yscale('log')
    ax.set_xticks(range(0, 16))
    plt.ylabel('InterCluster/IntraCluster Ratio')
    plt.xlabel('Number of clusters')
    plt.errorbar(x, y, yerr=c, fmt='bo', markersize=8, elinewidth=2)
    plt.show()
开发者ID:flaviovdf,项目名称:data-mining,代码行数:81,代码来源:cluster_text.py


注:本文中的sklearn.cluster.MiniBatchKMeans.transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。