当前位置: 首页>>代码示例>>Python>>正文


Python numpy.argpartition函数代码示例

本文整理汇总了Python中numpy.argpartition函数的典型用法代码示例。如果您正苦于以下问题:Python argpartition函数的具体用法?Python argpartition怎么用?Python argpartition使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了argpartition函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: predict_variance_inf_phase1

def predict_variance_inf_phase1(budget, hum_train_means, temp_train_means, hum_train_vars, temp_train_vars):
    """Method to make predictions based on max-variance active inference."""         
    start_hum = 0
    window_hum = None
    window_temp = None
    i = 0

    hum_preds = np.ones((50, 96))
    temp_preds = np.ones((50, 96))

    for t in global_times:
        if budget > 0:
            window_hum = np.argpartition(hum_train_vars[t], -budget)[-budget:]
            window_temp = np.argpartition(temp_train_vars[t], -budget)[-budget:]
        else:
            window_hum = np.array([])
            window_temp = np.array([])

        hum_pred, temp_pred = makePreds_phase1(window_hum, window_temp, hum_train_means, temp_train_means, i, t)

        hum_preds[:, i] = copy.deepcopy(hum_pred)
        temp_preds[:, i] = copy.deepcopy(temp_pred)
        
        i += 1

    hum_mean_err = mean_absolute_error(hum_test, hum_preds)
    temp_mean_err = mean_absolute_error(temp_test, temp_preds)

    return hum_preds, temp_preds, hum_mean_err, temp_mean_err
开发者ID:ironhide23586,项目名称:Sensor-Network-CS583,代码行数:29,代码来源:Phase3_00.py

示例2: doKNN

def doKNN(k):
	dm = cdist(teXf, trXf,'euclidean')
	cfm = np.zeros((10,10), dtype = int)
	for	a in range(0,len(dm)):
		knn = np.argpartition(dm[a], k)[:k]
		preds = trY[knn]
		counts = np.bincount(preds)
		pred = -1
		if len(counts)>=2:
			top2 = np.argpartition(-counts, 1)	
			if counts[top2[0]] == counts[top2[1]]:
				d = 99999
				for i in xrange(0,len(knn)):
					val = dm[a][i]
					if val < d:
						d = dm[a][i]
						pred = trY[knn[i]]
			else:		
				pred = top2[0]
		else:
			pred = 0
		#print pred
		#mnist.visualize(teX[a])
		cfm[teY[a]][pred] += 1
	#print cfm
	#print "ER: ", 1 - np.sum(np.diagonal(cfm))/np.sum(cfm)
	
	return cfm
开发者ID:ealiasannila,项目名称:iml,代码行数:28,代码来源:p3.py

示例3: precision_test_function

		def precision_test_function(theano_inputs):
			k = 10
			scores1, scores2, c_select, n_used_items = theano_test_function(*theano_inputs)
			ids1 = np.argpartition(-scores1, range(k), axis=-1)[0, :k]
			ids2 = np.argpartition(-scores2, range(k), axis=-1)[0, :k]
			
			return ids1, ids2, c_select, n_used_items
开发者ID:yang-tradelab,项目名称:sequence-based-recommendations,代码行数:7,代码来源:fism_cluster.py

示例4: similarity_matrix

 def similarity_matrix(self):
     """ Calculate the similarity matrix given all samples used for GTM map training
     :return: similarity_matrix: Matrix assessing the similarity between samples used for GTM map training
     """
     print "Calculating similarity matrix..."
     # Find one tenth of the highest and lowest probability distribution values for each sample in the latent space
     sim_size = int(round(self.latent_space_size/10))
     responsibility_indexes = np.zeros((sim_size * 2, self.input_data.shape[0]))
     corr_input = np.zeros((sim_size * 2, self.input_data.shape[0]))
     for i in xrange(0, self.input_data.shape[0]):
         responsibility_indexes[0:sim_size, i] = np.argpartition(self.gtm_responsibility[:, i],
                                                                 -sim_size)[-sim_size:]
         responsibility_indexes[sim_size:, i] = np.argpartition(self.gtm_responsibility[:, i], sim_size)[0:sim_size]
     responsibility_indexes = responsibility_indexes.astype(int)
     # Create correlation input matrix for similarity assessment
     for i in xrange(0, self.input_data.shape[0]):
         corr_input[:, i] = self.gtm_responsibility[responsibility_indexes[:, i], i]
     # Calculate correlation between all samples and build similarity matrix
     similarity_matrix = np.corrcoef(np.transpose(corr_input))
     # Plot heat map of the similarity matrix accordingly
     [x, y] = np.meshgrid(np.linspace(1, self.input_data.shape[0], self.input_data.shape[0]),
                          np.linspace(1, self.input_data.shape[0], self.input_data.shape[0]))
     x = np.ravel(x)
     y = np.ravel(y)
     sim_lat = np.array([x, y])
     print "Plotting color mesh image..."
     plt.pcolormesh(np.reshape(sim_lat[0, :], (self.input_data.shape[0], self.input_data.shape[0])),
                np.reshape(sim_lat[1, :], (self.input_data.shape[0], self.input_data.shape[0])), similarity_matrix,
                cmap='magma', vmin=0, vmax=1)
     plt.colorbar()
     plt.axis([x.min(), x.max(), y.min(), y.max()])
     plt.gca().invert_yaxis()
     return similarity_matrix
开发者ID:mattoescobar,项目名称:Machine-Learning,代码行数:33,代码来源:GTM.py

示例5: local_kmeans_class

def local_kmeans_class(I, L, x, k):
    from scipy.spatial.distance import cdist

    sizex = len(np.atleast_2d(x))
    label = np.zeros((sizex,k))
    for rowsx in range(0, sizex):
        tic()
        dists = cdist(I, np.atleast_2d(x[rowsx]), metric='euclidean')
        toc()
        center = np.zeros((10,k,28*28))
        label_order = np.unique(L)
        l=0
        tic()
        thing = np.zeros((k,28*28))
        for labs in np.unique(L):
            indices = L == labs
            k_smallest = np.argpartition(dists[indices],tuple(range(1,k)),axis=None)
            for i in range(0,k):
                M = I[indices]
                #center[l,i,:] = np.average(M[k_smallest[:i+1]],axis = 0)
                if i == 0:
                    thing[i] = M[k_smallest[i+1]]
                else:
                    thing[i] = thing[i-1] + M[k_smallest[i+1]]
            center[l,:,:] = np.divide(thing,np.repeat(np.arange(1,11).reshape(10,1),28*28,axis=1))
            l+=1
        toc()
        for i in range(k):
            #print(cdist(center[:,i,:], np.atleast_2d(x[rowsx]), metric='euclidean'))
            dists2center = cdist(center[:,i,:], np.atleast_2d(x[rowsx]), metric='euclidean')
            k_smallest = np.argpartition(dists2center,tuple(range(1)),axis=None)
            label[rowsx,i] = label_order[k_smallest[0]]
    return label
开发者ID:AndrewZastovnik,项目名称:Math-285-hw2,代码行数:33,代码来源:KNN.py

示例6: branch_to_nodes

    def branch_to_nodes(self, wt, completion):
        """
        Decide which nodes to branch to next
        """
        missing_edges = HGT.get_missing_edges(completion) # Obtain the missing edge sparse list

        nb = self.strat.node_brancher
        
        # Determine if there is a maximum count
        count_max = min(self.strat.max_node_branch, self.num_nodes)
        
        if nb is None or not 'name' in nb: # Default
            # Gets nodes that contribute to missing edge
            edge = missing_edges.indices[0] # Grab any next edge
            node_indices = self.H[:, edge].indices
        elif nb['name'] == 'greedy' or nb['name'] == 'long':
            # Gets the nodes that overlap the most(least) with what's missing
            overlap = self.H.dot(missing_edges.T)
            # k = min(count_max + wt.nnz, overlap.nnz)
            k = min(count_max, overlap.nnz)
            if k >= self.num_nodes or k == overlap.nnz:
                if nb['name'] == 'greedy':
                    alg_slice = np.argsort(overlap.data)[::-1]
                else: # long
                    alg_slice = np.argsort(overlap.data)
            else: # Else be smart, don't perform O(nlogn) operations, perform O(k) operations
                if nb['name'] == 'greedy':
                    alg_slice = np.argpartition(overlap.data, -k)[-k:]
                else: #long
                    alg_slice = np.argpartition(overlap.data, k)[:k]
            node_indices = overlap.indices[alg_slice]
        elif nb['name'] == 'random':
            # Gets nodes that contribute to random missing edge
            edge = np.random.choice(missing_edges.indices) # Grab any next edge
            node_indices = self.H[:, edge].indices
        elif nb['name'] == 'diverse':
            # Diversify the kinds of transversals that have been found
            if wt.nnz == 0: # Just starting out
                node_indices = np.arange(self.num_nodes) # Branch to everything
            else: # Otherwise be greedy up to one
                # edge = missing_edges.indices[0] # Grab any next edge
                # node_indices = [self.H[:, edge].indices[0]]
                # overlap = self.H.dot(missing_edges.T)
                # node_indices = [overlap.indices[np.argmax(overlap.data)]]
                scaled_overlap = overlap.data / (self.node_weights[overlap.indices]**2)
                node_indices = overlap.indices[np.where(np.max(scaled_overlap) == scaled_overlap)]
        else:
            raise ValueError("Invalid strat.node_brancher: {0}".format(self.strat.node_brancher))
        
        if nb is not None and bool(nb.get('shuffle', False)):
            np.random.shuffle(node_indices)
        
        count = 0
        for i in node_indices:
            if count >= count_max:
                break
            if not wt[i, 0] > 0: # not already part of working transversal
                self.log('Branching to node:', i)
                count += 1
                yield i
开发者ID:tcfraser,项目名称:quantum_tools,代码行数:60,代码来源:hypergraph_transversals.py

示例7: construct_initial_solution

 def construct_initial_solution(self):
   ind = np.argpartition(self.collaboration_coo.data, -len(self.villains_team))[-len(self.villains_team):]
   inc = 1
   while len(np.unique(self.collaboration_coo.row[ind])) < len(self.villains_team):
     ind = np.argpartition(self.collaboration_coo.data, -(len(self.villains_team) + inc))[-(len(self.villains_team) + inc):]
     inc += 1
   heroes_team = self.heroes.loc[self.heroes[CHARACTER_ID].isin(self.collaboration_coo.row[ind])]
   return heroes_team
开发者ID:brunogsa,项目名称:tabu,代码行数:8,代码来源:marvel_tabu.py

示例8: _get_k_max_elements_indices_and_scores

 def _get_k_max_elements_indices_and_scores(vec, k, mask=None):
     if mask is None:
         # We use argpartition here instead of argsort to achieve linear-time performance.
         max_elements_indices = np.argpartition(-vec, k - 1)[:k]
     else:
         masked_vec = vec.copy()  # To avoid side-effects
         masked_vec[~mask] = -np.inf
         max_elements_indices = np.argpartition(-masked_vec, k - 1)[:k]
     return max_elements_indices, vec[max_elements_indices]
开发者ID:Allensmile,项目名称:cakechat,代码行数:9,代码来源:beamsearch.py

示例9: similarityPlot

def similarityPlot():
	import matplotlib.pyplot as plt
	from matplotlib import rcParams
	tfidf_vectorizer = TfidfVectorizer(min_df=1)
	names = friendsAboveMinNumMessages(200) + [me]
	data = []
	words = [] #ordering of words in tf_idf matrix
	wordsSet = set() #for faster lookup
	nameSet = set()
	for person in personDict:
		for name in person.split():
			nameSet.add(name)
			nameSet.add(name.lower())
	for i in range(len(names)):
		data.append(getAllMessagesAsString(names[i], False))
	tfidf_matrix = tfidf_vectorizer.fit_transform(data)
	featureNames = tfidf_vectorizer.get_feature_names()
	tfidf_arr = tfidf_matrix.toarray()
	for j in range(len(tfidf_arr[0])):
		word = tfidf_arr[0][j]
		if word not in wordsSet:
			words.append(word)
			wordsSet.add(j)
	#nmds = manifold.MDS(metric = True, n_components = N_DISTINGUISHING_FEATURES) 
	#npos = nmds.fit_transform(tfidf_matrix.toarray())
	clf = PCA(n_components=2)
	npos = clf.fit_transform(tfidf_arr)
	plt.scatter(npos[:, 0], npos[:, 1], marker = 'o', c = 'b', cmap = plt.get_cmap('Spectral')) #change colors
	for name, x, y in zip(names, npos[:, 0], npos[:, 1]):
		plt.annotate(
			name, 
			xy = (x, y), xytext = (-20, 20),
			textcoords = 'offset points', ha = 'right', va = 'bottom',
			bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
			arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
	fig, ax = plt.subplots()
	ax2 = ax.twinx()
	xAxisP = [featureNames[i] for i in np.argpartition(clf.components_[0], -50)[-50:] if featureNames[i] not in nameSet]
	yAxisP = [featureNames[i] for i in np.argpartition(clf.components_[1], -50)[-50:] if featureNames[i] not in nameSet]
	xAxisN = [featureNames[i] for i in np.argpartition(-clf.components_[0], -50)[-50:] if featureNames[i] not in nameSet]
	yAxisN = [featureNames[i] for i in np.argpartition(-clf.components_[1], -50)[-50:] if featureNames[i] not in nameSet]
	ax.set_xlabel("Most Postively influential words along x axis:\n" + ", ".join(xAxisP), fontsize=18)
	ax.set_ylabel("Most Postively influential words along y axis:\n" + ", ".join(yAxisP), fontsize=18)
	ax2.set_xlabel("Most Negatively influential words along x axis:\n" + ", ".join(xAxisN), fontsize=18)
	ax2.set_ylabel("Most Negatively influential words along y axis:\n" + ", ".join(yAxisN), fontsize=18)
	# xAxis = [featureNames[i] for i in np.argpartition(np.absolute(clf.components_[0]), -50)[-50:] if featureNames[i] not in nameSet]
	# yAxis = [featureNames[i] for i in np.argpartition(np.absolute(clf.components_[1]), -50)[-50:] if featureNames[i] not in nameSet]
	# for i in range(1, max(len(xAxis), len(yAxis)) ):
	# 	if i % 20 == 0 and i < len(xAxis):
	# 		xAxis[i] += "\n"
	# 	if i % 15 == 0 and i < len(yAxis):
	# 		yAxis[i] += "\n"
	# plt.xlabel("Most influential words along x axis:\n" + ", ".join(xAxis), fontsize=18)
	# plt.ylabel("Most influential words along y axis:\n" + ", ".join(yAxis), fontsize=18)
	rcParams.update({'figure.autolayout': True})
	plt.suptitle("Word-Usage Similarity Scatterplot", fontsize = 24, fontweight = 'bold')
	plt.show()
开发者ID:ctbrennan,项目名称:cross-platform-message-analytics,代码行数:57,代码来源:parse_analyze.py

示例10: _phase2

	def _phase2(self):
		"""
		Execute phase 2 of the SP region. This phase is used to compute the
		active columns.
		
		Note - This should only be called after phase 1 has been called and
		after the inhibition radius and neighborhood have been updated.
		"""
		
		# Shift the outputs
		self.y[:, 1:] = self.y[:, :-1]
		self.y[:, 0] = 0
		
		# Calculate k
		#   - For a column to be active its overlap must be at least as large
		#     as the overlap of the k-th largest column in its neighborhood.
		k = self._get_num_cols()
		
		if self.global_inhibition:
			# The neighborhood is all columns, thus the set of active columns
			# is simply columns that have an overlap >= the k-th largest in the
			# entire region
			
			# Compute the winning column indexes
			if self.learn:				
				# Randomly break ties
				ix = np.argpartition(-self.overlap[:, 0] -
					self.prng.uniform(.1, .2, self.ncolumns), k - 1)[:k]
			else:
				# Choose the same set of columns each time
				ix = np.argpartition(-self.overlap[:, 0], k - 1)[:k]
			
			# Set the active columns
			self.y[ix, 0] = self.overlap[ix, 0] > 0
		else:
			# The neighborhood is bounded by the inhibition radius, therefore
			# each column's neighborhood must be considered
			
			for i in xrange(self.ncolumns):
				# Get the neighbors
				ix = np.where(self.neighbors[i])[0]
				
				# Compute the minimum top overlap
				if ix.shape[0] <= k:
					# Desired number of candidates is at or below the desired
					# activity level, so find the overall min
					m = max(bn.nanmin(self.overlap[ix, 0]), 1)
				else:
					# Desired number of candidates is above the desired
					# activity level, so find the k-th largest
					m = max(-np.partition(-self.overlap[ix, 0], k - 1)[k - 1],
						1)
				
				# Set the column activity
				if self.overlap[i, 0] >= m: self.y[i, 0] = True
开发者ID:johnrobinsn,项目名称:mHTM,代码行数:55,代码来源:region.py

示例11: _build_recursive

        def _build_recursive(indices, level=0, split_index=0):
            """
            Descend recursively into tree to build it, setting splits and
            returning indices for leaves

            :param indices: The current set of indices before partitioning
            :param level: The level in the tree
            :param split_index: The index of the split to set

            :return: A list of arrays representing leaf membership
            :rtype: list[np.ndarray]
            """
            # If we're at the bottom, no split, just return the set
            if level == self._depth:
                return [indices]

            n = indices.size
            # If we literally don't have enough to populate the leaf, make it
            # empty
            if n < 1:
                return []

            # Get the random projections for these indices at this level
            # NB: Recall that the projection matrix has shape (levels, N)
            level_proj = proj[indices, level]

            # Split at the median if even, put median in upper half if not
            n_split = n // 2
            if n % 2 == 0:
                part_indices = np.argpartition(
                    level_proj, (n_split - 1, n_split))
                split_val = level_proj[part_indices[n_split - 1]]
                split_val += level_proj[part_indices[n_split]]
                split_val /= 2.0
            else:
                part_indices = np.argpartition(level_proj, n_split)
                split_val = level_proj[part_indices[n_split]]

            splits[split_index] = split_val

            # part_indices is relative to this block of values, recover
            # main indices
            left_indices = indices[part_indices[:n_split]]
            right_indices = indices[part_indices[n_split:]]

            # Descend into each split and get sub-splits
            left_out = _build_recursive(left_indices, level=level + 1,
                                        split_index=2 * split_index + 1)
            right_out = _build_recursive(right_indices, level=level + 1,
                                         split_index=2 * split_index + 2)

            # Assemble index set
            left_out.extend(right_out)
            return left_out
开发者ID:Kitware,项目名称:SMQTK,代码行数:54,代码来源:mrpt.py

示例12: fitOneLinearRegression

def fitOneLinearRegression(thetaLinear, IntensityLinear, tiltanglesArray, options):
	if (len(tiltanglesArray)%2 == 1):
		halfN = int(len(tiltanglesArray)/2) + 1
		xLeft, yLeft = thetaLinear[0:halfN], IntensityLinear[0:halfN]
		xRight, yRight = thetaLinear[halfN-1:], IntensityLinear[halfN-1:]
		
	else:
		halfN = int(len(tiltanglesArray)/2)
		xLeft, yLeft = thetaLinear[0:halfN], IntensityLinear[0:halfN]
		xRight, yRight = thetaLinear[halfN:], IntensityLinear[halfN:]
	
	slopeLeft, interceptLeft, r2Left = linearRegression(xLeft, yLeft)
        slopeRight, interceptRight, r2Right = linearRegression(xRight, yRight)
	
	assert(len(xLeft)==len(xRight))
	
	fitLeft = slopeLeft*xLeft + interceptLeft
        fitRight = slopeRight*xRight + interceptRight
        
        #the sum of squared residuals
        resLeft = yLeft - fitLeft
	resLeft = resLeft / fitLeft
	#print "resLeft", resLeft
        resRight = yRight - fitRight
	resRight = resRight / fitRight
	#print "resRight", resRight
	
	fresLeft = sum(resLeft**2)
        fresRight = sum(resRight**2)
	fres = [fresLeft*1000000, fresRight*1000000]

	#find the points with the largest 3 residuals in left and right branches, use numpy.argpartition
	#N = options.largestNRes
	N=3
        negN = (-1)*N
        indexLargeLeft = np.argpartition(resLeft**2, negN)[negN:]
        indexLargeRight = np.argpartition(resRight**2, negN)[negN:]
	
	M=3
	#M = options.smallestNRes
	posM = M
	indexSmallLeft = np.argpartition(resLeft**2, posM)[:posM]
	indexSmallRight = np.argpartition(resRight**2, posM)[:posM]
	
        #MSE, under the assumption that the population error term has a constant variance, the estimate of that variance is given by MSE, mean square error
        #The denominator is the sample size reduced by the number of model parameters estimated from the same data, (n-p) for p regressors or (n-p-1) if an intercept is used.
        #In this case, p=1 so the denominator is n-2.
        stdResLeft = np.std(resLeft, ddof=2)
        stdResRight = np.std(resRight, ddof=2)
	stdRes = [stdResLeft*1000, stdResRight*1000]
	ret = fres, stdRes, xLeft, yLeft, fitLeft, xRight, yRight, fitRight, indexLargeLeft, indexLargeRight, indexSmallLeft, indexSmallRight, resLeft, resRight, slopeLeft, interceptLeft, slopeRight, interceptRight
	return ret
开发者ID:jianglab,项目名称:tomography,代码行数:52,代码来源:tomoThickness.py

示例13: define_toplogy

    def define_toplogy(self, num_input, num_hidden,  num_output, density):
        """
        Defines the topology of the OpenBrain network.
        :param num_input:
        :param num_hidden:
        :param num_output:
        :param density:
        :return:
        """
        topo = networkx.DiGraph(networkx.watts_strogatz_graph(self.num_neurons, 5, density, seed=None)).to_directed()
        adjacency_list = topo.adjacency_list()


        # Pick the output neurons to be those with highest in degree
        in_deg = np.array([topo.in_degree(x) for x,_ in enumerate(adjacency_list)])
        self.output_neurons = np.argpartition(in_deg, -num_output)[-num_output:]
        print(self.output_neurons)
        print([topo.in_degree(x) for x in self.output_neurons])

        # Pick the input neurons to be those with highest out degree
        out_deg = np.array([topo.out_degree(x) if x not in self.output_neurons else -1
                            for x,_ in enumerate(adjacency_list)])
        self.input_neurons = np.argpartition(out_deg, -num_input)[-num_input:]

        # Output neurons do not fire out.
        for adjacent_neurons in adjacency_list:
            for out_neuron in self.output_neurons:
                if out_neuron in adjacent_neurons:
                    adjacent_neurons.remove(out_neuron)

        # Disconnect input -> output
        for out in self.output_neurons:
            for inp in self.input_neurons:
                if out in adjacency_list[inp]: adjacency_list[inp].remove(out)
                if inp in adjacency_list[out]: adjacency_list[out].remove(inp)


        for i, adjacent in enumerate(adjacency_list):
            if i not in self.input_neurons and i not in self.output_neurons:
                for n in adjacent:
                    if i in adjacency_list[n]:
                        if np.random.rand(1)>0.5:
                            adjacent.remove(n)
                        else:
                            adjacency_list[n].remove(i)

        # Let nothing enter the input neurons
        for inp in self.input_neurons:
            adjacency_list[inp] = []

        return adjacency_list
开发者ID:mlberkeley,项目名称:openbrain,代码行数:51,代码来源:brain.py

示例14: sort_by_relative_entropy

def sort_by_relative_entropy(corpus, topicct, stemmer):
    # get the right file names for the corpus and count
    stemmed_weights = ['wordweights/' + fname for fname in os.listdir('wordweights')
            if fname.startswith('{}-{}-{}'.format(corpus, stemmer, topicct))]
    unstemmed_weights = ['wordweights/' + fname for fname in os.listdir('wordweights')
            if fname.startswith('{}-{}-{}'.format(corpus, UNSTEMMED_NAME, topicct))]
    stemmed_corpus_file = 'corpora/{}-train-{}-stopped.txt'.format(corpus, stemmer)
    unstemmed_corpus_file = 'corpora/{}-train-{}-stopped.txt'.format(corpus, UNSTEMMED_NAME)

    # get the mapping from unstemmed to stemmed words
    stemmed_to_unstemmed = defaultdict(set)
    unstemmed_counts = Counter()
    with open(stemmed_corpus_file) as f, open(unstemmed_corpus_file) as g:
        for stemmed_line in f:
            stemmed_words = stemmed_line.split()[3:]
            unstemmed_words = g.readline().split()[3:]
            assert(len(stemmed_words) == len(unstemmed_words))
            for uword, sword in zip(unstemmed_words, stemmed_words):
                stemmed_to_unstemmed[sword].add(uword)
                unstemmed_counts[uword] += 1

    # for each file; for each word; get the entropy
    stemmed_entropies = defaultdict(list)
    unstemmed_entropies = defaultdict(list)
    for file in stemmed_weights:
        entropy_dict = get_stemmed_entropy_per_word(file)
        for k, v in entropy_dict.iteritems():
            stemmed_entropies[k].append(v)
    for file in unstemmed_weights:
        entropy_dict = get_unstemmed_entropy_per_word(file, stemmed_to_unstemmed, int(topicct))
        for k, v in entropy_dict.iteritems():
            unstemmed_entropies[k].append(v)

    # compute difference of average entropies
    stemmed_vocab = [sword for sword, uwords in stemmed_to_unstemmed.iteritems() if len(uwords) > 1]
    entropy_diffs = np.zeros(len(stemmed_vocab))
    for i, sword in enumerate(stemmed_vocab):
        entropy_diffs[i] = np.mean(stemmed_entropies[sword]) - np.mean(unstemmed_entropies[sword])

    # find top 50 maximum and minimum entropies
    min_indices = np.argpartition(entropy_diffs, 50)[:50]
    max_indices = np.argpartition(entropy_diffs, -50)[-50:]
    with open('wordlists/{}-{}-{}.txt'.format(corpus, stemmer, topicct), 'w') as wf:
        wf.write('Lowest entropy differences (stemmed is better)\n')
        for i in min_indices:
            wf.write('{}\t{}\t{}\n'.format(entropy_diffs[i], stemmed_vocab[i], ' '.join(stemmed_to_unstemmed[stemmed_vocab[i]])))
        wf.write('Highest entropy differences (unstemmed is better)\n')
        for i in max_indices:
            wf.write('{}\t{}\t{}\n'.format(entropy_diffs[i], stemmed_vocab[i], ' '.join(stemmed_to_unstemmed[stemmed_vocab[i]])))
开发者ID:heraldicsandfox,项目名称:stemmers,代码行数:49,代码来源:word_entropy.py

示例15: computeRanks

def computeRanks(composedSpace, observedSpace):
    """Ranks all the representations in the composed space with respect to 
    the representations in the observed space. Cut-off value 1000"
    """
    ranks = {}
    rankList = []

    composedWords = set(composedSpace.get_id2row())
    observedWords = observedSpace.get_id2row()
    neighbours = 1000

    for w_idx, word in enumerate(composedWords):
        vector = composedSpace.get_row(word)
        Y = 1 - cdist(vector.mat, observedSpace.get_cooccurrence_matrix().mat, 'cosine')
        nearest = Y.argmax()
        nearest_k_indices = np.argpartition(Y, tuple([-p for p in range(neighbours)]), axis=None)[-neighbours:]
        # pp([(observedWords[idx], Y[0][idx]) for idx in reversed(nearest_k_indices)])
        words = [observedWords[idx] for idx in reversed(nearest_k_indices)]
        wordRanks = {word:index+1 for index,word in enumerate(words)}
        # print(wordRanks)

        if (word in wordRanks):
            r = wordRanks[word]
            ranks[word] = r
            rankList.append(r)

        else:
            ranks[word] = 1000
            rankList.append(1000)

        if ((w_idx > 0) and (w_idx % 100 == 0)):
            print(w_idx)

    return rankList, ranks
开发者ID:corinadima,项目名称:gWordcomp,代码行数:34,代码来源:composition_eval.py


注:本文中的numpy.argpartition函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。