当前位置: 首页>>代码示例>>Java>>正文


Java FeatureSequence.getLength方法代码示例

本文整理汇总了Java中cc.mallet.types.FeatureSequence.getLength方法的典型用法代码示例。如果您正苦于以下问题:Java FeatureSequence.getLength方法的具体用法?Java FeatureSequence.getLength怎么用?Java FeatureSequence.getLength使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在cc.mallet.types.FeatureSequence的用法示例。


在下文中一共展示了FeatureSequence.getLength方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: convolution

import cc.mallet.types.FeatureSequence; //导入方法依赖的package包/类
/**
 * construct word co-occurrence features from the original sequence
 * do combinatoric,  n choose 2, can be extended to n choose 3
 
public void convolution() {
	int fi = -1;
	int pre = -1;
	int i,j;
	int curLen = length;
	for(i = 0; i < curLen-1; i++) {
		for(j = i + 1; j < curLen; j++) {
			pre = features[i];
			fi = features[j];
			Object preO = dictionary.lookupObject(pre);
			Object curO = dictionary.lookupObject(fi);
			Object coO = preO.toString() + "_" + curO.toString();
			add(coO);
		}
	}
}*/

public Instance pipe (Instance carrier)
{
	FeatureSequence fseq = (FeatureSequence) carrier.getData();
	FeatureSequence ret =
		new FeatureSequence ((Alphabet)getDataAlphabet());
	int i,j, curLen;
	curLen=fseq.getLength();
	//first add fseq to ret
	for(i = 0; i < curLen; i++) {
		ret.add(fseq.getObjectAtPosition(i));
	}
	//second word co-occurrence
	int pre, cur;
	Object coO;
	for(i = 0; i < curLen-1; i++) {
		for(j = i + 1; j < curLen; j++) {
			pre = fseq.getIndexAtPosition(i);
			cur = fseq.getIndexAtPosition(j);
			coO = pre + "_" + cur;
			ret.add(coO);
		}
	}
	if(carrier.isLocked()) {
		carrier.unLock();
	}
	carrier.setData(ret);
	return carrier;
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:50,代码来源:FeatureSequenceConvolution.java

示例2: trimTopics

import cc.mallet.types.FeatureSequence; //导入方法依赖的package包/类
private void trimTopics() {

		 //System.out.println("start trim");
        int[] new_nk = IndexQuickSort.sort(nk);
        IndexQuickSort.reverse(new_nk);

        //remove empty topic
        IndexQuickSort.reorder(nk, new_nk);
        IndexQuickSort.reorder(nkt, new_nk);

        for (int i = 0; i < kgaps.size(); i++) {
            nk.remove(nk.size() - 1);
            nkt.remove(nkt.size() - 1);
        }

        for (int m = 0; m < numDocuments; m++) {
            IndexQuickSort.reorder(nmk[m], new_nk);
            //remove gaps
            for (int i = 0; i < kgaps.size(); i++) {
                nmk[m].remove(nmk[m].size() - 1);
            }
        }
        //clean kgaps
        kgaps.clear();
        int[] k2knew = IndexQuickSort.inverse(new_nk);

        //rewrite topic
        for (int i = 0; i < K; i++) {
            kactive.set(i, k2knew[kactive.get(i)]);
        }

        for (int m = 0; m < numDocuments; m++) {
            FeatureSequence fs = (FeatureSequence) instances.get(m).getData();

            for (int n = 0; n < fs.getLength(); n++) {
                z[m][n] = k2knew[z[m][n]];
            }
        }

    }
 
开发者ID:hmetaxa,项目名称:MixLDA,代码行数:41,代码来源:HDP.java

示例3: getPreplexity

import cc.mallet.types.FeatureSequence; //导入方法依赖的package包/类
public double getPreplexity(){
 
 //TODO
 double preplexity=0.0;
 double logLik=0.0;
 double[][] theta = new double[numDocs][K];
 
 //calculate theta
 for(int m=0 ; m<numDocs ; m++){
	 for(int k=0 ; k< K ; k++){
		 theta[m][k]= ((double)nmk[m][k]+alpha)/(effectiveDocLength[m]+K*alpha);
	 }
 }
 
 //calculate LL
 for(int m=0 ; m<numDocs ; m++){
	 
	 FeatureSequence fs = (FeatureSequence) testInstances.get(m).getData();
	 int seqLen = fs.getLength();
	 int type, token;
	 
	 for(token=0 ; token < seqLen ; token++){
		 
		 type = fs.getIndexAtPosition(token);
		 //only consider existed type
		 if(type < numTypes){
			 double sum =0.0;
			 for(int k=0 ; k<K ; k++){
				 sum += (theta[m][k]*phi[k][type]);
			 }//end k
			 logLik += Math.log(sum);
		 }//end if
	}//end token
 }//end m
 
 preplexity =Math.exp( (-1)*logLik / totalTokens);
 
 return preplexity;
}
 
开发者ID:hmetaxa,项目名称:MixLDA,代码行数:40,代码来源:HDPInferencer.java

示例4: sampleTopicsForOneTestDocAll

import cc.mallet.types.FeatureSequence; //导入方法依赖的package包/类
private void sampleTopicsForOneTestDocAll(FeatureSequence tokenSequence,
		LabelSequence topicSequence) {
	// TODO Auto-generated method stub
	int[] oneDocTopics = topicSequence.getFeatures();

	TIntIntHashMap currentTypeTopicCounts;
	int type, oldTopic, newTopic;
	double tw;
	double[] topicWeights = new double[numTopics];
	double topicWeightsSum;
	int docLength = tokenSequence.getLength();

	//		populate topic counts
	int[] localTopicCounts = new int[numTopics];
	for (int ti = 0; ti < numTopics; ti++){
		localTopicCounts[ti] = 0;
	}
	for (int position = 0; position < docLength; position++) {
		localTopicCounts[oneDocTopics[position]] ++;
	}

	// Iterate over the positions (words) in the document
	for (int si = 0; si < docLength; si++) {
		type = tokenSequence.getIndexAtPosition(si);
		oldTopic = oneDocTopics[si];

		// Remove this token from all counts
		localTopicCounts[oldTopic] --;

		currentTypeTopicCounts = typeTopicCounts[type];
		assert(currentTypeTopicCounts.get(oldTopic) >= 0);

		if (currentTypeTopicCounts.get(oldTopic) == 1) {
			currentTypeTopicCounts.remove(oldTopic);
		}
		else {
			currentTypeTopicCounts.adjustValue(oldTopic, -1);
		}
		tokensPerTopic[oldTopic]--;

		// Build a distribution over topics for this token
		Arrays.fill (topicWeights, 0.0);
		topicWeightsSum = 0;

		for (int ti = 0; ti < numTopics; ti++) {
			tw = ((currentTypeTopicCounts.get(ti) + beta) / (tokensPerTopic[ti] + betaSum))
			      * ((localTopicCounts[ti] + alpha[ti])); // (/docLen-1+tAlpha); is constant across all topics
			topicWeightsSum += tw;
			topicWeights[ti] = tw;
		}
		// Sample a topic assignment from this distribution
		newTopic = random.nextDiscrete (topicWeights, topicWeightsSum);

		// Put that new topic into the counts
		oneDocTopics[si] = newTopic;
		currentTypeTopicCounts.adjustOrPutValue(newTopic, 1, 1);
		localTopicCounts[newTopic] ++;
		tokensPerTopic[newTopic]++;
	}
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:61,代码来源:LDAStream.java

示例5: sampleTopicsForOneTestDoc

import cc.mallet.types.FeatureSequence; //导入方法依赖的package包/类
private void sampleTopicsForOneTestDoc(FeatureSequence tokenSequence,
		LabelSequence topicSequence) {
	// TODO Auto-generated method stub
	int[] oneDocTopics = topicSequence.getFeatures();

	TIntIntHashMap currentTypeTopicCounts;
	int type, oldTopic, newTopic;
	double tw;
	double[] topicWeights = new double[numTopics];
	double topicWeightsSum;
	int docLength = tokenSequence.getLength();

	//		populate topic counts
	int[] localTopicCounts = new int[numTopics];
	for (int ti = 0; ti < numTopics; ti++){
		localTopicCounts[ti] = 0;
	}
	for (int position = 0; position < docLength; position++) {
		if(oneDocTopics[position] != -1) {
			localTopicCounts[oneDocTopics[position]] ++;
		}
	}

	// Iterate over the positions (words) in the document
	for (int si = 0; si < docLength; si++) {
		type = tokenSequence.getIndexAtPosition(si);
		oldTopic = oneDocTopics[si];
		if(oldTopic == -1) {
			continue;
		}

		// Remove this token from all counts
    		localTopicCounts[oldTopic] --;
    		currentTypeTopicCounts = typeTopicCounts[type];
		assert(currentTypeTopicCounts.get(oldTopic) >= 0);

		if (currentTypeTopicCounts.get(oldTopic) == 1) {
			currentTypeTopicCounts.remove(oldTopic);
		}
		else {
			currentTypeTopicCounts.adjustValue(oldTopic, -1);
		}
		tokensPerTopic[oldTopic]--;

		// Build a distribution over topics for this token
		Arrays.fill (topicWeights, 0.0);
		topicWeightsSum = 0;

		for (int ti = 0; ti < numTopics; ti++) {
			tw = ((currentTypeTopicCounts.get(ti) + beta) / (tokensPerTopic[ti] + betaSum))
			      * ((localTopicCounts[ti] + alpha[ti])); // (/docLen-1+tAlpha); is constant across all topics
			topicWeightsSum += tw;
			topicWeights[ti] = tw;
		}
		// Sample a topic assignment from this distribution
		newTopic = random.nextDiscrete (topicWeights, topicWeightsSum);

		// Put that new topic into the counts
		oneDocTopics[si] = newTopic;
		currentTypeTopicCounts.adjustOrPutValue(newTopic, 1, 1);
		localTopicCounts[newTopic] ++;
		tokensPerTopic[newTopic]++;
	}
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:65,代码来源:LDAStream.java

示例6: sampleTopicsForOneDocWithTheta

import cc.mallet.types.FeatureSequence; //导入方法依赖的package包/类
private void sampleTopicsForOneDocWithTheta(FeatureSequence tokenSequence,
		LabelSequence topicSequence, double[] topicDistribution) {
	// TODO Auto-generated method stub
	int[] oneDocTopics = topicSequence.getFeatures();

	TIntIntHashMap currentTypeTopicCounts;
	int type, oldTopic, newTopic;
	double tw;
	double[] topicWeights = new double[numTopics];
	double topicWeightsSum;
	int docLength = tokenSequence.getLength();
	
	// Iterate over the positions (words) in the document
	for (int si = 0; si < docLength; si++) {
		type = tokenSequence.getIndexAtPosition(si);
		oldTopic = oneDocTopics[si];
		if(oldTopic == -1) {
			continue;
		}

 		currentTypeTopicCounts = typeTopicCounts[type];
		assert(currentTypeTopicCounts.get(oldTopic) >= 0);

		if (currentTypeTopicCounts.get(oldTopic) == 1) {
			currentTypeTopicCounts.remove(oldTopic);
		}
		else {
			currentTypeTopicCounts.adjustValue(oldTopic, -1);
		}
		tokensPerTopic[oldTopic]--;

		// Build a distribution over topics for this token
		Arrays.fill (topicWeights, 0.0);
		topicWeightsSum = 0;

		for (int ti = 0; ti < numTopics; ti++) {
			tw = ((currentTypeTopicCounts.get(ti) + beta) / (tokensPerTopic[ti] + betaSum))
			      * topicDistribution[ti]; // (/docLen-1+tAlpha); is constant across all topics
			topicWeightsSum += tw;
			topicWeights[ti] = tw;
		}
		// Sample a topic assignment from this distribution
		newTopic = random.nextDiscrete (topicWeights, topicWeightsSum);

		// Put that new topic into the counts
		oneDocTopics[si] = newTopic;
		currentTypeTopicCounts.adjustOrPutValue(newTopic, 1, 1);
		tokensPerTopic[newTopic]++;
	}
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:51,代码来源:LDAStream.java

示例7: sampleTopicsForOneTestDocAll

import cc.mallet.types.FeatureSequence; //导入方法依赖的package包/类
private void sampleTopicsForOneTestDocAll(FeatureSequence tokenSequence,
		LabelSequence topicSequence) {
	// TODO Auto-generated method stub
	int[] oneDocTopics = topicSequence.getFeatures();

	IntIntOpenHashMap currentTypeTopicCounts;
	int type, oldTopic, newTopic;
	double tw;
	double[] topicWeights = new double[numTopics];
	double topicWeightsSum;
	int docLength = tokenSequence.getLength();

	//		populate topic counts
	int[] localTopicCounts = new int[numTopics];
	for (int ti = 0; ti < numTopics; ti++){
		localTopicCounts[ti] = 0;
	}
	for (int position = 0; position < docLength; position++) {
		localTopicCounts[oneDocTopics[position]] ++;
	}

	// Iterate over the positions (words) in the document
	for (int si = 0; si < docLength; si++) {
		type = tokenSequence.getIndexAtPosition(si);
		oldTopic = oneDocTopics[si];

		// Remove this token from all counts
		localTopicCounts[oldTopic] --;

		currentTypeTopicCounts = typeTopicCounts[type];
		assert(currentTypeTopicCounts.get(oldTopic) >= 0);

		if (currentTypeTopicCounts.get(oldTopic) == 1) {
			currentTypeTopicCounts.remove(oldTopic);
		}
		else {
			currentTypeTopicCounts.addTo(oldTopic, -1);
		}
		tokensPerTopic[oldTopic]--;

		// Build a distribution over topics for this token
		Arrays.fill (topicWeights, 0.0);
		topicWeightsSum = 0;

		for (int ti = 0; ti < numTopics; ti++) {
			tw = ((currentTypeTopicCounts.get(ti) + beta) / (tokensPerTopic[ti] + betaSum))
			      * ((localTopicCounts[ti] + alpha[ti])); // (/docLen-1+tAlpha); is constant across all topics
			topicWeightsSum += tw;
			topicWeights[ti] = tw;
		}
		// Sample a topic assignment from this distribution
		newTopic = random.nextDiscrete (topicWeights, topicWeightsSum);

		// Put that new topic into the counts
		oneDocTopics[si] = newTopic;
		currentTypeTopicCounts.putOrAdd(newTopic, 1, 1);
		localTopicCounts[newTopic] ++;
		tokensPerTopic[newTopic]++;
	}
}
 
开发者ID:cmoen,项目名称:mallet,代码行数:61,代码来源:LDAStream.java

示例8: sampleTopicsForOneTestDoc

import cc.mallet.types.FeatureSequence; //导入方法依赖的package包/类
private void sampleTopicsForOneTestDoc(FeatureSequence tokenSequence,
		LabelSequence topicSequence) {
	// TODO Auto-generated method stub
	int[] oneDocTopics = topicSequence.getFeatures();

	IntIntOpenHashMap currentTypeTopicCounts;
	int type, oldTopic, newTopic;
	double tw;
	double[] topicWeights = new double[numTopics];
	double topicWeightsSum;
	int docLength = tokenSequence.getLength();

	//		populate topic counts
	int[] localTopicCounts = new int[numTopics];
	for (int ti = 0; ti < numTopics; ti++){
		localTopicCounts[ti] = 0;
	}
	for (int position = 0; position < docLength; position++) {
		if(oneDocTopics[position] != -1) {
			localTopicCounts[oneDocTopics[position]] ++;
		}
	}

	// Iterate over the positions (words) in the document
	for (int si = 0; si < docLength; si++) {
		type = tokenSequence.getIndexAtPosition(si);
		oldTopic = oneDocTopics[si];
		if(oldTopic == -1) {
			continue;
		}

		// Remove this token from all counts
    		localTopicCounts[oldTopic] --;
    		currentTypeTopicCounts = typeTopicCounts[type];
		assert(currentTypeTopicCounts.get(oldTopic) >= 0);

		if (currentTypeTopicCounts.get(oldTopic) == 1) {
			currentTypeTopicCounts.remove(oldTopic);
		}
		else {
			currentTypeTopicCounts.addTo(oldTopic, -1);
		}
		tokensPerTopic[oldTopic]--;

		// Build a distribution over topics for this token
		Arrays.fill (topicWeights, 0.0);
		topicWeightsSum = 0;

		for (int ti = 0; ti < numTopics; ti++) {
			tw = ((currentTypeTopicCounts.get(ti) + beta) / (tokensPerTopic[ti] + betaSum))
			      * ((localTopicCounts[ti] + alpha[ti])); // (/docLen-1+tAlpha); is constant across all topics
			topicWeightsSum += tw;
			topicWeights[ti] = tw;
		}
		// Sample a topic assignment from this distribution
		newTopic = random.nextDiscrete (topicWeights, topicWeightsSum);

		// Put that new topic into the counts
		oneDocTopics[si] = newTopic;
		currentTypeTopicCounts.putOrAdd(newTopic, 1, 1);
		localTopicCounts[newTopic] ++;
		tokensPerTopic[newTopic]++;
	}
}
 
开发者ID:cmoen,项目名称:mallet,代码行数:65,代码来源:LDAStream.java

示例9: sampleTopicsForOneDocWithTheta

import cc.mallet.types.FeatureSequence; //导入方法依赖的package包/类
private void sampleTopicsForOneDocWithTheta(FeatureSequence tokenSequence,
		LabelSequence topicSequence, double[] topicDistribution) {
	// TODO Auto-generated method stub
	int[] oneDocTopics = topicSequence.getFeatures();

   IntIntOpenHashMap currentTypeTopicCounts;
	int type, oldTopic, newTopic;
	double tw;
	double[] topicWeights = new double[numTopics];
	double topicWeightsSum;
	int docLength = tokenSequence.getLength();
	
	// Iterate over the positions (words) in the document
	for (int si = 0; si < docLength; si++) {
		type = tokenSequence.getIndexAtPosition(si);
		oldTopic = oneDocTopics[si];
		if(oldTopic == -1) {
			continue;
		}

 		currentTypeTopicCounts = typeTopicCounts[type];
		assert(currentTypeTopicCounts.get(oldTopic) >= 0);

		if (currentTypeTopicCounts.get(oldTopic) == 1) {
			currentTypeTopicCounts.remove(oldTopic);
		}
		else {
			currentTypeTopicCounts.addTo(oldTopic, -1);
		}
		tokensPerTopic[oldTopic]--;

		// Build a distribution over topics for this token
		Arrays.fill (topicWeights, 0.0);
		topicWeightsSum = 0;

		for (int ti = 0; ti < numTopics; ti++) {
			tw = ((currentTypeTopicCounts.get(ti) + beta) / (tokensPerTopic[ti] + betaSum))
			      * topicDistribution[ti]; // (/docLen-1+tAlpha); is constant across all topics
			topicWeightsSum += tw;
			topicWeights[ti] = tw;
		}
		// Sample a topic assignment from this distribution
		newTopic = random.nextDiscrete (topicWeights, topicWeightsSum);

		// Put that new topic into the counts
		oneDocTopics[si] = newTopic;
		currentTypeTopicCounts.putOrAdd(newTopic, 1, 1);
		tokensPerTopic[newTopic]++;
	}
}
 
开发者ID:cmoen,项目名称:mallet,代码行数:51,代码来源:LDAStream.java

示例10: sampleTopicsForOneTestDocAll

import cc.mallet.types.FeatureSequence; //导入方法依赖的package包/类
private void sampleTopicsForOneTestDocAll(FeatureSequence tokenSequence,
		LabelSequence topicSequence) {
	// TODO Auto-generated method stub
	int[] oneDocTopics = topicSequence.getFeatures();

	IntIntHashMap currentTypeTopicCounts;
	int type, oldTopic, newTopic;
	double tw;
	double[] topicWeights = new double[numTopics];
	double topicWeightsSum;
	int docLength = tokenSequence.getLength();

	//		populate topic counts
	int[] localTopicCounts = new int[numTopics];
	for (int ti = 0; ti < numTopics; ti++){
		localTopicCounts[ti] = 0;
	}
	for (int position = 0; position < docLength; position++) {
		localTopicCounts[oneDocTopics[position]] ++;
	}

	// Iterate over the positions (words) in the document
	for (int si = 0; si < docLength; si++) {
		type = tokenSequence.getIndexAtPosition(si);
		oldTopic = oneDocTopics[si];

		// Remove this token from all counts
		localTopicCounts[oldTopic] --;

		currentTypeTopicCounts = typeTopicCounts[type];
		assert(currentTypeTopicCounts.get(oldTopic) >= 0);

		if (currentTypeTopicCounts.get(oldTopic) == 1) {
			currentTypeTopicCounts.remove(oldTopic);
		}
		else {
			currentTypeTopicCounts.addTo(oldTopic, -1);
		}
		tokensPerTopic[oldTopic]--;

		// Build a distribution over topics for this token
		Arrays.fill (topicWeights, 0.0);
		topicWeightsSum = 0;

		for (int ti = 0; ti < numTopics; ti++) {
			tw = ((currentTypeTopicCounts.get(ti) + beta) / (tokensPerTopic[ti] + betaSum))
			      * ((localTopicCounts[ti] + alpha[ti])); // (/docLen-1+tAlpha); is constant across all topics
			topicWeightsSum += tw;
			topicWeights[ti] = tw;
		}
		// Sample a topic assignment from this distribution
		newTopic = random.nextDiscrete (topicWeights, topicWeightsSum);

		// Put that new topic into the counts
		oneDocTopics[si] = newTopic;
		currentTypeTopicCounts.putOrAdd(newTopic, 1, 1);
		localTopicCounts[newTopic] ++;
		tokensPerTopic[newTopic]++;
	}
}
 
开发者ID:mimno,项目名称:Mallet,代码行数:61,代码来源:LDAStream.java

示例11: sampleTopicsForOneTestDoc

import cc.mallet.types.FeatureSequence; //导入方法依赖的package包/类
private void sampleTopicsForOneTestDoc(FeatureSequence tokenSequence,
		LabelSequence topicSequence) {
	// TODO Auto-generated method stub
	int[] oneDocTopics = topicSequence.getFeatures();

	IntIntHashMap currentTypeTopicCounts;
	int type, oldTopic, newTopic;
	double tw;
	double[] topicWeights = new double[numTopics];
	double topicWeightsSum;
	int docLength = tokenSequence.getLength();

	//		populate topic counts
	int[] localTopicCounts = new int[numTopics];
	for (int ti = 0; ti < numTopics; ti++){
		localTopicCounts[ti] = 0;
	}
	for (int position = 0; position < docLength; position++) {
		if(oneDocTopics[position] != -1) {
			localTopicCounts[oneDocTopics[position]] ++;
		}
	}

	// Iterate over the positions (words) in the document
	for (int si = 0; si < docLength; si++) {
		type = tokenSequence.getIndexAtPosition(si);
		oldTopic = oneDocTopics[si];
		if(oldTopic == -1) {
			continue;
		}

		// Remove this token from all counts
    		localTopicCounts[oldTopic] --;
    		currentTypeTopicCounts = typeTopicCounts[type];
		assert(currentTypeTopicCounts.get(oldTopic) >= 0);

		if (currentTypeTopicCounts.get(oldTopic) == 1) {
			currentTypeTopicCounts.remove(oldTopic);
		}
		else {
			currentTypeTopicCounts.addTo(oldTopic, -1);
		}
		tokensPerTopic[oldTopic]--;

		// Build a distribution over topics for this token
		Arrays.fill (topicWeights, 0.0);
		topicWeightsSum = 0;

		for (int ti = 0; ti < numTopics; ti++) {
			tw = ((currentTypeTopicCounts.get(ti) + beta) / (tokensPerTopic[ti] + betaSum))
			      * ((localTopicCounts[ti] + alpha[ti])); // (/docLen-1+tAlpha); is constant across all topics
			topicWeightsSum += tw;
			topicWeights[ti] = tw;
		}
		// Sample a topic assignment from this distribution
		newTopic = random.nextDiscrete (topicWeights, topicWeightsSum);

		// Put that new topic into the counts
		oneDocTopics[si] = newTopic;
		currentTypeTopicCounts.putOrAdd(newTopic, 1, 1);
		localTopicCounts[newTopic] ++;
		tokensPerTopic[newTopic]++;
	}
}
 
开发者ID:mimno,项目名称:Mallet,代码行数:65,代码来源:LDAStream.java

示例12: sampleTopicsForOneDocWithTheta

import cc.mallet.types.FeatureSequence; //导入方法依赖的package包/类
private void sampleTopicsForOneDocWithTheta(FeatureSequence tokenSequence,
		LabelSequence topicSequence, double[] topicDistribution) {
	// TODO Auto-generated method stub
	int[] oneDocTopics = topicSequence.getFeatures();

	IntIntHashMap currentTypeTopicCounts;
	int type, oldTopic, newTopic;
	double tw;
	double[] topicWeights = new double[numTopics];
	double topicWeightsSum;
	int docLength = tokenSequence.getLength();
	
	// Iterate over the positions (words) in the document
	for (int si = 0; si < docLength; si++) {
		type = tokenSequence.getIndexAtPosition(si);
		oldTopic = oneDocTopics[si];
		if(oldTopic == -1) {
			continue;
		}

 		currentTypeTopicCounts = typeTopicCounts[type];
		assert(currentTypeTopicCounts.get(oldTopic) >= 0);

		if (currentTypeTopicCounts.get(oldTopic) == 1) {
			currentTypeTopicCounts.remove(oldTopic);
		}
		else {
			currentTypeTopicCounts.addTo(oldTopic, -1);
		}
		tokensPerTopic[oldTopic]--;

		// Build a distribution over topics for this token
		Arrays.fill (topicWeights, 0.0);
		topicWeightsSum = 0;

		for (int ti = 0; ti < numTopics; ti++) {
			tw = ((currentTypeTopicCounts.get(ti) + beta) / (tokensPerTopic[ti] + betaSum))
			      * topicDistribution[ti]; // (/docLen-1+tAlpha); is constant across all topics
			topicWeightsSum += tw;
			topicWeights[ti] = tw;
		}
		// Sample a topic assignment from this distribution
		newTopic = random.nextDiscrete (topicWeights, topicWeightsSum);

		// Put that new topic into the counts
		oneDocTopics[si] = newTopic;
		currentTypeTopicCounts.putOrAdd(newTopic, 1, 1);
		tokensPerTopic[newTopic]++;
	}
}
 
开发者ID:mimno,项目名称:Mallet,代码行数:51,代码来源:LDAStream.java

示例13: randomAssignTopics

import cc.mallet.types.FeatureSequence; //导入方法依赖的package包/类
private void randomAssignTopics() {

        //uniform multinomial distribution for initial assignment
        for (int kk = 0; kk < K; kk++) {
            //equal probability for each topic
            pp[kk] = 1.0 / K;
        }

        for (int m = 0; m < numDocuments; m++) {

            FeatureSequence fs = (FeatureSequence) instances.get(m).getData();
            int seqLen = fs.getLength();
            int type, token, k;
            double sum;

            for (token = 0; token < seqLen; token++) {

                type = fs.getIndexAtPosition(token);

                int u = rand.nextInt(K);
                //assign topics
                k = kactive.get(u);
                z[m][token] = k;
                //add z back
                nmk[m].set(k, nmk[m].get(k) + 1);
                nkt.get(k)[type]++;
                nk.set(k, nk.get(k) + 1);
            }
        }

        //remove empty topic if topic number are not fixed
        if (!fixedK) {
            for (int k = 0; k < nk.size(); k++) {
                if (nk.get(k) == 0) {
                    kactive.remove((Integer) k);
                    kgaps.add(k);
                    assert (Vectors.sum(nkt.get(k)) == 0);
                    K--;
                    updateTau();
                }
            }
        }
    }
 
开发者ID:hmetaxa,项目名称:MixLDA,代码行数:44,代码来源:HDP.java


注:本文中的cc.mallet.types.FeatureSequence.getLength方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。