本文整理汇总了Java中cc.mallet.types.IDSorter类的典型用法代码示例。如果您正苦于以下问题:Java IDSorter类的具体用法?Java IDSorter怎么用?Java IDSorter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
IDSorter类属于cc.mallet.types包,在下文中一共展示了IDSorter类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: topicXMLReport
import cc.mallet.types.IDSorter; //导入依赖的package包/类
public void topicXMLReport (PrintWriter out, int numWords) {
ArrayList<TreeSet<IDSorter>> topicSortedWords = getSortedWords();
out.println("<?xml version='1.0' ?>");
out.println("<topicModel>");
for (int topic = 0; topic < numTopics; topic++) {
out.println(" <topic id='" + topic + "' alpha='" + alpha[topic] +
"' totalTokens='" + tokensPerTopic[topic] + "'>");
int word = 1;
Iterator<IDSorter> iterator = topicSortedWords.get(topic).iterator();
while (iterator.hasNext() && word < numWords) {
IDSorter info = iterator.next();
out.println(" <word rank='" + word + "'>" +
alphabet.lookupObject(info.getID()) +
"</word>");
word++;
}
out.println(" </topic>");
}
out.println("</topicModel>");
}
示例2: printTopicWords
import cc.mallet.types.IDSorter; //导入依赖的package包/类
public static void printTopicWords(boolean addTab,int topicId,ArrayList<TreeSet<IDSorter>> topicSortedWords,BigAlphabet dataAlphabet,double score,double thresholdSim){
Iterator<IDSorter> iterator = topicSortedWords.get(topicId).iterator();
Formatter out = new Formatter(new StringBuilder(), Locale.US);
out = new Formatter(new StringBuilder(), Locale.US);
int rank = 0;
while (iterator.hasNext() && rank < 10) {
IDSorter idCountPair = iterator.next();
out.format("%s ", dataAlphabet.lookupObject(idCountPair.getID()));
rank++;
}
if(addTab){
if(score<=thresholdSim)
System.out.println("\t TopicId=" +topicId+ ",Score="+ score+ "-"+out);
}
else
System.out.println("MainTopicId=" +topicId+ " "+ out);
}
示例3: main
import cc.mallet.types.IDSorter; //导入依赖的package包/类
/**
* Main function
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
// setting
String keys_file = Env.SAMPLE_DIR + "keys.txt";
String docs_dir = Env.SAMPLE_DIR + "wikipedia";
int num_topics = 5;
int num_iterations = 10000;
// input
System.out.println("-Input--------------------------------------------------------------------------");
MalletLDAWrapper lda = new MalletLDAWrapper(keys_file);
lda.setInputData(docs_dir);
// clustering
System.out.println("-Clustering---------------------------------------------------------------------");
MalletLDAWrapper.DEFAULT_NUM_KEYWORDS = 50;
for (LDAResult result : lda.cluster(num_topics, num_iterations)) {
List<String> dists = new ArrayList<String>();
for (IDSorter scoredTopic : result.outputs)
dists.add(String.format("%d(%.4f)", scoredTopic.getID(), scoredTopic.getWeight()));
result.name = result.name.replaceAll("file:/.+/(.+)", "$1");
System.out.println(JString.join("\t", result.id, result.name, JString.join("\t", dists)));
}
}
示例4: writeTermWeightList
import cc.mallet.types.IDSorter; //导入依赖的package包/类
public static void writeTermWeightList(TermWeight termWeight,String outputFileName) throws IOException {
PrintStream out = new PrintStream(new File(outputFileName));
Alphabet typeAlphabet = termWeight.typeAlphabet;
double[] typeTermWeight = termWeight.typeWeight;
int numTypes = typeAlphabet.size();
TreeSet<IDSorter> sortedWords = new TreeSet<IDSorter>();
for (int type = 0; type < numTypes; type++) {
sortedWords.add(new IDSorter(type, typeTermWeight[type]));
}
int word = 1;
Iterator<IDSorter> iterator = sortedWords.descendingIterator();
while (iterator.hasNext()) {
IDSorter info = iterator.next();
out.println(word+","+typeAlphabet.lookupObject(info.getID()) + "," +
info.getWeight());
word++;
}
out.close();
}
示例5: exportTopWords
import cc.mallet.types.IDSorter; //导入依赖的package包/类
public void exportTopWords(String outFileName, int topWords) throws Exception {
FileWriter fstream = new FileWriter(outFileName);
BufferedWriter out = new BufferedWriter(fstream);
for (int k = 0; k < nk.size(); k++) {
IDSorter[] sortedTypes = new IDSorter[numTypes];
//sort word in topic k
for (int type = 0; type < numTypes; type++) {
sortedTypes[type] = new IDSorter(type, nkt.get(k)[type]);
}
Arrays.sort(sortedTypes);
Alphabet alphabet = instances.getDataAlphabet();
for (int i = 0; i < topWords; i++) {
out.append(k + "," + alphabet.lookupObject(sortedTypes[i].getID()) + ","
+ sortedTypes[i].getWeight() + "\n");
}//end i for
}//end k for
out.close();
}
示例6: getTopWords
import cc.mallet.types.IDSorter; //导入依赖的package包/类
/** Return an array (one element for each topic) of arrays of words, which
* are the most probable words for that topic in descending order. These
* are returned as Objects, but will probably be Strings.
*
* @param numWords The maximum length of each topic's array of words (may be less).
*/
public Object[][] getTopWords(int numWords) {
ArrayList<TreeSet<IDSorter>> topicSortedWords = getSortedWords();
Object[][] result = new Object[ numTopics ][];
for (int topic = 0; topic < numTopics; topic++) {
TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);
// How many words should we report? Some topics may have fewer than
// the default number of words with non-zero weight.
int limit = numWords;
if (sortedWords.size() < numWords) { limit = sortedWords.size(); }
result[topic] = new Object[limit];
Iterator<IDSorter> iterator = sortedWords.iterator();
for (int i=0; i < limit; i++) {
IDSorter info = iterator.next();
result[topic][i] = alphabet.lookupObject(info.getID());
}
}
return result;
}
示例7: printSimilarTopics
import cc.mallet.types.IDSorter; //导入依赖的package包/类
public static void printSimilarTopics(Map<Integer,SortedMap<Integer,Double>> sims,ArrayList<TreeSet<IDSorter>> topicSortedWords,BigAlphabet dataAlphabet,double thresholdSim){
Set<Integer> keys = sims.keySet();
for(int topic:keys){
printTopicWords(false,topic,topicSortedWords,dataAlphabet,2.0,0);
TreeMap<Integer,Double> relTopics =(TreeMap<Integer,Double>) sims.get(topic);
NavigableSet<Integer> keysForTopic= relTopics.descendingKeySet();
for(int tp1:keysForTopic){
if(relTopics.get(tp1)>0)
printTopicWords(true,tp1,topicSortedWords,dataAlphabet,relTopics.get(tp1),thresholdSim);
}
}
}
示例8: getTopicWords
import cc.mallet.types.IDSorter; //导入依赖的package包/类
private Map<Integer, Map<String, Double>> getTopicWords(int w) {
Map<Integer, Map<String, Double>> map = new HashMap<Integer, Map<String, Double>>();
Alphabet topicAlphabet = _model.getAlphabet();
ArrayList<TreeSet<IDSorter>> topicSortedWords = _model.getSortedWords();
for (int topic = 0; topic < _model.numTopics; topic++) {
TreeSet<IDSorter> set = topicSortedWords.get(topic);
double sum = 0.0;
for ( IDSorter s : set ) {
sum += s.getWeight();
}
Map<String, Double> words = new HashMap<String, Double>();
for(IDSorter idSorter : set) {
double weight = idSorter.getWeight() / sum;
String word = (String) topicAlphabet.lookupObject(idSorter.getID());
words.put(word, weight);
}
// Sort by weight
List<Entry<String, Double>> sortedWords = Sorter.sort(words);
w = Math.min(w, sortedWords.size());
Map<String, Double> temp = new HashMap<String, Double>();
for(int i=0; i<w; i++) {
Entry<String, Double> first = sortedWords.get(i);
temp.put(first.getKey(), first.getValue());
}
map.put(topic, temp);
}
return map;
}
示例9: LDAResult
import cc.mallet.types.IDSorter; //导入依赖的package包/类
public LDAResult(int id, String name, FeatureSequence input, IDSorter[] outputs) {
this.id = id;
this.name = name;
this.input = input;
this.outputs = new ArrayList<IDSorter>();
for (IDSorter output : outputs)
this.outputs.add(new IDSorter(output.getID(), output.getWeight()));
}
示例10: testMalletLDAWrapper
import cc.mallet.types.IDSorter; //导入依赖的package包/类
public void testMalletLDAWrapper() throws IOException {
System.out.println("\n----- testMalletLDAWrapper() ------------------------------");
ArrayList<JEntry> items = new ArrayList<JEntry>();
String kr_def = "Korea called Hanguk in South Korea and Chosŏn in North Korea, is an East Asian territory that is divided into two distinct sovereign states, North Korea and South Korea. Located on the Korean Peninsula, Korea is bordered by China to the northwest and Russia to the northeast. It is separated from Japan to the east by the Korea Strait and the Sea of Japan (East Sea). The adoption of the Chinese writing system in the 2nd century BC and the introduction of Buddhism in the 4th century AD had profound effects on the Three Kingdoms of Korea, which was first united during the Silla (57 BC – AD 935) under the King Munmu. The united Silla fell to Goryeo in 935 at the end of the Later Three Kingdoms. Goryeo was a highly cultured state and created the Jikji in the 14th century. The invasions by the Mongolians in the 13th century, however, greatly weakened the nation, which was forced to become a tributary state. After the Mongol Empire's collapse, severe political strife followed. The Ming-allied Joseon emerged supreme in 1388.";
String jp_def = "Japan is an island nation in East Asia. Located in the Pacific Ocean, it lies to the east of the Sea of Japan, China, North Korea, South Korea and Russia, stretching from the Sea of Okhotsk in the north to the East China Sea and Taiwan in the south. The characters that make up Japan's name mean sun-origin, which is why Japan is often referred to as the Land of the Rising Sun.";
String cn_def = "China is a sovereign state located in East Asia. It is the world's most populous country, with a population of over 1.35 billion. The PRC is a single-party state governed by the Communist Party, with its seat of government in the capital city of Beijing.[15] It exercises jurisdiction over 22 provinces, five autonomous regions, four direct-controlled municipalities (Beijing, Tianjin, Shanghai, and Chongqing), and two mostly self-governing special administrative regions (Hong Kong and Macau). The PRC also claims Taiwan – which is controlled by the Republic of China (ROC), a separate political entity – as its 23rd province, a claim which is controversial due to the complex political status of Taiwan.[16]";
String it_def = "Italy is a unitary parliamentary republic in Southern Europe. To the north, Italy borders France, Switzerland, Austria, and Slovenia, and is approximately delimited by the Alpine watershed, enclosing the Po Valley and the Venetian Plain. To the south, it consists of the entirety of the Italian Peninsula and the two biggest Mediterranean islands of Sicily and Sardinia.";
String fr_def = "France is a sovereign country in Western Europe that includes several overseas regions and territories.[note 13] Metropolitan France extends from the Mediterranean Sea to the English Channel and the North Sea, and from the Rhine to the Atlantic Ocean. It is one of only three countries (with Morocco and Spain) to have both Atlantic and Mediterranean coastlines. Due to its shape, it is often referred to in French as l’Hexagone.";
String en_def = "England is a country that is part of the United Kingdom.[2][3][4] It shares land borders with Scotland to the north and Wales to the west. The Irish Sea lies north west of England, whilst the Celtic Sea lies to the south west. The North Sea to the east and the English Channel to the south separate it from continental Europe. Most of England comprises the central and southern part of the island of Great Britain which lies in the North Atlantic. The country also includes over 100 smaller islands such as the Isles of Scilly, and the Isle of Wight.";
items.add(new JEntry("kr", kr_def));
items.add(new JEntry("jp", jp_def));
items.add(new JEntry("cn", cn_def));
items.add(new JEntry("it", it_def));
items.add(new JEntry("fr", fr_def));
items.add(new JEntry("en", en_def));
JEntry[] data = items.toArray(new JEntry[0]);
String keys_file = System.getProperty("user.home") + "/keys.txt";
MalletLDAWrapper.DEFAULT_NUM_KEYWORDS = 20;
MalletLDAWrapper lda = new MalletLDAWrapper(keys_file);
lda.setInputData(data);
assertEquals(6, lda.data.size());
List<LDAResult> results = lda.cluster(2, 1000);
assertEquals(true, keys_file == null || new File(keys_file).exists());
assertEquals(6, results.size());
for (LDAResult result : results) {
List<String> dists = new ArrayList<String>();
for (IDSorter scoredTopic : result.outputs)
dists.add(String.format("%d(%.4f)", scoredTopic.getID(), scoredTopic.getWeight()));
System.out.println(JString.join("\t", result.id, result.name, JString.join(", ", dists)));
}
}
示例11: printDocumentTopics
import cc.mallet.types.IDSorter; //导入依赖的package包/类
/**
* @param pw A print writer
* @param threshold Only print topics with proportion greater than this number
* @param max Print no more than this many topics
*/
public void printDocumentTopics (ArrayList<Topication> dataset, PrintWriter pw, double threshold, int max) {
pw.print ("#doc source topic proportion ...\n");
int docLen;
int[] topicCounts = new int[ numTopics ];
IDSorter[] sortedTopics = new IDSorter[ numTopics ];
for (int topic = 0; topic < numTopics; topic++) {
// Initialize the sorters with dummy values
sortedTopics[topic] = new IDSorter(topic, topic);
}
if (max < 0 || max > numTopics) {
max = numTopics;
}
for (int di = 0; di < dataset.size(); di++) {
LabelSequence topicSequence = dataset.get(di).topicSequence;
int[] currentDocTopics = topicSequence.getFeatures();
pw.print (di); pw.print (' ');
if (dataset.get(di).instance.getSource() != null) {
pw.print (dataset.get(di).instance.getSource());
}
else {
pw.print ("null-source");
}
pw.print (' ');
docLen = currentDocTopics.length;
// Count up the tokens
int realDocLen = 0;
for (int token=0; token < docLen; token++) {
if(currentDocTopics[token] != -1) {
topicCounts[ currentDocTopics[token] ]++;
realDocLen ++;
}
}
assert(realDocLen == docLen);
alphaSum=0.0;
for(int topic=0; topic < numTopics; topic++){
alphaSum+=alpha[topic];
}
// And normalize and smooth by Dirichlet prior alpha
for (int topic = 0; topic < numTopics; topic++) {
sortedTopics[topic].set(topic, (double) (topicCounts[topic]+alpha[topic]) / (docLen + alphaSum));
}
Arrays.sort(sortedTopics);
for (int i = 0; i < max; i++) {
if (sortedTopics[i].getWeight() < threshold) { break; }
pw.print (sortedTopics[i].getID() + " " +
sortedTopics[i].getWeight() + " ");
}
pw.print (" \n");
Arrays.fill(topicCounts, 0);
}
pw.close();
}
示例12: writeInferredDistributions
import cc.mallet.types.IDSorter; //导入依赖的package包/类
/**
* Infer topics for the provided instances and
* write distributions to the provided file.
*
* @param instances
* @param distributionsFile
* @param numIterations The total number of iterations of sampling per document
* @param thinning The number of iterations between saved samples
* @param burnIn The number of iterations before the first saved sample
* @param threshold The minimum proportion of a given topic that will be written
* @param max The total number of topics to report per document]
*/
public void writeInferredDistributions(BigInstanceList instances,
File distributionsFile,
int numIterations, int thinning, int burnIn,
double threshold, int max) throws IOException {
PrintWriter out = new PrintWriter(distributionsFile);
out.print ("#doc source topic proportion ...\n");
IDSorter[] sortedTopics = new IDSorter[ numTopics ];
for (int topic = 0; topic < numTopics; topic++) {
// Initialize the sorters with dummy values
sortedTopics[topic] = new IDSorter(topic, topic);
}
if (max < 0 || max > numTopics) {
max = numTopics;
}
int doc = 0;
for (Instance instance: instances) {
double[] topicDistribution =
getSampledDistribution(instance, numIterations,
thinning, burnIn);
out.print (doc); out.print (' ');
// Print the Source field of the instance
if (instance.getSource() != null) {
out.print (instance.getSource());
}
else {
out.print ("null-source");
}
out.print (' ');
for (int topic = 0; topic < numTopics; topic++) {
sortedTopics[topic].set(topic, topicDistribution[topic]);
}
Arrays.sort(sortedTopics);
for (int i = 0; i < max; i++) {
if (sortedTopics[i].getWeight() < threshold) { break; }
out.print (sortedTopics[i].getID() + " " +
sortedTopics[i].getWeight() + " ");
}
out.print (" \n");
doc++;
}
out.close();
}
示例13: printDocumentTopics
import cc.mallet.types.IDSorter; //导入依赖的package包/类
/**
* @param out A print writer
* @param threshold Only print topics with proportion greater than this number
* @param max Print no more than this many topics
*/
public void printDocumentTopics (PrintWriter out, double threshold, int max) {
out.print ("#doc name topic proportion ...\n");
int docLen;
int[] topicCounts = new int[ numTopics ];
IDSorter[] sortedTopics = new IDSorter[ numTopics ];
for (int topic = 0; topic < numTopics; topic++) {
// Initialize the sorters with dummy values
sortedTopics[topic] = new IDSorter(topic, topic);
}
if (max < 0 || max > numTopics) {
max = numTopics;
}
for (int doc = 0; doc < data.size(); doc++) {
TopicAssignment document = (TopicAssignment) data.get(doc);
BigLabelSequence topicSequence = (BigLabelSequence) document.topicSequence;
int[] currentDocTopics = topicSequence.getFeatures();
StringBuilder builder = new StringBuilder();
builder.append(doc);
builder.append("\t");
if (document.instance.getName() != null) {
builder.append(document.instance.getName());
}
else {
builder.append("no-name");
}
builder.append("\t");
docLen = currentDocTopics.length;
// Count up the tokens
for (int token=0; token < docLen; token++) {
topicCounts[ currentDocTopics[token] ]++;
}
// And normalize
for (int topic = 0; topic < numTopics; topic++) {
sortedTopics[topic].set(topic, (alpha[topic] + topicCounts[topic]) / (docLen + alphaSum) );
}
Arrays.sort(sortedTopics);
for (int i = 0; i < max; i++) {
if (sortedTopics[i].getWeight() < threshold) { break; }
builder.append(sortedTopics[i].getID() + "\t" +
sortedTopics[i].getWeight() + "\t");
}
out.println(builder);
Arrays.fill(topicCounts, 0);
}
}
示例14: printTopics
import cc.mallet.types.IDSorter; //导入依赖的package包/类
public void printTopics(ParallelTopicModel model, String writePathDocTopic, String writePathTopicTerm, String writePathTopicTermMatrix) throws Exception {
ArrayList<String> topicKeys = new ArrayList<String>();
BufferedWriter writerDocTopic = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(writePathDocTopic), "UTF8"));
BufferedWriter writerTopicTerm = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(writePathTopicTerm), "UTF8"));
File file = new File(writePathTopicTerm);
String path = file.getName().substring(0, file.getName().length()-4) + "-T" + String.valueOf(maxCount) + ".txt";
String parentPath = new File(writePathTopicTerm).getParentFile().getAbsolutePath();
BufferedWriter writerTopicTermShort = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(parentPath,path))));
BufferedWriter writerTopicTermMatrix = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(writePathTopicTermMatrix), "UTF8"));
/* Write header */
writerDocTopic.write("Class,Document");
for(int j = 0; j < model.numTopics; j++) {
writerDocTopic.write(",T" + j);
}
writerDocTopic.newLine();
/* Write document-topic probabilities to file */
for(int i=0;i<this.textList.size(); i++){
double[] topicProbs = model.getTopicProbabilities(i);
//writerDocTopic.write(i + ",");
String docName = this.idDocMapping.get(i);
writerDocTopic.write(this.classDocMapping.get(docName) + ",");
writerDocTopic.write(docName);
for(int j=0; j < topicProbs.length; j++){
writerDocTopic.write("," + topicProbs[j]);
}
writerDocTopic.newLine();
}
/* Write topic-term probabilities to file */
// Alphabet alphabet = model.getAlphabet();
// for (int i = 0; i < model.getSortedWords().size(); i++) {
// writerTopicTermMatrix.write("TOPIC " + i + ": ");
// /**topic for the label*/
// TreeSet<IDSorter> set = model.getSortedWords().get(i);
// for (IDSorter s : set) {
//
// }
// writerTopicTerm.newLine();
// writerTopicTermShort.newLine();
// }
//
/* Write topic term associations */
Alphabet alphabet = model.getAlphabet();
for (int i = 0; i < model.getSortedWords().size(); i++) {
writerTopicTerm.write("TOPIC " + i + ": ");
writerTopicTermShort.write("TOPIC " + i + ": ");
writerTopicTermMatrix.write("TOPIC " + i + ": ");
/**topic for the label*/
String tmpTopic = "";
int count = 0;
TreeSet<IDSorter> set = model.getSortedWords().get(i);
for (IDSorter s : set) {
if(count <= maxCount) {
writerTopicTermShort.write(alphabet.lookupObject(s.getID()) + ", " );
}
count++;
writerTopicTerm.write(alphabet.lookupObject(s.getID()) + ", ");
writerTopicTermMatrix.write(alphabet.lookupObject(s.getID()) + " (" + s.getWeight() + "), ");
/**add to topic label*/
tmpTopic += alphabet.lookupObject(s.getID()) + "\t";
}
topicKeys.add(tmpTopic);
writerTopicTerm.newLine();
writerTopicTermShort.newLine();
writerTopicTermMatrix.newLine();
}
writerTopicTermMatrix.close();
writerDocTopic.close();
writerTopicTerm.close();
writerTopicTermShort.close();
}
示例15: printTopWord
import cc.mallet.types.IDSorter; //导入依赖的package包/类
public void printTopWord(int numWords) {
//sort topic from largest to smallest
if (!fixedK) {
trimTopics();
}
int wordCount = 0;
for (int k = 0; k < nk.size(); k++) {
if (nk.get(k) != 0) {
int count = nk.get(k);
//check word count
wordCount += count;
IDSorter[] sortedTypes = new IDSorter[numTypes];
//sort word in topic k
for (int type = 0; type < numTypes; type++) {
sortedTypes[type] = new IDSorter(type, nkt.get(k)[type]);
}
Arrays.sort(sortedTypes);
Alphabet alphabet = instances.getDataAlphabet();
StringBuffer out = new StringBuffer();
out.append("topic" + k + ": ");
out.append("word:" + count + ", ");
if (k < kactive.size()) {
out.append("matched topic " + kactive.get(k) + ", ");
}
double prop = (double) count / totalWord;
out.append(String.format("prop:%2.4f, ", prop));
for (int i = 0; i < numWords; i++) {
out.append(alphabet.lookupObject(sortedTypes[i].getID()) + " ");
}
System.out.println(out);
} else {
if (k < kactive.size()) {
System.out.println("Topic" + k + ": matched topic " + kactive.get(k));
} else {
System.out.println("Topic" + k + ": empty");
}
}
}
System.out.println("Total Word count: " + wordCount);
}