当前位置: 首页>>代码示例>>Java>>正文


Java LabelSequence类代码示例

本文整理汇总了Java中cc.mallet.types.LabelSequence的典型用法代码示例。如果您正苦于以下问题:Java LabelSequence类的具体用法?Java LabelSequence怎么用?Java LabelSequence使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


LabelSequence类属于cc.mallet.types包,在下文中一共展示了LabelSequence类的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: printTheta

import cc.mallet.types.LabelSequence; //导入依赖的package包/类
public void printTheta(ArrayList<Topication> dataset, File f, double threshold, int max) throws IOException{
	PrintWriter pw = new PrintWriter(new FileWriter(f));
	int[] topicCounts = new int[ numTopics ];
	int docLen;
	
	for (int di = 0; di < dataset.size(); di++) {
		LabelSequence topicSequence = dataset.get(di).topicSequence;
		int[] currentDocTopics = topicSequence.getFeatures();
		docLen = currentDocTopics.length;
		for (int token=0; token < docLen; token++) {
			topicCounts[ currentDocTopics[token] ]++;
		}
		pw.println(dataset.get(di).instance.getName());
		// n(t|d)+alpha(t) / docLen + alphaSum
		for (int topic = 0; topic < numTopics; topic++) {
			double prob = (double) (topicCounts[topic]+alpha[topic]) / (docLen + alphaSum);
			pw.println("topic"+ topic + "\t" + prob);
		}

		pw.println();
		Arrays.fill(topicCounts, 0);
	}
	pw.close();
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:25,代码来源:LDAStream.java

示例2: printState

import cc.mallet.types.LabelSequence; //导入依赖的package包/类
public void printState (ArrayList<Topication> dataset, PrintStream out) {

		out.println ("#doc source pos typeindex type topic");

		for (int di = 0; di < dataset.size(); di++) {
			FeatureSequence tokenSequence =	(FeatureSequence) dataset.get(di).instance.getData();
			LabelSequence topicSequence =	dataset.get(di).topicSequence;

			String source = "NA";
			if (dataset.get(di).instance.getSource() != null) {
				source = dataset.get(di).instance.getSource().toString();
			}

			for (int pi = 0; pi < topicSequence.getLength(); pi++) {
				int type = tokenSequence.getIndexAtPosition(pi);
				int topic = topicSequence.getIndexAtPosition(pi);
				out.print(di); out.print(' ');
				out.print(source); out.print(' ');
				out.print(pi); out.print(' ');
				out.print(type); out.print(' ');
				out.print(alphabet.lookupObject(type)); out.print(' ');
				out.print(topic); out.println();
			}
		}
	}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:26,代码来源:LDAStream.java

示例3: testToXml

import cc.mallet.types.LabelSequence; //导入依赖的package包/类
public void testToXml () {
  LabelAlphabet dict = new LabelAlphabet ();
  String document = "the quick brown fox leapt over the lazy dog";
  StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());

  Label O = dict.lookupLabel ("O");
  Label ANML = dict.lookupLabel ("ANIMAL");
  Label VB = dict.lookupLabel ("VERB");
  LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, O, ANML, ANML });

  DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, "O");
  String actualXml = extr.toXmlString();
  String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
          "<doc>the <ANIMAL>quick brown fox </ANIMAL><VERB>leapt </VERB>over the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
  assertEquals (expectedXml, actualXml);
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:17,代码来源:TestDocumentExtraction.java

示例4: testToXmlBIO

import cc.mallet.types.LabelSequence; //导入依赖的package包/类
public void testToXmlBIO () {
  LabelAlphabet dict = new LabelAlphabet ();
  String document = "the quick brown fox leapt over the lazy dog";
  StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());

  Label O = dict.lookupLabel ("O");
  Label BANML = dict.lookupLabel ("B-ANIMAL");
  Label ANML = dict.lookupLabel ("ANIMAL");
  Label BVB = dict.lookupLabel ("B-VERB");
  Label VB = dict.lookupLabel ("I-VERB");
  LabelSequence tags = new LabelSequence (new Label[] { O, BANML, ANML, BANML, BVB, VB, O, ANML, ANML });

  DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new BIOTokenizationFilter());
  String actualXml = extr.toXmlString();
  String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
          "<doc>the <ANIMAL>quick brown </ANIMAL><ANIMAL>fox </ANIMAL><VERB>leapt over </VERB>the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
  assertEquals (expectedXml, actualXml);
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:19,代码来源:TestDocumentExtraction.java

示例5: ignoretestToXml

import cc.mallet.types.LabelSequence; //导入依赖的package包/类
public void ignoretestToXml () {
  LabelAlphabet dict = new LabelAlphabet ();
  String document = "the quick brown fox leapt over the lazy dog";
  StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());

  Label O = dict.lookupLabel ("O");
  Label ANML = dict.lookupLabel ("ANIMAL");
  Label VB = dict.lookupLabel ("VERB");
  LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, O, ANML, ANML });

  DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, "O");
  String actualXml = extr.toXmlString();
  String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
          "<doc>the <ANIMAL>quick brown fox </ANIMAL><VERB>leapt </VERB>over the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
  assertEquals (expectedXml, actualXml);
}
 
开发者ID:cmoen,项目名称:mallet,代码行数:17,代码来源:TestDocumentExtraction.java

示例6: ignoretestToXmlBIO

import cc.mallet.types.LabelSequence; //导入依赖的package包/类
public void ignoretestToXmlBIO () {
  LabelAlphabet dict = new LabelAlphabet ();
  String document = "the quick brown fox leapt over the lazy dog";
  StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());

  Label O = dict.lookupLabel ("O");
  Label BANML = dict.lookupLabel ("B-ANIMAL");
  Label ANML = dict.lookupLabel ("ANIMAL");
  Label BVB = dict.lookupLabel ("B-VERB");
  Label VB = dict.lookupLabel ("I-VERB");
  LabelSequence tags = new LabelSequence (new Label[] { O, BANML, ANML, BANML, BVB, VB, O, ANML, ANML });

  DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new BIOTokenizationFilter());
  String actualXml = extr.toXmlString();
  String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
          "<doc>the <ANIMAL>quick brown </ANIMAL><ANIMAL>fox </ANIMAL><VERB>leapt over </VERB>the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
  assertEquals (expectedXml, actualXml);
}
 
开发者ID:cmoen,项目名称:mallet,代码行数:19,代码来源:TestDocumentExtraction.java

示例7: pipe

import cc.mallet.types.LabelSequence; //导入依赖的package包/类
public Instance pipe(Instance carrier)
{
  StringTokenization ts =  (StringTokenization) carrier.getData();
  StringTokenization newTs = new StringTokenization((CharSequence) ts.getDocument ());
   final LabelAlphabet dict = (LabelAlphabet) getTargetAlphabet();
   LabelSequence labelSeq = new LabelSequence(dict);
   Label start = dict.lookupLabel ("start");
   Label notstart = dict.lookupLabel ("notstart");

  boolean lastWasSpace = true;
  StringBuffer sb = new StringBuffer();
  for (int i = 0; i < ts.size(); i++) {
    StringSpan t = (StringSpan) ts.getSpan(i);
    if (t.getText().equals(" "))
      lastWasSpace = true;
    else {
      sb.append(t.getText());
      newTs.add(t);
      labelSeq.add(lastWasSpace ? "start" : "notstart");
      lastWasSpace = false;
    }
  }
  if (isTargetProcessing())
    carrier.setTarget(labelSeq);
  carrier.setData(newTs);
  carrier.setSource(sb.toString());
  return carrier;
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:29,代码来源:TestMEMM.java

示例8: convert

import cc.mallet.types.LabelSequence; //导入依赖的package包/类
/**
 * 
 * @param inst input instance, with FeatureVectorSequence as data.
 * @param alphabetsPipe a Noop pipe containing the data and target alphabets for 
 * the resulting InstanceList and AugmentableFeatureVectors
 * @return list of instances, each with one AugmentableFeatureVector as data
 */
public static InstanceList convert(Instance inst, Noop alphabetsPipe)
{
	InstanceList ret = new InstanceList(alphabetsPipe);
	Object obj = inst.getData();
	assert(obj instanceof FeatureVectorSequence);

	FeatureVectorSequence fvs = (FeatureVectorSequence) obj;
	LabelSequence ls = (LabelSequence) inst.getTarget();
	assert(fvs.size() == ls.size());

	Object instName = (inst.getName() == null ? "NONAME" : inst.getName());
	
	for (int j = 0; j < fvs.size(); j++) {
		FeatureVector fv = fvs.getFeatureVector(j);
		int[] indices = fv.getIndices();
		FeatureVector data = new AugmentableFeatureVector (alphabetsPipe.getDataAlphabet(),
				indices, fv.getValues(), indices.length); 
		Labeling target = ls.getLabelAtPosition(j);
		String name = instName.toString() + "[email protected]_POS_" + (j + 1);
		Object source = inst.getSource();
		Instance toAdd = alphabetsPipe.pipe(new Instance(data, target, name, source));

		ret.add(toAdd);
	}

	return ret;
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:35,代码来源:AddClassifierTokenPredictions.java

示例9: next

import cc.mallet.types.LabelSequence; //导入依赖的package包/类
public Instance next () {
	if (!dataSubiterator.hasNext()) {
		assert (superIterator.hasNext());
		superInstance = superIterator.next();
		dataSubiterator = ((FeatureVectorSequence)superInstance.getData()).iterator();
		targetSubiterator = ((LabelSequence)superInstance.getTarget()).iterator();
	}
	// We are assuming sequences don't have zero length
	assert (dataSubiterator.hasNext());
	assert (targetSubiterator.hasNext());
	return new Instance (dataSubiterator.next(), targetSubiterator.next(), 
			superInstance.getSource()+" tokensequence:"+count++,	null);
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:14,代码来源:FeatureVectorSequence2FeatureVectors.java

示例10: testNestedToXML

import cc.mallet.types.LabelSequence; //导入依赖的package包/类
public void testNestedToXML ()
{
  LabelAlphabet dict = new LabelAlphabet ();
  String document = "the quick brown fox leapt over the lazy dog";
  StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());

  Label O = dict.lookupLabel ("O");
  Label ANML = dict.lookupLabel ("ANIMAL");
  Label VB = dict.lookupLabel ("VERB");
  Label JJ = dict.lookupLabel ("ADJ");
  Label MAMMAL = dict.lookupLabel ("MAMMAL");

  LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, ANML, ANML, ANML });

  LabeledSpans spans = new DefaultTokenizationFilter ().constructLabeledSpans (dict, document, O, toks, tags);

  Span foxToken = toks.subspan (3, 4);
  spans.add (new LabeledSpan (foxToken, MAMMAL, false));
  Span bigDogToken = toks.subspan (7, 8);
  spans.add (new LabeledSpan (bigDogToken, JJ, false));

  DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, spans, null, "O");
  String actualXml = extr.toXmlString();
  String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
          "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy </ADJ>dog</ANIMAL></doc>\r\n";
  assertEquals (expectedXml, actualXml);

}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:29,代码来源:TestDocumentExtraction.java

示例11: testNestedXMLTokenizationFilter

import cc.mallet.types.LabelSequence; //导入依赖的package包/类
public void testNestedXMLTokenizationFilter ()
{
  LabelAlphabet dict = new LabelAlphabet ();
  String document = "the quick brown fox leapt over the lazy dog";
  StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());

  Label O = dict.lookupLabel ("O");
  Label ANML = dict.lookupLabel ("ANIMAL");
  Label ANML_MAMM = dict.lookupLabel ("ANIMAL|MAMMAL");
  Label VB = dict.lookupLabel ("VERB");
  Label ANML_JJ = dict.lookupLabel ("ANIMAL|ADJ");
  Label ANML_JJ_MAMM = dict.lookupLabel ("ANIMAL|ADJ|MAMMAL");

  LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML_MAMM, VB, O, ANML, ANML_JJ, ANML_JJ_MAMM });
  DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter ());

  String actualXml = extr.toXmlString();
  String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
          "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy <MAMMAL>dog</MAMMAL></ADJ></ANIMAL></doc>\r\n";
  assertEquals (expectedXml, actualXml);

  // Test the ignore function

  extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter (Pattern.compile ("AD.*")));

  actualXml = extr.toXmlString();
  expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
          "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the lazy <MAMMAL>dog</MAMMAL></ANIMAL></doc>\r\n";
  assertEquals (expectedXml, actualXml);



}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:34,代码来源:TestDocumentExtraction.java

示例12: pipe

import cc.mallet.types.LabelSequence; //导入依赖的package包/类
@Override
public Instance pipe(Instance carrier) {
	String line = (String) carrier.getData();
	LabelAlphabet labels = (LabelAlphabet) getTargetAlphabet();
	TokenSequence ts = new TokenSequence();
	String[] tokens = line.split(" ");
	LabelSequence target = new LabelSequence(labels, tokens.length - 1);
	assert (tokens.length > 1);
	String prevTag = null;
	for (int i = 1; i < tokens.length; i++) {
		Matcher matcher = tokenPattern.matcher(tokens[i]);
		matcher.matches();
		String word = matcher.group(1);
		String tag = matcher.group(2);
		Token tok = new Token(word);
		tok.setFeatureValue(word, 1.0);
		ts.add(tok);
		String tmp = tag;
		if (targets.contains(tag)) {
			if (tag.equals(prevTag)) {
				tag = "I-GENE";
			} else {
				tag = "B-GENE";
			}
		} else {
			tag = "O";
		}
		target.add(tag);
		prevTag = tmp;
	}
	carrier.setData(ts);
	carrier.setTarget(target);
	return carrier;
}
 
开发者ID:siqil,项目名称:udaner,代码行数:35,代码来源:GenetagLine2TokenSequence.java

示例13: getStatistics

import cc.mallet.types.LabelSequence; //导入依赖的package包/类
private void getStatistics(CRF crf, Instance inst,
		Map<Instance, Double> entropyMap, Map<Instance, Double> probMap,
		boolean setTarget) {
	FeatureVectorSequence input = (FeatureVectorSequence) inst.getData();
	MaxLatticeDefault maxLattice = new MaxLatticeDefault(crf, input);
	Sequence output = maxLattice.bestOutputSequence();
	double labeled = new SumLatticeDefault(crf, input, output)
			.getTotalWeight();
	SumLattice lattice = new SumLatticeDefault(crf, input, true);
	double unlabeled = lattice.getTotalWeight();
	EntropyLattice entropyLattice = new EntropyLattice(input,
			lattice.getGammas(), lattice.getXis(), crf, null, 1);
	double entropy = -entropyLattice.getEntropy();
	double prob = Math.exp(labeled - unlabeled);
	entropyMap.put(inst, entropy);
	probMap.put(inst, prob);
	if (setTarget) {
		inst.unLock();
		int n = output.size();
		LabelSequence seq = new LabelSequence(
				(LabelAlphabet) crf.getOutputAlphabet(), n);
		for (int i = 0; i < output.size(); i++) {
			seq.add(output.get(i));
		}
		inst.setTarget(seq);
		inst.lock();
	}
}
 
开发者ID:siqil,项目名称:udaner,代码行数:29,代码来源:CRFTrainerByBootstrapping.java

示例14: TextInstance

import cc.mallet.types.LabelSequence; //导入依赖的package包/类
public TextInstance(TextSentence textSentence, Alphabet targetAlphabet) {
	super(new TokenSequence(), new LabelSequence(targetAlphabet), textSentence.getId(), textSentence);

	TokenSequence tokenSequence = (TokenSequence) getData();
	LabelSequence labelSequence = (LabelSequence) getTarget();

	for (TextToken textToken : textSentence) {
		String text = textToken.getText();
		String label = textToken.getTag();
		tokenSequence.add(new Token(text));
		labelSequence.add(label);
	}
}
 
开发者ID:jdmp,项目名称:java-data-mining-package,代码行数:14,代码来源:TextInstance.java


注:本文中的cc.mallet.types.LabelSequence类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。