当前位置: 首页>>代码示例>>Java>>正文


Java SimpleTaggerSentence2TokenSequence类代码示例

本文整理汇总了Java中cc.mallet.pipe.SimpleTaggerSentence2TokenSequence的典型用法代码示例。如果您正苦于以下问题:Java SimpleTaggerSentence2TokenSequence类的具体用法?Java SimpleTaggerSentence2TokenSequence怎么用?Java SimpleTaggerSentence2TokenSequence使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


SimpleTaggerSentence2TokenSequence类属于cc.mallet.pipe包,在下文中一共展示了SimpleTaggerSentence2TokenSequence类的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testMultiTagSerialization

import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public static void testMultiTagSerialization () throws IOException, ClassNotFoundException
{
  Pipe origPipe = new SerialPipes (new Pipe[] {
          new SimpleTaggerSentence2TokenSequence (),
          new TokenText (),
          new RegexMatches ("digits", Pattern.compile ("[0-9]+")),
          new RegexMatches ("ampm", Pattern.compile ("[aApP][mM]")),
          new OffsetFeatureConjunction ("time",
                  new String[] { "digits", "ampm" },
                  new int[] { 0, 1 },
                  true),
          new PrintInputAndTarget (),
  });

  Pipe mtPipe = (Pipe) TestSerializable.cloneViaSerialization (origPipe);
  InstanceList mtLst = new InstanceList (mtPipe);
  mtLst.addThruPipe (new ArrayIterator (doc1));
  Instance mtInst = mtLst.get (0);
  TokenSequence mtTs = (TokenSequence) mtInst.getData ();
  assertEquals (6, mtTs.size ());
  assertEquals (1.0, mtTs.get (3).getFeatureValue ("time"), 1e-15);
  assertEquals (1.0, mtTs.get (4).getFeatureValue ("time"), 1e-15);
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:24,代码来源:TestOffsetFeatureConjunctions.java

示例2: testConcatenateBadPipes

import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public void testConcatenateBadPipes ()
{
  Pipe p1 = new SimpleTaggerSentence2TokenSequence ();
  // force resolving data alphabet
  Alphabet dict1 = p1.getDataAlphabet ();

  Pipe p2 = new SimpleTaggerSentence2TokenSequence ();
  // force resolving data alphabet
  Alphabet dict2 = p2.getDataAlphabet ();

  assertTrue (dict1 != dict2);

  try {
    PipeUtils.concatenatePipes (p1, p2);
    assertTrue ("Test failed: concatenatePipes() allowed putting together incompatible alphabets.", false);

  } catch (IllegalArgumentException e) {
    // Exception expected
  }
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:21,代码来源:TestPipeUtils.java

示例3: build

import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public Pipe build() {
	pipes = new LinkedList<Pipe>();
	pipes.add(new SimpleTaggerSentence2TokenSequence(false));
	addFeatures();
	pipes.add(new TokenSequence2FeatureVectorSequence(true, false));
	if (useSCL) {
		Pipe pipe = new SerialPipes(pipes);
		InstanceList sourceTrainInstances = new InstanceList(pipe);
		sourceTrainInstances.addThruPipe(sclSourceIt);
		InstanceList targetTrainInstances = new InstanceList(pipe);
		targetTrainInstances.addThruPipe(sclTargetIt);
		SCL scl = trainSCL(sourceTrainInstances, targetTrainInstances);
		pipes.removeLast();
		pipes.add(new TokenSequence2FeatureVectorSequence(false, true));
		pipes.add(new SCLAugment(scl));
	}
	return new SerialPipes(pipes);
}
 
开发者ID:siqil,项目名称:udaner,代码行数:19,代码来源:PipeBuilder.java

示例4: ignoretestConcatenateBadPipes

import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public void ignoretestConcatenateBadPipes ()
{
  Pipe p1 = new SimpleTaggerSentence2TokenSequence ();
  // force resolving data alphabet
  Alphabet dict1 = p1.getDataAlphabet ();

  Pipe p2 = new SimpleTaggerSentence2TokenSequence ();
  // force resolving data alphabet
  Alphabet dict2 = p2.getDataAlphabet ();

  assertTrue (dict1 != dict2);

  try {
    PipeUtils.concatenatePipes (p1, p2);
    assertTrue ("Test failed: concatenatePipes() allowed putting together incompatible alphabets.", false);

  } catch (IllegalArgumentException e) {
    // Exception expected
  }
}
 
开发者ID:cmoen,项目名称:mallet,代码行数:21,代码来源:TestPipeUtils.java

示例5: testPipesAreStupid

import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public void testPipesAreStupid ()
{
  Pipe p1 = new StupidPipe ();
  Pipe p2 = new SimpleTaggerSentence2TokenSequence ();
  // initialize p2's dict
  p2.instanceFrom(new Instance (data, null, null, null));

  Pipe serial = new SerialPipes (new Pipe[] { p1, p2 });
  try {
    serial.getDataAlphabet ();
    assertTrue ("Test failed: Should have generated exception.", false);
  } catch (IllegalStateException e) {}
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:14,代码来源:TestPipeUtils.java

示例6: run

import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public static List<String> run(String trainingFilename, String testingFilename)
        throws FileNotFoundException, IOException {
  ArrayList<Pipe> pipes = new ArrayList<Pipe>();

  pipes.add(new SimpleTaggerSentence2TokenSequence());
  pipes.add(new TokenSequence2FeatureSequence());

  Pipe pipe = new SerialPipes(pipes);

  InstanceList trainingInstances = new InstanceList(pipe);
  InstanceList testingInstances = new InstanceList(pipe);

  trainingInstances.addThruPipe(new LineGroupIterator(
          new BufferedReader(new InputStreamReader(new FileInputStream(trainingFilename))),
          Pattern.compile("^\\s*$"), true));
  testingInstances.addThruPipe(new LineGroupIterator(
          new BufferedReader(new InputStreamReader(new FileInputStream(testingFilename))),
          Pattern.compile("^\\s*$"), true));

  HMM hmm = new HMM(pipe, null);
  hmm.addStatesForLabelsConnectedAsIn(trainingInstances);

  HMMTrainerByLikelihood trainer = new HMMTrainerByLikelihood(hmm);
  TransducerEvaluator testingEvaluator = new SegmentationEvaluator(testingInstances, "testing");
  trainer.train(trainingInstances, 100);
  testingEvaluator.evaluate(trainer);

  return testingInstances.stream().map(Instance::getData).map(Sequence.class::cast)
          .map(hmm::transduce)
          .flatMap(output -> IntStream.range(0, output.size()).mapToObj(output::get))
          .map(String.class::cast).collect(toList());

  // hmm.print();
}
 
开发者ID:ziy,项目名称:pkb,代码行数:35,代码来源:HmmTagger.java

示例7: VocabProcessor

import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public VocabProcessor(String... filenames) throws FileNotFoundException {
	Pipe pipe = new SimpleTaggerSentence2TokenSequence();
	ilists = new InstanceList[filenames.length];
	vocabs = new Map[filenames.length];
	for (int i = 0; i < filenames.length; i++) {
		ilists[i] = new InstanceList(pipe);
		ilists[i].addThruPipe(new LineGroupIterator(new FileReader(
				filenames[i]), Pattern.compile("^\\s*$"), true));
		vocabs[i] = getVocab(ilists[i]);
		System.out.println("Vocab " + i + " : " + vocabs[i].size());
	}

}
 
开发者ID:siqil,项目名称:udaner,代码行数:14,代码来源:VocabProcessor.java

示例8: ignoretestPipesAreStupid

import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public void ignoretestPipesAreStupid ()
{
  Pipe p1 = new StupidPipe ();
  Pipe p2 = new SimpleTaggerSentence2TokenSequence ();
  // initialize p2's dict
  p2.instanceFrom(new Instance (data, null, null, null));

  Pipe serial = new SerialPipes (new Pipe[] { p1, p2 });
  try {
    serial.getDataAlphabet ();
    assertTrue ("Test failed: Should have generated exception.", false);
  } catch (IllegalStateException e) {}
}
 
开发者ID:cmoen,项目名称:mallet,代码行数:14,代码来源:TestPipeUtils.java

示例9: testMultiTag

import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public static void testMultiTag ()
{
  Pipe mtPipe = new SerialPipes (new Pipe[] {
          new SimpleTaggerSentence2TokenSequence (),
          new TokenText (),
          new RegexMatches ("digits", Pattern.compile ("[0-9]+")),
          new RegexMatches ("ampm", Pattern.compile ("[aApP][mM]")),
          new OffsetFeatureConjunction ("time",
                  new String[] { "digits", "ampm" },
                  new int[] { 0, 1 },
                  true),
          new PrintInputAndTarget (),
  });
  Pipe noMtPipe = new SerialPipes (new Pipe[] {
          new SimpleTaggerSentence2TokenSequence (),
          new TokenText (),
          new RegexMatches ("digits", Pattern.compile ("[0-9]+")),
          new RegexMatches ("ampm", Pattern.compile ("[aApP][mM]")),
          new OffsetFeatureConjunction ("time",
                  new String[] { "digits", "ampm" },
                  new int[] { 0, 1 },
                  false),
          new PrintInputAndTarget (),
  });

  InstanceList mtLst = new InstanceList (mtPipe);
  InstanceList noMtLst = new InstanceList (noMtPipe);

  mtLst.addThruPipe (new ArrayIterator (doc1));
  noMtLst.addThruPipe (new ArrayIterator (doc1));

  Instance mtInst = mtLst.get (0);
  Instance noMtInst = noMtLst.get (0);

  TokenSequence mtTs = (TokenSequence) mtInst.getData ();
  TokenSequence noMtTs = (TokenSequence) noMtInst.getData ();

  assertEquals (6, mtTs.size ());
  assertEquals (6, noMtTs.size ());

  assertEquals (1.0, mtTs.get (3).getFeatureValue ("time"), 1e-15);
  assertEquals (1.0, noMtTs.get (3).getFeatureValue ("time"), 1e-15);
  assertEquals (1.0, mtTs.get (4).getFeatureValue ("time"), 1e-15);
  assertEquals (0.0, noMtTs.get (4).getFeatureValue ("time"), 1e-15);
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:46,代码来源:TestOffsetFeatureConjunctions.java

示例10: TrainCRF

import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public TrainCRF(String trainingFilename, String testingFilename) throws IOException {

        ArrayList<Pipe> pipes = new ArrayList<Pipe>();

        int[][] conjunctions = new int[2][];
        conjunctions[0] = new int[] { -1 };
        conjunctions[1] = new int[] { 1 };

        pipes.add(new SimpleTaggerSentence2TokenSequence());
        pipes.add(new OffsetConjunctions(conjunctions));
        //pipes.add(new FeaturesInWindow("PREV-", -1, 1));
        pipes.add(new TokenTextCharSuffix("C1=", 1));
        pipes.add(new TokenTextCharSuffix("C2=", 2));
        pipes.add(new TokenTextCharSuffix("C3=", 3));
        pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*")));
        pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*")));
        pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*")));
        pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile(".*\\$.*")));
        pipes.add(new TokenFirstPosition("FIRSTTOKEN"));
        pipes.add(new TokenSequence2FeatureVectorSequence());

        Pipe pipe = new SerialPipes(pipes);

        InstanceList trainingInstances = new InstanceList(pipe);
        InstanceList testingInstances = new InstanceList(pipe);

        trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true));
        testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true));

        CRF crf = new CRF(pipe, null);
        //crf.addStatesForLabelsConnectedAsIn(trainingInstances);
        crf.addStatesForThreeQuarterLabelsConnectedAsIn(trainingInstances);
        crf.addStartState();

        CRFTrainerByLabelLikelihood trainer =
                new CRFTrainerByLabelLikelihood(crf);
        trainer.setGaussianPriorVariance(10.0);

        //CRFTrainerByStochasticGradient trainer =
        //new CRFTrainerByStochasticGradient(crf, 1.0);

        //CRFTrainerByL1LabelLikelihood trainer =
        //	new CRFTrainerByL1LabelLikelihood(crf, 0.75);

        //trainer.addEvaluator(new PerClassAccuracyEvaluator(trainingInstances, "training"));
        trainer.addEvaluator(new PerClassAccuracyEvaluator(testingInstances, "testing"));
        trainer.addEvaluator(new TokenAccuracyEvaluator(testingInstances, "testing"));
        trainer.train(trainingInstances);

    }
 
开发者ID:karahindiba,项目名称:WikiInfoboxExtractor,代码行数:51,代码来源:TrainCRF.java

示例11: TrainWikiCRF

import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public TrainWikiCRF(String trainingFilename, String testingFilename) throws IOException {
	
	ArrayList<Pipe> pipes = new ArrayList<Pipe>();

	int[][] conjunctions = new int[2][];
	conjunctions[0] = new int[] { -1 };
	conjunctions[1] = new int[] { 1 };

	pipes.add(new SimpleTaggerSentence2TokenSequence());
	pipes.add(new OffsetConjunctions(conjunctions));
	//pipes.add(new FeaturesInWindow("PREV-", -1, 1));
	pipes.add(new TokenTextCharSuffix("C1=", 1));
	pipes.add(new TokenTextCharSuffix("C2=", 2));
	pipes.add(new TokenTextCharSuffix("C3=", 3));
	pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*")));
	pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*")));
	pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*")));
	pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile(".*\\$.*")));
	pipes.add(new TokenFirstPosition("FIRSTTOKEN"));
	pipes.add(new TokenSequence2FeatureVectorSequence());

	Pipe pipe = new SerialPipes(pipes);

	InstanceList trainingInstances = new InstanceList(pipe);
	InstanceList testingInstances = new InstanceList(pipe);

	trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true));
	testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true));
	
	CRF crf = new CRF(pipe, null);
	//crf.addStatesForLabelsConnectedAsIn(trainingInstances);
	crf.addStatesForThreeQuarterLabelsConnectedAsIn(trainingInstances);
	crf.addStartState();

	CRFTrainerByLabelLikelihood trainer = 
		new CRFTrainerByLabelLikelihood(crf);
	trainer.setGaussianPriorVariance(10.0);

	//CRFTrainerByStochasticGradient trainer = 
	//new CRFTrainerByStochasticGradient(crf, 1.0);

	//CRFTrainerByL1LabelLikelihood trainer = 
	//	new CRFTrainerByL1LabelLikelihood(crf, 0.75);

	//trainer.addEvaluator(new PerClassAccuracyEvaluator(trainingInstances, "training"));
	trainer.addEvaluator(new PerClassAccuracyEvaluator(testingInstances, "testing"));
	trainer.addEvaluator(new TokenAccuracyEvaluator(testingInstances, "testing"));
	trainer.train(trainingInstances);
	
}
 
开发者ID:karahindiba,项目名称:WikiInfoboxExtractor,代码行数:51,代码来源:TrainWikiCRF.java

示例12: TestCRFPipe

import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public TestCRFPipe(String trainingFilename) throws IOException {
	
	ArrayList<Pipe> pipes = new ArrayList<Pipe>();

	PrintWriter out = new PrintWriter("test.out");

	int[][] conjunctions = new int[3][];
	conjunctions[0] = new int[] { -1 };
	conjunctions[1] = new int[] { 1 };
	conjunctions[2] = new int[] { -2, -1 };

	pipes.add(new SimpleTaggerSentence2TokenSequence());
	//pipes.add(new FeaturesInWindow("PREV-", -1, 1));
	//pipes.add(new FeaturesInWindow("NEXT-", 1, 2));
	pipes.add(new OffsetConjunctions(conjunctions));
	pipes.add(new TokenTextCharSuffix("C1=", 1));
	pipes.add(new TokenTextCharSuffix("C2=", 2));
	pipes.add(new TokenTextCharSuffix("C3=", 3));
	pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*")));
	pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*")));
	pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*")));
	pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile("\\$.*")));
	pipes.add(new TokenFirstPosition("FIRSTTOKEN"));
	pipes.add(new TokenSequence2FeatureVectorSequence());
	pipes.add(new SequencePrintingPipe(out));
          /*
          * 	new ConllNer2003Sentence2TokenSequence (),
		new RegexMatches ("INITCAP", Pattern.compile (CAPS+".*")),
		new RegexMatches ("CAPITALIZED", Pattern.compile (CAPS+LOW+"*")),
		new RegexMatches ("ALLCAPS", Pattern.compile (CAPS+"+")),
		new RegexMatches ("MIXEDCAPS", Pattern.compile ("[A-Z][a-z]+[A-Z][A-Za-z]*")),
		new RegexMatches ("CONTAINSDIGITS", Pattern.compile (".*[0-9].*")),
		new RegexMatches ("ALLDIGITS", Pattern.compile ("[0-9]+")),
		new RegexMatches ("NUMERICAL", Pattern.compile ("[-0-9]+[\\.,]+[0-9\\.,]+")),
		//new RegexMatches ("ALPHNUMERIC", Pattern.compile ("[A-Za-z0-9]+")),
		//new RegexMatches ("ROMAN", Pattern.compile ("[ivxdlcm]+|[IVXDLCM]+")),
		new RegexMatches ("MULTIDOTS", Pattern.compile ("\\.\\.+")),
		new RegexMatches ("ENDSINDOT", Pattern.compile ("[^\\.]+.*\\.")),
		new RegexMatches ("CONTAINSDASH", Pattern.compile (ALPHANUM+"+-"+ALPHANUM+"*")),
		new RegexMatches ("ACRO", Pattern.compile ("[A-Z][A-Z\\.]*\\.[A-Z\\.]*")),
		new RegexMatches ("LONELYINITIAL", Pattern.compile (CAPS+"\\.")),
		new RegexMatches ("SINGLECHAR", Pattern.compile (ALPHA)),
		new RegexMatches ("CAPLETTER", Pattern.compile ("[A-Z]")),
		new RegexMatches ("PUNC", Pattern.compile (PUNT)),
		new RegexMatches ("QUOTE", Pattern.compile (QUOTE)),
		//new RegexMatches ("LOWER", Pattern.compile (LOW+"+")),
		//new RegexMatches ("MIXEDCAPS", Pattern.compile ("[A-Z]+[a-z]+[A-Z]+[a-z]*")),
          * */
	Pipe pipe = new SerialPipes(pipes);

	InstanceList trainingInstances = new InstanceList(pipe);

	trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true));

	out.close();
	
}
 
开发者ID:karahindiba,项目名称:WikiInfoboxExtractor,代码行数:58,代码来源:TestCRFPipe.java


注:本文中的cc.mallet.pipe.SimpleTaggerSentence2TokenSequence类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。