本文整理汇总了Java中cc.mallet.pipe.SimpleTaggerSentence2TokenSequence类的典型用法代码示例。如果您正苦于以下问题:Java SimpleTaggerSentence2TokenSequence类的具体用法?Java SimpleTaggerSentence2TokenSequence怎么用?Java SimpleTaggerSentence2TokenSequence使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
SimpleTaggerSentence2TokenSequence类属于cc.mallet.pipe包,在下文中一共展示了SimpleTaggerSentence2TokenSequence类的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testMultiTagSerialization
import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public static void testMultiTagSerialization () throws IOException, ClassNotFoundException
{
Pipe origPipe = new SerialPipes (new Pipe[] {
new SimpleTaggerSentence2TokenSequence (),
new TokenText (),
new RegexMatches ("digits", Pattern.compile ("[0-9]+")),
new RegexMatches ("ampm", Pattern.compile ("[aApP][mM]")),
new OffsetFeatureConjunction ("time",
new String[] { "digits", "ampm" },
new int[] { 0, 1 },
true),
new PrintInputAndTarget (),
});
Pipe mtPipe = (Pipe) TestSerializable.cloneViaSerialization (origPipe);
InstanceList mtLst = new InstanceList (mtPipe);
mtLst.addThruPipe (new ArrayIterator (doc1));
Instance mtInst = mtLst.get (0);
TokenSequence mtTs = (TokenSequence) mtInst.getData ();
assertEquals (6, mtTs.size ());
assertEquals (1.0, mtTs.get (3).getFeatureValue ("time"), 1e-15);
assertEquals (1.0, mtTs.get (4).getFeatureValue ("time"), 1e-15);
}
示例2: testConcatenateBadPipes
import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public void testConcatenateBadPipes ()
{
Pipe p1 = new SimpleTaggerSentence2TokenSequence ();
// force resolving data alphabet
Alphabet dict1 = p1.getDataAlphabet ();
Pipe p2 = new SimpleTaggerSentence2TokenSequence ();
// force resolving data alphabet
Alphabet dict2 = p2.getDataAlphabet ();
assertTrue (dict1 != dict2);
try {
PipeUtils.concatenatePipes (p1, p2);
assertTrue ("Test failed: concatenatePipes() allowed putting together incompatible alphabets.", false);
} catch (IllegalArgumentException e) {
// Exception expected
}
}
示例3: build
import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public Pipe build() {
pipes = new LinkedList<Pipe>();
pipes.add(new SimpleTaggerSentence2TokenSequence(false));
addFeatures();
pipes.add(new TokenSequence2FeatureVectorSequence(true, false));
if (useSCL) {
Pipe pipe = new SerialPipes(pipes);
InstanceList sourceTrainInstances = new InstanceList(pipe);
sourceTrainInstances.addThruPipe(sclSourceIt);
InstanceList targetTrainInstances = new InstanceList(pipe);
targetTrainInstances.addThruPipe(sclTargetIt);
SCL scl = trainSCL(sourceTrainInstances, targetTrainInstances);
pipes.removeLast();
pipes.add(new TokenSequence2FeatureVectorSequence(false, true));
pipes.add(new SCLAugment(scl));
}
return new SerialPipes(pipes);
}
示例4: ignoretestConcatenateBadPipes
import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public void ignoretestConcatenateBadPipes ()
{
Pipe p1 = new SimpleTaggerSentence2TokenSequence ();
// force resolving data alphabet
Alphabet dict1 = p1.getDataAlphabet ();
Pipe p2 = new SimpleTaggerSentence2TokenSequence ();
// force resolving data alphabet
Alphabet dict2 = p2.getDataAlphabet ();
assertTrue (dict1 != dict2);
try {
PipeUtils.concatenatePipes (p1, p2);
assertTrue ("Test failed: concatenatePipes() allowed putting together incompatible alphabets.", false);
} catch (IllegalArgumentException e) {
// Exception expected
}
}
示例5: testPipesAreStupid
import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public void testPipesAreStupid ()
{
Pipe p1 = new StupidPipe ();
Pipe p2 = new SimpleTaggerSentence2TokenSequence ();
// initialize p2's dict
p2.instanceFrom(new Instance (data, null, null, null));
Pipe serial = new SerialPipes (new Pipe[] { p1, p2 });
try {
serial.getDataAlphabet ();
assertTrue ("Test failed: Should have generated exception.", false);
} catch (IllegalStateException e) {}
}
示例6: run
import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public static List<String> run(String trainingFilename, String testingFilename)
throws FileNotFoundException, IOException {
ArrayList<Pipe> pipes = new ArrayList<Pipe>();
pipes.add(new SimpleTaggerSentence2TokenSequence());
pipes.add(new TokenSequence2FeatureSequence());
Pipe pipe = new SerialPipes(pipes);
InstanceList trainingInstances = new InstanceList(pipe);
InstanceList testingInstances = new InstanceList(pipe);
trainingInstances.addThruPipe(new LineGroupIterator(
new BufferedReader(new InputStreamReader(new FileInputStream(trainingFilename))),
Pattern.compile("^\\s*$"), true));
testingInstances.addThruPipe(new LineGroupIterator(
new BufferedReader(new InputStreamReader(new FileInputStream(testingFilename))),
Pattern.compile("^\\s*$"), true));
HMM hmm = new HMM(pipe, null);
hmm.addStatesForLabelsConnectedAsIn(trainingInstances);
HMMTrainerByLikelihood trainer = new HMMTrainerByLikelihood(hmm);
TransducerEvaluator testingEvaluator = new SegmentationEvaluator(testingInstances, "testing");
trainer.train(trainingInstances, 100);
testingEvaluator.evaluate(trainer);
return testingInstances.stream().map(Instance::getData).map(Sequence.class::cast)
.map(hmm::transduce)
.flatMap(output -> IntStream.range(0, output.size()).mapToObj(output::get))
.map(String.class::cast).collect(toList());
// hmm.print();
}
示例7: VocabProcessor
import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public VocabProcessor(String... filenames) throws FileNotFoundException {
Pipe pipe = new SimpleTaggerSentence2TokenSequence();
ilists = new InstanceList[filenames.length];
vocabs = new Map[filenames.length];
for (int i = 0; i < filenames.length; i++) {
ilists[i] = new InstanceList(pipe);
ilists[i].addThruPipe(new LineGroupIterator(new FileReader(
filenames[i]), Pattern.compile("^\\s*$"), true));
vocabs[i] = getVocab(ilists[i]);
System.out.println("Vocab " + i + " : " + vocabs[i].size());
}
}
示例8: ignoretestPipesAreStupid
import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public void ignoretestPipesAreStupid ()
{
Pipe p1 = new StupidPipe ();
Pipe p2 = new SimpleTaggerSentence2TokenSequence ();
// initialize p2's dict
p2.instanceFrom(new Instance (data, null, null, null));
Pipe serial = new SerialPipes (new Pipe[] { p1, p2 });
try {
serial.getDataAlphabet ();
assertTrue ("Test failed: Should have generated exception.", false);
} catch (IllegalStateException e) {}
}
示例9: testMultiTag
import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public static void testMultiTag ()
{
Pipe mtPipe = new SerialPipes (new Pipe[] {
new SimpleTaggerSentence2TokenSequence (),
new TokenText (),
new RegexMatches ("digits", Pattern.compile ("[0-9]+")),
new RegexMatches ("ampm", Pattern.compile ("[aApP][mM]")),
new OffsetFeatureConjunction ("time",
new String[] { "digits", "ampm" },
new int[] { 0, 1 },
true),
new PrintInputAndTarget (),
});
Pipe noMtPipe = new SerialPipes (new Pipe[] {
new SimpleTaggerSentence2TokenSequence (),
new TokenText (),
new RegexMatches ("digits", Pattern.compile ("[0-9]+")),
new RegexMatches ("ampm", Pattern.compile ("[aApP][mM]")),
new OffsetFeatureConjunction ("time",
new String[] { "digits", "ampm" },
new int[] { 0, 1 },
false),
new PrintInputAndTarget (),
});
InstanceList mtLst = new InstanceList (mtPipe);
InstanceList noMtLst = new InstanceList (noMtPipe);
mtLst.addThruPipe (new ArrayIterator (doc1));
noMtLst.addThruPipe (new ArrayIterator (doc1));
Instance mtInst = mtLst.get (0);
Instance noMtInst = noMtLst.get (0);
TokenSequence mtTs = (TokenSequence) mtInst.getData ();
TokenSequence noMtTs = (TokenSequence) noMtInst.getData ();
assertEquals (6, mtTs.size ());
assertEquals (6, noMtTs.size ());
assertEquals (1.0, mtTs.get (3).getFeatureValue ("time"), 1e-15);
assertEquals (1.0, noMtTs.get (3).getFeatureValue ("time"), 1e-15);
assertEquals (1.0, mtTs.get (4).getFeatureValue ("time"), 1e-15);
assertEquals (0.0, noMtTs.get (4).getFeatureValue ("time"), 1e-15);
}
示例10: TrainCRF
import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public TrainCRF(String trainingFilename, String testingFilename) throws IOException {
ArrayList<Pipe> pipes = new ArrayList<Pipe>();
int[][] conjunctions = new int[2][];
conjunctions[0] = new int[] { -1 };
conjunctions[1] = new int[] { 1 };
pipes.add(new SimpleTaggerSentence2TokenSequence());
pipes.add(new OffsetConjunctions(conjunctions));
//pipes.add(new FeaturesInWindow("PREV-", -1, 1));
pipes.add(new TokenTextCharSuffix("C1=", 1));
pipes.add(new TokenTextCharSuffix("C2=", 2));
pipes.add(new TokenTextCharSuffix("C3=", 3));
pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*")));
pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*")));
pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*")));
pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile(".*\\$.*")));
pipes.add(new TokenFirstPosition("FIRSTTOKEN"));
pipes.add(new TokenSequence2FeatureVectorSequence());
Pipe pipe = new SerialPipes(pipes);
InstanceList trainingInstances = new InstanceList(pipe);
InstanceList testingInstances = new InstanceList(pipe);
trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true));
testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true));
CRF crf = new CRF(pipe, null);
//crf.addStatesForLabelsConnectedAsIn(trainingInstances);
crf.addStatesForThreeQuarterLabelsConnectedAsIn(trainingInstances);
crf.addStartState();
CRFTrainerByLabelLikelihood trainer =
new CRFTrainerByLabelLikelihood(crf);
trainer.setGaussianPriorVariance(10.0);
//CRFTrainerByStochasticGradient trainer =
//new CRFTrainerByStochasticGradient(crf, 1.0);
//CRFTrainerByL1LabelLikelihood trainer =
// new CRFTrainerByL1LabelLikelihood(crf, 0.75);
//trainer.addEvaluator(new PerClassAccuracyEvaluator(trainingInstances, "training"));
trainer.addEvaluator(new PerClassAccuracyEvaluator(testingInstances, "testing"));
trainer.addEvaluator(new TokenAccuracyEvaluator(testingInstances, "testing"));
trainer.train(trainingInstances);
}
示例11: TrainWikiCRF
import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public TrainWikiCRF(String trainingFilename, String testingFilename) throws IOException {
ArrayList<Pipe> pipes = new ArrayList<Pipe>();
int[][] conjunctions = new int[2][];
conjunctions[0] = new int[] { -1 };
conjunctions[1] = new int[] { 1 };
pipes.add(new SimpleTaggerSentence2TokenSequence());
pipes.add(new OffsetConjunctions(conjunctions));
//pipes.add(new FeaturesInWindow("PREV-", -1, 1));
pipes.add(new TokenTextCharSuffix("C1=", 1));
pipes.add(new TokenTextCharSuffix("C2=", 2));
pipes.add(new TokenTextCharSuffix("C3=", 3));
pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*")));
pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*")));
pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*")));
pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile(".*\\$.*")));
pipes.add(new TokenFirstPosition("FIRSTTOKEN"));
pipes.add(new TokenSequence2FeatureVectorSequence());
Pipe pipe = new SerialPipes(pipes);
InstanceList trainingInstances = new InstanceList(pipe);
InstanceList testingInstances = new InstanceList(pipe);
trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true));
testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true));
CRF crf = new CRF(pipe, null);
//crf.addStatesForLabelsConnectedAsIn(trainingInstances);
crf.addStatesForThreeQuarterLabelsConnectedAsIn(trainingInstances);
crf.addStartState();
CRFTrainerByLabelLikelihood trainer =
new CRFTrainerByLabelLikelihood(crf);
trainer.setGaussianPriorVariance(10.0);
//CRFTrainerByStochasticGradient trainer =
//new CRFTrainerByStochasticGradient(crf, 1.0);
//CRFTrainerByL1LabelLikelihood trainer =
// new CRFTrainerByL1LabelLikelihood(crf, 0.75);
//trainer.addEvaluator(new PerClassAccuracyEvaluator(trainingInstances, "training"));
trainer.addEvaluator(new PerClassAccuracyEvaluator(testingInstances, "testing"));
trainer.addEvaluator(new TokenAccuracyEvaluator(testingInstances, "testing"));
trainer.train(trainingInstances);
}
示例12: TestCRFPipe
import cc.mallet.pipe.SimpleTaggerSentence2TokenSequence; //导入依赖的package包/类
public TestCRFPipe(String trainingFilename) throws IOException {
ArrayList<Pipe> pipes = new ArrayList<Pipe>();
PrintWriter out = new PrintWriter("test.out");
int[][] conjunctions = new int[3][];
conjunctions[0] = new int[] { -1 };
conjunctions[1] = new int[] { 1 };
conjunctions[2] = new int[] { -2, -1 };
pipes.add(new SimpleTaggerSentence2TokenSequence());
//pipes.add(new FeaturesInWindow("PREV-", -1, 1));
//pipes.add(new FeaturesInWindow("NEXT-", 1, 2));
pipes.add(new OffsetConjunctions(conjunctions));
pipes.add(new TokenTextCharSuffix("C1=", 1));
pipes.add(new TokenTextCharSuffix("C2=", 2));
pipes.add(new TokenTextCharSuffix("C3=", 3));
pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*")));
pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*")));
pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*")));
pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile("\\$.*")));
pipes.add(new TokenFirstPosition("FIRSTTOKEN"));
pipes.add(new TokenSequence2FeatureVectorSequence());
pipes.add(new SequencePrintingPipe(out));
/*
* new ConllNer2003Sentence2TokenSequence (),
new RegexMatches ("INITCAP", Pattern.compile (CAPS+".*")),
new RegexMatches ("CAPITALIZED", Pattern.compile (CAPS+LOW+"*")),
new RegexMatches ("ALLCAPS", Pattern.compile (CAPS+"+")),
new RegexMatches ("MIXEDCAPS", Pattern.compile ("[A-Z][a-z]+[A-Z][A-Za-z]*")),
new RegexMatches ("CONTAINSDIGITS", Pattern.compile (".*[0-9].*")),
new RegexMatches ("ALLDIGITS", Pattern.compile ("[0-9]+")),
new RegexMatches ("NUMERICAL", Pattern.compile ("[-0-9]+[\\.,]+[0-9\\.,]+")),
//new RegexMatches ("ALPHNUMERIC", Pattern.compile ("[A-Za-z0-9]+")),
//new RegexMatches ("ROMAN", Pattern.compile ("[ivxdlcm]+|[IVXDLCM]+")),
new RegexMatches ("MULTIDOTS", Pattern.compile ("\\.\\.+")),
new RegexMatches ("ENDSINDOT", Pattern.compile ("[^\\.]+.*\\.")),
new RegexMatches ("CONTAINSDASH", Pattern.compile (ALPHANUM+"+-"+ALPHANUM+"*")),
new RegexMatches ("ACRO", Pattern.compile ("[A-Z][A-Z\\.]*\\.[A-Z\\.]*")),
new RegexMatches ("LONELYINITIAL", Pattern.compile (CAPS+"\\.")),
new RegexMatches ("SINGLECHAR", Pattern.compile (ALPHA)),
new RegexMatches ("CAPLETTER", Pattern.compile ("[A-Z]")),
new RegexMatches ("PUNC", Pattern.compile (PUNT)),
new RegexMatches ("QUOTE", Pattern.compile (QUOTE)),
//new RegexMatches ("LOWER", Pattern.compile (LOW+"+")),
//new RegexMatches ("MIXEDCAPS", Pattern.compile ("[A-Z]+[a-z]+[A-Z]+[a-z]*")),
* */
Pipe pipe = new SerialPipes(pipes);
InstanceList trainingInstances = new InstanceList(pipe);
trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true));
out.close();
}