本文整理汇总了Java中cc.mallet.util.CharSequenceLexer类的典型用法代码示例。如果您正苦于以下问题:Java CharSequenceLexer类的具体用法?Java CharSequenceLexer怎么用?Java CharSequenceLexer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
CharSequenceLexer类属于cc.mallet.util包,在下文中一共展示了CharSequenceLexer类的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import cc.mallet.util.CharSequenceLexer; //导入依赖的package包/类
public static void main (String[] args)
{
try {
for (int i = 0; i < args.length; i++) {
Instance carrier = new Instance (new File(args[i]), null, null, null);
SerialPipes p = new SerialPipes (new Pipe[] {
new Input2CharSequence (),
new CharSequence2TokenSequence(new CharSequenceLexer())});
carrier = p.newIteratorFrom (new SingleInstanceIterator(carrier)).next();
TokenSequence ts = (TokenSequence) carrier.getData();
System.out.println ("===");
System.out.println (args[i]);
System.out.println (ts.toString());
}
} catch (Exception e) {
System.out.println (e);
e.printStackTrace();
}
}
示例2: pipe
import cc.mallet.util.CharSequenceLexer; //导入依赖的package包/类
public Instance pipe (Instance carrier)
{
TokenSequence ts = (TokenSequence) carrier.getData();
// xxx This doesn't seem so efficient. Perhaps have TokenSequence
// use a LinkedList, and remove Tokens from it? -?
// But a LinkedList implementation of TokenSequence would be quite inefficient -AKM
TokenSequence ret = new TokenSequence ();
Token prevToken = null;
for (int i = 0; i < ts.size(); i++) {
Token t = ts.get(i);
String s = t.getText();
if (CharSequenceLexer.LEX_ALPHA.matcher(s).matches()) {
ret.add (t);
prevToken = t;
} else if (markDeletions && prevToken != null)
prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText());
}
carrier.setData(ret);
return carrier;
}
示例3: testToXml
import cc.mallet.util.CharSequenceLexer; //导入依赖的package包/类
public void testToXml () {
LabelAlphabet dict = new LabelAlphabet ();
String document = "the quick brown fox leapt over the lazy dog";
StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
Label O = dict.lookupLabel ("O");
Label ANML = dict.lookupLabel ("ANIMAL");
Label VB = dict.lookupLabel ("VERB");
LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, O, ANML, ANML });
DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, "O");
String actualXml = extr.toXmlString();
String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
"<doc>the <ANIMAL>quick brown fox </ANIMAL><VERB>leapt </VERB>over the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
assertEquals (expectedXml, actualXml);
}
示例4: testToXmlBIO
import cc.mallet.util.CharSequenceLexer; //导入依赖的package包/类
public void testToXmlBIO () {
LabelAlphabet dict = new LabelAlphabet ();
String document = "the quick brown fox leapt over the lazy dog";
StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
Label O = dict.lookupLabel ("O");
Label BANML = dict.lookupLabel ("B-ANIMAL");
Label ANML = dict.lookupLabel ("ANIMAL");
Label BVB = dict.lookupLabel ("B-VERB");
Label VB = dict.lookupLabel ("I-VERB");
LabelSequence tags = new LabelSequence (new Label[] { O, BANML, ANML, BANML, BVB, VB, O, ANML, ANML });
DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new BIOTokenizationFilter());
String actualXml = extr.toXmlString();
String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
"<doc>the <ANIMAL>quick brown </ANIMAL><ANIMAL>fox </ANIMAL><VERB>leapt over </VERB>the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
assertEquals (expectedXml, actualXml);
}
示例5: main
import cc.mallet.util.CharSequenceLexer; //导入依赖的package包/类
public static void main (String[] args)
{
try {
for (int i = 0; i < args.length; i++) {
Instance carrier = new Instance (new File(args[i]), null, null, null);
BigSerialPipes p = new BigSerialPipes (new BigPipe[] {
new BigInput2CharSequence (),
new BigCharSequence2TokenSequence(new CharSequenceLexer())});
carrier = p.newIteratorFrom (new BigSingleInstanceIterator(carrier)).next();
TokenSequence ts = (TokenSequence) carrier.getData();
}
} catch (Exception e) {
e.printStackTrace();
}
}
示例6: ignoretestToXml
import cc.mallet.util.CharSequenceLexer; //导入依赖的package包/类
public void ignoretestToXml () {
LabelAlphabet dict = new LabelAlphabet ();
String document = "the quick brown fox leapt over the lazy dog";
StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
Label O = dict.lookupLabel ("O");
Label ANML = dict.lookupLabel ("ANIMAL");
Label VB = dict.lookupLabel ("VERB");
LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, O, ANML, ANML });
DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, "O");
String actualXml = extr.toXmlString();
String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
"<doc>the <ANIMAL>quick brown fox </ANIMAL><VERB>leapt </VERB>over the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
assertEquals (expectedXml, actualXml);
}
示例7: ignoretestToXmlBIO
import cc.mallet.util.CharSequenceLexer; //导入依赖的package包/类
public void ignoretestToXmlBIO () {
LabelAlphabet dict = new LabelAlphabet ();
String document = "the quick brown fox leapt over the lazy dog";
StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
Label O = dict.lookupLabel ("O");
Label BANML = dict.lookupLabel ("B-ANIMAL");
Label ANML = dict.lookupLabel ("ANIMAL");
Label BVB = dict.lookupLabel ("B-VERB");
Label VB = dict.lookupLabel ("I-VERB");
LabelSequence tags = new LabelSequence (new Label[] { O, BANML, ANML, BANML, BVB, VB, O, ANML, ANML });
DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new BIOTokenizationFilter());
String actualXml = extr.toXmlString();
String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
"<doc>the <ANIMAL>quick brown </ANIMAL><ANIMAL>fox </ANIMAL><VERB>leapt over </VERB>the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
assertEquals (expectedXml, actualXml);
}
示例8: tags
import cc.mallet.util.CharSequenceLexer; //导入依赖的package包/类
/**
@param lexer to tokenize input
@param backgroundTag default tag when not in any other tag
@param allowed set of tags (Strings) that will be converted to
labels
*/
public SelectiveSGML2TokenSequence (CharSequenceLexer lexer, String backgroundTag, Set allowed)
{
this.lexer = lexer;
this.backgroundTag = backgroundTag;
this.allowedTags = allowed;
}
示例9: readObject
import cc.mallet.util.CharSequenceLexer; //导入依赖的package包/类
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
int version = in.readInt ();
sgmlPattern = (Pattern) in.readObject();
lexer = (CharSequenceLexer) in.readObject();
backgroundTag = (String) in.readObject();
allowedTags = (Set) in.readObject();
}
示例10: readObject
import cc.mallet.util.CharSequenceLexer; //导入依赖的package包/类
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
int version = in.readInt ();
sgmlPattern = (Pattern) in.readObject();
lexer = (CharSequenceLexer) in.readObject();
backgroundTag = (String) in.readObject();
if (version == 0) {
saveSource = true;
}
else {
saveSource = in.readBoolean();
}
}
示例11: StringTokenization
import cc.mallet.util.CharSequenceLexer; //导入依赖的package包/类
/**
* Creates a tokenization of the given string. Tokens are
* added from all the matches of the given lexer.
*/
public StringTokenization (CharSequence string, CharSequenceLexer lexer)
{
super();
this.document = string;
lexer.setCharSequence (string);
while (lexer.hasNext()) {
lexer.next ();
this.add (new StringSpan (string, lexer.getStartOffset(), lexer.getEndOffset()));
}
}
示例12: createExtractionFrom
import cc.mallet.util.CharSequenceLexer; //导入依赖的package包/类
private Extraction createExtractionFrom (String[] predStrings, String[] trueStrings)
{
Pipe pipe = new SerialPipes (new Pipe[] {
new SGML2TokenSequence (new CharSequenceLexer (CharSequenceLexer.LEX_NONWHITESPACE_CLASSES ), "O"),
new Target2LabelSequence (),
new PrintInputAndTarget (),
});
InstanceList pred = new InstanceList (pipe);
pred.addThruPipe (new ArrayIterator (predStrings));
InstanceList targets = new InstanceList (pipe);
targets.addThruPipe (new ArrayIterator (trueStrings));
LabelAlphabet dict = (LabelAlphabet) pipe.getTargetAlphabet ();
Extraction extraction = new Extraction (null, dict);
for (int i = 0; i < pred.size(); i++) {
Instance aPred = pred.get (i);
Instance aTarget = targets.get (i);
Tokenization input = (Tokenization) aPred.getData ();
Sequence predSeq = (Sequence) aPred.getTarget ();
Sequence targetSeq = (Sequence) aTarget.getTarget ();
DocumentExtraction docextr = new DocumentExtraction ("TEST"+i, dict, input, predSeq, targetSeq, "O");
extraction.addDocumentExtraction (docextr);
}
return extraction;
}
示例13: testNestedToXML
import cc.mallet.util.CharSequenceLexer; //导入依赖的package包/类
public void testNestedToXML ()
{
LabelAlphabet dict = new LabelAlphabet ();
String document = "the quick brown fox leapt over the lazy dog";
StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
Label O = dict.lookupLabel ("O");
Label ANML = dict.lookupLabel ("ANIMAL");
Label VB = dict.lookupLabel ("VERB");
Label JJ = dict.lookupLabel ("ADJ");
Label MAMMAL = dict.lookupLabel ("MAMMAL");
LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, ANML, ANML, ANML });
LabeledSpans spans = new DefaultTokenizationFilter ().constructLabeledSpans (dict, document, O, toks, tags);
Span foxToken = toks.subspan (3, 4);
spans.add (new LabeledSpan (foxToken, MAMMAL, false));
Span bigDogToken = toks.subspan (7, 8);
spans.add (new LabeledSpan (bigDogToken, JJ, false));
DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, spans, null, "O");
String actualXml = extr.toXmlString();
String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
"<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy </ADJ>dog</ANIMAL></doc>\r\n";
assertEquals (expectedXml, actualXml);
}
示例14: testNestedXMLTokenizationFilter
import cc.mallet.util.CharSequenceLexer; //导入依赖的package包/类
public void testNestedXMLTokenizationFilter ()
{
LabelAlphabet dict = new LabelAlphabet ();
String document = "the quick brown fox leapt over the lazy dog";
StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
Label O = dict.lookupLabel ("O");
Label ANML = dict.lookupLabel ("ANIMAL");
Label ANML_MAMM = dict.lookupLabel ("ANIMAL|MAMMAL");
Label VB = dict.lookupLabel ("VERB");
Label ANML_JJ = dict.lookupLabel ("ANIMAL|ADJ");
Label ANML_JJ_MAMM = dict.lookupLabel ("ANIMAL|ADJ|MAMMAL");
LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML_MAMM, VB, O, ANML, ANML_JJ, ANML_JJ_MAMM });
DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter ());
String actualXml = extr.toXmlString();
String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
"<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy <MAMMAL>dog</MAMMAL></ADJ></ANIMAL></doc>\r\n";
assertEquals (expectedXml, actualXml);
// Test the ignore function
extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter (Pattern.compile ("AD.*")));
actualXml = extr.toXmlString();
expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
"<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the lazy <MAMMAL>dog</MAMMAL></ANIMAL></doc>\r\n";
assertEquals (expectedXml, actualXml);
}