本文整理汇总了Java中cc.mallet.extract.StringTokenization类的典型用法代码示例。如果您正苦于以下问题:Java StringTokenization类的具体用法?Java StringTokenization怎么用?Java StringTokenization使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
StringTokenization类属于cc.mallet.extract包,在下文中一共展示了StringTokenization类的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testLabelsAtEnd
import cc.mallet.extract.StringTokenization; //导入依赖的package包/类
public void testLabelsAtEnd () throws IOException, ClassNotFoundException
{
GenericAcrfData2TokenSequence p = new GenericAcrfData2TokenSequence (2);
p.setLabelsAtEnd (true);
InstanceList training = new InstanceList (p);
training.addThruPipe (new LineGroupIterator (new StringReader (labelsAtEndData), Pattern.compile ("^$"), true));
assertEquals (1, training.size ());
Instance inst1 = training.get (0);
StringTokenization toks = (StringTokenization) inst1.getData ();
LabelsSequence ls1 = (LabelsSequence) inst1.getTarget ();
assertEquals (4, ls1.size ());
assertEquals (3, toks.get(0).getFeatures ().size ());
assertEquals ("LBLB LBLD", ls1.getLabels (0).toString ());
LabelAlphabet globalDict = p.getLabelAlphabet (0);
assertEquals (2, p.numLevels ());
assertEquals (globalDict, ls1.getLabels (0).get (0).getLabelAlphabet ());
}
示例2: pipe
import cc.mallet.extract.StringTokenization; //导入依赖的package包/类
public Instance pipe(Instance carrier) {
CharSequence string = (CharSequence) carrier.getData();
TokenSequence ts = new StringTokenization(string);
List<Term> terms = HanLP.segment(string.toString());
for (Term term : terms) {
if (term.word.length() < 2) {
continue;
}
if (denydPosSet.contains(term.nature.name())) {
continue;
}
ts.add(new Token(term.word));
}
carrier.setData(ts);
return carrier;
}
示例3: ignoretestLabelsAtEnd
import cc.mallet.extract.StringTokenization; //导入依赖的package包/类
public void ignoretestLabelsAtEnd () throws IOException, ClassNotFoundException
{
GenericAcrfData2TokenSequence p = new GenericAcrfData2TokenSequence (2);
p.setLabelsAtEnd (true);
InstanceList training = new InstanceList (p);
training.addThruPipe (new LineGroupIterator (new StringReader (labelsAtEndData), Pattern.compile ("^$"), true));
assertEquals (1, training.size ());
Instance inst1 = training.get (0);
StringTokenization toks = (StringTokenization) inst1.getData ();
LabelsSequence ls1 = (LabelsSequence) inst1.getTarget ();
assertEquals (4, ls1.size ());
assertEquals (3, toks.get(0).getFeatures ().size ());
assertEquals ("LBLB LBLD", ls1.getLabels (0).toString ());
LabelAlphabet globalDict = p.getLabelAlphabet (0);
assertEquals (2, p.numLevels ());
assertEquals (globalDict, ls1.getLabels (0).get (0).getLabelAlphabet ());
}
示例4: pipe
import cc.mallet.extract.StringTokenization; //导入依赖的package包/类
public Instance pipe(Instance carrier)
{
StringTokenization ts = (StringTokenization) carrier.getData();
StringTokenization newTs = new StringTokenization((CharSequence) ts.getDocument ());
final LabelAlphabet dict = (LabelAlphabet) getTargetAlphabet();
LabelSequence labelSeq = new LabelSequence(dict);
Label start = dict.lookupLabel ("start");
Label notstart = dict.lookupLabel ("notstart");
boolean lastWasSpace = true;
StringBuffer sb = new StringBuffer();
for (int i = 0; i < ts.size(); i++) {
StringSpan t = (StringSpan) ts.getSpan(i);
if (t.getText().equals(" "))
lastWasSpace = true;
else {
sb.append(t.getText());
newTs.add(t);
labelSeq.add(lastWasSpace ? "start" : "notstart");
lastWasSpace = false;
}
}
if (isTargetProcessing())
carrier.setTarget(labelSeq);
carrier.setData(newTs);
carrier.setSource(sb.toString());
return carrier;
}
示例5: pipe
import cc.mallet.extract.StringTokenization; //导入依赖的package包/类
public Instance pipe (Instance carrier)
{
CharSequence string = (CharSequence) carrier.getData();
lexer.setCharSequence (string);
TokenSequence ts = new StringTokenization (string);
while (lexer.hasNext()) {
lexer.next();
ts.add (new StringSpan (string, lexer.getStartOffset (), lexer.getEndOffset ()));
}
carrier.setData(ts);
return carrier;
}
示例6: pipe
import cc.mallet.extract.StringTokenization; //导入依赖的package包/类
public Instance pipe (Instance carrier)
{
Object data = carrier.getData ();
if (data instanceof Tokenization) {
// we're done
} else if (data instanceof TokenSequence) {
StringBuffer buf = new StringBuffer ();
TokenSequence ts = (TokenSequence) data;
StringTokenization spans = new StringTokenization (buf); // I can use a StringBuffer as the doc! Awesome!
for (int i = 0; i < ts.size(); i++) {
Token token = ts.get(i);
int start = buf.length ();
buf.append (token.getText());
int end = buf.length();
StringSpan span = new StringSpan (buf, start, end);
span.setFeatures (token.getFeatures ());
span.setProperties (token.getProperties ());
spans.add (span);
buf.append (" ");
}
carrier.setData (spans);
} else {
throw new IllegalArgumentException ("Can't convert "+data+" to Tokenization.");
}
return carrier;
}
示例7: pipe
import cc.mallet.extract.StringTokenization; //导入依赖的package包/类
public Instance pipe (Instance carrier)
{
long millis=System.currentTimeMillis();
CharSequence string = (CharSequence) carrier.getData();
lexer.setCharSequence (string);
TokenSequence ts = new StringTokenization (string);
while (lexer.hasNext()) {
lexer.next();
ts.add (new StringSpan (string, lexer.getStartOffset (), lexer.getEndOffset ()));
}
carrier.setData(ts);
//System.out.println(this.getClass().getCanonicalName() + "----" +(System.currentTimeMillis()-millis));
return carrier;
}
示例8: pipe
import cc.mallet.extract.StringTokenization; //导入依赖的package包/类
public Instance pipe (Instance carrier)
{
String input;
if (carrier.getData () instanceof CharSequence) {
input = String.valueOf(carrier.getData ());
} else {
throw new ClassCastException("Needed a String; got "+carrier.getData());
}
String[] lines = input.split ("\n");
StringSpan[] spans = new StringSpan[lines.length];
Labels[] lbls = new Labels[lines.length];
StringBuffer buf = new StringBuffer ();
Alphabet dict = getDataAlphabet ();
for (int i = 0; i < lines.length; i++) {
String line = lines[i];
String[] toks = line.split ("\\s+");
int j = 0;
ArrayList thisLabels = new ArrayList ();
if (!labelsAtEnd) {
while (!isLabelSeparator (toks, j)) {
thisLabels.add (labelForTok (toks[j], j));
j++;
}
if ((j < toks.length) && toks[j].equals ("----")) j++;
lbls[i] = new Labels ((Label[]) thisLabels.toArray (new Label[thisLabels.size ()]));
}
int maxFeatureIdx = (labelsAtEnd) ? toks.length - numLabels : toks.length;
String text = "*???*";
if (featuresIncludeToken) {
if (j < maxFeatureIdx) {
text = toks [j++];
}
}
int start = buf.length ();
buf.append (text);
int end = buf.length ();
buf.append (" ");
StringSpan span = new StringSpan (buf, start, end);
while (j < maxFeatureIdx) {
span.setFeatureValue (toks[j].intern (), 1.0);
j++;
}
if (includeTokenText) {
span.setFeatureValue ((textFeaturePrefix+text).intern(), 1.0);
}
if (labelsAtEnd) {
int firstLblIdx = j;
while (j < toks.length) {
thisLabels.add (labelForTok (toks[j], j - firstLblIdx));
j++;
}
lbls[i] = new Labels ((Label[]) thisLabels.toArray (new Label[thisLabels.size ()]));
}
spans[i] = span;
}
StringTokenization tokenization = new StringTokenization (buf);
tokenization.addAll (spans);
carrier.setData (tokenization);
carrier.setTarget (new LabelsAssignment (new LabelsSequence (lbls)));
return carrier;
}
示例9: pipe
import cc.mallet.extract.StringTokenization; //导入依赖的package包/类
public Instance pipe (Instance carrier)
{
CharSequence string = (CharSequence) carrier.getData();
StringTokenization dataTokens = new StringTokenization (string);
TokenSequence targetTokens = new TokenSequence ();
String tag = backgroundTag;
String nextTag = backgroundTag;
Matcher m = sgmlPattern.matcher (string);
int textStart = 0;
int textEnd = 0;
int nextStart = 0;
boolean done = false;
logger.fine(sgmlPattern.pattern());
logger.finer(string.toString());
while (!done) {
done = !(m.find());
if (done)
textEnd = string.length(); // culotta: changed from string.length()-1
else {
String sgml = m.group();
logger.finer ("SGML = "+sgml);
int groupCount = m.groupCount();
logger.finer(Integer.toString (groupCount));
if (sgml.charAt(1) == '/')
nextTag = backgroundTag;
else{
//nextTag = m.group(0);
nextTag = sgml.substring(1, sgml.length()-1);
}
logger.finer("nextTag: " + nextTag);
nextStart = m.end(); // m.end returns one beyond index of last match char
textEnd = m.start(); // String.subtring does not include index end
logger.finer ("Text start/end "+textStart+" "+textEnd);
}
if (textEnd - textStart > 0) {
logger.finer ("Tag = "+tag);
logger.finer ("Target = "+string.subSequence (textStart, textEnd));
lexer.setCharSequence (string.subSequence (textStart, textEnd));
while (lexer.hasNext()) {
lexer.next ();
int tokStart = textStart + lexer.getStartOffset ();
int tokEnd = textStart + lexer.getEndOffset ();
dataTokens.add (new StringSpan (string, tokStart, tokEnd));
targetTokens.add (new Token (tag));
}
}
textStart = nextStart;
tag = nextTag;
}
carrier.setData(dataTokens);
carrier.setTarget(targetTokens);
if (saveSource) {
carrier.setSource(dataTokens);
}
return carrier;
}
示例10: pipe
import cc.mallet.extract.StringTokenization; //导入依赖的package包/类
/**
* Takes an instance with data of type String or String[][] and creates
* an Instance of type StringTokenization. Each Token in the sequence is
* gets the test of the line preceding it and once feature of value 1
* for each "Feature" in the line. For example, if the String[][] is
* {{a,b},{c,d,e}} (and target processing is off) then the text would be
* "a b" for the first token and "c d e" for the second. Also, the
* features "a" and "b" would be set for the first token and "c", "d" and
* "e" for the second. The last element in the String[] for the current
* token is taken as the target (label), so in the previous example "b"
* would have been the label of the first sequence.
*/
public Instance pipe(Instance carrier) {
Object inputData = carrier.getData();
LabelAlphabet labels;
LabelSequence target = null;
String[][] tokens;
StringBuffer source = new StringBuffer();
StringTokenization ts = new StringTokenization(source);
if (inputData instanceof String)
tokens = parseSentence((String) inputData);
else if (inputData instanceof String[][])
tokens = (String[][]) inputData;
else
throw new IllegalArgumentException("Not a String; got " + inputData);
if (isTargetProcessing()) {
labels = (LabelAlphabet) getTargetAlphabet();
target = new LabelSequence(labels, tokens.length);
}
for (int l = 0; l < tokens.length; l++) {
int nFeatures;
if (isTargetProcessing()) {
if (tokens[l].length < 1)
throw new IllegalStateException("Missing label at line "
+ l + " instance " + carrier.getName());
nFeatures = tokens[l].length - 1;
target.add(tokens[l][nFeatures]);
} else
nFeatures = tokens[l].length;
int start = source.length();
String word = makeText(tokens[l]);
source.append(word + " ");
Token tok = new StringSpan(source, start, source.length() - 1);
if (setTokensAsFeatures) {
for (int f = 0; f < nFeatures; f++)
tok.setFeatureValue(tokens[l][f], 1.0);
} else {
for (int f = 1; f < nFeatures; f++)
tok.setFeatureValue(tokens[l][f], 1.0);
}
ts.add(tok);
}
carrier.setData(ts);
if (isTargetProcessing())
carrier.setTarget(target);
return carrier;
}
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:58,代码来源:SimpleTaggerSentence2StringTokenization.java