本文整理汇总了Java中cc.mallet.extract.StringSpan类的典型用法代码示例。如果您正苦于以下问题:Java StringSpan类的具体用法?Java StringSpan怎么用?Java StringSpan使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
StringSpan类属于cc.mallet.extract包,在下文中一共展示了StringSpan类的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: pipe
import cc.mallet.extract.StringSpan; //导入依赖的package包/类
public Instance pipe(Instance carrier)
{
StringTokenization ts = (StringTokenization) carrier.getData();
StringTokenization newTs = new StringTokenization((CharSequence) ts.getDocument ());
final LabelAlphabet dict = (LabelAlphabet) getTargetAlphabet();
LabelSequence labelSeq = new LabelSequence(dict);
Label start = dict.lookupLabel ("start");
Label notstart = dict.lookupLabel ("notstart");
boolean lastWasSpace = true;
StringBuffer sb = new StringBuffer();
for (int i = 0; i < ts.size(); i++) {
StringSpan t = (StringSpan) ts.getSpan(i);
if (t.getText().equals(" "))
lastWasSpace = true;
else {
sb.append(t.getText());
newTs.add(t);
labelSeq.add(lastWasSpace ? "start" : "notstart");
lastWasSpace = false;
}
}
if (isTargetProcessing())
carrier.setTarget(labelSeq);
carrier.setData(newTs);
carrier.setSource(sb.toString());
return carrier;
}
示例2: pipe
import cc.mallet.extract.StringSpan; //导入依赖的package包/类
public Instance pipe (Instance carrier)
{
CharSequence string = (CharSequence) carrier.getData();
lexer.setCharSequence (string);
TokenSequence ts = new StringTokenization (string);
while (lexer.hasNext()) {
lexer.next();
ts.add (new StringSpan (string, lexer.getStartOffset (), lexer.getEndOffset ()));
}
carrier.setData(ts);
return carrier;
}
示例3: pipe
import cc.mallet.extract.StringSpan; //导入依赖的package包/类
public Instance pipe (Instance carrier)
{
Object data = carrier.getData ();
if (data instanceof Tokenization) {
// we're done
} else if (data instanceof TokenSequence) {
StringBuffer buf = new StringBuffer ();
TokenSequence ts = (TokenSequence) data;
StringTokenization spans = new StringTokenization (buf); // I can use a StringBuffer as the doc! Awesome!
for (int i = 0; i < ts.size(); i++) {
Token token = ts.get(i);
int start = buf.length ();
buf.append (token.getText());
int end = buf.length();
StringSpan span = new StringSpan (buf, start, end);
span.setFeatures (token.getFeatures ());
span.setProperties (token.getProperties ());
spans.add (span);
buf.append (" ");
}
carrier.setData (spans);
} else {
throw new IllegalArgumentException ("Can't convert "+data+" to Tokenization.");
}
return carrier;
}
示例4: pipe
import cc.mallet.extract.StringSpan; //导入依赖的package包/类
public Instance pipe (Instance carrier)
{
long millis=System.currentTimeMillis();
CharSequence string = (CharSequence) carrier.getData();
lexer.setCharSequence (string);
TokenSequence ts = new StringTokenization (string);
while (lexer.hasNext()) {
lexer.next();
ts.add (new StringSpan (string, lexer.getStartOffset (), lexer.getEndOffset ()));
}
carrier.setData(ts);
//System.out.println(this.getClass().getCanonicalName() + "----" +(System.currentTimeMillis()-millis));
return carrier;
}
示例5: stem
import cc.mallet.extract.StringSpan; //导入依赖的package包/类
private TokenSequence stem(TokenSequence tokenSequence) {
PorterStemmer stemmer = new PorterStemmer();
StringIterator text = new StringIterator("");
Iterator<Token> tokens = tokenSequence.iterator();
while (tokens.hasNext()) {
Token token = tokens.next();
if (token == null || !(token instanceof StringSpan)) {
continue;
}
text.reset(token.getText());
while (!text.isEndOfText()) {
char c = text.peek();
if (!Character.isLetter(c)) {
break;
}
stemmer.add(c);
text.moveAhead();
}
stemmer.stem();
token.setText(stemmer.toString());
}
return tokenSequence;
}
示例6: pipe
import cc.mallet.extract.StringSpan; //导入依赖的package包/类
public Instance pipe (Instance carrier)
{
String input;
if (carrier.getData () instanceof CharSequence) {
input = String.valueOf(carrier.getData ());
} else {
throw new ClassCastException("Needed a String; got "+carrier.getData());
}
String[] lines = input.split ("\n");
StringSpan[] spans = new StringSpan[lines.length];
Labels[] lbls = new Labels[lines.length];
StringBuffer buf = new StringBuffer ();
Alphabet dict = getDataAlphabet ();
for (int i = 0; i < lines.length; i++) {
String line = lines[i];
String[] toks = line.split ("\\s+");
int j = 0;
ArrayList thisLabels = new ArrayList ();
if (!labelsAtEnd) {
while (!isLabelSeparator (toks, j)) {
thisLabels.add (labelForTok (toks[j], j));
j++;
}
if ((j < toks.length) && toks[j].equals ("----")) j++;
lbls[i] = new Labels ((Label[]) thisLabels.toArray (new Label[thisLabels.size ()]));
}
int maxFeatureIdx = (labelsAtEnd) ? toks.length - numLabels : toks.length;
String text = "*???*";
if (featuresIncludeToken) {
if (j < maxFeatureIdx) {
text = toks [j++];
}
}
int start = buf.length ();
buf.append (text);
int end = buf.length ();
buf.append (" ");
StringSpan span = new StringSpan (buf, start, end);
while (j < maxFeatureIdx) {
span.setFeatureValue (toks[j].intern (), 1.0);
j++;
}
if (includeTokenText) {
span.setFeatureValue ((textFeaturePrefix+text).intern(), 1.0);
}
if (labelsAtEnd) {
int firstLblIdx = j;
while (j < toks.length) {
thisLabels.add (labelForTok (toks[j], j - firstLblIdx));
j++;
}
lbls[i] = new Labels ((Label[]) thisLabels.toArray (new Label[thisLabels.size ()]));
}
spans[i] = span;
}
StringTokenization tokenization = new StringTokenization (buf);
tokenization.addAll (spans);
carrier.setData (tokenization);
carrier.setTarget (new LabelsAssignment (new LabelsSequence (lbls)));
return carrier;
}
示例7: pipe
import cc.mallet.extract.StringSpan; //导入依赖的package包/类
public Instance pipe (Instance carrier)
{
CharSequence string = (CharSequence) carrier.getData();
StringTokenization dataTokens = new StringTokenization (string);
TokenSequence targetTokens = new TokenSequence ();
String tag = backgroundTag;
String nextTag = backgroundTag;
Matcher m = sgmlPattern.matcher (string);
int textStart = 0;
int textEnd = 0;
int nextStart = 0;
boolean done = false;
logger.fine(sgmlPattern.pattern());
logger.finer(string.toString());
while (!done) {
done = !(m.find());
if (done)
textEnd = string.length(); // culotta: changed from string.length()-1
else {
String sgml = m.group();
logger.finer ("SGML = "+sgml);
int groupCount = m.groupCount();
logger.finer(Integer.toString (groupCount));
if (sgml.charAt(1) == '/')
nextTag = backgroundTag;
else{
//nextTag = m.group(0);
nextTag = sgml.substring(1, sgml.length()-1);
}
logger.finer("nextTag: " + nextTag);
nextStart = m.end(); // m.end returns one beyond index of last match char
textEnd = m.start(); // String.subtring does not include index end
logger.finer ("Text start/end "+textStart+" "+textEnd);
}
if (textEnd - textStart > 0) {
logger.finer ("Tag = "+tag);
logger.finer ("Target = "+string.subSequence (textStart, textEnd));
lexer.setCharSequence (string.subSequence (textStart, textEnd));
while (lexer.hasNext()) {
lexer.next ();
int tokStart = textStart + lexer.getStartOffset ();
int tokEnd = textStart + lexer.getEndOffset ();
dataTokens.add (new StringSpan (string, tokStart, tokEnd));
targetTokens.add (new Token (tag));
}
}
textStart = nextStart;
tag = nextTag;
}
carrier.setData(dataTokens);
carrier.setTarget(targetTokens);
if (saveSource) {
carrier.setSource(dataTokens);
}
return carrier;
}
示例8: pipe
import cc.mallet.extract.StringSpan; //导入依赖的package包/类
/**
* Takes an instance with data of type String or String[][] and creates
* an Instance of type StringTokenization. Each Token in the sequence is
* gets the test of the line preceding it and once feature of value 1
* for each "Feature" in the line. For example, if the String[][] is
* {{a,b},{c,d,e}} (and target processing is off) then the text would be
* "a b" for the first token and "c d e" for the second. Also, the
* features "a" and "b" would be set for the first token and "c", "d" and
* "e" for the second. The last element in the String[] for the current
* token is taken as the target (label), so in the previous example "b"
* would have been the label of the first sequence.
*/
public Instance pipe(Instance carrier) {
Object inputData = carrier.getData();
LabelAlphabet labels;
LabelSequence target = null;
String[][] tokens;
StringBuffer source = new StringBuffer();
StringTokenization ts = new StringTokenization(source);
if (inputData instanceof String)
tokens = parseSentence((String) inputData);
else if (inputData instanceof String[][])
tokens = (String[][]) inputData;
else
throw new IllegalArgumentException("Not a String; got " + inputData);
if (isTargetProcessing()) {
labels = (LabelAlphabet) getTargetAlphabet();
target = new LabelSequence(labels, tokens.length);
}
for (int l = 0; l < tokens.length; l++) {
int nFeatures;
if (isTargetProcessing()) {
if (tokens[l].length < 1)
throw new IllegalStateException("Missing label at line "
+ l + " instance " + carrier.getName());
nFeatures = tokens[l].length - 1;
target.add(tokens[l][nFeatures]);
} else
nFeatures = tokens[l].length;
int start = source.length();
String word = makeText(tokens[l]);
source.append(word + " ");
Token tok = new StringSpan(source, start, source.length() - 1);
if (setTokensAsFeatures) {
for (int f = 0; f < nFeatures; f++)
tok.setFeatureValue(tokens[l][f], 1.0);
} else {
for (int f = 1; f < nFeatures; f++)
tok.setFeatureValue(tokens[l][f], 1.0);
}
ts.add(tok);
}
carrier.setData(ts);
if (isTargetProcessing())
carrier.setTarget(target);
return carrier;
}
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:58,代码来源:SimpleTaggerSentence2StringTokenization.java