本文整理汇总了Java中com.aliasi.tokenizer.Tokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Java Tokenizer.tokenize方法的具体用法?Java Tokenizer.tokenize怎么用?Java Tokenizer.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类com.aliasi.tokenizer.Tokenizer
的用法示例。
在下文中一共展示了Tokenizer.tokenize方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getSentences
import com.aliasi.tokenizer.Tokenizer; //导入方法依赖的package包/类
public String[] getSentences(String text) {
ArrayList<String> tokenList = new ArrayList<>();
ArrayList<String> whiteList = new ArrayList<>();
Tokenizer tokenizer = tokenizerFactory.tokenizer(text.toCharArray(), 0, text.length());
tokenizer.tokenize(tokenList, whiteList);
String[] tokens = new String[tokenList.size()];
String[] whites = new String[whiteList.size()];
tokenList.toArray(tokens);
whiteList.toArray(whites);
int[] sentenceBoundaries = sentenceModel.boundaryIndices(tokens, whites);
if (sentenceBoundaries.length < 1) {
return new String[0];
}
String[] result = new String[sentenceBoundaries.length];
int sentStartTok = 0;
int sentEndTok;
for (int i = 0; i < sentenceBoundaries.length; ++i) {
sentEndTok = sentenceBoundaries[i];
StringBuilder sb = new StringBuilder();
for (int j = sentStartTok; j <= sentEndTok; j++) {
sb.append(tokens[j]).append(whites[j + 1]);
}
result[i] = sb.toString();
sentStartTok = sentEndTok + 1;
}
return result;
}
示例2: splitSentences
import com.aliasi.tokenizer.Tokenizer; //导入方法依赖的package包/类
public static ArrayList<String> splitSentences(String paragraph)
{
ArrayList<String> tokenList = new ArrayList<String>();
ArrayList<String> whiteList = new ArrayList<String>();
paragraph=paragraph.trim();
paragraph=paragraph.replace("\r\n", " ");
paragraph=paragraph.replace("\n", " ");
paragraph=paragraph.replace("\r", " ");
Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(paragraph.toCharArray(),0, paragraph.length());
tokenizer.tokenize(tokenList, whiteList);
String[] tokens = new String[tokenList.size()];
String[] whites = new String[whiteList.size()];
tokenList.toArray(tokens);
whiteList.toArray(whites);
int[] sentenceBoundaries = SENTENCE_MODEL.boundaryIndices(tokens,whites);
ArrayList<String> sentences = new ArrayList<String>();
if (sentenceBoundaries.length < 1) {
System.out.println("No sentence boundaries found.");
sentences.add(paragraph);
}
int sentStartTok = 0;
int sentEndTok = 0;
for (int i = 0; i < sentenceBoundaries.length; ++i)
{
sentEndTok = sentenceBoundaries[i];
String sentence="";
for (int j=sentStartTok; j<=sentEndTok; j++)
{
sentence+=tokens[j]+whites[j+1];
}
sentences.add(sentence.trim());
sentStartTok = sentEndTok+1;
}
return sentences;
}
示例3: tokenize
import com.aliasi.tokenizer.Tokenizer; //导入方法依赖的package包/类
/**
* Tokenizes a text.
*
* @param text text to tokenize
* @return array of tokens or <code>null</code>, if the tokenizer is not
* initialized
*/
public static String[] tokenize(String text) {
if (tokenizerFactory == null) return null;
ArrayList<String> tokenList = new ArrayList<String>();
ArrayList<String> whiteList = new ArrayList<String>();
Tokenizer tokenizer =
tokenizerFactory.tokenizer(text.toCharArray(), 0, text.length());
tokenizer.tokenize(tokenList, whiteList);
return tokenList.toArray(new String[tokenList.size()]);
}
示例4: sentDetect
import com.aliasi.tokenizer.Tokenizer; //导入方法依赖的package包/类
/**
* Splits a text into sentences.
*
* @param text sequence of sentences
* @return array of sentences in the text or <code>null</code>, if the
* sentence detector is not initialized
*/
public static String[] sentDetect(String text) {
if (sentenceModel == null) return null;
// tokenize text
ArrayList<String> tokenList = new ArrayList<String>();
ArrayList<String> whiteList = new ArrayList<String>();
Tokenizer tokenizer =
tokenizerFactory.tokenizer(text.toCharArray(), 0, text.length());
tokenizer.tokenize(tokenList, whiteList);
String[] tokens = tokenList.toArray(new String[tokenList.size()]);
String[] whites = whiteList.toArray(new String[whiteList.size()]);
// detect sentences
int[] sentenceBoundaries =
sentenceModel.boundaryIndices(tokens, whites);
int sentStartTok = 0;
int sentEndTok = 0;
String[] sentences = new String[sentenceBoundaries.length];
for (int i = 0; i < sentenceBoundaries.length; i++) {
sentEndTok = sentenceBoundaries[i];
StringBuilder sb = new StringBuilder();
for (int j = sentStartTok; j <= sentEndTok; j++) {
sb.append(tokens[j]);
if (whites[j + 1].length() > 0 && j < sentEndTok)
sb.append(" ");
}
sentences[i] = sb.toString();
sentStartTok = sentEndTok+1;
}
return sentences;
}
示例5: wordSpliter
import com.aliasi.tokenizer.Tokenizer; //导入方法依赖的package包/类
public static List<String>[] wordSpliter(String txt) {
List<String> ls[] = new ArrayList[2];
ls[0] = new ArrayList<String>();
ls[1] = new ArrayList<String>();
char cc[] = txt.toCharArray();
Tokenizer tk = TOKENIZER.tokenizer(cc, 0, cc.length);
tk.tokenize(ls[0], ls[1]);
return ls;
}
示例6: tokenize
import com.aliasi.tokenizer.Tokenizer; //导入方法依赖的package包/类
public static String[] tokenize(String txt) {
char cc[] = txt.toCharArray();
Tokenizer tk = TOKENIZER.tokenizer(cc, 0, cc.length);
return tk.tokenize();
}
示例7: execute
import com.aliasi.tokenizer.Tokenizer; //导入方法依赖的package包/类
/**
* execute method. Makes LingPipe API calls to tokenize the document.
* It uses the document's string and passes it over to the LingPipe to
* tokenize. It also generates space tokens as well.
*/
public void execute() throws ExecutionException {
if(document == null) {
throw new ExecutionException("There is no loaded document");
}
super.fireProgressChanged(0);
long startOffset = 0, endOffset = 0;
AnnotationSet as = null;
if(outputASName == null || outputASName.trim().length() == 0)
as = document.getAnnotations();
else as = document.getAnnotations(outputASName);
String docContent = document.getContent().toString();
List<String> tokenList = new ArrayList<String>();
List<String> whiteList = new ArrayList<String>();
Tokenizer tokenizer = tf.tokenizer(docContent.toCharArray(), 0, docContent
.length());
tokenizer.tokenize(tokenList, whiteList);
for(int i = 0; i < whiteList.size(); i++) {
try {
startOffset = endOffset;
endOffset = startOffset + whiteList.get(i).length();
if((endOffset - startOffset) != 0) {
FeatureMap fmSpaces = Factory.newFeatureMap();
fmSpaces.put("length", "" + (endOffset - startOffset));
as.add(new Long(startOffset), new Long(endOffset), "SpaceToken",
fmSpaces);
}
if(i < tokenList.size()) {
startOffset = endOffset;
endOffset = startOffset + tokenList.get(i).length();
FeatureMap fmTokens = Factory.newFeatureMap();
fmTokens.put("length", "" + (endOffset - startOffset));
as.add(new Long(startOffset), new Long(endOffset), "Token", fmTokens);
}
}
catch(InvalidOffsetException e) {
throw new ExecutionException(e);
}
}
}