本文整理汇总了Java中com.aliasi.tokenizer.Tokenizer类的典型用法代码示例。如果您正苦于以下问题:Java Tokenizer类的具体用法?Java Tokenizer怎么用?Java Tokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
Tokenizer类属于com.aliasi.tokenizer包,在下文中一共展示了Tokenizer类的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: tokenizeSentencesOPENNLP
import com.aliasi.tokenizer.Tokenizer; //导入依赖的package包/类
public static void tokenizeSentencesOPENNLP(List<String> sentences)
{
ArrayList<String> tokenizedSentences = new ArrayList<String>();
int size = sentences.size();
try {
opennlp.tools.lang.english.Tokenizer tokenizer = new opennlp.tools.lang.english.Tokenizer("data/EnglishTok.bin.gz");
for(int i = 0; i < size; i ++)
{
String[] tokens = tokenizer.tokenize(sentences.get(i).trim());
String tokenized = "";
for(int j = 0; j < tokens.length; j ++)
{
tokenized+=tokens[j]+" ";
}
tokenized=tokenized.trim();
tokenizedSentences.add(tokenized);
System.out.println(tokenized);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
writeSentencesToTempFile("data/allSentencesTokenizedOPENNLP.txt",tokenizedSentences);
}
示例2: tokenizeSentencesOPENNLP
import com.aliasi.tokenizer.Tokenizer; //导入依赖的package包/类
public static void tokenizeSentencesOPENNLP(ArrayList<String> sentences)
{
ArrayList<String> tokenizedSentences = new ArrayList<String>();
int size = sentences.size();
try {
opennlp.tools.lang.english.Tokenizer tokenizer = new opennlp.tools.lang.english.Tokenizer("data/EnglishTok.bin.gz");
for(int i = 0; i < size; i ++)
{
String[] tokens = tokenizer.tokenize(sentences.get(i).trim());
String tokenized = "";
for(int j = 0; j < tokens.length; j ++)
{
tokenized+=tokens[j]+" ";
}
tokenized=tokenized.trim();
tokenizedSentences.add(tokenized);
System.out.println(tokenized);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
writeSentencesToTempFile("data/allSentencesTokenizedOPENNLP.txt",tokenizedSentences);
}
示例3: tokenize
import com.aliasi.tokenizer.Tokenizer; //导入依赖的package包/类
@Override
public List<Token> tokenize(JCas jcas) {
char[] cs = jcas.getDocumentText().toCharArray();
Tokenizer tokenizer = tokenizerFactory.tokenizer(cs, 0, cs.length);
return StreamSupport
.stream(tokenizer.spliterator(), false).map(token -> TypeFactory.createToken(jcas,
tokenizer.lastTokenStartPosition(), tokenizer.lastTokenEndPosition()))
.collect(toList());
}
示例4: getSentences
import com.aliasi.tokenizer.Tokenizer; //导入依赖的package包/类
public String[] getSentences(String text) {
ArrayList<String> tokenList = new ArrayList<>();
ArrayList<String> whiteList = new ArrayList<>();
Tokenizer tokenizer = tokenizerFactory.tokenizer(text.toCharArray(), 0, text.length());
tokenizer.tokenize(tokenList, whiteList);
String[] tokens = new String[tokenList.size()];
String[] whites = new String[whiteList.size()];
tokenList.toArray(tokens);
whiteList.toArray(whites);
int[] sentenceBoundaries = sentenceModel.boundaryIndices(tokens, whites);
if (sentenceBoundaries.length < 1) {
return new String[0];
}
String[] result = new String[sentenceBoundaries.length];
int sentStartTok = 0;
int sentEndTok;
for (int i = 0; i < sentenceBoundaries.length; ++i) {
sentEndTok = sentenceBoundaries[i];
StringBuilder sb = new StringBuilder();
for (int j = sentStartTok; j <= sentEndTok; j++) {
sb.append(tokens[j]).append(whites[j + 1]);
}
result[i] = sb.toString();
sentStartTok = sentEndTok + 1;
}
return result;
}
示例5: splitSentences
import com.aliasi.tokenizer.Tokenizer; //导入依赖的package包/类
public static ArrayList<String> splitSentences(String paragraph)
{
ArrayList<String> tokenList = new ArrayList<String>();
ArrayList<String> whiteList = new ArrayList<String>();
paragraph=paragraph.trim();
paragraph=paragraph.replace("\r\n", " ");
paragraph=paragraph.replace("\n", " ");
paragraph=paragraph.replace("\r", " ");
Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(paragraph.toCharArray(),0, paragraph.length());
tokenizer.tokenize(tokenList, whiteList);
String[] tokens = new String[tokenList.size()];
String[] whites = new String[whiteList.size()];
tokenList.toArray(tokens);
whiteList.toArray(whites);
int[] sentenceBoundaries = SENTENCE_MODEL.boundaryIndices(tokens,whites);
ArrayList<String> sentences = new ArrayList<String>();
if (sentenceBoundaries.length < 1) {
System.out.println("No sentence boundaries found.");
sentences.add(paragraph);
}
int sentStartTok = 0;
int sentEndTok = 0;
for (int i = 0; i < sentenceBoundaries.length; ++i)
{
sentEndTok = sentenceBoundaries[i];
String sentence="";
for (int j=sentStartTok; j<=sentEndTok; j++)
{
sentence+=tokens[j]+whites[j+1];
}
sentences.add(sentence.trim());
sentStartTok = sentEndTok+1;
}
return sentences;
}
示例6: tokenize
import com.aliasi.tokenizer.Tokenizer; //导入依赖的package包/类
/**
* Tokenizes a text.
*
* @param text text to tokenize
* @return array of tokens or <code>null</code>, if the tokenizer is not
* initialized
*/
public static String[] tokenize(String text) {
if (tokenizerFactory == null) return null;
ArrayList<String> tokenList = new ArrayList<String>();
ArrayList<String> whiteList = new ArrayList<String>();
Tokenizer tokenizer =
tokenizerFactory.tokenizer(text.toCharArray(), 0, text.length());
tokenizer.tokenize(tokenList, whiteList);
return tokenList.toArray(new String[tokenList.size()]);
}
示例7: sentDetect
import com.aliasi.tokenizer.Tokenizer; //导入依赖的package包/类
/**
* Splits a text into sentences.
*
* @param text sequence of sentences
* @return array of sentences in the text or <code>null</code>, if the
* sentence detector is not initialized
*/
public static String[] sentDetect(String text) {
if (sentenceModel == null) return null;
// tokenize text
ArrayList<String> tokenList = new ArrayList<String>();
ArrayList<String> whiteList = new ArrayList<String>();
Tokenizer tokenizer =
tokenizerFactory.tokenizer(text.toCharArray(), 0, text.length());
tokenizer.tokenize(tokenList, whiteList);
String[] tokens = tokenList.toArray(new String[tokenList.size()]);
String[] whites = whiteList.toArray(new String[whiteList.size()]);
// detect sentences
int[] sentenceBoundaries =
sentenceModel.boundaryIndices(tokens, whites);
int sentStartTok = 0;
int sentEndTok = 0;
String[] sentences = new String[sentenceBoundaries.length];
for (int i = 0; i < sentenceBoundaries.length; i++) {
sentEndTok = sentenceBoundaries[i];
StringBuilder sb = new StringBuilder();
for (int j = sentStartTok; j <= sentEndTok; j++) {
sb.append(tokens[j]);
if (whites[j + 1].length() > 0 && j < sentEndTok)
sb.append(" ");
}
sentences[i] = sb.toString();
sentStartTok = sentEndTok+1;
}
return sentences;
}
示例8: wordSpliter
import com.aliasi.tokenizer.Tokenizer; //导入依赖的package包/类
public static List<String>[] wordSpliter(String txt) {
List<String> ls[] = new ArrayList[2];
ls[0] = new ArrayList<String>();
ls[1] = new ArrayList<String>();
char cc[] = txt.toCharArray();
Tokenizer tk = TOKENIZER.tokenizer(cc, 0, cc.length);
tk.tokenize(ls[0], ls[1]);
return ls;
}
示例9: tokenize
import com.aliasi.tokenizer.Tokenizer; //导入依赖的package包/类
public static String[] tokenize(String txt) {
char cc[] = txt.toCharArray();
Tokenizer tk = TOKENIZER.tokenizer(cc, 0, cc.length);
return tk.tokenize();
}
示例10: execute
import com.aliasi.tokenizer.Tokenizer; //导入依赖的package包/类
/**
* execute method. Makes LingPipe API calls to tokenize the document.
* It uses the document's string and passes it over to the LingPipe to
* tokenize. It also generates space tokens as well.
*/
public void execute() throws ExecutionException {
if(document == null) {
throw new ExecutionException("There is no loaded document");
}
super.fireProgressChanged(0);
long startOffset = 0, endOffset = 0;
AnnotationSet as = null;
if(outputASName == null || outputASName.trim().length() == 0)
as = document.getAnnotations();
else as = document.getAnnotations(outputASName);
String docContent = document.getContent().toString();
List<String> tokenList = new ArrayList<String>();
List<String> whiteList = new ArrayList<String>();
Tokenizer tokenizer = tf.tokenizer(docContent.toCharArray(), 0, docContent
.length());
tokenizer.tokenize(tokenList, whiteList);
for(int i = 0; i < whiteList.size(); i++) {
try {
startOffset = endOffset;
endOffset = startOffset + whiteList.get(i).length();
if((endOffset - startOffset) != 0) {
FeatureMap fmSpaces = Factory.newFeatureMap();
fmSpaces.put("length", "" + (endOffset - startOffset));
as.add(new Long(startOffset), new Long(endOffset), "SpaceToken",
fmSpaces);
}
if(i < tokenList.size()) {
startOffset = endOffset;
endOffset = startOffset + tokenList.get(i).length();
FeatureMap fmTokens = Factory.newFeatureMap();
fmTokens.put("length", "" + (endOffset - startOffset));
as.add(new Long(startOffset), new Long(endOffset), "Token", fmTokens);
}
}
catch(InvalidOffsetException e) {
throw new ExecutionException(e);
}
}
}
示例11: tokenizer
import com.aliasi.tokenizer.Tokenizer; //导入依赖的package包/类
public Tokenizer tokenizer(char[] content, int start, int length) {
String str = new String(content, start, length);
return new TCCLingPipeTokenizer(str);
}