本文整理汇总了Java中opennlp.tools.tokenize.Tokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Java Tokenizer.tokenize方法的具体用法?Java Tokenizer.tokenize怎么用?Java Tokenizer.tokenize使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类opennlp.tools.tokenize.Tokenizer
的用法示例。
在下文中一共展示了Tokenizer.tokenize方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: doRun
import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
@Override
public List<Word> doRun(Language language, String sentence) {
Tokenizer tokenizer = new TokenizerME(getTokenizerModel(language));
POSTaggerME tagger = new POSTaggerME(getPOSModel(language));
String[] tokens = tokenizer.tokenize(sentence);
String[] tags = tagger.tag(tokens);
PartOfSpeechSet posSet = PartOfSpeechSet.getPOSSet(language);
List<Word> words = new ArrayList<>();
for (int i = 0; i < tokens.length; i++) {
words.add(new Word(posSet.valueOf(tags[i]), tokens[i]));
}
return words;
}
示例2: getConceptVector
import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
public INDArray getConceptVector(Concept c) {
Tokenizer tok = SimpleTokenizer.INSTANCE;
List<INDArray> vectors = new ArrayList<INDArray>();
int countUnk = 0;
for (String word : tok.tokenize(c.name.toLowerCase().trim())) {
if (wordVectors.hasWord(word))
vectors.add(wordVectors.getWordVectorMatrix(word));
else {
vectors.add(unkVector);
countUnk++;
}
}
if (vectors.size() == countUnk)
return null; // all tokens unknown
INDArray allVectors = Nd4j.vstack(vectors);
// sum or mean is irrelevant for cosine similarity
INDArray conceptVector = allVectors.mean(0);
return conceptVector;
}
示例3: tokenDetect
import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
public String[] tokenDetect(String sentence) {
File modelIn = null;
String tokens[] = null;
try {
File userDir = new File(System.getProperty("user.dir"));
if (this.turNLPInstance.getLanguage().equals("en_US")) {
modelIn = new File(userDir.getAbsolutePath().concat("/models/opennlp/en/en-token.bin"));
} else if (this.turNLPInstance.getLanguage().equals("pt_BR")) {
modelIn = new File(userDir.getAbsolutePath().concat("/models/opennlp/pt/pt-token.bin"));
}
TokenizerModel model = new TokenizerModel(modelIn);
Tokenizer tokenizer = new TokenizerME(model);
tokens = tokenizer.tokenize(sentence);
} catch (IOException e) {
e.printStackTrace();
}
return tokens;
}
示例4: stem
import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
protected Set<String> stem(Concept c) {
Set<String> stems = new HashSet<String>();
Tokenizer tok = SimpleTokenizer.INSTANCE;
for (String word : tok.tokenize(c.name.toLowerCase().trim())) {
if (!this.functionWords.contains(word))
stems.add((String) this.stemmer.stem(word));
}
return stems;
}
示例5: tokenizeText
import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
/**
* Utility offered to other elements of the pipeline for text tokenizing.
*
* @param text the text to tokenize
* @param language the language of the input text
* @return an array containing the tokenized text.
*/
public static String[] tokenizeText(String text, String language) {
setup();
// Split the text into sentences
SentenceModel sentModel = getSentenceModel(language + "-sent");
SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
String sentences[] = sentenceDetector.sentDetect(text);
// Get the right models
TokenizerModel tokenModel = getTokenizerModel(language + "-token");
// Iterate through sentences and produce the distilled objects,
// i.e. a sentence object with pos-tagged and stemmed tokens.
List<String> tokenizedText = new ArrayList<>();
for (String sentenceString : sentences) {
// Tokenize the sentence
Tokenizer tokenizer = new TokenizerME(tokenModel);
String tokens[] = tokenizer.tokenize(sentenceString);
for (String token : tokens) {
tokenizedText.add(token);
}
}
return tokenizedText.toArray(new String[tokenizedText.size()]);
}
示例6: testTokenizer
import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
public String[] testTokenizer(){
String[] tokens = {};
try (InputStream modelIn = BasicActions.class.getClassLoader()
.getResourceAsStream(Consts.EN_TOKEN_MODEL);) {
TokenizerModel tokenModel = new TokenizerModel(modelIn);
Tokenizer tokenizer = new TokenizerME(tokenModel);
tokens = tokenizer.tokenize(TEST_PHRASE);
System.out.println(Arrays.toString(tokens));
} catch (IOException e) {
e.printStackTrace();
}
return tokens;
}
示例7: annotate
import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
/**
* Annotates the document using the Apache OpenNLP tools.
*
* @param component the component to annotate.
*/
@Override
public void annotate(Blackboard blackboard, DocumentComponent component) {
// set up the annotator
setup();
// Language tag used to retrieve the datasets
String langTag = component.getLanguage().getLanguage();
// Split the text into sentences
SentenceModel sentModel = getSentenceModel(langTag + "-sent");
SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
String sentences[] = sentenceDetector.sentDetect(component.getText());
// Get the right models
TokenizerModel tokenModel = getTokenizerModel(langTag + "-token");
POSModel POSModel = getPOSTaggerModel(langTag + "-pos-maxent");
// Iterate through sentences and produce the distilled objects,
// i.e. a sentence object with pos-tagged and stemmed tokens.
for (String sentenceString : sentences) {
// the distilled sentence object
Sentence sentence = new Sentence(sentenceString,
"" + sentenceCounter++);
sentence.setLanguage(component.getLanguage());
// Tokenize the sentence
Tokenizer tokenizer = new TokenizerME(tokenModel);
String tokens[] = tokenizer.tokenize(sentenceString);
// POS tag the tokens
POSTaggerME tagger = new POSTaggerME(POSModel);
String tags[] = tagger.tag(tokens);
// put the features detected by OpenNLP in the distiller's
// sentence
for (int i = 0; i < tokens.length; i++) {
Token t = new Token(tokens[i]);
t.setPoS(tags[i]);
sentence.addToken(t);
} // for
((DocumentComposite) component).addComponent(sentence);
} // for (String sentenceString : sentences)
}
示例8: convertToTokens
import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
private String[] convertToTokens(String sentence) {
Tokenizer tokenizer = new TokenizerME(tokenModel);
return tokenizer.tokenize(sentence);
}
示例9: tokensFromSentence
import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
public static String[] tokensFromSentence(String sentence) {
Tokenizer tokenizer = new TokenizerME(tokenModel);
return tokenizer.tokenize(sentence);
}