本文整理汇总了Java中opennlp.tools.tokenize.Tokenizer类的典型用法代码示例。如果您正苦于以下问题:Java Tokenizer类的具体用法?Java Tokenizer怎么用?Java Tokenizer使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
Tokenizer类属于opennlp.tools.tokenize包,在下文中一共展示了Tokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: doRun
import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
@Override
public List<Word> doRun(Language language, String sentence) {
Tokenizer tokenizer = new TokenizerME(getTokenizerModel(language));
POSTaggerME tagger = new POSTaggerME(getPOSModel(language));
String[] tokens = tokenizer.tokenize(sentence);
String[] tags = tagger.tag(tokens);
PartOfSpeechSet posSet = PartOfSpeechSet.getPOSSet(language);
List<Word> words = new ArrayList<>();
for (int i = 0; i < tokens.length; i++) {
words.add(new Word(posSet.valueOf(tags[i]), tokens[i]));
}
return words;
}
示例2: getConceptVector
import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public INDArray getConceptVector(Concept c) {
Tokenizer tok = SimpleTokenizer.INSTANCE;
List<INDArray> vectors = new ArrayList<INDArray>();
int countUnk = 0;
for (String word : tok.tokenize(c.name.toLowerCase().trim())) {
if (wordVectors.hasWord(word))
vectors.add(wordVectors.getWordVectorMatrix(word));
else {
vectors.add(unkVector);
countUnk++;
}
}
if (vectors.size() == countUnk)
return null; // all tokens unknown
INDArray allVectors = Nd4j.vstack(vectors);
// sum or mean is irrelevant for cosine similarity
INDArray conceptVector = allVectors.mean(0);
return conceptVector;
}
示例3: tokenDetect
import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public String[] tokenDetect(String sentence) {
File modelIn = null;
String tokens[] = null;
try {
File userDir = new File(System.getProperty("user.dir"));
if (this.turNLPInstance.getLanguage().equals("en_US")) {
modelIn = new File(userDir.getAbsolutePath().concat("/models/opennlp/en/en-token.bin"));
} else if (this.turNLPInstance.getLanguage().equals("pt_BR")) {
modelIn = new File(userDir.getAbsolutePath().concat("/models/opennlp/pt/pt-token.bin"));
}
TokenizerModel model = new TokenizerModel(modelIn);
Tokenizer tokenizer = new TokenizerME(model);
tokens = tokenizer.tokenize(sentence);
} catch (IOException e) {
e.printStackTrace();
}
return tokens;
}
示例4: create
import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public static AgeClassifyFactory create(String subclassName, Tokenizer tokenizer,
FeatureGenerator[] featureGenerators) throws InvalidFormatException {
if (subclassName == null) {
// will create the default factory
return new AgeClassifyFactory(tokenizer, featureGenerators);
}
try {
AgeClassifyFactory factory = AgeClassifyFactory.INSTANCE;
factory.init(tokenizer, featureGenerators);
return factory;
} catch (Exception e) {
String msg = "Could not instantiate the " + subclassName
+ ". The initialization throw an exception.";
System.err.println(msg);
e.printStackTrace();
throw new InvalidFormatException(msg, e);
}
}
示例5: scoreStructure
import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public double scoreStructure(String ca, String q, String passage, boolean verbose) throws InvalidFormatException, IOException{
POSTaggerME parserModel = new POSTaggerME(new POSModel(new FileInputStream(new File("en-pos-model.bin"))));
Tokenizer tokenizer = new TokenizerME(new TokenizerModel(new FileInputStream(new File("en-token.bin"))));
Parser parser = ParserFactory.create(new ParserModel(new FileInputStream(new File("en-parser.bin"))));
double score = 0;
Parse[] questionParse = ParserTool.parseLine(q, parser, 1);
Parse[] passageParse = ParserTool.parseLine(q, parser, 1);
if (passage.contains(ca)) {
for (int i =0; i < questionParse.length; i++) {
score += matchChildren(questionParse[i],passageParse[i]);
}
}
return score;
}
示例6: stem
import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
protected Set<String> stem(Concept c) {
Set<String> stems = new HashSet<String>();
Tokenizer tok = SimpleTokenizer.INSTANCE;
for (String word : tok.tokenize(c.name.toLowerCase().trim())) {
if (!this.functionWords.contains(word))
stems.add((String) this.stemmer.stem(word));
}
return stems;
}
示例7: getTokenizer
import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public Tokenizer getTokenizer() {
if (this.tokenizer == null) {
if (artifactProvider != null) {
String className = artifactProvider.getManifestProperty(TOKENIZER_NAME);
if (className != null) {
this.tokenizer = ExtensionLoader.instantiateExtension(
Tokenizer.class , className);
}
}
if (this.tokenizer == null) { // could not load using artifact provider
this.tokenizer = WhitespaceTokenizer.INSTANCE;
}
}
return tokenizer;
}
示例8: tokenizeText
import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
/**
* Utility offered to other elements of the pipeline for text tokenizing.
*
* @param text the text to tokenize
* @param language the language of the input text
* @return an array containing the tokenized text.
*/
public static String[] tokenizeText(String text, String language) {
setup();
// Split the text into sentences
SentenceModel sentModel = getSentenceModel(language + "-sent");
SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
String sentences[] = sentenceDetector.sentDetect(text);
// Get the right models
TokenizerModel tokenModel = getTokenizerModel(language + "-token");
// Iterate through sentences and produce the distilled objects,
// i.e. a sentence object with pos-tagged and stemmed tokens.
List<String> tokenizedText = new ArrayList<>();
for (String sentenceString : sentences) {
// Tokenize the sentence
Tokenizer tokenizer = new TokenizerME(tokenModel);
String tokens[] = tokenizer.tokenize(sentenceString);
for (String token : tokens) {
tokenizedText.add(token);
}
}
return tokenizedText.toArray(new String[tokenizedText.size()]);
}
示例9: testTokenizer
import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public String[] testTokenizer(){
String[] tokens = {};
try (InputStream modelIn = BasicActions.class.getClassLoader()
.getResourceAsStream(Consts.EN_TOKEN_MODEL);) {
TokenizerModel tokenModel = new TokenizerModel(modelIn);
Tokenizer tokenizer = new TokenizerME(tokenModel);
tokens = tokenizer.tokenize(TEST_PHRASE);
System.out.println(Arrays.toString(tokens));
} catch (IOException e) {
e.printStackTrace();
}
return tokens;
}
示例10: parsePassageText
import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public Parse[] parsePassageText(String p) throws InvalidFormatException{
if (!modelsAreInitialized)init();
//initialize
SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
NameFinderME nameFinder = new NameFinderME(this.nerModel);
Parser parser = ParserFactory.create(
this.parserModel,
20, // beam size
0.95); // advance percentage
//find sentences, tokenize each, parse each, return top parse for each
String[] sentences = sentenceDetector.sentDetect(p);
Parse[] results = new Parse[sentences.length];
for (int i=0;i<sentences.length;i++){
//String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]);
//StringTokenizer st = new StringTokenizer(tks[i]);
//There are several tokenizers available. SimpleTokenizer works best
Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
for (int si = 0; si < sentences.length; si++) {
Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]);
String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]);
Span[] names = nameFinder.find(tokens);
for (int ni = 0; ni < names.length; ni++) {
Span startSpan = tokenSpans[names[ni].getStart()];
int nameStart = startSpan.getStart();
Span endSpan = tokenSpans[names[ni].getEnd() - 1];
int nameEnd = endSpan.getEnd();
String name = sentences[si].substring(nameStart, nameEnd);
System.out.println(name);
}
}
String sent= StringUtils.join(tokenizer," ");
System.out.println("Found sentence " + sent);
Parse[] sentResults = ParserTool.parseLine(sent,parser, 1);
results[i]=sentResults[0];
}
return results;
}
示例11: getTokenizer
import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
@CheckedProvides(TokenizerProvider.class)
Tokenizer getTokenizer() throws IOException {
try (InputStream is = getClass().getResourceAsStream("/opennlp/en-token.bin")) {
TokenizerModel model = new TokenizerModel(is);
return new TokenizerME(model);
}
}
示例12: getTokenizer
import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public Tokenizer getTokenizer() {
if(tokenizer != null) {
return ExtensionLoader.instantiateExtension(Tokenizer.class, this.tokenizer);
}
return WhitespaceTokenizer.INSTANCE;
}
示例13: createTokenizer
import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
private static Tokenizer createTokenizer(String tokenizer) {
if(tokenizer != null) {
return ExtensionLoader.instantiateExtension(Tokenizer.class, tokenizer);
}
return WhitespaceTokenizer.INSTANCE;
}
示例14: getTokenizer
import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
private Tokenizer getTokenizer(String tokenizer) {
if(tokenizer != null) {
return ExtensionLoader.instantiateExtension(Tokenizer.class, tokenizer);
}
return WhitespaceTokenizer.INSTANCE;
}
示例15: getProbabilities
import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public double[] getProbabilities(String documentText) {
Tokenizer tokenizer = this.factory.getTokenizer();
return getProbabilities(tokenizer.tokenize(documentText));
}