当前位置: 首页>>代码示例>>Java>>正文


Java Tokenizer类代码示例

本文整理汇总了Java中opennlp.tools.tokenize.Tokenizer的典型用法代码示例。如果您正苦于以下问题:Java Tokenizer类的具体用法?Java Tokenizer怎么用?Java Tokenizer使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


Tokenizer类属于opennlp.tools.tokenize包,在下文中一共展示了Tokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: doRun

import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
@Override
public List<Word> doRun(Language language, String sentence) {
    Tokenizer tokenizer = new TokenizerME(getTokenizerModel(language));
    POSTaggerME tagger = new POSTaggerME(getPOSModel(language));
    String[] tokens = tokenizer.tokenize(sentence);
    String[] tags = tagger.tag(tokens);

    PartOfSpeechSet posSet = PartOfSpeechSet.getPOSSet(language);

    List<Word> words = new ArrayList<>();
    for (int i = 0; i < tokens.length; i++) {
        words.add(new Word(posSet.valueOf(tags[i]), tokens[i]));
    }

    return words;
}
 
开发者ID:Lambda-3,项目名称:Stargraph,代码行数:17,代码来源:OpenNLPAnnotator.java

示例2: getConceptVector

import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public INDArray getConceptVector(Concept c) {

		Tokenizer tok = SimpleTokenizer.INSTANCE;

		List<INDArray> vectors = new ArrayList<INDArray>();
		int countUnk = 0;
		for (String word : tok.tokenize(c.name.toLowerCase().trim())) {
			if (wordVectors.hasWord(word))
				vectors.add(wordVectors.getWordVectorMatrix(word));
			else {
				vectors.add(unkVector);
				countUnk++;
			}
		}
		if (vectors.size() == countUnk)
			return null; // all tokens unknown
		INDArray allVectors = Nd4j.vstack(vectors);

		// sum or mean is irrelevant for cosine similarity
		INDArray conceptVector = allVectors.mean(0);

		return conceptVector;
	}
 
开发者ID:UKPLab,项目名称:ijcnlp2017-cmaps,代码行数:24,代码来源:WordEmbeddingDistance.java

示例3: tokenDetect

import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public String[] tokenDetect(String sentence) {
	File modelIn = null;
	String tokens[] = null;
	try {
		File userDir = new File(System.getProperty("user.dir"));
		if (this.turNLPInstance.getLanguage().equals("en_US")) {
			modelIn = new File(userDir.getAbsolutePath().concat("/models/opennlp/en/en-token.bin"));
		} else if (this.turNLPInstance.getLanguage().equals("pt_BR")) {
			modelIn = new File(userDir.getAbsolutePath().concat("/models/opennlp/pt/pt-token.bin"));
		}
		TokenizerModel model = new TokenizerModel(modelIn);
		Tokenizer tokenizer = new TokenizerME(model);
		tokens = tokenizer.tokenize(sentence);
	} catch (IOException e) {
		e.printStackTrace();
	}
	return tokens;
}
 
开发者ID:openviglet,项目名称:turing,代码行数:19,代码来源:TurOpenNLPConnector.java

示例4: create

import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public static AgeClassifyFactory create(String subclassName, Tokenizer tokenizer,
				    FeatureGenerator[] featureGenerators) throws InvalidFormatException {
if (subclassName == null) {
    // will create the default factory
    return new AgeClassifyFactory(tokenizer, featureGenerators);
}
try {
    AgeClassifyFactory factory = AgeClassifyFactory.INSTANCE;
    factory.init(tokenizer, featureGenerators);
    return factory;
} catch (Exception e) {
    String msg = "Could not instantiate the " + subclassName
	+ ". The initialization throw an exception.";
    System.err.println(msg);
    e.printStackTrace();
    throw new InvalidFormatException(msg, e);
}

   }
 
开发者ID:USCDataScience,项目名称:AgePredictor,代码行数:20,代码来源:AgeClassifyFactory.java

示例5: scoreStructure

import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public double scoreStructure(String ca, String q, String passage, boolean verbose) throws InvalidFormatException, IOException{
	POSTaggerME parserModel = new POSTaggerME(new POSModel(new FileInputStream(new File("en-pos-model.bin"))));
	Tokenizer tokenizer = new TokenizerME(new TokenizerModel(new FileInputStream(new File("en-token.bin"))));
	Parser parser = ParserFactory.create(new ParserModel(new FileInputStream(new File("en-parser.bin"))));
	double score = 0;
	
	Parse[] questionParse = ParserTool.parseLine(q, parser, 1);
	Parse[] passageParse = ParserTool.parseLine(q, parser, 1);
	
	if (passage.contains(ca)) {
		for (int i =0; i < questionParse.length; i++) {
			score += matchChildren(questionParse[i],passageParse[i]);
		}
	}
	
	return score;
}
 
开发者ID:SeanTater,项目名称:uncc2014watsonsim,代码行数:18,代码来源:JM_Scorer.java

示例6: stem

import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
protected Set<String> stem(Concept c) {
	Set<String> stems = new HashSet<String>();
	Tokenizer tok = SimpleTokenizer.INSTANCE;
	for (String word : tok.tokenize(c.name.toLowerCase().trim())) {
		if (!this.functionWords.contains(word))
			stems.add((String) this.stemmer.stem(word));
	}
	return stems;
}
 
开发者ID:UKPLab,项目名称:ijcnlp2017-cmaps,代码行数:10,代码来源:JaccardDistance.java

示例7: getTokenizer

import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public Tokenizer getTokenizer() {
if (this.tokenizer == null) {
    if (artifactProvider != null) {
	String className = artifactProvider.getManifestProperty(TOKENIZER_NAME);
	if (className != null) {
	    this.tokenizer = ExtensionLoader.instantiateExtension(
	        Tokenizer.class , className);
	}
    }
    if (this.tokenizer == null) { // could not load using artifact provider
	this.tokenizer = WhitespaceTokenizer.INSTANCE;
    }
}
return tokenizer;
   }
 
开发者ID:USCDataScience,项目名称:AgePredictor,代码行数:16,代码来源:AgeClassifyFactory.java

示例8: tokenizeText

import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
/**
 * Utility offered to other elements of the pipeline for text tokenizing.
 *
 * @param text the text to tokenize
 * @param language the language of the input text
 * @return an array containing the tokenized text.
 */
public static String[] tokenizeText(String text, String language) {

    setup();

    // Split the text into sentences
    SentenceModel sentModel = getSentenceModel(language + "-sent");

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
    String sentences[] = sentenceDetector.sentDetect(text);

    // Get the right models
    TokenizerModel tokenModel = getTokenizerModel(language + "-token");

    // Iterate through sentences and produce the distilled objects, 
    // i.e. a sentence object with pos-tagged and stemmed tokens.
    List<String> tokenizedText = new ArrayList<>();

    for (String sentenceString : sentences) {

        // Tokenize the sentence
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        String tokens[] = tokenizer.tokenize(sentenceString);
        for (String token : tokens) {
            tokenizedText.add(token);
        }
    }
    return tokenizedText.toArray(new String[tokenizedText.size()]);
}
 
开发者ID:ailab-uniud,项目名称:distiller-CORE,代码行数:36,代码来源:OpenNlpBootstrapperAnnotator.java

示例9: testTokenizer

import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public String[] testTokenizer(){
	String[] tokens = {};
	try (InputStream modelIn = BasicActions.class.getClassLoader()
			.getResourceAsStream(Consts.EN_TOKEN_MODEL);) {
		
		TokenizerModel tokenModel = new TokenizerModel(modelIn);
		Tokenizer tokenizer = new TokenizerME(tokenModel);
		tokens = tokenizer.tokenize(TEST_PHRASE);
		System.out.println(Arrays.toString(tokens));
	} catch (IOException e) {
		e.printStackTrace();
	}
	return tokens;
}
 
开发者ID:5agado,项目名称:knowledge-extraction,代码行数:15,代码来源:BasicActions.java

示例10: parsePassageText

import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public Parse[] parsePassageText(String p) throws InvalidFormatException{
	if (!modelsAreInitialized)init();
	//initialize 	 
	SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
	NameFinderME nameFinder = new NameFinderME(this.nerModel);
	Parser parser = ParserFactory.create(
			this.parserModel,
			20, // beam size
			0.95); // advance percentage
	//find sentences, tokenize each, parse each, return top parse for each 	 	 
	String[] sentences = sentenceDetector.sentDetect(p);
	Parse[] results = new Parse[sentences.length];
	for (int i=0;i<sentences.length;i++){
		//String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]);
		
		//StringTokenizer st = new StringTokenizer(tks[i]); 
		//There are several tokenizers available. SimpleTokenizer works best
		Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
		for (int si = 0; si < sentences.length; si++) {
	        Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]);
	        String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]);
	        Span[] names = nameFinder.find(tokens);
	        for (int ni = 0; ni < names.length; ni++) {
	            Span startSpan = tokenSpans[names[ni].getStart()];
	            int nameStart = startSpan.getStart();
	            Span endSpan = tokenSpans[names[ni].getEnd() - 1];
	            int nameEnd = endSpan.getEnd();
	            String name = sentences[si].substring(nameStart, nameEnd);
	            System.out.println(name);
	        }
	    }
		String sent= StringUtils.join(tokenizer," ");
		System.out.println("Found sentence " + sent);
		Parse[] sentResults = ParserTool.parseLine(sent,parser, 1);
		results[i]=sentResults[0];
	}
	return results;
}
 
开发者ID:SeanTater,项目名称:uncc2014watsonsim,代码行数:39,代码来源:NERScorer.java

示例11: getTokenizer

import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
@CheckedProvides(TokenizerProvider.class)
Tokenizer getTokenizer() throws IOException {
  try (InputStream is = getClass().getResourceAsStream("/opennlp/en-token.bin")) {
    TokenizerModel model = new TokenizerModel(is);
    return new TokenizerME(model);
  }
}
 
开发者ID:SciGraph,项目名称:SciGraph,代码行数:8,代码来源:OpenNlpModule.java

示例12: getTokenizer

import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public Tokenizer getTokenizer() {
if(tokenizer != null) {
    return ExtensionLoader.instantiateExtension(Tokenizer.class, this.tokenizer);
}
return WhitespaceTokenizer.INSTANCE;
   }
 
开发者ID:USCDataScience,项目名称:AgePredictor,代码行数:7,代码来源:AgeClassifyContextGeneratorWrapper.java

示例13: createTokenizer

import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
private static Tokenizer createTokenizer(String tokenizer) {
if(tokenizer != null) {
    return ExtensionLoader.instantiateExtension(Tokenizer.class, tokenizer);
}
return WhitespaceTokenizer.INSTANCE;
   }
 
开发者ID:USCDataScience,项目名称:AgePredictor,代码行数:7,代码来源:AgePredictTrainerTool.java

示例14: getTokenizer

import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
private Tokenizer getTokenizer(String tokenizer) {
    if(tokenizer != null) {
        return ExtensionLoader.instantiateExtension(Tokenizer.class, tokenizer);
    }
    return WhitespaceTokenizer.INSTANCE;
}
 
开发者ID:USCDataScience,项目名称:AgePredictor,代码行数:7,代码来源:AuthorAgeSampleStreamFactory.java

示例15: getProbabilities

import opennlp.tools.tokenize.Tokenizer; //导入依赖的package包/类
public double[] getProbabilities(String documentText) {
Tokenizer tokenizer = this.factory.getTokenizer();
return getProbabilities(tokenizer.tokenize(documentText));
   }
 
开发者ID:USCDataScience,项目名称:AgePredictor,代码行数:5,代码来源:AgeClassifyME.java


注:本文中的opennlp.tools.tokenize.Tokenizer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。