当前位置: 首页>>代码示例>>Java>>正文


Java Tokenizer.tokenize方法代码示例

本文整理汇总了Java中opennlp.tools.tokenize.Tokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Java Tokenizer.tokenize方法的具体用法?Java Tokenizer.tokenize怎么用?Java Tokenizer.tokenize使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在opennlp.tools.tokenize.Tokenizer的用法示例。


在下文中一共展示了Tokenizer.tokenize方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: doRun

import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
@Override
public List<Word> doRun(Language language, String sentence) {
    Tokenizer tokenizer = new TokenizerME(getTokenizerModel(language));
    POSTaggerME tagger = new POSTaggerME(getPOSModel(language));
    String[] tokens = tokenizer.tokenize(sentence);
    String[] tags = tagger.tag(tokens);

    PartOfSpeechSet posSet = PartOfSpeechSet.getPOSSet(language);

    List<Word> words = new ArrayList<>();
    for (int i = 0; i < tokens.length; i++) {
        words.add(new Word(posSet.valueOf(tags[i]), tokens[i]));
    }

    return words;
}
 
开发者ID:Lambda-3,项目名称:Stargraph,代码行数:17,代码来源:OpenNLPAnnotator.java

示例2: getConceptVector

import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
public INDArray getConceptVector(Concept c) {

		Tokenizer tok = SimpleTokenizer.INSTANCE;

		List<INDArray> vectors = new ArrayList<INDArray>();
		int countUnk = 0;
		for (String word : tok.tokenize(c.name.toLowerCase().trim())) {
			if (wordVectors.hasWord(word))
				vectors.add(wordVectors.getWordVectorMatrix(word));
			else {
				vectors.add(unkVector);
				countUnk++;
			}
		}
		if (vectors.size() == countUnk)
			return null; // all tokens unknown
		INDArray allVectors = Nd4j.vstack(vectors);

		// sum or mean is irrelevant for cosine similarity
		INDArray conceptVector = allVectors.mean(0);

		return conceptVector;
	}
 
开发者ID:UKPLab,项目名称:ijcnlp2017-cmaps,代码行数:24,代码来源:WordEmbeddingDistance.java

示例3: tokenDetect

import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
public String[] tokenDetect(String sentence) {
	File modelIn = null;
	String tokens[] = null;
	try {
		File userDir = new File(System.getProperty("user.dir"));
		if (this.turNLPInstance.getLanguage().equals("en_US")) {
			modelIn = new File(userDir.getAbsolutePath().concat("/models/opennlp/en/en-token.bin"));
		} else if (this.turNLPInstance.getLanguage().equals("pt_BR")) {
			modelIn = new File(userDir.getAbsolutePath().concat("/models/opennlp/pt/pt-token.bin"));
		}
		TokenizerModel model = new TokenizerModel(modelIn);
		Tokenizer tokenizer = new TokenizerME(model);
		tokens = tokenizer.tokenize(sentence);
	} catch (IOException e) {
		e.printStackTrace();
	}
	return tokens;
}
 
开发者ID:openviglet,项目名称:turing,代码行数:19,代码来源:TurOpenNLPConnector.java

示例4: stem

import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
protected Set<String> stem(Concept c) {
	Set<String> stems = new HashSet<String>();
	Tokenizer tok = SimpleTokenizer.INSTANCE;
	for (String word : tok.tokenize(c.name.toLowerCase().trim())) {
		if (!this.functionWords.contains(word))
			stems.add((String) this.stemmer.stem(word));
	}
	return stems;
}
 
开发者ID:UKPLab,项目名称:ijcnlp2017-cmaps,代码行数:10,代码来源:JaccardDistance.java

示例5: tokenizeText

import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
/**
 * Utility offered to other elements of the pipeline for text tokenizing.
 *
 * @param text the text to tokenize
 * @param language the language of the input text
 * @return an array containing the tokenized text.
 */
public static String[] tokenizeText(String text, String language) {

    setup();

    // Split the text into sentences
    SentenceModel sentModel = getSentenceModel(language + "-sent");

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
    String sentences[] = sentenceDetector.sentDetect(text);

    // Get the right models
    TokenizerModel tokenModel = getTokenizerModel(language + "-token");

    // Iterate through sentences and produce the distilled objects, 
    // i.e. a sentence object with pos-tagged and stemmed tokens.
    List<String> tokenizedText = new ArrayList<>();

    for (String sentenceString : sentences) {

        // Tokenize the sentence
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        String tokens[] = tokenizer.tokenize(sentenceString);
        for (String token : tokens) {
            tokenizedText.add(token);
        }
    }
    return tokenizedText.toArray(new String[tokenizedText.size()]);
}
 
开发者ID:ailab-uniud,项目名称:distiller-CORE,代码行数:36,代码来源:OpenNlpBootstrapperAnnotator.java

示例6: testTokenizer

import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
public String[] testTokenizer(){
	String[] tokens = {};
	try (InputStream modelIn = BasicActions.class.getClassLoader()
			.getResourceAsStream(Consts.EN_TOKEN_MODEL);) {
		
		TokenizerModel tokenModel = new TokenizerModel(modelIn);
		Tokenizer tokenizer = new TokenizerME(tokenModel);
		tokens = tokenizer.tokenize(TEST_PHRASE);
		System.out.println(Arrays.toString(tokens));
	} catch (IOException e) {
		e.printStackTrace();
	}
	return tokens;
}
 
开发者ID:5agado,项目名称:knowledge-extraction,代码行数:15,代码来源:BasicActions.java

示例7: annotate

import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
/**
 * Annotates the document using the Apache OpenNLP tools.
 *
 * @param component the component to annotate.
 */
@Override
public void annotate(Blackboard blackboard, DocumentComponent component) {

    // set up the annotator
    setup();

    // Language tag used to retrieve the datasets
    String langTag = component.getLanguage().getLanguage();

    // Split the text into sentences
    SentenceModel sentModel = getSentenceModel(langTag + "-sent");

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
    String sentences[] = sentenceDetector.sentDetect(component.getText());

    // Get the right models
    TokenizerModel tokenModel = getTokenizerModel(langTag + "-token");
    POSModel POSModel = getPOSTaggerModel(langTag + "-pos-maxent");

    // Iterate through sentences and produce the distilled objects, 
    // i.e. a sentence object with pos-tagged and stemmed tokens.
    for (String sentenceString : sentences) {

        // the distilled sentence object
        Sentence sentence = new Sentence(sentenceString,
                "" + sentenceCounter++);
        sentence.setLanguage(component.getLanguage());

        // Tokenize the sentence
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        String tokens[] = tokenizer.tokenize(sentenceString);

        // POS tag the tokens
        POSTaggerME tagger = new POSTaggerME(POSModel);
        String tags[] = tagger.tag(tokens);

        // put the features detected by OpenNLP in the distiller's
        // sentence
        for (int i = 0; i < tokens.length; i++) {
            Token t = new Token(tokens[i]);
            t.setPoS(tags[i]);
            sentence.addToken(t);

        } // for 
        ((DocumentComposite) component).addComponent(sentence);

    } // for (String sentenceString : sentences)
}
 
开发者ID:ailab-uniud,项目名称:distiller-CORE,代码行数:54,代码来源:OpenNlpBootstrapperAnnotator.java

示例8: convertToTokens

import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
private String[] convertToTokens(String sentence) {
	Tokenizer tokenizer = new TokenizerME(tokenModel);
	return tokenizer.tokenize(sentence);
}
 
开发者ID:singram,项目名称:ner_service_example,代码行数:5,代码来源:OpenNlpNerService.java

示例9: tokensFromSentence

import opennlp.tools.tokenize.Tokenizer; //导入方法依赖的package包/类
public static String[] tokensFromSentence(String sentence) {
  Tokenizer tokenizer = new TokenizerME(tokenModel);
  return tokenizer.tokenize(sentence);
}
 
开发者ID:MachinePublishers,项目名称:ScreenSlicer,代码行数:5,代码来源:NlpUtil.java


注:本文中的opennlp.tools.tokenize.Tokenizer.tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。