本文整理汇总了Java中org.tartarus.snowball.SnowballStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题:Java SnowballStemmer.stem方法的具体用法?Java SnowballStemmer.stem怎么用?Java SnowballStemmer.stem使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.tartarus.snowball.SnowballStemmer
的用法示例。
在下文中一共展示了SnowballStemmer.stem方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: englishSanityCheck
import org.tartarus.snowball.SnowballStemmer; //导入方法依赖的package包/类
@Test
public void englishSanityCheck() {
SnowballStemmer snowballStemmer = new englishStemmer();
snowballStemmer.setCurrent("Jumps");
snowballStemmer.stem();
String result = snowballStemmer.getCurrent();
Assert.assertEquals("Jump", result);
}
示例2: annotate
import org.tartarus.snowball.SnowballStemmer; //导入方法依赖的package包/类
@Override
public void annotate(Blackboard blackboard, DocumentComponent component) {
initIndex(component.getLanguage());
List<Sentence> sentences = DocumentUtils.getSentences(component);
for (Sentence s : sentences) {
for (Gram g : s.getGrams()) {
if (!g.hasAnnotation(TFIDF)) {
String stemmedSurface = g.getSurface();
String[] tokenizedSurface
= OpenNlpBootstrapperAnnotator.
tokenizeText(stemmedSurface,
component.getLanguage().getLanguage());
SnowballStemmer stemmer = SnowballStemmerSelector.
getStemmerForLanguage(component.getLanguage());
for (int i = 0; i < tokenizedSurface.length; i++) {
stemmer.setCurrent(tokenizedSurface[i]);
if (stemmer.stem()) {
tokenizedSurface[i] = stemmer.getCurrent();
}
}
stemmedSurface = String.join(" ",
markTokens(tokenizedSurface)).trim();
((Keyphrase) g).putFeature(TFIDF,
tfIdf(
IOBlackboard.getCurrentDocument(),
stemmedSurface));
}
}
}
}
示例3: stem
import org.tartarus.snowball.SnowballStemmer; //导入方法依赖的package包/类
/**
* Method to perform stemming
* @param input the words we would like to stem in a list
* @return the words stemmed
*/
public List<String> stem(List<String> input) {
List<String> output=new ArrayList<>();
SnowballStemmer snowballStemmer = new englishStemmer();
for(String word:input){
snowballStemmer.setCurrent(word);
snowballStemmer.stem();
output.add(snowballStemmer.getCurrent());
}
return output;
}
示例4: stemInput
import org.tartarus.snowball.SnowballStemmer; //导入方法依赖的package包/类
public ArrayList<String> stemInput(String s) {
SnowballStemmer stemmer = (SnowballStemmer) new englishStemmer();
s = s.replaceAll("[\\.\\,\\:\\?]", replacePunc);
s = s.replaceAll("[^a-zA-Z0-9'\\s]", "").replaceAll("\\s+", " ");
String[] tmp = s.split(" ");
ArrayList<String> res = new ArrayList<String>();
Boolean negate = false;
for (String word: tmp) {
if (IsStopWord(word)) {
continue;
}
if (replacePunc.contains(word)) {
negate = false;
continue;
}
if (IsNegation(word)) {
negate = !negate;
continue;
}
stemmer.setCurrent(word);
stemmer.stem();
if (negate) {
res.add("not-" + stemmer.getCurrent());
} else {
res.add(stemmer.getCurrent());
}
}
return res;
}
示例5: buildBag
import org.tartarus.snowball.SnowballStemmer; //导入方法依赖的package包/类
/**
*
* @param text
* @return
* @throws IOException
*/
public Map<String, Float> buildBag(String text) throws IOException {
Map<String, Float> bag = new HashMap<>();
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
SnowballStemmer stemmer = null;
if (stemming) {
stemmer = getStemmer(language);
if (stemmer == null) {
Logger.getLogger(RevisedLesk.class.getName()).log(Level.WARNING, "No stemmer for language {0}", language);
}
}
TokenStream tokenStream = analyzer.tokenStream("gloss", new StringReader(text));
while (tokenStream.incrementToken()) {
TermAttribute token = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
String term = token.term();
if (stemmer != null) {
stemmer.setCurrent(term);
if (stemmer.stem()) {
term = stemmer.getCurrent();
}
}
Float c = bag.get(term);
if (c == null) {
bag.put(term, 1f);
} else {
bag.put(term, c + 1f);
}
}
return bag;
}
示例6: stemTerms
import org.tartarus.snowball.SnowballStemmer; //导入方法依赖的package包/类
/**
* The function is responsible for the stemming of each entity in the dictionary based on the stemming language
* defined in the constructor.
*/
private void stemTerms() {
try {
int offset, overallOffset = 0;
String word, name, uri;
Concept concept;
StringBuilder sb;
Class stemClass = Class.forName("org.tartarus.snowball.ext." + stemmingLanguage);
SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance();
for (ProcessedText pt : processedTerms) {
sb = new StringBuilder();
sb.append(" ");
for (Token token : pt.tokens) {
stemmer.setCurrent(token.text);
stemmer.stem();
word = stemmer.getCurrent();
offset = token.text.length() - word.length();
token.begin -= overallOffset;
overallOffset += offset;
token.end -= overallOffset;
sb.append(word);
sb.append(" ");
token.stem = word;
}
name = sb.toString();
concept = originalDictionary.getConcept(pt.originalText);
pt.setStemmedText(name);
processedDictionary.addElement(name.substring(1, name.length() - 1), concept);
}
} catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
示例7: stemText
import org.tartarus.snowball.SnowballStemmer; //导入方法依赖的package包/类
/**
* The function is responsible for the stemming of the main text based on the stemming language defined in the
* constructor.
*/
private void stemText(ProcessedText processedText) {
try {
int offset, overallOffset = 0;
String word;
StringBuilder sb;
Class stemClass = Class.forName("org.tartarus.snowball.ext." + stemmingLanguage);
SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance();
sb = new StringBuilder();
sb.append(" ");
for (Token token : processedText.tokens) {
stemmer.setCurrent(token.text);
stemmer.stem();
word = stemmer.getCurrent();
offset = token.text.length() - word.length();
token.begin -= overallOffset;
overallOffset += offset;
token.end -= overallOffset;
sb.append(word);
sb.append(" ");
token.stem = word;
}
processedText.setStemmedText(sb.toString());
} catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
示例8: stem
import org.tartarus.snowball.SnowballStemmer; //导入方法依赖的package包/类
public static String stem(@Nullable final String lang, final String string) {
final SnowballStemmer stemmer = getStemmer(lang);
stemmer.setCurrent(string);
stemmer.stem();
return stemmer.getCurrent();
}
示例9: analyze
import org.tartarus.snowball.SnowballStemmer; //导入方法依赖的package包/类
public TextAnalyzer analyze() {
// Stanford CoreNLP, avoid lemmatization as it's very slow to use Porter2 stemming
// instead. (Porter -> Snowball (Porter2) -> Lancaster is order of stemming
// aggressiveness.
//
// other ideas
// - remove top 10k most common english words
Properties props = new Properties();
props.put("annotators", "tokenize, ssplit, stopword");
props.setProperty("customAnnotatorClass.stopword", "com.asimihsan.handytrowel.nlp.StopwordAnnotator");
List<String> stopWords = null;
try {
stopWords = WordReader.wordReaderWithResourcePath("/nlp/top1000words.txt").getWords();
} catch (IOException e) {
e.printStackTrace();
return this;
}
String customStopWordList = Joiner.on(",").join(stopWords);
props.setProperty(StopwordAnnotator.STOPWORDS_LIST, customStopWordList);
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation document = new Annotation(body);
pipeline.annotate(document);
List<CoreLabel> inputTokens = document.get(CoreAnnotations.TokensAnnotation.class);
SnowballStemmer stemmer = new englishStemmer();
for (CoreLabel token : inputTokens) {
Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class);
if (stopword.first())
continue;
String word = token.word().toLowerCase();
//!!AI TODO this sucks, should make another annotator and make it optional etc.
//also we're matching full stops! so we lose sentence information.
if (punctuation.matcher(word).matches())
continue;
//!AI TODO again this would be its own annotator and optional
word = number.matcher(word).replaceAll("NUMBER");
stemmer.setCurrent(word);
stemmer.stem();
word = stemmer.getCurrent();
tokens.add(word);
}
return this;
}