本文整理汇总了Java中org.tartarus.snowball.SnowballStemmer类的典型用法代码示例。如果您正苦于以下问题:Java SnowballStemmer类的具体用法?Java SnowballStemmer怎么用?Java SnowballStemmer使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
SnowballStemmer类属于org.tartarus.snowball包,在下文中一共展示了SnowballStemmer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: englishSanityCheck
import org.tartarus.snowball.SnowballStemmer; //导入依赖的package包/类
@Test
public void englishSanityCheck() {
SnowballStemmer snowballStemmer = new englishStemmer();
snowballStemmer.setCurrent("Jumps");
snowballStemmer.stem();
String result = snowballStemmer.getCurrent();
Assert.assertEquals("Jump", result);
}
示例2: createNewStemmer
import org.tartarus.snowball.SnowballStemmer; //导入依赖的package包/类
private SnowballStemmer createNewStemmer() {
try {
Class stemClass = Class.forName("org.tartarus.snowball.ext." + language + "Stemmer");
return (SnowballStemmer) stemClass.newInstance();
} catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
throw new RuntimeException(e.getMessage(), e);
}
}
示例3: annotate
import org.tartarus.snowball.SnowballStemmer; //导入依赖的package包/类
@Override
public void annotate(Blackboard blackboard, DocumentComponent component) {
initIndex(component.getLanguage());
List<Sentence> sentences = DocumentUtils.getSentences(component);
for (Sentence s : sentences) {
for (Gram g : s.getGrams()) {
if (!g.hasAnnotation(TFIDF)) {
String stemmedSurface = g.getSurface();
String[] tokenizedSurface
= OpenNlpBootstrapperAnnotator.
tokenizeText(stemmedSurface,
component.getLanguage().getLanguage());
SnowballStemmer stemmer = SnowballStemmerSelector.
getStemmerForLanguage(component.getLanguage());
for (int i = 0; i < tokenizedSurface.length; i++) {
stemmer.setCurrent(tokenizedSurface[i]);
if (stemmer.stem()) {
tokenizedSurface[i] = stemmer.getCurrent();
}
}
stemmedSurface = String.join(" ",
markTokens(tokenizedSurface)).trim();
((Keyphrase) g).putFeature(TFIDF,
tfIdf(
IOBlackboard.getCurrentDocument(),
stemmedSurface));
}
}
}
}
示例4: getStemmerForLanguage
import org.tartarus.snowball.SnowballStemmer; //导入依赖的package包/类
/**
* Creates a Snowball Stemmer for the provided locale, or returns null
* if there isn't any.
*
* @param loc a locale
* @return the stemmer for the provided locale.
*/
public static SnowballStemmer getStemmerForLanguage(Locale loc) {
switch (loc.getLanguage()) {
case "da":
return new danishStemmer();
case "nl":
return new dutchStemmer();
case "en":
return new englishStemmer();
case "fi":
return new finnishStemmer();
case "fr":
return new frenchStemmer();
case "de":
return new germanStemmer();
case "hu":
return new hungarianStemmer();
case "it":
return new italianStemmer();
case "no":
return new norwegianStemmer();
case "pt":
return new portugueseStemmer();
case "ro":
return new romanianStemmer();
case "ru":
return new russianStemmer();
case "es":
return new spanishStemmer();
case "sv":
return new swedishStemmer();
case "tr":
return new turkishStemmer();
default:
return null;
}
}
示例5: getStemmer
import org.tartarus.snowball.SnowballStemmer; //导入依赖的package包/类
public static SnowballStemmer getStemmer(Locale locale)
{
if (locale == null)
return null;
String rootLang = locale.getLanguage().substring(0, 2);
try
{
Class clazz = SUPPORTED_LANGUAGES.get(rootLang);
if(clazz == null)
return null;
Constructor<?> ctor = STEMMER_CONSTRUCTOR_CACHE.get(clazz);
return (SnowballStemmer) ctor.newInstance();
}
catch (Exception e)
{
logger.debug("Failed to create new SnowballStemmer instance " +
"for language [{}]", locale.getLanguage(), e);
}
return null;
}
示例6: stem
import org.tartarus.snowball.SnowballStemmer; //导入依赖的package包/类
/**
* Method to perform stemming
* @param input the words we would like to stem in a list
* @return the words stemmed
*/
public List<String> stem(List<String> input) {
List<String> output=new ArrayList<>();
SnowballStemmer snowballStemmer = new englishStemmer();
for(String word:input){
snowballStemmer.setCurrent(word);
snowballStemmer.stem();
output.add(snowballStemmer.getCurrent());
}
return output;
}
示例7: stemInput
import org.tartarus.snowball.SnowballStemmer; //导入依赖的package包/类
public ArrayList<String> stemInput(String s) {
SnowballStemmer stemmer = (SnowballStemmer) new englishStemmer();
s = s.replaceAll("[\\.\\,\\:\\?]", replacePunc);
s = s.replaceAll("[^a-zA-Z0-9'\\s]", "").replaceAll("\\s+", " ");
String[] tmp = s.split(" ");
ArrayList<String> res = new ArrayList<String>();
Boolean negate = false;
for (String word: tmp) {
if (IsStopWord(word)) {
continue;
}
if (replacePunc.contains(word)) {
negate = false;
continue;
}
if (IsNegation(word)) {
negate = !negate;
continue;
}
stemmer.setCurrent(word);
stemmer.stem();
if (negate) {
res.add("not-" + stemmer.getCurrent());
} else {
res.add(stemmer.getCurrent());
}
}
return res;
}
示例8: getStemmer
import org.tartarus.snowball.SnowballStemmer; //导入依赖的package包/类
private SnowballStemmer getStemmer(Language language) {
if (language.equals(Language.EN)) {
return new porterStemmer();
} else if (language.equals(Language.ES)) {
return new spanishStemmer();
} else if (language.equals(Language.FR)) {
return new frenchStemmer();
} else if (language.equals(Language.DE)) {
return new germanStemmer();
} else if (language.equals(Language.IT)) {
return new italianStemmer();
} else {
return null;
}
}
示例9: buildBag
import org.tartarus.snowball.SnowballStemmer; //导入依赖的package包/类
/**
*
* @param text
* @return
* @throws IOException
*/
public Map<String, Float> buildBag(String text) throws IOException {
Map<String, Float> bag = new HashMap<>();
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
SnowballStemmer stemmer = null;
if (stemming) {
stemmer = getStemmer(language);
if (stemmer == null) {
Logger.getLogger(RevisedLesk.class.getName()).log(Level.WARNING, "No stemmer for language {0}", language);
}
}
TokenStream tokenStream = analyzer.tokenStream("gloss", new StringReader(text));
while (tokenStream.incrementToken()) {
TermAttribute token = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
String term = token.term();
if (stemmer != null) {
stemmer.setCurrent(term);
if (stemmer.stem()) {
term = stemmer.getCurrent();
}
}
Float c = bag.get(term);
if (c == null) {
bag.put(term, 1f);
} else {
bag.put(term, c + 1f);
}
}
return bag;
}
示例10: stemTerms
import org.tartarus.snowball.SnowballStemmer; //导入依赖的package包/类
/**
* The function is responsible for the stemming of each entity in the dictionary based on the stemming language
* defined in the constructor.
*/
private void stemTerms() {
try {
int offset, overallOffset = 0;
String word, name, uri;
Concept concept;
StringBuilder sb;
Class stemClass = Class.forName("org.tartarus.snowball.ext." + stemmingLanguage);
SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance();
for (ProcessedText pt : processedTerms) {
sb = new StringBuilder();
sb.append(" ");
for (Token token : pt.tokens) {
stemmer.setCurrent(token.text);
stemmer.stem();
word = stemmer.getCurrent();
offset = token.text.length() - word.length();
token.begin -= overallOffset;
overallOffset += offset;
token.end -= overallOffset;
sb.append(word);
sb.append(" ");
token.stem = word;
}
name = sb.toString();
concept = originalDictionary.getConcept(pt.originalText);
pt.setStemmedText(name);
processedDictionary.addElement(name.substring(1, name.length() - 1), concept);
}
} catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
示例11: stemText
import org.tartarus.snowball.SnowballStemmer; //导入依赖的package包/类
/**
* The function is responsible for the stemming of the main text based on the stemming language defined in the
* constructor.
*/
private void stemText(ProcessedText processedText) {
try {
int offset, overallOffset = 0;
String word;
StringBuilder sb;
Class stemClass = Class.forName("org.tartarus.snowball.ext." + stemmingLanguage);
SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance();
sb = new StringBuilder();
sb.append(" ");
for (Token token : processedText.tokens) {
stemmer.setCurrent(token.text);
stemmer.stem();
word = stemmer.getCurrent();
offset = token.text.length() - word.length();
token.begin -= overallOffset;
overallOffset += offset;
token.end -= overallOffset;
sb.append(word);
sb.append(" ");
token.stem = word;
}
processedText.setStemmedText(sb.toString());
} catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
示例12: SnowballStemmerWrapper
import org.tartarus.snowball.SnowballStemmer; //导入依赖的package包/类
public SnowballStemmerWrapper(SnowballStemmer stemmer) {
this.stemmer = stemmer;
}
示例13: setStemming
import org.tartarus.snowball.SnowballStemmer; //导入依赖的package包/类
public void setStemming(SnowballStemmer stemm)
{
this.stemmer = stemm;
}
示例14: stem
import org.tartarus.snowball.SnowballStemmer; //导入依赖的package包/类
public static String stem(@Nullable final String lang, final String string) {
final SnowballStemmer stemmer = getStemmer(lang);
stemmer.setCurrent(string);
stemmer.stem();
return stemmer.getCurrent();
}
示例15: analyze
import org.tartarus.snowball.SnowballStemmer; //导入依赖的package包/类
public TextAnalyzer analyze() {
// Stanford CoreNLP, avoid lemmatization as it's very slow to use Porter2 stemming
// instead. (Porter -> Snowball (Porter2) -> Lancaster is order of stemming
// aggressiveness.
//
// other ideas
// - remove top 10k most common english words
Properties props = new Properties();
props.put("annotators", "tokenize, ssplit, stopword");
props.setProperty("customAnnotatorClass.stopword", "com.asimihsan.handytrowel.nlp.StopwordAnnotator");
List<String> stopWords = null;
try {
stopWords = WordReader.wordReaderWithResourcePath("/nlp/top1000words.txt").getWords();
} catch (IOException e) {
e.printStackTrace();
return this;
}
String customStopWordList = Joiner.on(",").join(stopWords);
props.setProperty(StopwordAnnotator.STOPWORDS_LIST, customStopWordList);
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation document = new Annotation(body);
pipeline.annotate(document);
List<CoreLabel> inputTokens = document.get(CoreAnnotations.TokensAnnotation.class);
SnowballStemmer stemmer = new englishStemmer();
for (CoreLabel token : inputTokens) {
Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class);
if (stopword.first())
continue;
String word = token.word().toLowerCase();
//!!AI TODO this sucks, should make another annotator and make it optional etc.
//also we're matching full stops! so we lose sentence information.
if (punctuation.matcher(word).matches())
continue;
//!AI TODO again this would be its own annotator and optional
word = number.matcher(word).replaceAll("NUMBER");
stemmer.setCurrent(word);
stemmer.stem();
word = stemmer.getCurrent();
tokens.add(word);
}
return this;
}