本文整理汇总了Java中info.ephyra.nlp.SnowballStemmer类的典型用法代码示例。如果您正苦于以下问题:Java SnowballStemmer类的具体用法?Java SnowballStemmer怎么用?Java SnowballStemmer使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
SnowballStemmer类属于info.ephyra.nlp包,在下文中一共展示了SnowballStemmer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: TRECNugget
import info.ephyra.nlp.SnowballStemmer; //导入依赖的package包/类
/**
* @param targetID the targetID of the TREC target the OTHER question belongs to
* @param questionID the ID of the OTHER question
* @param nuggetID the ID of the nugget
* @param nuggetType the type of the nugget (okay or vital)
* @param nugget the nugget's text
*/
public TRECNugget(String targetID, String questionID, String nuggetID, String nuggetType, String nugget) {
this.targetID = targetID;
this.questionID = questionID;
this.nuggetID = nuggetID;
this.nuggetType = nuggetType;
this.nugget = nugget;
String[] nTokens = NETagger.tokenize(nugget);
HashSet<String> nSet = new HashSet<String>();
for (String n : nTokens)
if (!FunctionWords.lookup(n) && (n.length() > 1))
nSet.add(SnowballStemmer.stem(n).toLowerCase());
this.size = nSet.size();
}
示例2: covers
import info.ephyra.nlp.SnowballStemmer; //导入依赖的package包/类
/** check if some result covers some nugger
* @param result the result String
* @param nugget the nugget string
* @return the tokens of the specified nugget String not contained in the specified result String
*/
private String[] covers(String result, String nugget) {
String[] rTokens = NETagger.tokenize(result);
HashSet<String> rSet = new HashSet<String>();
for (String r : rTokens)
if (!FunctionWords.lookup(r) && (r.length() > 1))
rSet.add(SnowballStemmer.stem(r).toLowerCase());
String[] nTokens = NETagger.tokenize(nugget);
HashSet<String> nSet = new HashSet<String>();
for (String n : nTokens)
if (!FunctionWords.lookup(n) && (n.length() > 1))
nSet.add(SnowballStemmer.stem(n).toLowerCase());
nSet.removeAll(rSet);
ArrayList<String> remaining = new ArrayList<String>(nSet);
return remaining.toArray(new String[remaining.size()]);
}
示例3: add
import info.ephyra.nlp.SnowballStemmer; //导入依赖的package包/类
/**
* Adds a word to the dictionary.
*
* @param word the word to add
*/
public void add(String word) {
if (word != null) {
word = NETagger.tokenizeWithSpaces(word.trim().toLowerCase());
word = SnowballStemmer.stemAllTokens(word);
// add whole word
if (word.length() > 0) words.add(word);
// add tokens of word
String[] tokens = word.split(" ");
if (tokens.length > maxTokens) maxTokens = tokens.length;
for (int p = 0; p < tokens.length; p++)
if (tokens[p].length() > 0) this.tokens.add(tokens[p]);
}
}
示例4: equalsCommonNorm
import info.ephyra.nlp.SnowballStemmer; //导入依赖的package包/类
/**
* Compares the normalizations of the two strings, using the same criterion
* as the <code>equalsCommon()</code> method.
*
* @param s1 string 1
* @param s2 string 2
* @return true, iff the normalizations are equal
*/
public static boolean equalsCommonNorm(String s1, String s2) {
// convert to lower-case
s1 = s1.toLowerCase();
s2 = s2.toLowerCase();
// tokenize
String tokens1[] = NETagger.tokenize(s1);
String tokens2[] = NETagger.tokenize(s2);
// eliminate function words and tokens of length < 2, stemm all tokens
ArrayList<String> tks1 = new ArrayList<String>();
for (String token1 : tokens1)
if (token1.length() > 1 && !FunctionWords.lookup(token1))
tks1.add(SnowballStemmer.stem(token1));
HashSet<String> tks2 = new HashSet<String>();
for (String token2 : tokens2)
if (token2.length() > 1 && !FunctionWords.lookup(token2))
tks2.add(SnowballStemmer.stem(token2));
// check for common token
for (String token : tks1) if (tks2.contains(token)) return true;
return false;
}
示例5: equalsCommonProp
import info.ephyra.nlp.SnowballStemmer; //导入依赖的package包/类
/**
* Compares two strings, using the same criterion as the <code>equalsCommonNorm()</code> method, but considers only words starting with a capital letter (proper nouns)
*
* @param s1 string 1
* @param s2 string 2
* @return true, iff the proper nouns are equal
*/
public static boolean equalsCommonProp(String s1, String s2) {
// convert to lower-case
s1 = s1.toLowerCase();
s2 = s2.toLowerCase();
// tokenize
String tokens1[] = NETagger.tokenize(s1);
String tokens2[] = NETagger.tokenize(s2);
// eliminate function words and tokens of length < 2, stemm all tokens
ArrayList<String> tks1 = new ArrayList<String>();
for (String token1 : tokens1)
if (token1.length() > 1 && !FunctionWords.lookup(token1) && token1.substring(0, 1).matches("[A-Z]"))
tks1.add(SnowballStemmer.stem(token1));
HashSet<String> tks2 = new HashSet<String>();
for (String token2 : tokens2)
if (token2.length() > 1 && !FunctionWords.lookup(token2) && token2.substring(0, 1).matches("[A-Z]"))
tks2.add(SnowballStemmer.stem(token2));
// check for common token
for (String token : tks1) if (tks2.contains(token)) return true;
return false;
}
示例6: match
import info.ephyra.nlp.SnowballStemmer; //导入依赖的package包/类
/**
* Checks if the first phrase is inclusive of the second
* @param npq parsed string
* @param npt parsed string
*/
private static boolean match(String npq, String npt) {
String q = unparse(npq).replace("'s", "").replace("'", "");
String t = unparse(npt).replace("'s", "").replace("'", "");
boolean exists;
for (String token1 : q.split(" ")) {
token1 = SnowballStemmer.stem(token1);
exists = false;
for (String token2 : t.split(" ")) {
token2 = SnowballStemmer.stem(token2);
// System.out.println(token1 + ":" + token2);
if (token1.equalsIgnoreCase(token2)) {
exists = true;
break;
}
}
if (!exists) {
return false;
}
}
return true;
}
示例7: HashDictionary
import info.ephyra.nlp.SnowballStemmer; //导入依赖的package包/类
/**
* Creates a <code>HashDictionary</code> from a list of words in a file.
*
* @param fileName file containing a list of words
* @throws IOException if the list could not be read from the file
*/
public HashDictionary(String fileName) throws IOException {
this();
if (fileName != null) {
File file = new File(fileName);
BufferedReader in = new BufferedReader(new FileReader(file));
while (in.ready()) {
// read and normalize word
String word = in.readLine().trim();
if (word.startsWith("//")) continue; // skip comments
word = NETagger.tokenizeWithSpaces(word.toLowerCase());
word = SnowballStemmer.stemAllTokens(word);
// add whole word
if (word.length() > 0) words.add(word);
// add tokens of word
String[] tokens = word.split(" ");
if (tokens.length > maxTokens) maxTokens = tokens.length;
for (int p = 0; p < tokens.length; p++)
if (tokens[p].length() > 0) this.tokens.add(tokens[p]);
}
in.close();
}
}
示例8: contains
import info.ephyra.nlp.SnowballStemmer; //导入依赖的package包/类
/**
* Looks up a word.
*
* @param word the word to look up
* @return <code>true</code> iff the word was found
*/
public boolean contains(String word) {
word = NETagger.tokenizeWithSpaces(word.trim().toLowerCase());
word = SnowballStemmer.stemAllTokens(word);
return words.contains(word);
}
示例9: fuzzyContains
import info.ephyra.nlp.SnowballStemmer; //导入依赖的package包/类
/**
* Does a fuzzy lookup for a word. The specified word w is considered as
* contained in the dictionary is there is a word W in the dictionary such
* that <code>LevenshteinDistance(w, W) <= maxDistance</code>
*
* @param word the word to look up
* @param maxDistance the maximum Levenshtein edit distance for fuzzy
* comparison
* @return <code>true</code> iff the word was found
*/
public boolean fuzzyContains(String word, int maxDistance) {
word = NETagger.tokenizeWithSpaces(word.trim().toLowerCase());
word = SnowballStemmer.stemAllTokens(word);
if (maxDistance == 0) return this.words.contains(word);
else if (this.words.contains(word)) return true;
Iterator<String> wordIter = this.words.iterator();
while (wordIter.hasNext())
if (getLevenshteinDistance(word, wordIter.next(), maxDistance, true, 1, 1) <= maxDistance) return true;
return false;
}
示例10: fuzzyContainsToken
import info.ephyra.nlp.SnowballStemmer; //导入依赖的package包/类
/**
* Does a fuzzy lookup for a token. The specified token t is considered as
* contained in the dictionary is there is a token T in the dictionary such
* that <code>LevenshteinDistance(t, T) <= maxDistance</code>
*
* @param token the token to look up
* @param maxDistance the maximum Levenshtein edit distance for fuzzy
* comparison
* @return <code>true</code> iff a word in the dictionary contains the token
*/
public boolean fuzzyContainsToken(String token, int maxDistance) {
token = SnowballStemmer.stem(token.trim().toLowerCase());
if (maxDistance == 0) return this.tokens.contains(token);
else if (this.tokens.contains(token)) return true;
Iterator<String> tokenIter = this.tokens.iterator();
while (tokenIter.hasNext())
if (getLevenshteinDistance(token, tokenIter.next(), maxDistance, true, 1, 1) <= maxDistance) return true;
return false;
}
示例11: normalize
import info.ephyra.nlp.SnowballStemmer; //导入依赖的package包/类
/**
* Normalizes a string. Similar strings are mapped to equal normalizations.
*
* @param s the string
* @return normalized string
*/
// TODO use noun and verb stemming (also for equals...Norm() methods)
public static String normalize(String s) {
// convert to lower-case
s = s.toLowerCase();
// tokenize
String tokens[] = NETagger.tokenize(s);
// stemm all tokens
for (int i = 0; i < tokens.length; i++)
tokens[i] = SnowballStemmer.stem(tokens[i]);
return concatWithSpaces(tokens);
}
示例12: main
import info.ephyra.nlp.SnowballStemmer; //导入依赖的package包/类
public static void main(String[] args) {
TEST_TERM_DOWMLOD = true;
MsgPrinter.enableStatusMsgs(true);
MsgPrinter.enableErrorMsgs(true);
// create tokenizer
MsgPrinter.printStatusMsg("Creating tokenizer...");
if (!OpenNLP.createTokenizer("res/nlp/tokenizer/opennlp/EnglishTok.bin.gz"))
MsgPrinter.printErrorMsg("Could not create tokenizer.");
// LingPipe.createTokenizer();
// // create sentence detector
// MsgPrinter.printStatusMsg("Creating sentence detector...");
// if (!OpenNLP.createSentenceDetector("res/nlp/sentencedetector/opennlp/EnglishSD.bin.gz"))
// MsgPrinter.printErrorMsg("Could not create sentence detector.");
// LingPipe.createSentenceDetector();
// create stemmer
MsgPrinter.printStatusMsg("Creating stemmer...");
SnowballStemmer.create();
// // create part of speech tagger
// MsgPrinter.printStatusMsg("Creating POS tagger...");
// if (!OpenNLP.createPosTagger("res/nlp/postagger/opennlp/tag.bin.gz",
// "res/nlp/postagger/opennlp/tagdict"))
// MsgPrinter.printErrorMsg("Could not create OpenNLP POS tagger.");
// if (!StanfordPosTagger.init("res/nlp/postagger/stanford/" +
// "train-wsj-0-18.holder"))
// MsgPrinter.printErrorMsg("Could not create Stanford POS tagger.");
// // create chunker
// MsgPrinter.printStatusMsg("Creating chunker...");
// if (!OpenNLP.createChunker("res/nlp/phrasechunker/opennlp/" +
// "EnglishChunk.bin.gz"))
// MsgPrinter.printErrorMsg("Could not create chunker.");
// create named entity taggers
MsgPrinter.printStatusMsg("Creating NE taggers...");
NETagger.loadListTaggers("res/nlp/netagger/lists/");
NETagger.loadRegExTaggers("res/nlp/netagger/patterns.lst");
MsgPrinter.printStatusMsg(" ...loading models");
// if (!NETagger.loadNameFinders("res/nlp/netagger/opennlp/"))
// MsgPrinter.printErrorMsg("Could not create OpenNLP NE tagger.");
// if (!StanfordNeTagger.isInitialized() && !StanfordNeTagger.init())
// MsgPrinter.printErrorMsg("Could not create Stanford NE tagger.");
MsgPrinter.printStatusMsg(" ...done");
WikipediaTermImportanceFilter wtif = new WikipediaTermImportanceFilter(NO_NORMALIZATION, NO_NORMALIZATION, false);
TRECTarget[] targets = TREC13To16Parser.loadTargets(args[0]);
for (TRECTarget target : targets) {
String question = target.getTargetDesc();
// query generation
MsgPrinter.printGeneratingQueries();
String qn = QuestionNormalizer.normalize(question);
MsgPrinter.printNormalization(qn); // print normalized question string
Logger.logNormalization(qn); // log normalized question string
String[] kws = KeywordExtractor.getKeywords(qn);
AnalyzedQuestion aq = new AnalyzedQuestion(question);
aq.setKeywords(kws);
aq.setFactoid(false);
Query[] queries = new BagOfWordsG().generateQueries(aq);
for (int q = 0; q < queries.length; q++)
queries[q].setOriginalQueryString(question);
Result[] results = new Result[1];
results[0] = new Result("This would be the answer", queries[0]);
wtif.apply(results);
}
}
示例13: getTermCounters
import info.ephyra.nlp.SnowballStemmer; //导入依赖的package包/类
/** @see info.ephyra.answerselection.filters.WebTermImportanceFilter#getTermCounters(java.lang.String[])
*/
@Override
public HashMap<String, TermCounter> getTermCounters(String[] targets) {
HashMap<String, TermCounter> termCounters = new HashMap<String, TermCounter>();
for (String target : targets) {
// get snippets from yahoo
SearchClient client = new SearchClient(YAHOO_ID);
// create request
WebSearchRequest request = new WebSearchRequest(target);
request.setLanguage("en"); // search for English pages only
request.setStart(BigInteger.valueOf(0));
request.setResults(MAX_RESULTS_PERQUERY);
// perform search
WebSearchResult[] searchResults = null;
int retries = 0;
while (searchResults == null)
try {
searchResults = client.webSearch(request).listResults();
} catch (Exception e) {
MsgPrinter.printSearchError(e); // print search error message
if (retries == RETRIES) {
MsgPrinter.printErrorMsg("\nSearch failed.");
System.exit(1);
}
retries++;
try {
YahooKM.sleep(1000);
} catch (InterruptedException ie) {}
}
// parse yahoo snippets
int lengthSum = 0;
for (int i = 0; i < searchResults.length; i++) {
String summary = searchResults[i].getSummary();
if (summary != null) {
// tokenize and tag sentence
String[] sentence = NETagger.tokenize(summary);
lengthSum += sentence.length;
// scan sentence for NPs
for (int s = 0; s < sentence.length; s++) {
String term = SnowballStemmer.stem(sentence[s].toLowerCase());
if (term.length() > 1) {
if (!termCounters.containsKey(term))
termCounters.put(term, new TermCounter());
termCounters.get(term).increment();
}
}
}
}
}
return termCounters;
}
示例14: main
import info.ephyra.nlp.SnowballStemmer; //导入依赖的package包/类
public static void main(String[] args) {
TEST_TARGET_GENERATION = true;
MsgPrinter.enableStatusMsgs(true);
MsgPrinter.enableErrorMsgs(true);
// create tokenizer
MsgPrinter.printStatusMsg("Creating tokenizer...");
if (!OpenNLP.createTokenizer("res/nlp/tokenizer/opennlp/EnglishTok.bin.gz"))
MsgPrinter.printErrorMsg("Could not create tokenizer.");
// LingPipe.createTokenizer();
// create sentence detector
// MsgPrinter.printStatusMsg("Creating sentence detector...");
// if (!OpenNLP.createSentenceDetector("res/nlp/sentencedetector/opennlp/EnglishSD.bin.gz"))
// MsgPrinter.printErrorMsg("Could not create sentence detector.");
// LingPipe.createSentenceDetector();
// create stemmer
MsgPrinter.printStatusMsg("Creating stemmer...");
SnowballStemmer.create();
// create part of speech tagger
MsgPrinter.printStatusMsg("Creating POS tagger...");
if (!OpenNLP.createPosTagger("res/nlp/postagger/opennlp/tag.bin.gz",
"res/nlp/postagger/opennlp/tagdict"))
MsgPrinter.printErrorMsg("Could not create OpenNLP POS tagger.");
// if (!StanfordPosTagger.init("res/nlp/postagger/stanford/" +
// "train-wsj-0-18.holder"))
// MsgPrinter.printErrorMsg("Could not create Stanford POS tagger.");
// create chunker
MsgPrinter.printStatusMsg("Creating chunker...");
if (!OpenNLP.createChunker("res/nlp/phrasechunker/opennlp/" +
"EnglishChunk.bin.gz"))
MsgPrinter.printErrorMsg("Could not create chunker.");
// create named entity taggers
MsgPrinter.printStatusMsg("Creating NE taggers...");
NETagger.loadListTaggers("res/nlp/netagger/lists/");
NETagger.loadRegExTaggers("res/nlp/netagger/patterns.lst");
MsgPrinter.printStatusMsg(" ...loading models");
// if (!NETagger.loadNameFinders("res/nlp/netagger/opennlp/"))
// MsgPrinter.printErrorMsg("Could not create OpenNLP NE tagger.");
if (!StanfordNeTagger.isInitialized() && !StanfordNeTagger.init())
MsgPrinter.printErrorMsg("Could not create Stanford NE tagger.");
MsgPrinter.printStatusMsg(" ...done");
WebTermImportanceFilter wtif = new TargetGeneratorTest(NO_NORMALIZATION);
TRECTarget[] targets = TREC13To16Parser.loadTargets(args[0]);
for (TRECTarget target : targets) {
String question = target.getTargetDesc();
// query generation
MsgPrinter.printGeneratingQueries();
String qn = QuestionNormalizer.normalize(question);
MsgPrinter.printNormalization(qn); // print normalized question string
Logger.logNormalization(qn); // log normalized question string
String[] kws = KeywordExtractor.getKeywords(qn);
AnalyzedQuestion aq = new AnalyzedQuestion(question);
aq.setKeywords(kws);
aq.setFactoid(false);
Query[] queries = new BagOfWordsG().generateQueries(aq);
for (int q = 0; q < queries.length; q++)
queries[q].setOriginalQueryString(question);
Result[] results = new Result[1];
results[0] = new Result("This would be the answer", queries[0]);
wtif.apply(results);
}
}
示例15: getGoogleTermCounters
import info.ephyra.nlp.SnowballStemmer; //导入依赖的package包/类
private HashMap<String, TermCounter> getGoogleTermCounters(String target) {
HashMap<String, TermCounter> targetTermCounters = new HashMap<String, TermCounter>();
// subsequently get top MAX_RESULTS_TOTAL snippets, MAX_RESULTS_PERQUERY each time
for (int startResult = 0; startResult < MAX_RESULTS_TOTAL; startResult += MAX_RESULTS_PERQUERY) {
// get snippets from google
GoogleSearch search = new GoogleSearch();
if (TEST_TARGET_GENERATION) System.out.println("Got search ...");
// set license key
search.setKey(GOOGLE_KEY);
if (TEST_TARGET_GENERATION) System.out.println(" - key is " + GOOGLE_KEY);
// set search string
search.setQueryString(target);
if (TEST_TARGET_GENERATION) System.out.println(" - target is " + target);
// set language to English only
search.setLanguageRestricts("English");
if (TEST_TARGET_GENERATION) System.out.println(" - language set");
// set hit position of first search result
search.setStartResult(startResult);
if (TEST_TARGET_GENERATION) System.out.println(" - start result set to " + startResult);
// set maximum number of search results
search.setMaxResults(MAX_RESULTS_PERQUERY);
if (TEST_TARGET_GENERATION) System.out.println(" - max results set");
// perform search
GoogleSearchResult googleResult = null;
int retries = 0;
while (googleResult == null)
try {
googleResult = search.doSearch();
} catch (GoogleSearchFault e) {
MsgPrinter.printSearchError(e); // print search error message
if (retries == RETRIES) {
MsgPrinter.printErrorMsg("\nSearch failed.");
//System.exit(1);
return targetTermCounters;
}
retries++;
try {
GoogleKM.sleep(1000);
} catch (InterruptedException ie) {}
}
// get snippets
GoogleSearchResultElement[] elements = googleResult.getResultElements();
if (TEST_TARGET_GENERATION) System.out.println(" - got results: " + elements.length);
// parse google snippets
int lengthSum = 0;
for (int i = 0; i < elements.length; i++) {
String plain = elements[i].getSnippet().replaceAll("\\<[^\\>]++\\>", " ");
plain = plain.replaceAll("\\&\\#39\\;", "'");
if (TEST_TARGET_GENERATION) System.out.println(" - plain: " + plain);
// tokenize and tag sentence
String[] sentence = NETagger.tokenize(plain);
lengthSum += sentence.length;
// scan sentence for NPs
for (int s = 0; s < sentence.length; s++) {
String term = SnowballStemmer.stem(sentence[s].toLowerCase());
if (term.length() > 1) {
if (!targetTermCounters.containsKey(term))
targetTermCounters.put(term, new TermCounter());
targetTermCounters.get(term).increment();
}
}
}
}
return targetTermCounters;
}