本文整理汇总了Java中org.apache.lucene.analysis.standard.StandardTokenizer类的典型用法代码示例。如果您正苦于以下问题:Java StandardTokenizer类的具体用法?Java StandardTokenizer怎么用?Java StandardTokenizer使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
StandardTokenizer类属于org.apache.lucene.analysis.standard包,在下文中一共展示了StandardTokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: reusableTokenStream
import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
class SavedStreams {
StandardTokenizer tokenStream;
TokenStream filteredTokenStream;
}
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
setPreviousTokenStream(streams);
streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
} else {
streams.tokenStream.reset(reader);
}
streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);
return streams.filteredTokenStream;
}
示例2: testHanOnly
import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testHanOnly() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(reader);
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" },
new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>",
"<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
示例3: testUnigramsAndBigramsHanOnly
import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testUnigramsAndBigramsHanOnly() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(reader);
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
new String[] { "多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" },
new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>",
"<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>",
"<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
}
示例4: testStopPositons
import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
/**
* Test Position increments applied by TypeTokenFilter with and without enabling this option.
*/
public void testStopPositons() throws IOException {
StringBuilder sb = new StringBuilder();
for (int i = 10; i < 20; i++) {
if (i % 3 != 0) {
sb.append(i).append(" ");
} else {
String w = English.intToEnglish(i).trim();
sb.append(w).append(" ");
}
}
log(sb.toString());
String stopTypes[] = new String[]{"<NUM>"};
Set<String> stopSet = asSet(stopTypes);
// with increments
StringReader reader = new StringReader(sb.toString());
TypeTokenFilter typeTokenFilter = new TypeTokenFilter(Version.LATEST, new StandardTokenizer(reader), stopSet);
testPositons(typeTokenFilter);
// without increments
reader = new StringReader(sb.toString());
typeTokenFilter = new TypeTokenFilter(Version.LUCENE_4_3, false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
testPositons(typeTokenFilter);
}
示例5: stemWords
import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
/**
* Stem a list of words with a configured stemmer.
*
* @param words
* The list of words to stem.
* @param stemming
* The stemmer to be used.
* @return The stemmed list of words.
*/
@SuppressWarnings("resource")
public static String[] stemWords(String[] words, Stemming stemming) {
Set<String> stemmedStopWords = Sets.newHashSet();
for (String word : words) {
TokenStream tokenStream = new StandardTokenizer(LUCENE_VERSION, new StringReader(word));
tokenStream = Stemming.wrapStemmingFilter(tokenStream, stemming);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
try {
tokenStream.reset();
while (tokenStream.incrementToken()) {
String term = charTermAttribute.toString();
stemmedStopWords.add(term);
}
} catch (IOException e) {
logger.error("Failed to stem a list of words", e);
}
}
return stemmedStopWords.toArray(new String[] {});
}
示例6: testHanOnly
import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testHanOnly() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" },
new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>",
"<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
示例7: testUnigramsAndBigramsHanOnly
import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testUnigramsAndBigramsHanOnly() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
new String[] { "多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" },
new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>",
"<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>",
"<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
}
示例8: testStopPositons
import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
/**
* Test Position increments applied by TypeTokenFilter with and without enabling this option.
*/
public void testStopPositons() throws IOException {
StringBuilder sb = new StringBuilder();
for (int i = 10; i < 20; i++) {
if (i % 3 != 0) {
sb.append(i).append(" ");
} else {
String w = English.intToEnglish(i).trim();
sb.append(w).append(" ");
}
}
log(sb.toString());
String stopTypes[] = new String[]{"<NUM>"};
Set<String> stopSet = asSet(stopTypes);
// with increments
StringReader reader = new StringReader(sb.toString());
TypeTokenFilter typeTokenFilter = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
testPositons(typeTokenFilter);
// without increments
reader = new StringReader(sb.toString());
typeTokenFilter = new TypeTokenFilter(false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
testPositons(typeTokenFilter);
}
示例9: testTwo
import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
@Test
public void testTwo() throws IOException {
AnalysisService analysisService = createAnalysisService();
TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform");
String source = "Das sind Autos, die Nudeln transportieren.";
String[] expected = {
"Das",
"Das",
"sind",
"sind",
"Autos",
"Auto",
"die",
"der",
"Nudeln",
"Nudel",
"transportieren",
"transportieren"
};
Tokenizer tokenizer = new StandardTokenizer(Version.LATEST, new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
示例10: testStopPositons
import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
/**
* Test Position increments applied by TypeTokenFilter with and without enabling this option.
*/
public void testStopPositons() throws IOException {
StringBuilder sb = new StringBuilder();
for (int i = 10; i < 20; i++) {
if (i % 3 != 0) {
sb.append(i).append(" ");
} else {
String w = English.intToEnglish(i).trim();
sb.append(w).append(" ");
}
}
log(sb.toString());
String stopTypes[] = new String[]{"<NUM>"};
Set<String> stopSet = asSet(stopTypes);
// with increments
StringReader reader = new StringReader(sb.toString());
TypeTokenFilter typeTokenFilter = new TypeTokenFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
testPositons(typeTokenFilter);
// without increments
reader = new StringReader(sb.toString());
typeTokenFilter = new TypeTokenFilter(Version.LUCENE_43, false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
testPositons(typeTokenFilter);
}
示例11: testKeepTypes
import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testKeepTypes() throws IOException {
Settings settings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.keep_numbers.type", "keep_types")
.putArray("index.analysis.filter.keep_numbers.types", new String[] {"<NUM>", "<SOMETHINGELSE>"})
.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
String source = "Hello 123 world";
String[] expected = new String[]{"123"};
Tokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{2});
}
示例12: testDefault
import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testDefault() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_bigram");
String source = "多くの学生が試験に落ちた。";
String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
Tokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
示例13: testNoFlags
import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testNoFlags() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_no_flags");
String source = "多くの学生が試験に落ちた。";
String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
Tokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
示例14: testHanOnly
import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testHanOnly() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_only");
String source = "多くの学生が試験に落ちた。";
String[] expected = new String[]{"多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" };
Tokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
示例15: testHanUnigramOnly
import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testHanUnigramOnly() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_unigram_only");
String source = "多くの学生が試験に落ちた。";
String[] expected = new String[]{"多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" };
Tokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}