本文整理汇总了Java中org.apache.lucene.analysis.Tokenizer.setReader方法的典型用法代码示例。如果您正苦于以下问题:Java Tokenizer.setReader方法的具体用法?Java Tokenizer.setReader怎么用?Java Tokenizer.setReader使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.analysis.Tokenizer
的用法示例。
在下文中一共展示了Tokenizer.setReader方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testBasicUsage
import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
@Test
public void testBasicUsage() throws IOException {
String query = "그리고 이것은 예시, 또는 예로써, 한국어를 처리하기 입니다";
String[] expectedCharTerms = new String[]{"예시", "예", "한국어", "처리", "하다", "이다"};
String[] expectedTypes = new String[]{"Noun", "Modifier", "Noun", "Noun", "Verb", "Adjective"};
int[] expectedStartOffsets = new int[]{8, 15, 20, 25, 27, 30};
int[] expectedEndOffsets = new int[]{10, 16, 23, 27, 29, 33};
Tokenizer tokenizer = new OpenKoreanTextTokenizer();
tokenizer.setReader(new StringReader(query));
OpenKoreanTextTokenFilter tokenFilter = new OpenKoreanTextStemmer(tokenizer);
tokenFilter = new OpenKoreanTextRedundantFilter(tokenFilter);
TokenStreamAssertions.assertTokenStream(tokenFilter, expectedCharTerms, expectedTypes, expectedStartOffsets, expectedEndOffsets);
}
开发者ID:open-korean-text,项目名称:elasticsearch-analysis-openkoreantext,代码行数:17,代码来源:OpenKoreanTextRedundantFilterTest.java
示例2: testMultiTerms
import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testMultiTerms() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42",
"wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se",
"ONeil", "O'Neil's", "O", "Neil" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1};
int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1};
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
expectedIncr, expectedPosLen, null);
}
示例3: testPreTokenization
import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testPreTokenization() throws IOException {
// Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
final Index index = new Index("test", "_na_");
final String name = "ngr";
final Settings indexSettings = newAnalysisSettingsBuilder().build();
Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f "));
assertTokenStreamContents(tokenizer,
new String[] {"Åb", "Åbc", "bc", "dé", "déf", "éf", "g\uD801\uDC00", "g\uD801\uDC00f", "\uD801\uDC00f"});
settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
tokenizer.setReader(new StringReader(" a!$ 9"));
assertTokenStreamContents(tokenizer,
new String[] {" a", " a!", "a!", "a!$", "!$", "!$ ", "$ ", "$ 9", " 9"});
}
示例4: testPreTokenizationEdge
import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testPreTokenizationEdge() throws IOException {
// Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
final Index index = new Index("test", "_na_");
final String name = "ngr";
final Settings indexSettings = newAnalysisSettingsBuilder().build();
Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
Tokenizer tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f "));
assertTokenStreamContents(tokenizer,
new String[] {"Åb", "Åbc", "dé", "déf", "g\uD801\uDC00", "g\uD801\uDC00f"});
settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
tokenizer.setReader(new StringReader(" a!$ 9"));
assertTokenStreamContents(tokenizer,
new String[] {" a", " a!"});
}
示例5: testInverseMapping
import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testInverseMapping() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("shingle_inverse");
assertThat(tokenFilter, instanceOf(ShingleTokenFilterFactory.class));
String source = "the quick brown fox";
String[] expected = new String[]{"the_quick_brown", "quick_brown_fox"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
示例6: testMetaphoneWords
import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
@Test
public void testMetaphoneWords() throws Exception {
Index index = new Index("test", "_na_");
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("index.analysis.filter.myStemmer.type", "br_metaphone")
.build();
AnalysisService analysisService = createAnalysisService(index, settings, new AnalysisMetaphonePlugin());
TokenFilterFactory filterFactory = analysisService.tokenFilter("br_metaphone");
Tokenizer tokenizer = new KeywordTokenizer();
Map<String,String> words = buildWordList();
Set<String> inputWords = words.keySet();
for(String word : inputWords) {
tokenizer.setReader(new StringReader(word));
TokenStream ts = filterFactory.create(tokenizer);
CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
ts.reset();
assertThat(ts.incrementToken(), equalTo(true));
assertThat(term1.toString(), equalTo(words.get(word)));
ts.close();
}
}
开发者ID:anaelcarvalho,项目名称:elasticsearch-analysis-metaphone_ptBR,代码行数:29,代码来源:MetaphoneTokenFilterTests.java
示例7: createStackedTokenStream
import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
private static TokenStream createStackedTokenStream(String source, CharFilterFactory[] charFilterFactories, TokenizerFactory tokenizerFactory, TokenFilterFactory[] tokenFilterFactories, int current) {
Reader reader = new FastStringReader(source);
for (CharFilterFactory charFilterFactory : charFilterFactories) {
reader = charFilterFactory.create(reader);
}
Tokenizer tokenizer = tokenizerFactory.create();
tokenizer.setReader(reader);
TokenStream tokenStream = tokenizer;
for (int i = 0; i < current; i++) {
tokenStream = tokenFilterFactories[i].create(tokenStream);
}
return tokenStream;
}
示例8: testStemEnglishPossessive
import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testStemEnglishPossessive() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2",
"se", "O", "Neil", "s"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
示例9: testBaseFormFilterFactory
import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testBaseFormFilterFactory() throws IOException {
TestAnalysis analysis = createTestAnalysis();
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_pos");
assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class));
String source = "私は制限スピードを超える。";
String[] expected = new String[]{"私", "は", "制限", "スピード", "を"};
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
示例10: testHanUnigramOnly
import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testHanUnigramOnly() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_unigram_only");
String source = "多くの学生が試験に落ちた。";
String[] expected = new String[]{"多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" };
Tokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
示例11: testDefault
import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testDefault() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
String source = "Ansprüche";
String[] expected = new String[]{"Anspruche"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
示例12: testCatenateNumbers
import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testCatenateNumbers() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
.put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2",
"se", "O", "Neil"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
示例13: testNoFlags
import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testNoFlags() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_no_flags");
String source = "多くの学生が試験に落ちた。";
String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
Tokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
示例14: testCaseInsensitiveMapping
import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testCaseInsensitiveMapping() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keep_filter");
assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class));
String source = "hello small world";
String[] expected = new String[]{"hello", "world"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1, 2});
}
示例15: testCaseSensitiveMapping
import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testCaseSensitiveMapping() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_case_sensitive_keep_filter");
assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class));
String source = "Hello small world";
String[] expected = new String[]{"Hello"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1});
}