当前位置: 首页>>代码示例>>Java>>正文


Java Tokenizer.setReader方法代码示例

本文整理汇总了Java中org.apache.lucene.analysis.Tokenizer.setReader方法的典型用法代码示例。如果您正苦于以下问题:Java Tokenizer.setReader方法的具体用法?Java Tokenizer.setReader怎么用?Java Tokenizer.setReader使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.lucene.analysis.Tokenizer的用法示例。


在下文中一共展示了Tokenizer.setReader方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testBasicUsage

import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
@Test
public void testBasicUsage() throws IOException {
    String query = "그리고 이것은 예시, 또는 예로써, 한국어를 처리하기 입니다";
    String[] expectedCharTerms = new String[]{"예시", "예", "한국어", "처리", "하다", "이다"};
    String[] expectedTypes = new String[]{"Noun", "Modifier", "Noun", "Noun", "Verb", "Adjective"};
    int[] expectedStartOffsets = new int[]{8, 15, 20, 25, 27, 30};
    int[] expectedEndOffsets = new int[]{10, 16, 23, 27, 29, 33};

    Tokenizer tokenizer = new OpenKoreanTextTokenizer();
    tokenizer.setReader(new StringReader(query));

    OpenKoreanTextTokenFilter tokenFilter = new OpenKoreanTextStemmer(tokenizer);
    tokenFilter = new OpenKoreanTextRedundantFilter(tokenFilter);

    TokenStreamAssertions.assertTokenStream(tokenFilter, expectedCharTerms, expectedTypes, expectedStartOffsets, expectedEndOffsets);
}
 
开发者ID:open-korean-text,项目名称:elasticsearch-analysis-openkoreantext,代码行数:17,代码来源:OpenKoreanTextRedundantFilterTest.java

示例2: testMultiTerms

import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testMultiTerms() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
        .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
        .put("index.analysis.filter.my_word_delimiter.type", type)
        .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
        .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
        .build());

    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42",
        "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se",
        "ONeil", "O'Neil's", "O", "Neil" };
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1};
    int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1};
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
            expectedIncr, expectedPosLen, null);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:21,代码来源:WordDelimiterGraphTokenFilterFactoryTests.java

示例3: testPreTokenization

import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testPreTokenization() throws IOException {
    // Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
    final Index index = new Index("test", "_na_");
    final String name = "ngr";
    final Settings indexSettings = newAnalysisSettingsBuilder().build();
    Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
    Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f "));
    assertTokenStreamContents(tokenizer,
            new String[] {"Åb", "Åbc", "bc", "dé", "déf", "éf", "g\uD801\uDC00", "g\uD801\uDC00f", "\uD801\uDC00f"});
    settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
    tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader(" a!$ 9"));
    assertTokenStreamContents(tokenizer,
        new String[] {" a", " a!", "a!", "a!$", "!$", "!$ ", "$ ", "$ 9", " 9"});
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:17,代码来源:NGramTokenizerFactoryTests.java

示例4: testPreTokenizationEdge

import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testPreTokenizationEdge() throws IOException {
    // Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
    final Index index = new Index("test", "_na_");
    final String name = "ngr";
    final Settings indexSettings = newAnalysisSettingsBuilder().build();
    Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
    Tokenizer tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f "));
    assertTokenStreamContents(tokenizer,
            new String[] {"Åb", "Åbc", "dé", "déf", "g\uD801\uDC00", "g\uD801\uDC00f"});
    settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
    tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader(" a!$ 9"));
    assertTokenStreamContents(tokenizer,
            new String[] {" a", " a!"});
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:17,代码来源:NGramTokenizerFactoryTests.java

示例5: testInverseMapping

import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testInverseMapping() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("shingle_inverse");
    assertThat(tokenFilter, instanceOf(ShingleTokenFilterFactory.class));
    String source = "the quick brown fox";
    String[] expected = new String[]{"the_quick_brown", "quick_brown_fox"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:11,代码来源:ShingleTokenFilterFactoryTests.java

示例6: testMetaphoneWords

import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
@Test
public void testMetaphoneWords() throws Exception {
    Index index = new Index("test", "_na_");
    Settings settings = Settings.builder()
            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("index.analysis.filter.myStemmer.type", "br_metaphone")
            .build();

    AnalysisService analysisService = createAnalysisService(index, settings, new AnalysisMetaphonePlugin());

    TokenFilterFactory filterFactory = analysisService.tokenFilter("br_metaphone");

    Tokenizer tokenizer = new KeywordTokenizer();
    
    Map<String,String> words = buildWordList();
    
    Set<String> inputWords = words.keySet();
    for(String word : inputWords) {
        tokenizer.setReader(new StringReader(word));
        TokenStream ts = filterFactory.create(tokenizer);

        CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(words.get(word)));
        ts.close();
    }
}
 
开发者ID:anaelcarvalho,项目名称:elasticsearch-analysis-metaphone_ptBR,代码行数:29,代码来源:MetaphoneTokenFilterTests.java

示例7: createStackedTokenStream

import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
private static TokenStream createStackedTokenStream(String source, CharFilterFactory[] charFilterFactories, TokenizerFactory tokenizerFactory, TokenFilterFactory[] tokenFilterFactories, int current) {
    Reader reader = new FastStringReader(source);
    for (CharFilterFactory charFilterFactory : charFilterFactories) {
        reader = charFilterFactory.create(reader);
    }
    Tokenizer tokenizer = tokenizerFactory.create();
    tokenizer.setReader(reader);
    TokenStream tokenStream = tokenizer;
    for (int i = 0; i < current; i++) {
        tokenStream = tokenFilterFactories[i].create(tokenStream);
    }
    return tokenStream;
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:14,代码来源:TransportAnalyzeAction.java

示例8: testStemEnglishPossessive

import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testStemEnglishPossessive() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.my_word_delimiter.type", type)
            .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
            .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2",
        "se", "O", "Neil", "s"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:15,代码来源:BaseWordDelimiterTokenFilterFactoryTestCase.java

示例9: testBaseFormFilterFactory

import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testBaseFormFilterFactory() throws IOException {
    TestAnalysis analysis = createTestAnalysis();
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_pos");
    assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class));
    String source = "私は制限スピードを超える。";
    String[] expected = new String[]{"私", "は", "制限", "スピード", "を"};
    Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:11,代码来源:KuromojiAnalysisTests.java

示例10: testHanUnigramOnly

import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testHanUnigramOnly() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_unigram_only");
    String source = "多くの学生が試験に落ちた。";
    String[] expected = new String[]{"多", "く", "の",  "学", "学生", "生", "が",  "試", "試験", "験", "に",  "落", "ち", "た"  };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:10,代码来源:CJKFilterFactoryTests.java

示例11: testDefault

import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testDefault() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
            .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
    String source = "Ansprüche";
    String[] expected = new String[]{"Anspruche"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:13,代码来源:ASCIIFoldingTokenFilterFactoryTests.java

示例12: testCatenateNumbers

import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testCatenateNumbers() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.my_word_delimiter.type", type)
            .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
            .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
            .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2",
        "se", "O", "Neil"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:16,代码来源:BaseWordDelimiterTokenFilterFactoryTestCase.java

示例13: testNoFlags

import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testNoFlags() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_no_flags");
    String source = "多くの学生が試験に落ちた。";
    String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:10,代码来源:CJKFilterFactoryTests.java

示例14: testCaseInsensitiveMapping

import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testCaseInsensitiveMapping() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keep_filter");
    assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class));
    String source = "hello small world";
    String[] expected = new String[]{"hello", "world"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1, 2});
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:11,代码来源:KeepFilterFactoryTests.java

示例15: testCaseSensitiveMapping

import org.apache.lucene.analysis.Tokenizer; //导入方法依赖的package包/类
public void testCaseSensitiveMapping() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_case_sensitive_keep_filter");
    assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class));
    String source = "Hello small world";
    String[] expected = new String[]{"Hello"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1});
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:11,代码来源:KeepFilterFactoryTests.java


注:本文中的org.apache.lucene.analysis.Tokenizer.setReader方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。