當前位置: 首頁>>代碼示例>>Java>>正文


Java Tokenizer.setReader方法代碼示例

本文整理匯總了Java中org.apache.lucene.analysis.Tokenizer.setReader方法的典型用法代碼示例。如果您正苦於以下問題:Java Tokenizer.setReader方法的具體用法?Java Tokenizer.setReader怎麽用?Java Tokenizer.setReader使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在org.apache.lucene.analysis.Tokenizer的用法示例。


在下文中一共展示了Tokenizer.setReader方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。

示例1: testBasicUsage

import org.apache.lucene.analysis.Tokenizer; //導入方法依賴的package包/類
@Test
public void testBasicUsage() throws IOException {
    String query = "그리고 이것은 예시, 또는 예로써, 한국어를 처리하기 입니다";
    String[] expectedCharTerms = new String[]{"예시", "예", "한국어", "처리", "하다", "이다"};
    String[] expectedTypes = new String[]{"Noun", "Modifier", "Noun", "Noun", "Verb", "Adjective"};
    int[] expectedStartOffsets = new int[]{8, 15, 20, 25, 27, 30};
    int[] expectedEndOffsets = new int[]{10, 16, 23, 27, 29, 33};

    Tokenizer tokenizer = new OpenKoreanTextTokenizer();
    tokenizer.setReader(new StringReader(query));

    OpenKoreanTextTokenFilter tokenFilter = new OpenKoreanTextStemmer(tokenizer);
    tokenFilter = new OpenKoreanTextRedundantFilter(tokenFilter);

    TokenStreamAssertions.assertTokenStream(tokenFilter, expectedCharTerms, expectedTypes, expectedStartOffsets, expectedEndOffsets);
}
 
開發者ID:open-korean-text,項目名稱:elasticsearch-analysis-openkoreantext,代碼行數:17,代碼來源:OpenKoreanTextRedundantFilterTest.java

示例2: testMultiTerms

import org.apache.lucene.analysis.Tokenizer; //導入方法依賴的package包/類
public void testMultiTerms() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
        .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
        .put("index.analysis.filter.my_word_delimiter.type", type)
        .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
        .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
        .build());

    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42",
        "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se",
        "ONeil", "O'Neil's", "O", "Neil" };
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1};
    int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1};
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
            expectedIncr, expectedPosLen, null);
}
 
開發者ID:justor,項目名稱:elasticsearch_my,代碼行數:21,代碼來源:WordDelimiterGraphTokenFilterFactoryTests.java

示例3: testPreTokenization

import org.apache.lucene.analysis.Tokenizer; //導入方法依賴的package包/類
public void testPreTokenization() throws IOException {
    // Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
    final Index index = new Index("test", "_na_");
    final String name = "ngr";
    final Settings indexSettings = newAnalysisSettingsBuilder().build();
    Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
    Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f "));
    assertTokenStreamContents(tokenizer,
            new String[] {"Åb", "Åbc", "bc", "dé", "déf", "éf", "g\uD801\uDC00", "g\uD801\uDC00f", "\uD801\uDC00f"});
    settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
    tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader(" a!$ 9"));
    assertTokenStreamContents(tokenizer,
        new String[] {" a", " a!", "a!", "a!$", "!$", "!$ ", "$ ", "$ 9", " 9"});
}
 
開發者ID:justor,項目名稱:elasticsearch_my,代碼行數:17,代碼來源:NGramTokenizerFactoryTests.java

示例4: testPreTokenizationEdge

import org.apache.lucene.analysis.Tokenizer; //導入方法依賴的package包/類
public void testPreTokenizationEdge() throws IOException {
    // Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
    final Index index = new Index("test", "_na_");
    final String name = "ngr";
    final Settings indexSettings = newAnalysisSettingsBuilder().build();
    Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
    Tokenizer tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f "));
    assertTokenStreamContents(tokenizer,
            new String[] {"Åb", "Åbc", "dé", "déf", "g\uD801\uDC00", "g\uD801\uDC00f"});
    settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
    tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader(" a!$ 9"));
    assertTokenStreamContents(tokenizer,
            new String[] {" a", " a!"});
}
 
開發者ID:justor,項目名稱:elasticsearch_my,代碼行數:17,代碼來源:NGramTokenizerFactoryTests.java

示例5: testInverseMapping

import org.apache.lucene.analysis.Tokenizer; //導入方法依賴的package包/類
public void testInverseMapping() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("shingle_inverse");
    assertThat(tokenFilter, instanceOf(ShingleTokenFilterFactory.class));
    String source = "the quick brown fox";
    String[] expected = new String[]{"the_quick_brown", "quick_brown_fox"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
開發者ID:justor,項目名稱:elasticsearch_my,代碼行數:11,代碼來源:ShingleTokenFilterFactoryTests.java

示例6: testMetaphoneWords

import org.apache.lucene.analysis.Tokenizer; //導入方法依賴的package包/類
@Test
public void testMetaphoneWords() throws Exception {
    Index index = new Index("test", "_na_");
    Settings settings = Settings.builder()
            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("index.analysis.filter.myStemmer.type", "br_metaphone")
            .build();

    AnalysisService analysisService = createAnalysisService(index, settings, new AnalysisMetaphonePlugin());

    TokenFilterFactory filterFactory = analysisService.tokenFilter("br_metaphone");

    Tokenizer tokenizer = new KeywordTokenizer();
    
    Map<String,String> words = buildWordList();
    
    Set<String> inputWords = words.keySet();
    for(String word : inputWords) {
        tokenizer.setReader(new StringReader(word));
        TokenStream ts = filterFactory.create(tokenizer);

        CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(words.get(word)));
        ts.close();
    }
}
 
開發者ID:anaelcarvalho,項目名稱:elasticsearch-analysis-metaphone_ptBR,代碼行數:29,代碼來源:MetaphoneTokenFilterTests.java

示例7: createStackedTokenStream

import org.apache.lucene.analysis.Tokenizer; //導入方法依賴的package包/類
private static TokenStream createStackedTokenStream(String source, CharFilterFactory[] charFilterFactories, TokenizerFactory tokenizerFactory, TokenFilterFactory[] tokenFilterFactories, int current) {
    Reader reader = new FastStringReader(source);
    for (CharFilterFactory charFilterFactory : charFilterFactories) {
        reader = charFilterFactory.create(reader);
    }
    Tokenizer tokenizer = tokenizerFactory.create();
    tokenizer.setReader(reader);
    TokenStream tokenStream = tokenizer;
    for (int i = 0; i < current; i++) {
        tokenStream = tokenFilterFactories[i].create(tokenStream);
    }
    return tokenStream;
}
 
開發者ID:justor,項目名稱:elasticsearch_my,代碼行數:14,代碼來源:TransportAnalyzeAction.java

示例8: testStemEnglishPossessive

import org.apache.lucene.analysis.Tokenizer; //導入方法依賴的package包/類
public void testStemEnglishPossessive() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.my_word_delimiter.type", type)
            .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
            .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2",
        "se", "O", "Neil", "s"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
開發者ID:justor,項目名稱:elasticsearch_my,代碼行數:15,代碼來源:BaseWordDelimiterTokenFilterFactoryTestCase.java

示例9: testBaseFormFilterFactory

import org.apache.lucene.analysis.Tokenizer; //導入方法依賴的package包/類
public void testBaseFormFilterFactory() throws IOException {
    TestAnalysis analysis = createTestAnalysis();
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_pos");
    assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class));
    String source = "私は製限スピードを超える。";
    String[] expected = new String[]{"私", "は", "製限", "スピード", "を"};
    Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
 
開發者ID:justor,項目名稱:elasticsearch_my,代碼行數:11,代碼來源:KuromojiAnalysisTests.java

示例10: testHanUnigramOnly

import org.apache.lucene.analysis.Tokenizer; //導入方法依賴的package包/類
public void testHanUnigramOnly() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_unigram_only");
    String source = "多くの學生が試験に落ちた。";
    String[] expected = new String[]{"多", "く", "の",  "學", "學生", "生", "が",  "試", "試験", "験", "に",  "落", "ち", "た"  };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
開發者ID:justor,項目名稱:elasticsearch_my,代碼行數:10,代碼來源:CJKFilterFactoryTests.java

示例11: testDefault

import org.apache.lucene.analysis.Tokenizer; //導入方法依賴的package包/類
public void testDefault() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
            .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
    String source = "Ansprüche";
    String[] expected = new String[]{"Anspruche"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
開發者ID:justor,項目名稱:elasticsearch_my,代碼行數:13,代碼來源:ASCIIFoldingTokenFilterFactoryTests.java

示例12: testCatenateNumbers

import org.apache.lucene.analysis.Tokenizer; //導入方法依賴的package包/類
public void testCatenateNumbers() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.my_word_delimiter.type", type)
            .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
            .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
            .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2",
        "se", "O", "Neil"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
開發者ID:justor,項目名稱:elasticsearch_my,代碼行數:16,代碼來源:BaseWordDelimiterTokenFilterFactoryTestCase.java

示例13: testNoFlags

import org.apache.lucene.analysis.Tokenizer; //導入方法依賴的package包/類
public void testNoFlags() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_no_flags");
    String source = "多くの學生が試験に落ちた。";
    String[] expected = new String[]{"多く", "くの", "の學", "學生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
開發者ID:justor,項目名稱:elasticsearch_my,代碼行數:10,代碼來源:CJKFilterFactoryTests.java

示例14: testCaseInsensitiveMapping

import org.apache.lucene.analysis.Tokenizer; //導入方法依賴的package包/類
public void testCaseInsensitiveMapping() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keep_filter");
    assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class));
    String source = "hello small world";
    String[] expected = new String[]{"hello", "world"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1, 2});
}
 
開發者ID:justor,項目名稱:elasticsearch_my,代碼行數:11,代碼來源:KeepFilterFactoryTests.java

示例15: testCaseSensitiveMapping

import org.apache.lucene.analysis.Tokenizer; //導入方法依賴的package包/類
public void testCaseSensitiveMapping() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_case_sensitive_keep_filter");
    assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class));
    String source = "Hello small world";
    String[] expected = new String[]{"Hello"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1});
}
 
開發者ID:justor,項目名稱:elasticsearch_my,代碼行數:11,代碼來源:KeepFilterFactoryTests.java


注:本文中的org.apache.lucene.analysis.Tokenizer.setReader方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。