当前位置: 首页>>代码示例>>Java>>正文


Java WhitespaceTokenizer类代码示例

本文整理汇总了Java中org.apache.lucene.analysis.core.WhitespaceTokenizer的典型用法代码示例。如果您正苦于以下问题:Java WhitespaceTokenizer类的具体用法?Java WhitespaceTokenizer怎么用?Java WhitespaceTokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


WhitespaceTokenizer类属于org.apache.lucene.analysis.core包,在下文中一共展示了WhitespaceTokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: main

import org.apache.lucene.analysis.core.WhitespaceTokenizer; //导入依赖的package包/类
@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Tokenizer tok = new WhitespaceTokenizer();
    tok.setReader(new StringReader("dark sea green sea green"));

    final SynonymMap.Builder builder = new SynonymMap.Builder(true);
    addSynonym("dark sea green", "color", builder);
    addSynonym("green", "color", builder);
    addSynonym("dark sea", "color", builder);
    addSynonym("sea green", "color", builder);
    final SynonymMap synMap = builder.build();
    final TokenStream ts = new SynonymFilter(tok, synMap, true);

    final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);

    ts.reset();
    int pos = -1;
    while (ts.incrementToken()) {
        pos += posIncrAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength());
    }
    ts.end();
    ts.close();
}
 
开发者ID:shaie,项目名称:lucenelab,代码行数:27,代码来源:SynonymFilterExample.java

示例2: testEnglishFilterFactory

import org.apache.lucene.analysis.core.WhitespaceTokenizer; //导入依赖的package包/类
public void testEnglishFilterFactory() throws IOException {
    int iters = scaledRandomIntBetween(20, 100);
    for (int i = 0; i < iters; i++) {
        Version v = VersionUtils.randomVersion(random());
        Settings settings = Settings.builder()
                .put("index.analysis.filter.my_english.type", "stemmer")
                .put("index.analysis.filter.my_english.language", "english")
                .put("index.analysis.analyzer.my_english.tokenizer","whitespace")
                .put("index.analysis.analyzer.my_english.filter","my_english")
                .put(SETTING_VERSION_CREATED,v)
                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
                .build();

        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_english");
        assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
        Tokenizer tokenizer = new WhitespaceTokenizer();
        tokenizer.setReader(new StringReader("foo bar"));
        TokenStream create = tokenFilter.create(tokenizer);
        IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
        NamedAnalyzer analyzer = indexAnalyzers.get("my_english");
        assertThat(create, instanceOf(PorterStemFilter.class));
        assertAnalyzesTo(analyzer, "consolingly", new String[]{"consolingli"});
    }

}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:27,代码来源:StemmerTokenFilterFactoryTests.java

示例3: testDefault

import org.apache.lucene.analysis.core.WhitespaceTokenizer; //导入依赖的package包/类
public void testDefault() throws IOException {
    int default_hash_count = 1;
    int default_bucket_size = 512;
    int default_hash_set_size = 1;
    Settings settings = Settings.builder()
        .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
        .build();
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash");
    String source = "the quick brown fox";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));

    // with_rotation is true by default, and hash_set_size is 1, so even though the source doesn't
    // have enough tokens to fill all the buckets, we still expect 512 tokens.
    assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer),
        default_hash_count * default_bucket_size * default_hash_set_size);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:19,代码来源:MinHashFilterFactoryTests.java

示例4: testSettings

import org.apache.lucene.analysis.core.WhitespaceTokenizer; //导入依赖的package包/类
public void testSettings() throws IOException {
    Settings settings = Settings.builder()
        .put("index.analysis.filter.test_min_hash.type", "min_hash")
        .put("index.analysis.filter.test_min_hash.hash_count", "1")
        .put("index.analysis.filter.test_min_hash.bucket_count", "2")
        .put("index.analysis.filter.test_min_hash.hash_set_size", "1")
        .put("index.analysis.filter.test_min_hash.with_rotation", false)
        .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
        .build();
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash");
    String source = "sushi";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));

    // despite the fact that bucket_count is 2 and hash_set_size is 1,
    // because with_rotation is false, we only expect 1 token here.
    assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), 1);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:20,代码来源:MinHashFilterFactoryTests.java

示例5: testCorrectPositionIncrementSetting

import org.apache.lucene.analysis.core.WhitespaceTokenizer; //导入依赖的package包/类
public void testCorrectPositionIncrementSetting() throws IOException {
    Builder builder = Settings.builder().put("index.analysis.filter.my_stop.type", "stop");
    if (random().nextBoolean()) {
        builder.put("index.analysis.filter.my_stop.version", Version.LATEST);
    } else {
        // don't specify
    }
    builder.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString());
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(builder.build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_stop");
    assertThat(tokenFilter, instanceOf(StopTokenFilterFactory.class));
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("foo bar"));
    TokenStream create = tokenFilter.create(tokenizer);
    assertThat(create, instanceOf(StopFilter.class));
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:17,代码来源:StopTokenFilterTests.java

示例6: testMultiTerms

import org.apache.lucene.analysis.core.WhitespaceTokenizer; //导入依赖的package包/类
public void testMultiTerms() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
        .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
        .put("index.analysis.filter.my_word_delimiter.type", type)
        .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
        .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
        .build());

    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42",
        "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se",
        "ONeil", "O'Neil's", "O", "Neil" };
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1};
    int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1};
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
            expectedIncr, expectedPosLen, null);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:21,代码来源:WordDelimiterGraphTokenFilterFactoryTests.java

示例7: testPartsAndCatenate

import org.apache.lucene.analysis.core.WhitespaceTokenizer; //导入依赖的package包/类
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
public void testPartsAndCatenate() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
        .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
        .put("index.analysis.filter.my_word_delimiter.type", type)
        .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
        .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
        .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot";
    int[] expectedIncr = new int[]{1, 0, 1};
    int[] expectedPosLen = new int[]{2, 1, 1};
    String[] expected = new String[]{"PowerShot", "Power", "Shot" };
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
        expectedIncr, expectedPosLen, null);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:19,代码来源:WordDelimiterGraphTokenFilterFactoryTests.java

示例8: countTokensInText

import org.apache.lucene.analysis.core.WhitespaceTokenizer; //导入依赖的package包/类
private int countTokensInText(String text) {
	WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader(text));
	// tokenizer.setReader(new StringReader(text));
	int tokens = 0;
	try {
		tokenizer.reset();
		while (tokenizer.incrementToken()) {
			++tokens;
		}
	} catch (Exception e) {
		LOGGER.error("Error while tokenizing text. Returning.", e);
	} finally {
		IOUtils.closeQuietly(tokenizer);
	}
	return tokens;
}
 
开发者ID:dice-group,项目名称:BENGAL,代码行数:17,代码来源:DatasetAnalyzer.java

示例9: testBasic

import org.apache.lucene.analysis.core.WhitespaceTokenizer; //导入依赖的package包/类
@Test
public void testBasic() throws Exception {
    final Path dictPath = Files.createTempFile("rsf_", ".txt");
    final long reloadInterval = 500;
    writeFile(dictPath, "aaa");

    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new WhitespaceTokenizer();
            return new TokenStreamComponents(tokenizer, new ReloadableStopFilter(tokenizer, dictPath, true, reloadInterval));
        }
    };

    String input = "aaa bbb";
    assertAnalyzesTo(analyzer, input, new String[] { "bbb" });

    Thread.sleep(1000L);
    writeFile(dictPath, "bbb");
    Thread.sleep(1000L);

    assertAnalyzesTo(analyzer, input, new String[] { "aaa" });

}
 
开发者ID:codelibs,项目名称:analyzers-ja,代码行数:25,代码来源:ReloadableStopFilterTest.java

示例10: testBasic

import org.apache.lucene.analysis.core.WhitespaceTokenizer; //导入依赖的package包/类
@Test
public void testBasic() throws Exception {
    final Path dictPath = Files.createTempFile("rkmf_", ".txt");
    final long reloadInterval = 500;
    writeFile(dictPath, "aaa");

    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new WhitespaceTokenizer();
            return new TokenStreamComponents(tokenizer, new ReloadableKeywordMarkerFilter(tokenizer, dictPath, reloadInterval));
        }
    };

    String input = "aaa bbb";
    assertTokenStreamContents(analyzer.tokenStream("dummy", input), new String[] { "aaa", "bbb" }, new int[] { 0, 4 },
            new int[] { 3, 7 }, null, null, null, input.length(), new boolean[] { true, false }, true);

    Thread.sleep(1000L);
    writeFile(dictPath, "bbb");
    Thread.sleep(1000L);

    assertTokenStreamContents(analyzer.tokenStream("dummy", input), new String[] { "aaa", "bbb" }, new int[] { 0, 4 },
            new int[] { 3, 7 }, null, null, null, input.length(), new boolean[] { false, true }, true);

}
 
开发者ID:codelibs,项目名称:analyzers-ja,代码行数:27,代码来源:ReloadableKeywordMarkerFilterTest.java

示例11: testBasic

import org.apache.lucene.analysis.core.WhitespaceTokenizer; //导入依赖的package包/类
@Test
public void testBasic() throws IOException {
    final Pattern pattern1 = Pattern.compile("平成|昭和");
    final Pattern pattern2 = Pattern.compile("[0-9]+年");
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new WhitespaceTokenizer();
            return new TokenStreamComponents(tokenizer, new PatternConcatenationFilter(tokenizer, pattern1, pattern2));
        }
    };

    assertAnalyzesTo(analyzer, "平成 10年", //
            new String[] { "平成10年" }, //
            new int[] { 0 },//
            new int[] { 6 },//
            new int[] { 1 });
    assertAnalyzesTo(analyzer, "aaa 昭和 56年 bbb", //
            new String[] { "aaa", "昭和56年", "bbb" }, //
            new int[] { 1, 1, 1 });
    assertAnalyzesTo(analyzer, "大正 5年", //
            new String[] { "大正", "5年" }, //
            new int[] { 1, 1 });

}
 
开发者ID:codelibs,项目名称:analyzers-ja,代码行数:26,代码来源:PatternConcatenationFilterTest.java

示例12: testBasic

import org.apache.lucene.analysis.core.WhitespaceTokenizer; //导入依赖的package包/类
@Test
public void testBasic() throws IOException {
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new WhitespaceTokenizer();
            return new TokenStreamComponents(tokenizer, new StopTokenPrefixFilter(tokenizer, new String[] { "b", "dd" }, false));
        }
    };

    assertAnalyzesTo(analyzer, "aaa bbb ccc ddd eee", //
            new String[] { "aaa", "ccc", "eee" }, //
            new int[] { 0, 8, 16 }, //
            new int[] { 3, 11, 19 }, //
            new int[] { 1, 2, 2 });
    assertAnalyzesTo(analyzer, "aaa", new String[] { "aaa" });
    assertAnalyzesTo(analyzer, "ddd", new String[0]);
    assertAnalyzesTo(analyzer, "add", new String[] { "add" });
    assertAnalyzesTo(analyzer, "aad", new String[] { "aad" });
    assertAnalyzesTo(analyzer, "dda", new String[0]);
    assertAnalyzesTo(analyzer, "daa", new String[] { "daa" });

}
 
开发者ID:codelibs,项目名称:analyzers-ja,代码行数:24,代码来源:StopTokenPrefixFilterTest.java

示例13: testNone

import org.apache.lucene.analysis.core.WhitespaceTokenizer; //导入依赖的package包/类
@Test
public void testNone() throws IOException {
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new WhitespaceTokenizer();
            return new TokenStreamComponents(tokenizer, new CharTypeFilter(tokenizer, false, false, false));
        }
    };

    assertAnalyzesTo(analyzer, "aaa 111 あああ aa1 aaあ 11あ", //
            new String[0]);

    String symbolStr = "!\"#$%&'()*+-.,/:;<=>[email protected][\\]^_`{|}~";
    for (int i = 0; i < symbolStr.length(); i++) {
        String target = symbolStr.substring(i, i + 1);
        assertAnalyzesTo(analyzer, target, new String[0]);
    }
}
 
开发者ID:codelibs,项目名称:analyzers-ja,代码行数:20,代码来源:CharTypeFilterTest.java

示例14: testBasic

import org.apache.lucene.analysis.core.WhitespaceTokenizer; //导入依赖的package包/类
@Test
public void testBasic() throws IOException {
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new WhitespaceTokenizer();
            return new TokenStreamComponents(tokenizer, new StopTokenSuffixFilter(tokenizer, new String[] { "b", "dd" }, false));
        }
    };

    assertAnalyzesTo(analyzer, "aaa bbb ccc ddd eee", //
            new String[] { "aaa", "ccc", "eee" }, //
            new int[] { 0, 8, 16 }, //
            new int[] { 3, 11, 19 }, //
            new int[] { 1, 2, 2 });
    assertAnalyzesTo(analyzer, "aaa", new String[] { "aaa" });
    assertAnalyzesTo(analyzer, "ddd", new String[0]);
    assertAnalyzesTo(analyzer, "add", new String[0]);
    assertAnalyzesTo(analyzer, "aad", new String[] { "aad" });
    assertAnalyzesTo(analyzer, "dda", new String[] { "dda" });
    assertAnalyzesTo(analyzer, "daa", new String[] { "daa" });

}
 
开发者ID:codelibs,项目名称:analyzers-ja,代码行数:24,代码来源:StopTokenSuffixFilterTest.java

示例15: testTokenEndingWithWordComponentOfMinimumLength

import org.apache.lucene.analysis.core.WhitespaceTokenizer; //导入依赖的package包/类
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
  CharArraySet dict = makeDictionary("ab", "cd", "ef");

  Tokenizer tokenizer = new MockTokenizer(new StringReader("abcdef"), MockTokenizer.WHITESPACE, false);
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
    new WhitespaceTokenizer(
      new StringReader(
        "abcdef")
      ),
    dict,
    CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

  assertTokenStreamContents(tf,
    new String[] { "abcdef", "ab", "cd", "ef" },
    new int[] { 0, 0, 0, 0},
    new int[] { 6, 6, 6, 6},
    new int[] { 1, 0, 0, 0}
    );
}
 
开发者ID:europeana,项目名称:search,代码行数:22,代码来源:TestCompoundWordTokenFilter.java


注:本文中的org.apache.lucene.analysis.core.WhitespaceTokenizer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。