本文整理匯總了Java中org.apache.lucene.analysis.core.WhitespaceTokenizer類的典型用法代碼示例。如果您正苦於以下問題:Java WhitespaceTokenizer類的具體用法?Java WhitespaceTokenizer怎麽用?Java WhitespaceTokenizer使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
WhitespaceTokenizer類屬於org.apache.lucene.analysis.core包,在下文中一共展示了WhitespaceTokenizer類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: main
import org.apache.lucene.analysis.core.WhitespaceTokenizer; //導入依賴的package包/類
@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
final Tokenizer tok = new WhitespaceTokenizer();
tok.setReader(new StringReader("dark sea green sea green"));
final SynonymMap.Builder builder = new SynonymMap.Builder(true);
addSynonym("dark sea green", "color", builder);
addSynonym("green", "color", builder);
addSynonym("dark sea", "color", builder);
addSynonym("sea green", "color", builder);
final SynonymMap synMap = builder.build();
final TokenStream ts = new SynonymFilter(tok, synMap, true);
final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);
ts.reset();
int pos = -1;
while (ts.incrementToken()) {
pos += posIncrAtt.getPositionIncrement();
System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength());
}
ts.end();
ts.close();
}
示例2: testEnglishFilterFactory
import org.apache.lucene.analysis.core.WhitespaceTokenizer; //導入依賴的package包/類
public void testEnglishFilterFactory() throws IOException {
int iters = scaledRandomIntBetween(20, 100);
for (int i = 0; i < iters; i++) {
Version v = VersionUtils.randomVersion(random());
Settings settings = Settings.builder()
.put("index.analysis.filter.my_english.type", "stemmer")
.put("index.analysis.filter.my_english.language", "english")
.put("index.analysis.analyzer.my_english.tokenizer","whitespace")
.put("index.analysis.analyzer.my_english.filter","my_english")
.put(SETTING_VERSION_CREATED,v)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_english");
assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
TokenStream create = tokenFilter.create(tokenizer);
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
NamedAnalyzer analyzer = indexAnalyzers.get("my_english");
assertThat(create, instanceOf(PorterStemFilter.class));
assertAnalyzesTo(analyzer, "consolingly", new String[]{"consolingli"});
}
}
示例3: testDefault
import org.apache.lucene.analysis.core.WhitespaceTokenizer; //導入依賴的package包/類
public void testDefault() throws IOException {
int default_hash_count = 1;
int default_bucket_size = 512;
int default_hash_set_size = 1;
Settings settings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash");
String source = "the quick brown fox";
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
// with_rotation is true by default, and hash_set_size is 1, so even though the source doesn't
// have enough tokens to fill all the buckets, we still expect 512 tokens.
assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer),
default_hash_count * default_bucket_size * default_hash_set_size);
}
示例4: testSettings
import org.apache.lucene.analysis.core.WhitespaceTokenizer; //導入依賴的package包/類
public void testSettings() throws IOException {
Settings settings = Settings.builder()
.put("index.analysis.filter.test_min_hash.type", "min_hash")
.put("index.analysis.filter.test_min_hash.hash_count", "1")
.put("index.analysis.filter.test_min_hash.bucket_count", "2")
.put("index.analysis.filter.test_min_hash.hash_set_size", "1")
.put("index.analysis.filter.test_min_hash.with_rotation", false)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash");
String source = "sushi";
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
// despite the fact that bucket_count is 2 and hash_set_size is 1,
// because with_rotation is false, we only expect 1 token here.
assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), 1);
}
示例5: testCorrectPositionIncrementSetting
import org.apache.lucene.analysis.core.WhitespaceTokenizer; //導入依賴的package包/類
public void testCorrectPositionIncrementSetting() throws IOException {
Builder builder = Settings.builder().put("index.analysis.filter.my_stop.type", "stop");
if (random().nextBoolean()) {
builder.put("index.analysis.filter.my_stop.version", Version.LATEST);
} else {
// don't specify
}
builder.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString());
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(builder.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_stop");
assertThat(tokenFilter, instanceOf(StopTokenFilterFactory.class));
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
TokenStream create = tokenFilter.create(tokenizer);
assertThat(create, instanceOf(StopFilter.class));
}
示例6: testMultiTerms
import org.apache.lucene.analysis.core.WhitespaceTokenizer; //導入依賴的package包/類
public void testMultiTerms() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42",
"wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se",
"ONeil", "O'Neil's", "O", "Neil" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1};
int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1};
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
expectedIncr, expectedPosLen, null);
}
示例7: testPartsAndCatenate
import org.apache.lucene.analysis.core.WhitespaceTokenizer; //導入依賴的package包/類
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
public void testPartsAndCatenate() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
int[] expectedIncr = new int[]{1, 0, 1};
int[] expectedPosLen = new int[]{2, 1, 1};
String[] expected = new String[]{"PowerShot", "Power", "Shot" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
expectedIncr, expectedPosLen, null);
}
示例8: countTokensInText
import org.apache.lucene.analysis.core.WhitespaceTokenizer; //導入依賴的package包/類
private int countTokensInText(String text) {
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader(text));
// tokenizer.setReader(new StringReader(text));
int tokens = 0;
try {
tokenizer.reset();
while (tokenizer.incrementToken()) {
++tokens;
}
} catch (Exception e) {
LOGGER.error("Error while tokenizing text. Returning.", e);
} finally {
IOUtils.closeQuietly(tokenizer);
}
return tokens;
}
示例9: testBasic
import org.apache.lucene.analysis.core.WhitespaceTokenizer; //導入依賴的package包/類
@Test
public void testBasic() throws Exception {
final Path dictPath = Files.createTempFile("rsf_", ".txt");
final long reloadInterval = 500;
writeFile(dictPath, "aaa");
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
final Tokenizer tokenizer = new WhitespaceTokenizer();
return new TokenStreamComponents(tokenizer, new ReloadableStopFilter(tokenizer, dictPath, true, reloadInterval));
}
};
String input = "aaa bbb";
assertAnalyzesTo(analyzer, input, new String[] { "bbb" });
Thread.sleep(1000L);
writeFile(dictPath, "bbb");
Thread.sleep(1000L);
assertAnalyzesTo(analyzer, input, new String[] { "aaa" });
}
示例10: testBasic
import org.apache.lucene.analysis.core.WhitespaceTokenizer; //導入依賴的package包/類
@Test
public void testBasic() throws Exception {
final Path dictPath = Files.createTempFile("rkmf_", ".txt");
final long reloadInterval = 500;
writeFile(dictPath, "aaa");
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
final Tokenizer tokenizer = new WhitespaceTokenizer();
return new TokenStreamComponents(tokenizer, new ReloadableKeywordMarkerFilter(tokenizer, dictPath, reloadInterval));
}
};
String input = "aaa bbb";
assertTokenStreamContents(analyzer.tokenStream("dummy", input), new String[] { "aaa", "bbb" }, new int[] { 0, 4 },
new int[] { 3, 7 }, null, null, null, input.length(), new boolean[] { true, false }, true);
Thread.sleep(1000L);
writeFile(dictPath, "bbb");
Thread.sleep(1000L);
assertTokenStreamContents(analyzer.tokenStream("dummy", input), new String[] { "aaa", "bbb" }, new int[] { 0, 4 },
new int[] { 3, 7 }, null, null, null, input.length(), new boolean[] { false, true }, true);
}
示例11: testBasic
import org.apache.lucene.analysis.core.WhitespaceTokenizer; //導入依賴的package包/類
@Test
public void testBasic() throws IOException {
final Pattern pattern1 = Pattern.compile("平成|昭和");
final Pattern pattern2 = Pattern.compile("[0-9]+年");
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
final Tokenizer tokenizer = new WhitespaceTokenizer();
return new TokenStreamComponents(tokenizer, new PatternConcatenationFilter(tokenizer, pattern1, pattern2));
}
};
assertAnalyzesTo(analyzer, "平成 10年", //
new String[] { "平成10年" }, //
new int[] { 0 },//
new int[] { 6 },//
new int[] { 1 });
assertAnalyzesTo(analyzer, "aaa 昭和 56年 bbb", //
new String[] { "aaa", "昭和56年", "bbb" }, //
new int[] { 1, 1, 1 });
assertAnalyzesTo(analyzer, "大正 5年", //
new String[] { "大正", "5年" }, //
new int[] { 1, 1 });
}
示例12: testBasic
import org.apache.lucene.analysis.core.WhitespaceTokenizer; //導入依賴的package包/類
@Test
public void testBasic() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
final Tokenizer tokenizer = new WhitespaceTokenizer();
return new TokenStreamComponents(tokenizer, new StopTokenPrefixFilter(tokenizer, new String[] { "b", "dd" }, false));
}
};
assertAnalyzesTo(analyzer, "aaa bbb ccc ddd eee", //
new String[] { "aaa", "ccc", "eee" }, //
new int[] { 0, 8, 16 }, //
new int[] { 3, 11, 19 }, //
new int[] { 1, 2, 2 });
assertAnalyzesTo(analyzer, "aaa", new String[] { "aaa" });
assertAnalyzesTo(analyzer, "ddd", new String[0]);
assertAnalyzesTo(analyzer, "add", new String[] { "add" });
assertAnalyzesTo(analyzer, "aad", new String[] { "aad" });
assertAnalyzesTo(analyzer, "dda", new String[0]);
assertAnalyzesTo(analyzer, "daa", new String[] { "daa" });
}
示例13: testNone
import org.apache.lucene.analysis.core.WhitespaceTokenizer; //導入依賴的package包/類
@Test
public void testNone() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
final Tokenizer tokenizer = new WhitespaceTokenizer();
return new TokenStreamComponents(tokenizer, new CharTypeFilter(tokenizer, false, false, false));
}
};
assertAnalyzesTo(analyzer, "aaa 111 あああ aa1 aaあ 11あ", //
new String[0]);
String symbolStr = "!\"#$%&'()*+-.,/:;<=>[email protected][\\]^_`{|}~";
for (int i = 0; i < symbolStr.length(); i++) {
String target = symbolStr.substring(i, i + 1);
assertAnalyzesTo(analyzer, target, new String[0]);
}
}
示例14: testBasic
import org.apache.lucene.analysis.core.WhitespaceTokenizer; //導入依賴的package包/類
@Test
public void testBasic() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
final Tokenizer tokenizer = new WhitespaceTokenizer();
return new TokenStreamComponents(tokenizer, new StopTokenSuffixFilter(tokenizer, new String[] { "b", "dd" }, false));
}
};
assertAnalyzesTo(analyzer, "aaa bbb ccc ddd eee", //
new String[] { "aaa", "ccc", "eee" }, //
new int[] { 0, 8, 16 }, //
new int[] { 3, 11, 19 }, //
new int[] { 1, 2, 2 });
assertAnalyzesTo(analyzer, "aaa", new String[] { "aaa" });
assertAnalyzesTo(analyzer, "ddd", new String[0]);
assertAnalyzesTo(analyzer, "add", new String[0]);
assertAnalyzesTo(analyzer, "aad", new String[] { "aad" });
assertAnalyzesTo(analyzer, "dda", new String[] { "dda" });
assertAnalyzesTo(analyzer, "daa", new String[] { "daa" });
}
示例15: testTokenEndingWithWordComponentOfMinimumLength
import org.apache.lucene.analysis.core.WhitespaceTokenizer; //導入依賴的package包/類
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
CharArraySet dict = makeDictionary("ab", "cd", "ef");
Tokenizer tokenizer = new MockTokenizer(new StringReader("abcdef"), MockTokenizer.WHITESPACE, false);
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
new WhitespaceTokenizer(
new StringReader(
"abcdef")
),
dict,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
assertTokenStreamContents(tf,
new String[] { "abcdef", "ab", "cd", "ef" },
new int[] { 0, 0, 0, 0},
new int[] { 6, 6, 6, 6},
new int[] { 1, 0, 0, 0}
);
}