本文整理汇总了Java中org.apache.lucene.analysis.Tokenizer类的典型用法代码示例。如果您正苦于以下问题:Java Tokenizer类的具体用法?Java Tokenizer怎么用?Java Tokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
Tokenizer类属于org.apache.lucene.analysis包,在下文中一共展示了Tokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.lucene.analysis.Tokenizer; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
List<Term> parse = ToAnalysis.parse("中华人民 共和国 成立了 ");
System.out.println(parse);
List<Term> parse1 = IndexAnalysis.parse("你吃过饭了没有!!!!!吃过无妨论文");
//System.out.println(parse1);
String text11="ZW321282050000000325";
Tokenizer tokenizer = new AnsjTokenizer(new StringReader(text11), 0, true);
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt =
tokenizer.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute positionIncrementAtt =
tokenizer.addAttribute(PositionIncrementAttribute.class);
tokenizer.reset();
while (tokenizer.incrementToken()){
System.out.print(new String(termAtt.toString()+" ") );
// System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
//System.out.print( positionIncrementAtt.getPositionIncrement() +"/");
}
tokenizer.close();
}
示例2: testSimple
import org.apache.lucene.analysis.Tokenizer; //导入依赖的package包/类
public void testSimple() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(t, new UniqueTokenFilter(t));
}
};
TokenStream test = analyzer.tokenStream("test", "this test with test");
test.reset();
CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class);
assertThat(test.incrementToken(), equalTo(true));
assertThat(termAttribute.toString(), equalTo("this"));
assertThat(test.incrementToken(), equalTo(true));
assertThat(termAttribute.toString(), equalTo("test"));
assertThat(test.incrementToken(), equalTo(true));
assertThat(termAttribute.toString(), equalTo("with"));
assertThat(test.incrementToken(), equalTo(false));
}
示例3: testEnglishFilterFactory
import org.apache.lucene.analysis.Tokenizer; //导入依赖的package包/类
public void testEnglishFilterFactory() throws IOException {
int iters = scaledRandomIntBetween(20, 100);
for (int i = 0; i < iters; i++) {
Version v = VersionUtils.randomVersion(random());
Settings settings = Settings.builder()
.put("index.analysis.filter.my_english.type", "stemmer")
.put("index.analysis.filter.my_english.language", "english")
.put("index.analysis.analyzer.my_english.tokenizer","whitespace")
.put("index.analysis.analyzer.my_english.filter","my_english")
.put(SETTING_VERSION_CREATED,v)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_english");
assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
TokenStream create = tokenFilter.create(tokenizer);
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
NamedAnalyzer analyzer = indexAnalyzers.get("my_english");
assertThat(create, instanceOf(PorterStemFilter.class));
assertAnalyzesTo(analyzer, "consolingly", new String[]{"consolingli"});
}
}
示例4: testPreTokenization
import org.apache.lucene.analysis.Tokenizer; //导入依赖的package包/类
public void testPreTokenization() throws IOException {
// Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
final Index index = new Index("test", "_na_");
final String name = "ngr";
final Settings indexSettings = newAnalysisSettingsBuilder().build();
Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f "));
assertTokenStreamContents(tokenizer,
new String[] {"Åb", "Åbc", "bc", "dé", "déf", "éf", "g\uD801\uDC00", "g\uD801\uDC00f", "\uD801\uDC00f"});
settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
tokenizer.setReader(new StringReader(" a!$ 9"));
assertTokenStreamContents(tokenizer,
new String[] {" a", " a!", "a!", "a!$", "!$", "!$ ", "$ ", "$ 9", " 9"});
}
示例5: testPreTokenizationEdge
import org.apache.lucene.analysis.Tokenizer; //导入依赖的package包/类
public void testPreTokenizationEdge() throws IOException {
// Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
final Index index = new Index("test", "_na_");
final String name = "ngr";
final Settings indexSettings = newAnalysisSettingsBuilder().build();
Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
Tokenizer tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f "));
assertTokenStreamContents(tokenizer,
new String[] {"Åb", "Åbc", "dé", "déf", "g\uD801\uDC00", "g\uD801\uDC00f"});
settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
tokenizer.setReader(new StringReader(" a!$ 9"));
assertTokenStreamContents(tokenizer,
new String[] {" a", " a!"});
}
示例6: testBackwardsCompatibilityEdgeNgramTokenFilter
import org.apache.lucene.analysis.Tokenizer; //导入依赖的package包/类
public void testBackwardsCompatibilityEdgeNgramTokenFilter() throws Exception {
int iters = scaledRandomIntBetween(20, 100);
for (int i = 0; i < iters; i++) {
final Index index = new Index("test", "_na_");
final String name = "ngr";
Version v = randomVersion(random());
Builder builder = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3);
boolean reverse = random().nextBoolean();
if (reverse) {
builder.put("side", "back");
}
Settings settings = builder.build();
Settings indexSettings = newAnalysisSettingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, v.id).build();
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
TokenStream edgeNGramTokenFilter = new EdgeNGramTokenFilterFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(tokenizer);
if (reverse) {
assertThat(edgeNGramTokenFilter, instanceOf(ReverseStringFilter.class));
} else {
assertThat(edgeNGramTokenFilter, instanceOf(EdgeNGramTokenFilter.class));
}
}
}
示例7: testDefault
import org.apache.lucene.analysis.Tokenizer; //导入依赖的package包/类
public void testDefault() throws IOException {
int default_hash_count = 1;
int default_bucket_size = 512;
int default_hash_set_size = 1;
Settings settings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash");
String source = "the quick brown fox";
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
// with_rotation is true by default, and hash_set_size is 1, so even though the source doesn't
// have enough tokens to fill all the buckets, we still expect 512 tokens.
assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer),
default_hash_count * default_bucket_size * default_hash_set_size);
}
示例8: testSettings
import org.apache.lucene.analysis.Tokenizer; //导入依赖的package包/类
public void testSettings() throws IOException {
Settings settings = Settings.builder()
.put("index.analysis.filter.test_min_hash.type", "min_hash")
.put("index.analysis.filter.test_min_hash.hash_count", "1")
.put("index.analysis.filter.test_min_hash.bucket_count", "2")
.put("index.analysis.filter.test_min_hash.hash_set_size", "1")
.put("index.analysis.filter.test_min_hash.with_rotation", false)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash");
String source = "sushi";
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
// despite the fact that bucket_count is 2 and hash_set_size is 1,
// because with_rotation is false, we only expect 1 token here.
assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), 1);
}
示例9: testCorrectPositionIncrementSetting
import org.apache.lucene.analysis.Tokenizer; //导入依赖的package包/类
public void testCorrectPositionIncrementSetting() throws IOException {
Builder builder = Settings.builder().put("index.analysis.filter.my_stop.type", "stop");
if (random().nextBoolean()) {
builder.put("index.analysis.filter.my_stop.version", Version.LATEST);
} else {
// don't specify
}
builder.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString());
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(builder.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_stop");
assertThat(tokenFilter, instanceOf(StopTokenFilterFactory.class));
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
TokenStream create = tokenFilter.create(tokenizer);
assertThat(create, instanceOf(StopFilter.class));
}
示例10: testMultiTerms
import org.apache.lucene.analysis.Tokenizer; //导入依赖的package包/类
public void testMultiTerms() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42",
"wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se",
"ONeil", "O'Neil's", "O", "Neil" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1};
int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1};
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
expectedIncr, expectedPosLen, null);
}
示例11: testPartsAndCatenate
import org.apache.lucene.analysis.Tokenizer; //导入依赖的package包/类
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
public void testPartsAndCatenate() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
int[] expectedIncr = new int[]{1, 0, 1};
int[] expectedPosLen = new int[]{2, 1, 1};
String[] expected = new String[]{"PowerShot", "Power", "Shot" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
expectedIncr, expectedPosLen, null);
}
示例12: createComponents
import org.apache.lucene.analysis.Tokenizer; //导入依赖的package包/类
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new Tokenizer() {
boolean incremented = false;
CharTermAttribute term = addAttribute(CharTermAttribute.class);
@Override
public boolean incrementToken() throws IOException {
if (incremented) {
return false;
}
term.setLength(0).append(output);
incremented = true;
return true;
}
};
return new TokenStreamComponents(tokenizer);
}
示例13: testToken
import org.apache.lucene.analysis.Tokenizer; //导入依赖的package包/类
private void testToken(String source, String expected) throws IOException {
Index index = new Index("test", "_na_");
Settings settings = Settings.builder()
.put("index.analysis.filter.myStemmer.type", "polish_stem")
.build();
TestAnalysis analysis = createTestAnalysis(index, settings, new AnalysisStempelPlugin());
TokenFilterFactory filterFactory = analysis.tokenFilter.get("myStemmer");
Tokenizer tokenizer = new KeywordTokenizer();
tokenizer.setReader(new StringReader(source));
TokenStream ts = filterFactory.create(tokenizer);
CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
ts.reset();
assertThat(ts.incrementToken(), equalTo(true));
assertThat(term1.toString(), equalTo(expected));
}
示例14: inform
import org.apache.lucene.analysis.Tokenizer; //导入依赖的package包/类
@Override
public void inform(ResourceLoader loader) throws IOException {
final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory);
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader) : factory.create(reader);
TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer) : tokenizer;
return new TokenStreamComponents(tokenizer, stream);
}
};
try {
String formatClass = format;
if (format == null || format.equals("solr")) {
formatClass = SolrSynonymParser.class.getName();
} else if (format.equals("wordnet")) {
formatClass = WordnetSynonymParser.class.getName();
}
// TODO: expose dedup as a parameter?
map = loadSynonyms(loader, formatClass, true, analyzer);
} catch (ParseException e) {
throw new IOException("Error parsing synonyms file:", e);
}
}
示例15: createComponents
import org.apache.lucene.analysis.Tokenizer; //导入依赖的package包/类
/**
* Creates a token stream that tokenizes the given string into token terms
* (aka words).
*
* @param fieldName
* the name of the field to tokenize (currently ignored).
* @param reader
* reader (e.g. charfilter) of the original text. can be null.
* @param text
* the string to tokenize
* @return a new token stream
*/
public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
// Ideally the Analyzer superclass should have a method with the same signature,
// with a default impl that simply delegates to the StringReader flavour.
if (reader == null)
reader = new FastStringReader(text);
if (pattern == NON_WORD_PATTERN) { // fast path
return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords));
} else if (pattern == WHITESPACE_PATTERN) { // fast path
return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords));
}
Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase);
TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
return new TokenStreamComponents(tokenizer, result);
}