本文整理汇总了Java中org.apache.lucene.analysis.core.KeywordTokenizer类的典型用法代码示例。如果您正苦于以下问题:Java KeywordTokenizer类的具体用法?Java KeywordTokenizer怎么用?Java KeywordTokenizer使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
KeywordTokenizer类属于org.apache.lucene.analysis.core包,在下文中一共展示了KeywordTokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testToken
import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
private void testToken(String source, String expected) throws IOException {
Index index = new Index("test", "_na_");
Settings settings = Settings.builder()
.put("index.analysis.filter.myStemmer.type", "polish_stem")
.build();
TestAnalysis analysis = createTestAnalysis(index, settings, new AnalysisStempelPlugin());
TokenFilterFactory filterFactory = analysis.tokenFilter.get("myStemmer");
Tokenizer tokenizer = new KeywordTokenizer();
tokenizer.setReader(new StringReader(source));
TokenStream ts = filterFactory.create(tokenizer);
CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
ts.reset();
assertThat(ts.incrementToken(), equalTo(true));
assertThat(term1.toString(), equalTo(expected));
}
示例2: testIgnoreWhitespace
import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testIgnoreWhitespace() throws Exception {
String withSpace = "foo bar";
String withoutSpace = "foobar";
String withPunctuation = "foo-bar";
TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
"locale", "en",
"strength", "primary",
"alternate", "shifted",
"variableTop", " ");
TokenStream tsWithSpace = factory.create(
new KeywordTokenizer(new StringReader(withSpace)));
TokenStream tsWithoutSpace = factory.create(
new KeywordTokenizer(new StringReader(withoutSpace)));
assertCollatesToSame(tsWithSpace, tsWithoutSpace);
// now assert that punctuation still matters: foo-bar < foo bar
tsWithSpace = factory.create(
new KeywordTokenizer(new StringReader(withSpace)));
TokenStream tsWithPunctuation = factory.create(
new KeywordTokenizer(new StringReader(withPunctuation)));
assertCollation(tsWithPunctuation, tsWithSpace, -1);
}
示例3: testEmptyTerm
import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testEmptyTerm() throws IOException {
Random random = random();
final int numIters = atLeast(10);
for (int i = 0; i < numIters; i++) {
b = new SynonymMap.Builder(random.nextBoolean());
final int numEntries = atLeast(10);
for (int j = 0; j < numEntries; j++) {
add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
}
final SynonymMap map = b.build();
final boolean ignoreCase = random.nextBoolean();
final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
}
};
checkAnalysisConsistency(random, analyzer, random.nextBoolean(), "");
}
}
示例4: testSupplementaryCharacters
import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testSupplementaryCharacters() throws IOException {
final String s = TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = TestUtil.nextInt(random(), 1, 3);
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
TokenStream tk = new KeywordTokenizer(new StringReader(s));
tk = new EdgeNGramTokenFilter(tk, minGram, maxGram);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int end = Character.offsetByCodePoints(s, 0, i);
assertEquals(s.substring(0, end), termAtt.toString());
}
assertFalse(tk.incrementToken());
}
示例5: testSupplementaryCharacters
import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testSupplementaryCharacters() throws IOException {
final String s = TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = TestUtil.nextInt(random(), 1, 3);
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
TokenStream tk = new KeywordTokenizer(new StringReader(s));
tk = new NGramTokenFilter(tk, minGram, maxGram);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
for (int start = 0; start < codePointCount; ++start) {
for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int startIndex = Character.offsetByCodePoints(s, 0, start);
final int endIndex = Character.offsetByCodePoints(s, 0, end);
assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
}
}
assertFalse(tk.incrementToken());
}
示例6: testEmptyTerm
import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testEmptyTerm() throws IOException {
Random random = random();
for (int i = 0; i < 512; i++) {
final int flags = i;
final CharArraySet protectedWords;
if (random.nextBoolean()) {
protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
} else {
protectedWords = null;
}
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(tokenizer, flags, protectedWords));
}
};
// depending upon options, this thing may or may not preserve the empty term
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
}
}
示例7: testEmptyTerm
import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testEmptyTerm() throws IOException {
Random random = random();
for (int i = 0; i < 512; i++) {
final int flags = i;
final CharArraySet protectedWords;
if (random.nextBoolean()) {
protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
} else {
protectedWords = null;
}
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
}
};
// depending upon options, this thing may or may not preserve the empty term
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
}
}
示例8: testRandomStrings
import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testRandomStrings() throws IOException {
for (int i = 0; i < 10000; i++) {
String text = TestUtil.randomUnicodeString(random(), 100);
int min = TestUtil.nextInt(random(), 0, 100);
int max = TestUtil.nextInt(random(), 0, 100);
int count = text.codePointCount(0, text.length());
if(min>max){
int temp = min;
min = max;
max = temp;
}
boolean expected = count >= min && count <= max;
TokenStream stream = new KeywordTokenizer(new StringReader(text));
stream = new CodepointCountFilter(stream, min, max);
stream.reset();
assertEquals(expected, stream.incrementToken());
stream.end();
stream.close();
}
}
示例9: assertCorrectOutput
import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
/**
* For the supplied language, run the stemmer against all strings in voc.txt
* The output should be the same as the string in output.txt
*/
private void assertCorrectOutput(final String snowballLanguage, String dataDirectory)
throws IOException {
if (VERBOSE) System.out.println("checking snowball language: " + snowballLanguage);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer t = new KeywordTokenizer(reader);
return new TokenStreamComponents(t, new SnowballFilter(t, snowballLanguage));
}
};
assertVocabulary(a, getDataFile("TestSnowballVocabData.zip"),
dataDirectory + "/voc.txt", dataDirectory + "/output.txt");
}
示例10: testNormalization
import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testNormalization() throws IOException {
String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
String turkishLowerCase = "ı will use turkish casıng";
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("locale", "tr");
args.put("strength", "primary");
args.put("decomposition", "canonical");
factory.init(args);
factory.inform(new StringMockResourceLoader(""));
TokenStream tsUpper = factory.create(
new KeywordTokenizer(new StringReader(turkishUpperCase)));
TokenStream tsLower = factory.create(
new KeywordTokenizer(new StringReader(turkishLowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
示例11: testSecondaryStrength
import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testSecondaryStrength() throws IOException {
String upperCase = "TESTING";
String lowerCase = "testing";
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("locale", "en");
args.put("strength", "secondary");
args.put("decomposition", "no");
factory.init(args);
factory.inform(new StringMockResourceLoader(""));
TokenStream tsUpper = factory.create(
new KeywordTokenizer(new StringReader(upperCase)));
TokenStream tsLower = factory.create(
new KeywordTokenizer(new StringReader(lowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
示例12: testIgnorePunctuation
import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testIgnorePunctuation() throws IOException {
String withPunctuation = "foo-bar";
String withoutPunctuation = "foo bar";
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("locale", "en");
args.put("strength", "primary");
args.put("alternate", "shifted");
factory.init(args);
factory.inform(new StringMockResourceLoader(""));
TokenStream tsPunctuation = factory.create(
new KeywordTokenizer(new StringReader(withPunctuation)));
TokenStream tsWithoutPunctuation = factory.create(
new KeywordTokenizer(new StringReader(withoutPunctuation)));
assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
}
示例13: testIgnoreWhitespace
import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testIgnoreWhitespace() throws IOException {
String withSpace = "foo bar";
String withoutSpace = "foobar";
String withPunctuation = "foo-bar";
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("locale", "en");
args.put("strength", "primary");
args.put("alternate", "shifted");
args.put("variableTop", " ");
factory.init(args);
factory.inform(new StringMockResourceLoader(""));
TokenStream tsWithSpace = factory.create(
new KeywordTokenizer(new StringReader(withSpace)));
TokenStream tsWithoutSpace = factory.create(
new KeywordTokenizer(new StringReader(withoutSpace)));
assertCollatesToSame(tsWithSpace, tsWithoutSpace);
// now assert that punctuation still matters: foo-bar < foo bar
tsWithSpace = factory.create(
new KeywordTokenizer(new StringReader(withSpace)));
TokenStream tsWithPunctuation = factory.create(
new KeywordTokenizer(new StringReader(withPunctuation)));
assertCollation(tsWithPunctuation, tsWithSpace, -1);
}
示例14: testUpperCaseFirst
import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testUpperCaseFirst() throws IOException {
String lower = "resume";
String upper = "Resume";
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("locale", "en");
args.put("strength", "tertiary");
args.put("caseFirst", "upper");
factory.init(args);
factory.inform(new StringMockResourceLoader(""));
TokenStream tsLower = factory.create(
new KeywordTokenizer(new StringReader(lower)));
TokenStream tsUpper = factory.create(
new KeywordTokenizer(new StringReader(upper)));
assertCollation(tsUpper, tsLower, -1);
}
示例15: testEmptyTerm
import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testEmptyTerm() throws IOException {
Random random = random();
for (int i = 0; i < 512; i++) {
final int flags = i;
final CharArraySet protectedWords;
if (random.nextBoolean()) {
protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("a", "b", "cd")), false);
} else {
protectedWords = null;
}
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
}
};
// depending upon options, this thing may or may not preserve the empty term
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
}
}