当前位置: 首页>>代码示例>>Java>>正文


Java KeywordTokenizer类代码示例

本文整理汇总了Java中org.apache.lucene.analysis.core.KeywordTokenizer的典型用法代码示例。如果您正苦于以下问题:Java KeywordTokenizer类的具体用法?Java KeywordTokenizer怎么用?Java KeywordTokenizer使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


KeywordTokenizer类属于org.apache.lucene.analysis.core包,在下文中一共展示了KeywordTokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testToken

import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
private void testToken(String source, String expected) throws IOException {
    Index index = new Index("test", "_na_");
    Settings settings = Settings.builder()
            .put("index.analysis.filter.myStemmer.type", "polish_stem")
            .build();
    TestAnalysis analysis = createTestAnalysis(index, settings, new AnalysisStempelPlugin());

    TokenFilterFactory filterFactory = analysis.tokenFilter.get("myStemmer");

    Tokenizer tokenizer = new KeywordTokenizer();
    tokenizer.setReader(new StringReader(source));
    TokenStream ts = filterFactory.create(tokenizer);

    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    assertThat(ts.incrementToken(), equalTo(true));

    assertThat(term1.toString(), equalTo(expected));
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:20,代码来源:SimplePolishTokenFilterTests.java

示例2: testIgnoreWhitespace

import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testIgnoreWhitespace() throws Exception {
  String withSpace = "foo bar";
  String withoutSpace = "foobar";
  String withPunctuation = "foo-bar";
  TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
      "locale", "en",
      "strength", "primary",
      "alternate", "shifted",
      "variableTop", " ");
  TokenStream tsWithSpace = factory.create(
      new KeywordTokenizer(new StringReader(withSpace)));
  TokenStream tsWithoutSpace = factory.create(
      new KeywordTokenizer(new StringReader(withoutSpace)));
  assertCollatesToSame(tsWithSpace, tsWithoutSpace);
  // now assert that punctuation still matters: foo-bar < foo bar
  tsWithSpace = factory.create(
      new KeywordTokenizer(new StringReader(withSpace)));
  TokenStream tsWithPunctuation = factory.create(
      new KeywordTokenizer(new StringReader(withPunctuation)));
  assertCollation(tsWithPunctuation, tsWithSpace, -1);
}
 
开发者ID:europeana,项目名称:search,代码行数:22,代码来源:TestICUCollationKeyFilterFactory.java

示例3: testEmptyTerm

import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testEmptyTerm() throws IOException {
  Random random = random();
  final int numIters = atLeast(10);
  for (int i = 0; i < numIters; i++) {
    b = new SynonymMap.Builder(random.nextBoolean());
    final int numEntries = atLeast(10);
    for (int j = 0; j < numEntries; j++) {
      add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
    }
    final SynonymMap map = b.build();
    final boolean ignoreCase = random.nextBoolean();
    
    final Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new KeywordTokenizer(reader);
        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
      }
    };

    checkAnalysisConsistency(random, analyzer, random.nextBoolean(), "");
  }
}
 
开发者ID:europeana,项目名称:search,代码行数:24,代码来源:TestSynonymMapFilter.java

示例4: testSupplementaryCharacters

import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testSupplementaryCharacters() throws IOException {
  final String s = TestUtil.randomUnicodeString(random(), 10);
  final int codePointCount = s.codePointCount(0, s.length());
  final int minGram = TestUtil.nextInt(random(), 1, 3);
  final int maxGram = TestUtil.nextInt(random(), minGram, 10);
  TokenStream tk = new KeywordTokenizer(new StringReader(s));
  tk = new EdgeNGramTokenFilter(tk, minGram, maxGram);
  final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
  tk.reset();
  for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
    assertTrue(tk.incrementToken());
    assertEquals(0, offsetAtt.startOffset());
    assertEquals(s.length(), offsetAtt.endOffset());
    final int end = Character.offsetByCodePoints(s, 0, i);
    assertEquals(s.substring(0, end), termAtt.toString());
  }
  assertFalse(tk.incrementToken());
}
 
开发者ID:europeana,项目名称:search,代码行数:20,代码来源:EdgeNGramTokenFilterTest.java

示例5: testSupplementaryCharacters

import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testSupplementaryCharacters() throws IOException {
  final String s = TestUtil.randomUnicodeString(random(), 10);
  final int codePointCount = s.codePointCount(0, s.length());
  final int minGram = TestUtil.nextInt(random(), 1, 3);
  final int maxGram = TestUtil.nextInt(random(), minGram, 10);
  TokenStream tk = new KeywordTokenizer(new StringReader(s));
  tk = new NGramTokenFilter(tk, minGram, maxGram);
  final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
  tk.reset();
  for (int start = 0; start < codePointCount; ++start) {
    for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
      assertTrue(tk.incrementToken());
      assertEquals(0, offsetAtt.startOffset());
      assertEquals(s.length(), offsetAtt.endOffset());
      final int startIndex = Character.offsetByCodePoints(s, 0, start);
      final int endIndex = Character.offsetByCodePoints(s, 0, end);
      assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
    }
  }
  assertFalse(tk.incrementToken());
}
 
开发者ID:europeana,项目名称:search,代码行数:23,代码来源:NGramTokenFilterTest.java

示例6: testEmptyTerm

import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testEmptyTerm() throws IOException {
  Random random = random();
  for (int i = 0; i < 512; i++) {
    final int flags = i;
    final CharArraySet protectedWords;
    if (random.nextBoolean()) {
      protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
    } else {
      protectedWords = null;
    }
  
    Analyzer a = new Analyzer() { 
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new KeywordTokenizer(reader);
        return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(tokenizer, flags, protectedWords));
      }
    };
    // depending upon options, this thing may or may not preserve the empty term
    checkAnalysisConsistency(random, a, random.nextBoolean(), "");
  }
}
 
开发者ID:europeana,项目名称:search,代码行数:23,代码来源:TestLucene47WordDelimiterFilter.java

示例7: testEmptyTerm

import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testEmptyTerm() throws IOException {
  Random random = random();
  for (int i = 0; i < 512; i++) {
    final int flags = i;
    final CharArraySet protectedWords;
    if (random.nextBoolean()) {
      protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
    } else {
      protectedWords = null;
    }
  
    Analyzer a = new Analyzer() { 
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new KeywordTokenizer(reader);
        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
      }
    };
    // depending upon options, this thing may or may not preserve the empty term
    checkAnalysisConsistency(random, a, random.nextBoolean(), "");
  }
}
 
开发者ID:europeana,项目名称:search,代码行数:23,代码来源:TestWordDelimiterFilter.java

示例8: testRandomStrings

import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testRandomStrings() throws IOException {
  for (int i = 0; i < 10000; i++) {
    String text = TestUtil.randomUnicodeString(random(), 100);
    int min = TestUtil.nextInt(random(), 0, 100);
    int max = TestUtil.nextInt(random(), 0, 100);
    int count = text.codePointCount(0, text.length());
    if(min>max){
      int temp = min;
      min = max;
      max = temp;
    }
    boolean expected = count >= min && count <= max;
    TokenStream stream = new KeywordTokenizer(new StringReader(text));
    stream = new CodepointCountFilter(stream, min, max);
    stream.reset();
    assertEquals(expected, stream.incrementToken());
    stream.end();
    stream.close();
  }
}
 
开发者ID:europeana,项目名称:search,代码行数:21,代码来源:TestCodepointCountFilter.java

示例9: assertCorrectOutput

import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
/**
 * For the supplied language, run the stemmer against all strings in voc.txt
 * The output should be the same as the string in output.txt
 */
private void assertCorrectOutput(final String snowballLanguage, String dataDirectory)
    throws IOException {
  if (VERBOSE) System.out.println("checking snowball language: " + snowballLanguage);
  
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName,
        Reader reader) {
      Tokenizer t = new KeywordTokenizer(reader);
      return new TokenStreamComponents(t, new SnowballFilter(t, snowballLanguage));
    }  
  };
  
  assertVocabulary(a, getDataFile("TestSnowballVocabData.zip"), 
      dataDirectory + "/voc.txt", dataDirectory + "/output.txt");
}
 
开发者ID:europeana,项目名称:search,代码行数:21,代码来源:TestSnowballVocab.java

示例10: testNormalization

import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testNormalization() throws IOException {
  String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
  String turkishLowerCase = "ı will use turkish casıng";
  ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
  Map<String,String> args = new HashMap<String,String>();
  args.put("locale", "tr");
  args.put("strength", "primary");
  args.put("decomposition", "canonical");
  factory.init(args);
  factory.inform(new StringMockResourceLoader(""));
  TokenStream tsUpper = factory.create(
      new KeywordTokenizer(new StringReader(turkishUpperCase)));
  TokenStream tsLower = factory.create(
      new KeywordTokenizer(new StringReader(turkishLowerCase)));
  assertCollatesToSame(tsUpper, tsLower);
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:17,代码来源:TestICUCollationKeyFilterFactory.java

示例11: testSecondaryStrength

import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testSecondaryStrength() throws IOException {
  String upperCase = "TESTING";
  String lowerCase = "testing";
  ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
  Map<String,String> args = new HashMap<String,String>();
  args.put("locale", "en");
  args.put("strength", "secondary");
  args.put("decomposition", "no");
  factory.init(args);
  factory.inform(new StringMockResourceLoader(""));
  TokenStream tsUpper = factory.create(
      new KeywordTokenizer(new StringReader(upperCase)));
  TokenStream tsLower = factory.create(
      new KeywordTokenizer(new StringReader(lowerCase)));
  assertCollatesToSame(tsUpper, tsLower);
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:17,代码来源:TestICUCollationKeyFilterFactory.java

示例12: testIgnorePunctuation

import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testIgnorePunctuation() throws IOException {
  String withPunctuation = "foo-bar";
  String withoutPunctuation = "foo bar";
  ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
  Map<String,String> args = new HashMap<String,String>();
  args.put("locale", "en");
  args.put("strength", "primary");
  args.put("alternate", "shifted");
  factory.init(args);
  factory.inform(new StringMockResourceLoader(""));
  TokenStream tsPunctuation = factory.create(
      new KeywordTokenizer(new StringReader(withPunctuation)));
  TokenStream tsWithoutPunctuation = factory.create(
      new KeywordTokenizer(new StringReader(withoutPunctuation)));
  assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:17,代码来源:TestICUCollationKeyFilterFactory.java

示例13: testIgnoreWhitespace

import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testIgnoreWhitespace() throws IOException {
  String withSpace = "foo bar";
  String withoutSpace = "foobar";
  String withPunctuation = "foo-bar";
  ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
  Map<String,String> args = new HashMap<String,String>();
  args.put("locale", "en");
  args.put("strength", "primary");
  args.put("alternate", "shifted");
  args.put("variableTop", " ");
  factory.init(args);
  factory.inform(new StringMockResourceLoader(""));
  TokenStream tsWithSpace = factory.create(
      new KeywordTokenizer(new StringReader(withSpace)));
  TokenStream tsWithoutSpace = factory.create(
      new KeywordTokenizer(new StringReader(withoutSpace)));
  assertCollatesToSame(tsWithSpace, tsWithoutSpace);
  // now assert that punctuation still matters: foo-bar < foo bar
  tsWithSpace = factory.create(
      new KeywordTokenizer(new StringReader(withSpace)));
  TokenStream tsWithPunctuation = factory.create(
      new KeywordTokenizer(new StringReader(withPunctuation)));
  assertCollation(tsWithPunctuation, tsWithSpace, -1);
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:25,代码来源:TestICUCollationKeyFilterFactory.java

示例14: testUpperCaseFirst

import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testUpperCaseFirst() throws IOException {
  String lower = "resume";
  String upper = "Resume";
  ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
  Map<String,String> args = new HashMap<String,String>();
  args.put("locale", "en");
  args.put("strength", "tertiary");
  args.put("caseFirst", "upper");
  factory.init(args);
  factory.inform(new StringMockResourceLoader(""));
  TokenStream tsLower = factory.create(
      new KeywordTokenizer(new StringReader(lower)));
  TokenStream tsUpper = factory.create(
      new KeywordTokenizer(new StringReader(upper)));
  assertCollation(tsUpper, tsLower, -1);
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:17,代码来源:TestICUCollationKeyFilterFactory.java

示例15: testEmptyTerm

import org.apache.lucene.analysis.core.KeywordTokenizer; //导入依赖的package包/类
public void testEmptyTerm() throws IOException {
  Random random = random();
  for (int i = 0; i < 512; i++) {
    final int flags = i;
    final CharArraySet protectedWords;
    if (random.nextBoolean()) {
      protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("a", "b", "cd")), false);
    } else {
      protectedWords = null;
    }
  
    Analyzer a = new Analyzer() { 
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new KeywordTokenizer(reader);
        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
      }
    };
    // depending upon options, this thing may or may not preserve the empty term
    checkAnalysisConsistency(random, a, random.nextBoolean(), "");
  }
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:23,代码来源:TestWordDelimiterFilter.java


注:本文中的org.apache.lucene.analysis.core.KeywordTokenizer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。