当前位置: 首页>>代码示例>>Java>>正文


Java StandardTokenizer类代码示例

本文整理汇总了Java中org.apache.lucene.analysis.standard.StandardTokenizer的典型用法代码示例。如果您正苦于以下问题:Java StandardTokenizer类的具体用法?Java StandardTokenizer怎么用?Java StandardTokenizer使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


StandardTokenizer类属于org.apache.lucene.analysis.standard包,在下文中一共展示了StandardTokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: reusableTokenStream

import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
 
开发者ID:airsonic,项目名称:airsonic,代码行数:24,代码来源:SearchService.java

示例2: testHanOnly

import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testHanOnly() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer t = new StandardTokenizer(reader);
      return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多", "く", "の",  "学生", "が",  "試験", "に",  "落", "ち", "た" },
      new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
      new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
      new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>", 
                     "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
      new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
      new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
 
开发者ID:europeana,项目名称:search,代码行数:18,代码来源:TestCJKBigramFilter.java

示例3: testUnigramsAndBigramsHanOnly

import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testUnigramsAndBigramsHanOnly() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer t = new StandardTokenizer(reader);
      return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多", "く", "の",  "学", "学生", "生", "が",  "試", "試験", "験", "に",  "落", "ち", "た" },
      new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
      new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
      new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", 
                     "<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", 
                     "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
      new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
      new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
}
 
开发者ID:europeana,项目名称:search,代码行数:19,代码来源:TestCJKBigramFilter.java

示例4: testStopPositons

import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
/**
 * Test Position increments applied by TypeTokenFilter with and without enabling this option.
 */
public void testStopPositons() throws IOException {
  StringBuilder sb = new StringBuilder();
  for (int i = 10; i < 20; i++) {
    if (i % 3 != 0) {
      sb.append(i).append(" ");
    } else {
      String w = English.intToEnglish(i).trim();
      sb.append(w).append(" ");
    }
  }
  log(sb.toString());
  String stopTypes[] = new String[]{"<NUM>"};
  Set<String> stopSet = asSet(stopTypes);

  // with increments
  StringReader reader = new StringReader(sb.toString());
  TypeTokenFilter typeTokenFilter = new TypeTokenFilter(Version.LATEST, new StandardTokenizer(reader), stopSet);
  testPositons(typeTokenFilter);

  // without increments
  reader = new StringReader(sb.toString());
  typeTokenFilter = new TypeTokenFilter(Version.LUCENE_4_3, false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
  testPositons(typeTokenFilter);

}
 
开发者ID:europeana,项目名称:search,代码行数:29,代码来源:TestTypeTokenFilter.java

示例5: stemWords

import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
/**
 * Stem a list of words with a configured stemmer.
 *
 * @param words
 *            The list of words to stem.
 * @param stemming
 *            The stemmer to be used.
 * @return The stemmed list of words.
 */
@SuppressWarnings("resource")
public static String[] stemWords(String[] words, Stemming stemming) {
    Set<String> stemmedStopWords = Sets.newHashSet();

    for (String word : words) {
        TokenStream tokenStream = new StandardTokenizer(LUCENE_VERSION, new StringReader(word));
        tokenStream = Stemming.wrapStemmingFilter(tokenStream, stemming);

        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        try {
            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                String term = charTermAttribute.toString();
                stemmedStopWords.add(term);
            }
        } catch (IOException e) {
            logger.error("Failed to stem a list of words", e);
        }
    }
    return stemmedStopWords.toArray(new String[] {});
}
 
开发者ID:kopl,项目名称:SPLevo,代码行数:31,代码来源:LuceneCodeAnalyzer.java

示例6: testHanOnly

import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testHanOnly() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
      return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多", "く", "の",  "学生", "が",  "試験", "に",  "落", "ち", "た" },
      new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
      new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
      new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>", 
                     "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
      new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
      new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:18,代码来源:TestCJKBigramFilter.java

示例7: testUnigramsAndBigramsHanOnly

import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testUnigramsAndBigramsHanOnly() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
      return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多", "く", "の",  "学", "学生", "生", "が",  "試", "試験", "験", "に",  "落", "ち", "た" },
      new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
      new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
      new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", 
                     "<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", 
                     "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
      new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
      new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:19,代码来源:TestCJKBigramFilter.java

示例8: testStopPositons

import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
/**
 * Test Position increments applied by TypeTokenFilter with and without enabling this option.
 */
public void testStopPositons() throws IOException {
  StringBuilder sb = new StringBuilder();
  for (int i = 10; i < 20; i++) {
    if (i % 3 != 0) {
      sb.append(i).append(" ");
    } else {
      String w = English.intToEnglish(i).trim();
      sb.append(w).append(" ");
    }
  }
  log(sb.toString());
  String stopTypes[] = new String[]{"<NUM>"};
  Set<String> stopSet = asSet(stopTypes);

  // with increments
  StringReader reader = new StringReader(sb.toString());
  TypeTokenFilter typeTokenFilter = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
  testPositons(typeTokenFilter);

  // without increments
  reader = new StringReader(sb.toString());
  typeTokenFilter = new TypeTokenFilter(false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
  testPositons(typeTokenFilter);

}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:29,代码来源:TestTypeTokenFilter.java

示例9: testTwo

import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
@Test
public void testTwo() throws IOException {
    AnalysisService analysisService = createAnalysisService();
    TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform");

    String source = "Das sind Autos, die Nudeln transportieren.";

    String[] expected = {
            "Das",
            "Das",
            "sind",
            "sind",
            "Autos",
            "Auto",
            "die",
            "der",
            "Nudeln",
            "Nudel",
            "transportieren",
            "transportieren"
    };

    Tokenizer tokenizer = new StandardTokenizer(Version.LATEST, new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
 
开发者ID:jprante,项目名称:elasticsearch-analysis-german,代码行数:26,代码来源:BaseformTokenFilterTests.java

示例10: testStopPositons

import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
/**
 * Test Position increments applied by TypeTokenFilter with and without enabling this option.
 */
public void testStopPositons() throws IOException {
  StringBuilder sb = new StringBuilder();
  for (int i = 10; i < 20; i++) {
    if (i % 3 != 0) {
      sb.append(i).append(" ");
    } else {
      String w = English.intToEnglish(i).trim();
      sb.append(w).append(" ");
    }
  }
  log(sb.toString());
  String stopTypes[] = new String[]{"<NUM>"};
  Set<String> stopSet = asSet(stopTypes);

  // with increments
  StringReader reader = new StringReader(sb.toString());
  TypeTokenFilter typeTokenFilter = new TypeTokenFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
  testPositons(typeTokenFilter);

  // without increments
  reader = new StringReader(sb.toString());
  typeTokenFilter = new TypeTokenFilter(Version.LUCENE_43, false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
  testPositons(typeTokenFilter);

}
 
开发者ID:jimaguere,项目名称:Maskana-Gestor-de-Conocimiento,代码行数:29,代码来源:TestTypeTokenFilter.java

示例11: testKeepTypes

import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testKeepTypes() throws IOException {
    Settings settings = Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.keep_numbers.type", "keep_types")
            .putArray("index.analysis.filter.keep_numbers.types", new String[] {"<NUM>", "<SOMETHINGELSE>"})
            .build();
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
    assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
    String source = "Hello 123 world";
    String[] expected = new String[]{"123"};
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{2});
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:16,代码来源:KeepTypesFilterFactoryTests.java

示例12: testDefault

import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testDefault() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_bigram");
    String source = "多くの学生が試験に落ちた。";
    String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:10,代码来源:CJKFilterFactoryTests.java

示例13: testNoFlags

import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testNoFlags() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_no_flags");
    String source = "多くの学生が試験に落ちた。";
    String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:10,代码来源:CJKFilterFactoryTests.java

示例14: testHanOnly

import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testHanOnly() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_only");
    String source = "多くの学生が試験に落ちた。";
    String[] expected = new String[]{"多", "く", "の",  "学生", "が",  "試験", "に",  "落", "ち", "た"  };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:10,代码来源:CJKFilterFactoryTests.java

示例15: testHanUnigramOnly

import org.apache.lucene.analysis.standard.StandardTokenizer; //导入依赖的package包/类
public void testHanUnigramOnly() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_unigram_only");
    String source = "多くの学生が試験に落ちた。";
    String[] expected = new String[]{"多", "く", "の",  "学", "学生", "生", "が",  "試", "試験", "験", "に",  "落", "ち", "た"  };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:10,代码来源:CJKFilterFactoryTests.java


注:本文中的org.apache.lucene.analysis.standard.StandardTokenizer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。