当前位置: 首页>>代码示例>>Java>>正文

Java TokenizerFactory类代码示例

本文整理汇总了Java中org.apache.lucene.analysis.util.TokenizerFactory的典型用法代码示例。如果您正苦于以下问题:Java TokenizerFactory类的具体用法?Java TokenizerFactory怎么用?Java TokenizerFactory使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


示例1: testBigramTokenizer

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void testBigramTokenizer() throws Exception {
  SlowSynonymMap synMap;

  // prepare bi-gram tokenizer factory
  Map<String, String> args = new HashMap<>();
  args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, "4.4");
  TokenizerFactory tf = new NGramTokenizerFactory(args);
  // (ab)->(bc)->(cd)->[ef][fg][gh]
  List<String> rules = new ArrayList<>();
  rules.add( "abcd=>efgh" );
  synMap = new SlowSynonymMap( true );
  SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
  assertEquals( 1, synMap.submap.size() );
  assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
  assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" );

示例2: doTestTokenizer

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
private void doTestTokenizer(String tokenizer) throws IOException {
  Class<? extends TokenizerFactory> factoryClazz = TokenizerFactory.lookupClass(tokenizer);
  TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz);
  if (factory != null) {
    // we managed to fully create an instance. check a few more things:
    // if it implements MultiTermAware, sanity check its impl
    if (factory instanceof MultiTermAwareComponent) {
      AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
      // its not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
      assertFalse(mtc instanceof CharFilterFactory);
    // beast it just a little, it shouldnt throw exceptions:
    // (it should have thrown them in initialize)
    checkRandomData(random(), new FactoryAnalyzer(factory, null, null), 100, 20, false, false);

示例3: testBigramTokenizer

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void testBigramTokenizer() throws Exception {
  SlowSynonymMap synMap;

  // prepare bi-gram tokenizer factory
  TokenizerFactory tf = new NGramTokenizerFactory();
  Map<String, String> args = new HashMap<String, String>();
  tf.init( args );

  // (ab)->(bc)->(cd)->[ef][fg][gh]
  List<String> rules = new ArrayList<String>();
  rules.add( "abcd=>efgh" );
  synMap = new SlowSynonymMap( true );
  SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
  assertEquals( 1, synMap.submap.size() );
  assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
  assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" );

示例4: doTestTokenizer

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
private void doTestTokenizer(String tokenizer) throws IOException {
  TokenizerFactory factory = TokenizerFactory.forName(tokenizer);
  if (initialize(factory)) {
    // we managed to fully create an instance. check a few more things:
    // if it implements MultiTermAware, sanity check its impl
    if (factory instanceof MultiTermAwareComponent) {
      AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
      // its not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
      assertFalse(mtc instanceof CharFilterFactory);
    // beast it just a little, it shouldnt throw exceptions:
    // (it should have thrown them in initialize)
    checkRandomData(random(), new FactoryAnalyzer(factory, null, null), 100, 20, false, false);

示例5: testBigramTokenizer

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void testBigramTokenizer() throws Exception {
  SlowSynonymMap synMap;

  // prepare bi-gram tokenizer factory
  Map<String, String> args = new HashMap<String, String>();
  args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, "4.4");
  TokenizerFactory tf = new NGramTokenizerFactory(args);
  // (ab)->(bc)->(cd)->[ef][fg][gh]
  List<String> rules = new ArrayList<String>();
  rules.add( "abcd=>efgh" );
  synMap = new SlowSynonymMap( true );
  SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
  assertEquals( 1, synMap.submap.size() );
  assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
  assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" );

示例6: reloadLuceneSPI

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
 * Reloads all Lucene SPI implementations using the new classloader.
 * This method must be called after the new classloader has been created to
 * register the services for use.
static void reloadLuceneSPI(ClassLoader loader) {
    // do NOT change the order of these method calls!

    // Codecs:
    // Analysis:

示例7: testCreate

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void testCreate() throws Exception
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" +
                                                 "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" +
                                                 "突出外表、百變髮型及正面的形象,以至自己" +
                                                 "品牌的男士香水等商品,及長期擔任運動品牌" +
                                                 "Adidas的代言人,因此對大眾傳播媒介和時尚界" +
                                                 "等方面都具很大的影響力,在足球圈外所獲得的" +
    while (tokenizer.incrementToken())
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());

示例8: lookupAnalysisClass

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
 * This method looks up a class with its fully qualified name (FQN), or a short-name
 * class-simplename, or with a package suffix, assuming "org.apache.lucene.analysis."
 * as the package prefix (e.g. "standard.ClassicTokenizerFactory" ->
 * "org.apache.lucene.analysis.standard.ClassicTokenizerFactory").
 * If className contains a period, the class is first looked up as-is, assuming that it
 * is an FQN.  If this fails, lookup is retried after prepending the Lucene analysis
 * package prefix to the class name.
 * If className does not contain a period, the analysis SPI *Factory.lookupClass()
 * methods are used to find the class.
 * @param className The name or the short name of the class.
 * @param expectedType The superclass className is expected to extend
 * @return the loaded class.
 * @throws ClassNotFoundException if lookup fails
public <T> Class<? extends T> lookupAnalysisClass(String className, Class<T> expectedType)
    throws ClassNotFoundException {
  if (className.contains(".")) {
    try {
      // First, try className == FQN
      return Class.forName(className).asSubclass(expectedType);
    } catch (ClassNotFoundException e) {
      try {
        // Second, retry lookup after prepending the Lucene analysis package prefix
        return Class.forName(LUCENE_ANALYSIS_PACKAGE_PREFIX + className).asSubclass(expectedType);
      } catch (ClassNotFoundException e1) {
        throw new ClassNotFoundException("Can't find class '" + className
                                         + "' or '" + LUCENE_ANALYSIS_PACKAGE_PREFIX + className + "'");
  // No dot - use analysis SPI lookup
  final String analysisComponentName = ANALYSIS_COMPONENT_SUFFIX_PATTERN.matcher(className).replaceFirst("");
  if (CharFilterFactory.class.isAssignableFrom(expectedType)) {
    return CharFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
  } else if (TokenizerFactory.class.isAssignableFrom(expectedType)) {
    return TokenizerFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
  } else if (TokenFilterFactory.class.isAssignableFrom(expectedType)) {
    return TokenFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);

  throw new ClassNotFoundException("Can't find class '" + className + "'");

示例9: AnalyzerFactory

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public AnalyzerFactory(List<CharFilterFactory> charFilterFactories,
                       TokenizerFactory tokenizerFactory,
                       List<TokenFilterFactory> tokenFilterFactories) {
  this.charFilterFactories = charFilterFactories;
  assert null != tokenizerFactory;
  this.tokenizerFactory = tokenizerFactory;
  this.tokenFilterFactories = tokenFilterFactories;

示例10: testSimple

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
/** Test showing the behavior */
public void testSimple() throws Exception {
  Reader reader = new StringReader("我购买了道具和服装。");
  TokenizerFactory factory = new HMMChineseTokenizerFactory(new HashMap<String,String>());
  Tokenizer tokenizer = factory.create(newAttributeFactory(), reader);
  // TODO: fix smart chinese to not emit punctuation tokens
  // at the moment: you have to clean up with WDF, or use the stoplist, etc
     new String[] { "我", "购买", "了", "道具", "和", "服装", "," });

示例11: test

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void test() throws IOException {
  for (String tokenizer : TokenizerFactory.availableTokenizers()) {
  for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) {
  for (String charFilter : CharFilterFactory.availableCharFilters()) {

示例12: reloadLuceneSPI

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
 * Reloads all Lucene SPI implementations using the new classloader.
 * This method must be called after {@link #addToClassLoader(String, FileFilter, boolean)}
 * and {@link #addToClassLoader(String,FileFilter,boolean)} before using
 * this ResourceLoader.
void reloadLuceneSPI() {
  // Codecs:
  // Analysis:

示例13: index

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void index(final IndexType indexType, final Collection<Song> songs) {
	executor.execute(new Runnable() {
		public void run() {
			Stopwatch stopwatch = Stopwatch.createStarted();
			Directory directory = new RAMDirectory();
			try {
				LOG.debug("available tokenizers: {}", TokenizerFactory.availableTokenizers());
				LOG.debug("available token filters: {}", TokenFilterFactory.availableTokenFilters());
				Analyzer analyzer = CustomAnalyzer.builder()
					.addTokenFilter("ngram", "minGramSize", "1", "maxGramSize", "25")
				IndexWriterConfig config = new IndexWriterConfig(analyzer);
				try (IndexWriter writer = new IndexWriter(directory, config)) {
					for (Song song : songs) {
						Document document = createDocument(song);
						songByUuid.put(song.getUUID(), song);
				} catch (IOException e) {
					LOG.warn("couldn't index songs", e);
			} catch (IOException e1) {
				LOG.warn("couldn't create analyzer", e1);
			} finally {
				putIndex(indexType, directory);
				LOG.info("indexing songs in background thread took {}", stopwatch.toString());

示例14: add

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void add(Object current)
    if (!(current instanceof MultiTermAwareComponent))
    AbstractAnalysisFactory newComponent = ((MultiTermAwareComponent) current).getMultiTermComponent();
    if (newComponent instanceof TokenFilterFactory)
        if (filters == null)
            filters = new ArrayList<TokenFilterFactory>(2);
        filters.add((TokenFilterFactory) newComponent);
    else if (newComponent instanceof TokenizerFactory)
        tokenizer = (TokenizerFactory) newComponent;
    else if (newComponent instanceof CharFilterFactory)
        if (charFilters == null)
            charFilters = new ArrayList<CharFilterFactory>(1);
        charFilters.add((CharFilterFactory) newComponent);

        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown analysis component from MultiTermAwareComponent: " + newComponent);

示例15: getDictionaryByFieldType

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
private Dictionary getDictionaryByFieldType(String fieldTypeName) {
	FieldType ft = h.getCore().getLatestSchema().getFieldTypeByName(fieldTypeName);
	Analyzer a = ft.getIndexAnalyzer();
	Assert.assertEquals(a.getClass(), TokenizerChain.class);
	TokenizerChain tc = (TokenizerChain) a;
	TokenizerFactory tf = tc.getTokenizerFactory();
	Assert.assertEquals(tf.getClass(), MMSegTokenizerFactory.class);
	MMSegTokenizerFactory mtf = (MMSegTokenizerFactory) tf;
	return mtf.dic;
