当前位置: 首页>>代码示例>>Java>>正文


Java TokenizerFactory类代码示例

本文整理汇总了Java中org.apache.lucene.analysis.util.TokenizerFactory的典型用法代码示例。如果您正苦于以下问题:Java TokenizerFactory类的具体用法?Java TokenizerFactory怎么用?Java TokenizerFactory使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


TokenizerFactory类属于org.apache.lucene.analysis.util包,在下文中一共展示了TokenizerFactory类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testBigramTokenizer

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void testBigramTokenizer() throws Exception {
  SlowSynonymMap synMap;

  // prepare bi-gram tokenizer factory
  Map<String, String> args = new HashMap<>();
  args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, "4.4");
  args.put("minGramSize","2");
  args.put("maxGramSize","2");
  TokenizerFactory tf = new NGramTokenizerFactory(args);
  
  // (ab)->(bc)->(cd)->[ef][fg][gh]
  List<String> rules = new ArrayList<>();
  rules.add( "abcd=>efgh" );
  synMap = new SlowSynonymMap( true );
  SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
  assertEquals( 1, synMap.submap.size() );
  assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
  assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" );
}
 
开发者ID:europeana,项目名称:search,代码行数:23,代码来源:TestSynonymMap.java

示例2: doTestTokenizer

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
private void doTestTokenizer(String tokenizer) throws IOException {
  Class<? extends TokenizerFactory> factoryClazz = TokenizerFactory.lookupClass(tokenizer);
  TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz);
  if (factory != null) {
    // we managed to fully create an instance. check a few more things:
    
    // if it implements MultiTermAware, sanity check its impl
    if (factory instanceof MultiTermAwareComponent) {
      AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
      assertNotNull(mtc);
      // its not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
      assertFalse(mtc instanceof CharFilterFactory);
    }
    
    // beast it just a little, it shouldnt throw exceptions:
    // (it should have thrown them in initialize)
    checkRandomData(random(), new FactoryAnalyzer(factory, null, null), 100, 20, false, false);
  }
}
 
开发者ID:europeana,项目名称:search,代码行数:20,代码来源:TestFactories.java

示例3: testBigramTokenizer

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void testBigramTokenizer() throws Exception {
  SlowSynonymMap synMap;

  // prepare bi-gram tokenizer factory
  TokenizerFactory tf = new NGramTokenizerFactory();
  Map<String, String> args = new HashMap<String, String>();
  args.put("minGramSize","2");
  args.put("maxGramSize","2");
  tf.init( args );

  // (ab)->(bc)->(cd)->[ef][fg][gh]
  List<String> rules = new ArrayList<String>();
  rules.add( "abcd=>efgh" );
  synMap = new SlowSynonymMap( true );
  SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
  assertEquals( 1, synMap.submap.size() );
  assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
  assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" );
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:23,代码来源:TestSynonymMap.java

示例4: doTestTokenizer

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
private void doTestTokenizer(String tokenizer) throws IOException {
  TokenizerFactory factory = TokenizerFactory.forName(tokenizer);
  if (initialize(factory)) {
    // we managed to fully create an instance. check a few more things:
    
    // if it implements MultiTermAware, sanity check its impl
    if (factory instanceof MultiTermAwareComponent) {
      AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
      assertNotNull(mtc);
      // its not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
      assertFalse(mtc instanceof CharFilterFactory);
    }
    
    // beast it just a little, it shouldnt throw exceptions:
    // (it should have thrown them in initialize)
    checkRandomData(random(), new FactoryAnalyzer(factory, null, null), 100, 20, false, false);
  }
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:19,代码来源:TestFactories.java

示例5: testBigramTokenizer

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void testBigramTokenizer() throws Exception {
  SlowSynonymMap synMap;

  // prepare bi-gram tokenizer factory
  Map<String, String> args = new HashMap<String, String>();
  args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, "4.4");
  args.put("minGramSize","2");
  args.put("maxGramSize","2");
  TokenizerFactory tf = new NGramTokenizerFactory(args);
  
  // (ab)->(bc)->(cd)->[ef][fg][gh]
  List<String> rules = new ArrayList<String>();
  rules.add( "abcd=>efgh" );
  synMap = new SlowSynonymMap( true );
  SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
  assertEquals( 1, synMap.submap.size() );
  assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
  assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" );
  assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" );
}
 
开发者ID:jimaguere,项目名称:Maskana-Gestor-de-Conocimiento,代码行数:23,代码来源:TestSynonymMap.java

示例6: reloadLuceneSPI

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
/**
 * Reloads all Lucene SPI implementations using the new classloader.
 * This method must be called after the new classloader has been created to
 * register the services for use.
 */
static void reloadLuceneSPI(ClassLoader loader) {
    // do NOT change the order of these method calls!

    // Codecs:
    PostingsFormat.reloadPostingsFormats(loader);
    DocValuesFormat.reloadDocValuesFormats(loader);
    Codec.reloadCodecs(loader);
    // Analysis:
    CharFilterFactory.reloadCharFilters(loader);
    TokenFilterFactory.reloadTokenFilters(loader);
    TokenizerFactory.reloadTokenizers(loader);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:18,代码来源:PluginsService.java

示例7: testCreate

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void testCreate() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" +
                                                 "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" +
                                                 "突出外表、百變髮型及正面的形象,以至自己" +
                                                 "品牌的男士香水等商品,及長期擔任運動品牌" +
                                                 "Adidas的代言人,因此對大眾傳播媒介和時尚界" +
                                                 "等方面都具很大的影響力,在足球圈外所獲得的" +
                                                 "認受程度可謂前所未見。"));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
开发者ID:hankcs,项目名称:hanlp-lucene-plugin,代码行数:28,代码来源:HanLPTokenizerFactoryTest.java

示例8: lookupAnalysisClass

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
/**
 * This method looks up a class with its fully qualified name (FQN), or a short-name
 * class-simplename, or with a package suffix, assuming "org.apache.lucene.analysis."
 * as the package prefix (e.g. "standard.ClassicTokenizerFactory" ->
 * "org.apache.lucene.analysis.standard.ClassicTokenizerFactory").
 *
 * If className contains a period, the class is first looked up as-is, assuming that it
 * is an FQN.  If this fails, lookup is retried after prepending the Lucene analysis
 * package prefix to the class name.
 *
 * If className does not contain a period, the analysis SPI *Factory.lookupClass()
 * methods are used to find the class.
 *
 * @param className The name or the short name of the class.
 * @param expectedType The superclass className is expected to extend
 * @return the loaded class.
 * @throws ClassNotFoundException if lookup fails
 */
public <T> Class<? extends T> lookupAnalysisClass(String className, Class<T> expectedType)
    throws ClassNotFoundException {
  if (className.contains(".")) {
    try {
      // First, try className == FQN
      return Class.forName(className).asSubclass(expectedType);
    } catch (ClassNotFoundException e) {
      try {
        // Second, retry lookup after prepending the Lucene analysis package prefix
        return Class.forName(LUCENE_ANALYSIS_PACKAGE_PREFIX + className).asSubclass(expectedType);
      } catch (ClassNotFoundException e1) {
        throw new ClassNotFoundException("Can't find class '" + className
                                         + "' or '" + LUCENE_ANALYSIS_PACKAGE_PREFIX + className + "'");
      }
    }
  }
  // No dot - use analysis SPI lookup
  final String analysisComponentName = ANALYSIS_COMPONENT_SUFFIX_PATTERN.matcher(className).replaceFirst("");
  if (CharFilterFactory.class.isAssignableFrom(expectedType)) {
    return CharFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
  } else if (TokenizerFactory.class.isAssignableFrom(expectedType)) {
    return TokenizerFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
  } else if (TokenFilterFactory.class.isAssignableFrom(expectedType)) {
    return TokenFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
  }

  throw new ClassNotFoundException("Can't find class '" + className + "'");
}
 
开发者ID:europeana,项目名称:search,代码行数:47,代码来源:AnalyzerFactoryTask.java

示例9: AnalyzerFactory

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public AnalyzerFactory(List<CharFilterFactory> charFilterFactories,
                       TokenizerFactory tokenizerFactory,
                       List<TokenFilterFactory> tokenFilterFactories) {
  this.charFilterFactories = charFilterFactories;
  assert null != tokenizerFactory;
  this.tokenizerFactory = tokenizerFactory;
  this.tokenFilterFactories = tokenFilterFactories;
}
 
开发者ID:europeana,项目名称:search,代码行数:9,代码来源:AnalyzerFactory.java

示例10: testSimple

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
/** Test showing the behavior */
public void testSimple() throws Exception {
  Reader reader = new StringReader("我购买了道具和服装。");
  TokenizerFactory factory = new HMMChineseTokenizerFactory(new HashMap<String,String>());
  Tokenizer tokenizer = factory.create(newAttributeFactory(), reader);
  // TODO: fix smart chinese to not emit punctuation tokens
  // at the moment: you have to clean up with WDF, or use the stoplist, etc
  assertTokenStreamContents(tokenizer, 
     new String[] { "我", "购买", "了", "道具", "和", "服装", "," });
}
 
开发者ID:europeana,项目名称:search,代码行数:11,代码来源:TestHMMChineseTokenizerFactory.java

示例11: test

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void test() throws IOException {
  for (String tokenizer : TokenizerFactory.availableTokenizers()) {
    doTestTokenizer(tokenizer);
  }
  
  for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) {
    doTestTokenFilter(tokenFilter);
  }
  
  for (String charFilter : CharFilterFactory.availableCharFilters()) {
    doTestCharFilter(charFilter);
  }
}
 
开发者ID:europeana,项目名称:search,代码行数:14,代码来源:TestFactories.java

示例12: reloadLuceneSPI

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
/**
 * Reloads all Lucene SPI implementations using the new classloader.
 * This method must be called after {@link #addToClassLoader(String, FileFilter, boolean)}
 * and {@link #addToClassLoader(String,FileFilter,boolean)} before using
 * this ResourceLoader.
 */
void reloadLuceneSPI() {
  // Codecs:
  PostingsFormat.reloadPostingsFormats(this.classLoader);
  DocValuesFormat.reloadDocValuesFormats(this.classLoader);
  Codec.reloadCodecs(this.classLoader);
  // Analysis:
  CharFilterFactory.reloadCharFilters(this.classLoader);
  TokenFilterFactory.reloadTokenFilters(this.classLoader);
  TokenizerFactory.reloadTokenizers(this.classLoader);
}
 
开发者ID:europeana,项目名称:search,代码行数:17,代码来源:SolrResourceLoader.java

示例13: index

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
@Override
public void index(final IndexType indexType, final Collection<Song> songs) {
	executor.execute(new Runnable() {
		@Override
		public void run() {
			Stopwatch stopwatch = Stopwatch.createStarted();
			
			Directory directory = new RAMDirectory();
			try {
				LOG.debug("available tokenizers: {}", TokenizerFactory.availableTokenizers());
				LOG.debug("available token filters: {}", TokenFilterFactory.availableTokenFilters());
				Analyzer analyzer = CustomAnalyzer.builder()
					.withTokenizer("standard")
					.addTokenFilter("lowercase")
					.addTokenFilter("ngram", "minGramSize", "1", "maxGramSize", "25")
					.build();
				IndexWriterConfig config = new IndexWriterConfig(analyzer);
				try (IndexWriter writer = new IndexWriter(directory, config)) {
					for (Song song : songs) {
						Document document = createDocument(song);
						writer.addDocument(document);
						songByUuid.put(song.getUUID(), song);
					}
				} catch (IOException e) {
					LOG.warn("couldn't index songs", e);
				}
			} catch (IOException e1) {
				LOG.warn("couldn't create analyzer", e1);
			} finally {
				putIndex(indexType, directory);
				stopwatch.stop();
				LOG.info("indexing songs in background thread took {}", stopwatch.toString());
			}
		}
	});
}
 
开发者ID:mathisdt,项目名称:sdb2,代码行数:37,代码来源:IndexerServiceImpl.java

示例14: add

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void add(Object current)
{
    if (!(current instanceof MultiTermAwareComponent))
        return;
    AbstractAnalysisFactory newComponent = ((MultiTermAwareComponent) current).getMultiTermComponent();
    if (newComponent instanceof TokenFilterFactory)
    {
        if (filters == null)
        {
            filters = new ArrayList<TokenFilterFactory>(2);
        }
        filters.add((TokenFilterFactory) newComponent);
    }
    else if (newComponent instanceof TokenizerFactory)
    {
        tokenizer = (TokenizerFactory) newComponent;
    }
    else if (newComponent instanceof CharFilterFactory)
    {
        if (charFilters == null)
        {
            charFilters = new ArrayList<CharFilterFactory>(1);
        }
        charFilters.add((CharFilterFactory) newComponent);

    }
    else
    {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown analysis component from MultiTermAwareComponent: " + newComponent);
    }
}
 
开发者ID:Alfresco,项目名称:community-edition-old,代码行数:32,代码来源:AlfrescoFieldType.java

示例15: getDictionaryByFieldType

import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
private Dictionary getDictionaryByFieldType(String fieldTypeName) {
	FieldType ft = h.getCore().getLatestSchema().getFieldTypeByName(fieldTypeName);
	Analyzer a = ft.getIndexAnalyzer();
	Assert.assertEquals(a.getClass(), TokenizerChain.class);
	
	TokenizerChain tc = (TokenizerChain) a;
	TokenizerFactory tf = tc.getTokenizerFactory();
	Assert.assertEquals(tf.getClass(), MMSegTokenizerFactory.class);
	
	MMSegTokenizerFactory mtf = (MMSegTokenizerFactory) tf;
	
	Assert.assertNotNull(mtf.dic);
	return mtf.dic;
}
 
开发者ID:chenlb,项目名称:mmseg4j-solr,代码行数:15,代码来源:MMSegTokenizerFactoryTest.java


注:本文中的org.apache.lucene.analysis.util.TokenizerFactory类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。