本文整理汇总了Java中org.apache.lucene.analysis.util.TokenizerFactory类的典型用法代码示例。如果您正苦于以下问题:Java TokenizerFactory类的具体用法?Java TokenizerFactory怎么用?Java TokenizerFactory使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
TokenizerFactory类属于org.apache.lucene.analysis.util包,在下文中一共展示了TokenizerFactory类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testBigramTokenizer
import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void testBigramTokenizer() throws Exception {
SlowSynonymMap synMap;
// prepare bi-gram tokenizer factory
Map<String, String> args = new HashMap<>();
args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, "4.4");
args.put("minGramSize","2");
args.put("maxGramSize","2");
TokenizerFactory tf = new NGramTokenizerFactory(args);
// (ab)->(bc)->(cd)->[ef][fg][gh]
List<String> rules = new ArrayList<>();
rules.add( "abcd=>efgh" );
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
assertEquals( 1, synMap.submap.size() );
assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" );
assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" );
assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" );
}
示例2: doTestTokenizer
import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
private void doTestTokenizer(String tokenizer) throws IOException {
Class<? extends TokenizerFactory> factoryClazz = TokenizerFactory.lookupClass(tokenizer);
TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz);
if (factory != null) {
// we managed to fully create an instance. check a few more things:
// if it implements MultiTermAware, sanity check its impl
if (factory instanceof MultiTermAwareComponent) {
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
assertNotNull(mtc);
// its not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
assertFalse(mtc instanceof CharFilterFactory);
}
// beast it just a little, it shouldnt throw exceptions:
// (it should have thrown them in initialize)
checkRandomData(random(), new FactoryAnalyzer(factory, null, null), 100, 20, false, false);
}
}
示例3: testBigramTokenizer
import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void testBigramTokenizer() throws Exception {
SlowSynonymMap synMap;
// prepare bi-gram tokenizer factory
TokenizerFactory tf = new NGramTokenizerFactory();
Map<String, String> args = new HashMap<String, String>();
args.put("minGramSize","2");
args.put("maxGramSize","2");
tf.init( args );
// (ab)->(bc)->(cd)->[ef][fg][gh]
List<String> rules = new ArrayList<String>();
rules.add( "abcd=>efgh" );
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
assertEquals( 1, synMap.submap.size() );
assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" );
assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" );
assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" );
}
示例4: doTestTokenizer
import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
private void doTestTokenizer(String tokenizer) throws IOException {
TokenizerFactory factory = TokenizerFactory.forName(tokenizer);
if (initialize(factory)) {
// we managed to fully create an instance. check a few more things:
// if it implements MultiTermAware, sanity check its impl
if (factory instanceof MultiTermAwareComponent) {
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
assertNotNull(mtc);
// its not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
assertFalse(mtc instanceof CharFilterFactory);
}
// beast it just a little, it shouldnt throw exceptions:
// (it should have thrown them in initialize)
checkRandomData(random(), new FactoryAnalyzer(factory, null, null), 100, 20, false, false);
}
}
示例5: testBigramTokenizer
import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void testBigramTokenizer() throws Exception {
SlowSynonymMap synMap;
// prepare bi-gram tokenizer factory
Map<String, String> args = new HashMap<String, String>();
args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, "4.4");
args.put("minGramSize","2");
args.put("maxGramSize","2");
TokenizerFactory tf = new NGramTokenizerFactory(args);
// (ab)->(bc)->(cd)->[ef][fg][gh]
List<String> rules = new ArrayList<String>();
rules.add( "abcd=>efgh" );
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
assertEquals( 1, synMap.submap.size() );
assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" );
assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" );
assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" );
}
示例6: reloadLuceneSPI
import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
/**
* Reloads all Lucene SPI implementations using the new classloader.
* This method must be called after the new classloader has been created to
* register the services for use.
*/
static void reloadLuceneSPI(ClassLoader loader) {
// do NOT change the order of these method calls!
// Codecs:
PostingsFormat.reloadPostingsFormats(loader);
DocValuesFormat.reloadDocValuesFormats(loader);
Codec.reloadCodecs(loader);
// Analysis:
CharFilterFactory.reloadCharFilters(loader);
TokenFilterFactory.reloadTokenFilters(loader);
TokenizerFactory.reloadTokenizers(loader);
}
示例7: testCreate
import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void testCreate() throws Exception
{
Map<String, String> args = new TreeMap<>();
args.put("enableTraditionalChineseMode", "true");
TokenizerFactory factory = new HanLPTokenizerFactory(args);
Tokenizer tokenizer = factory.create(null);
tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" +
"辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" +
"突出外表、百變髮型及正面的形象,以至自己" +
"品牌的男士香水等商品,及長期擔任運動品牌" +
"Adidas的代言人,因此對大眾傳播媒介和時尚界" +
"等方面都具很大的影響力,在足球圈外所獲得的" +
"認受程度可謂前所未見。"));
tokenizer.reset();
while (tokenizer.incrementToken())
{
CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
// 偏移量
OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
// 距离
PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
// 词性
TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
}
}
示例8: lookupAnalysisClass
import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
/**
* This method looks up a class with its fully qualified name (FQN), or a short-name
* class-simplename, or with a package suffix, assuming "org.apache.lucene.analysis."
* as the package prefix (e.g. "standard.ClassicTokenizerFactory" ->
* "org.apache.lucene.analysis.standard.ClassicTokenizerFactory").
*
* If className contains a period, the class is first looked up as-is, assuming that it
* is an FQN. If this fails, lookup is retried after prepending the Lucene analysis
* package prefix to the class name.
*
* If className does not contain a period, the analysis SPI *Factory.lookupClass()
* methods are used to find the class.
*
* @param className The name or the short name of the class.
* @param expectedType The superclass className is expected to extend
* @return the loaded class.
* @throws ClassNotFoundException if lookup fails
*/
public <T> Class<? extends T> lookupAnalysisClass(String className, Class<T> expectedType)
throws ClassNotFoundException {
if (className.contains(".")) {
try {
// First, try className == FQN
return Class.forName(className).asSubclass(expectedType);
} catch (ClassNotFoundException e) {
try {
// Second, retry lookup after prepending the Lucene analysis package prefix
return Class.forName(LUCENE_ANALYSIS_PACKAGE_PREFIX + className).asSubclass(expectedType);
} catch (ClassNotFoundException e1) {
throw new ClassNotFoundException("Can't find class '" + className
+ "' or '" + LUCENE_ANALYSIS_PACKAGE_PREFIX + className + "'");
}
}
}
// No dot - use analysis SPI lookup
final String analysisComponentName = ANALYSIS_COMPONENT_SUFFIX_PATTERN.matcher(className).replaceFirst("");
if (CharFilterFactory.class.isAssignableFrom(expectedType)) {
return CharFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
} else if (TokenizerFactory.class.isAssignableFrom(expectedType)) {
return TokenizerFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
} else if (TokenFilterFactory.class.isAssignableFrom(expectedType)) {
return TokenFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
}
throw new ClassNotFoundException("Can't find class '" + className + "'");
}
示例9: AnalyzerFactory
import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public AnalyzerFactory(List<CharFilterFactory> charFilterFactories,
TokenizerFactory tokenizerFactory,
List<TokenFilterFactory> tokenFilterFactories) {
this.charFilterFactories = charFilterFactories;
assert null != tokenizerFactory;
this.tokenizerFactory = tokenizerFactory;
this.tokenFilterFactories = tokenFilterFactories;
}
示例10: testSimple
import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
/** Test showing the behavior */
public void testSimple() throws Exception {
Reader reader = new StringReader("我购买了道具和服装。");
TokenizerFactory factory = new HMMChineseTokenizerFactory(new HashMap<String,String>());
Tokenizer tokenizer = factory.create(newAttributeFactory(), reader);
// TODO: fix smart chinese to not emit punctuation tokens
// at the moment: you have to clean up with WDF, or use the stoplist, etc
assertTokenStreamContents(tokenizer,
new String[] { "我", "购买", "了", "道具", "和", "服装", "," });
}
示例11: test
import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void test() throws IOException {
for (String tokenizer : TokenizerFactory.availableTokenizers()) {
doTestTokenizer(tokenizer);
}
for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) {
doTestTokenFilter(tokenFilter);
}
for (String charFilter : CharFilterFactory.availableCharFilters()) {
doTestCharFilter(charFilter);
}
}
示例12: reloadLuceneSPI
import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
/**
* Reloads all Lucene SPI implementations using the new classloader.
* This method must be called after {@link #addToClassLoader(String, FileFilter, boolean)}
* and {@link #addToClassLoader(String,FileFilter,boolean)} before using
* this ResourceLoader.
*/
void reloadLuceneSPI() {
// Codecs:
PostingsFormat.reloadPostingsFormats(this.classLoader);
DocValuesFormat.reloadDocValuesFormats(this.classLoader);
Codec.reloadCodecs(this.classLoader);
// Analysis:
CharFilterFactory.reloadCharFilters(this.classLoader);
TokenFilterFactory.reloadTokenFilters(this.classLoader);
TokenizerFactory.reloadTokenizers(this.classLoader);
}
示例13: index
import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
@Override
public void index(final IndexType indexType, final Collection<Song> songs) {
executor.execute(new Runnable() {
@Override
public void run() {
Stopwatch stopwatch = Stopwatch.createStarted();
Directory directory = new RAMDirectory();
try {
LOG.debug("available tokenizers: {}", TokenizerFactory.availableTokenizers());
LOG.debug("available token filters: {}", TokenFilterFactory.availableTokenFilters());
Analyzer analyzer = CustomAnalyzer.builder()
.withTokenizer("standard")
.addTokenFilter("lowercase")
.addTokenFilter("ngram", "minGramSize", "1", "maxGramSize", "25")
.build();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
try (IndexWriter writer = new IndexWriter(directory, config)) {
for (Song song : songs) {
Document document = createDocument(song);
writer.addDocument(document);
songByUuid.put(song.getUUID(), song);
}
} catch (IOException e) {
LOG.warn("couldn't index songs", e);
}
} catch (IOException e1) {
LOG.warn("couldn't create analyzer", e1);
} finally {
putIndex(indexType, directory);
stopwatch.stop();
LOG.info("indexing songs in background thread took {}", stopwatch.toString());
}
}
});
}
示例14: add
import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
public void add(Object current)
{
if (!(current instanceof MultiTermAwareComponent))
return;
AbstractAnalysisFactory newComponent = ((MultiTermAwareComponent) current).getMultiTermComponent();
if (newComponent instanceof TokenFilterFactory)
{
if (filters == null)
{
filters = new ArrayList<TokenFilterFactory>(2);
}
filters.add((TokenFilterFactory) newComponent);
}
else if (newComponent instanceof TokenizerFactory)
{
tokenizer = (TokenizerFactory) newComponent;
}
else if (newComponent instanceof CharFilterFactory)
{
if (charFilters == null)
{
charFilters = new ArrayList<CharFilterFactory>(1);
}
charFilters.add((CharFilterFactory) newComponent);
}
else
{
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown analysis component from MultiTermAwareComponent: " + newComponent);
}
}
示例15: getDictionaryByFieldType
import org.apache.lucene.analysis.util.TokenizerFactory; //导入依赖的package包/类
private Dictionary getDictionaryByFieldType(String fieldTypeName) {
FieldType ft = h.getCore().getLatestSchema().getFieldTypeByName(fieldTypeName);
Analyzer a = ft.getIndexAnalyzer();
Assert.assertEquals(a.getClass(), TokenizerChain.class);
TokenizerChain tc = (TokenizerChain) a;
TokenizerFactory tf = tc.getTokenizerFactory();
Assert.assertEquals(tf.getClass(), MMSegTokenizerFactory.class);
MMSegTokenizerFactory mtf = (MMSegTokenizerFactory) tf;
Assert.assertNotNull(mtf.dic);
return mtf.dic;
}