本文整理汇总了Python中org.apache.lucene.index.IndexWriterConfig.setSimilarity方法的典型用法代码示例。如果您正苦于以下问题:Python IndexWriterConfig.setSimilarity方法的具体用法?Python IndexWriterConfig.setSimilarity怎么用?Python IndexWriterConfig.setSimilarity使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.index.IndexWriterConfig
的用法示例。
在下文中一共展示了IndexWriterConfig.setSimilarity方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from org.apache.lucene.index import IndexWriterConfig [as 别名]
# 或者: from org.apache.lucene.index.IndexWriterConfig import setSimilarity [as 别名]
def __init__(self, path, settings):
self._settings = settings
self._multithreaded = settings.multithreaded
self._checker = DirectSpellChecker()
indexDirectory = MMapDirectory(File(join(path, 'index')))
indexDirectory.setUseUnmap(False)
taxoDirectory = MMapDirectory(File(join(path, 'taxo')))
taxoDirectory.setUseUnmap(False)
conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer)
conf.setSimilarity(settings.similarity)
mergePolicy = TieredMergePolicy()
mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce)
mergePolicy.setSegmentsPerTier(settings.segmentsPerTier)
conf.setMergePolicy(mergePolicy)
if not settings.readonly:
self._indexWriter = IndexWriter(indexDirectory, conf)
self._indexWriter.commit()
self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize))
self._taxoWriter.commit()
self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory)
self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper
self._facetsConfig = settings.fieldRegistry.facetsConfig
self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader())
示例2: _get_writer
# 需要导入模块: from org.apache.lucene.index import IndexWriterConfig [as 别名]
# 或者: from org.apache.lucene.index.IndexWriterConfig import setSimilarity [as 别名]
def _get_writer(self, analyzer=None, create=False):
config = IndexWriterConfig(Version.LUCENE_CURRENT, self._analyzer)
if create:
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
if self._similarity is not None:
config.setSimilarity(self._similarity)
writer = IndexWriter(self._store, config)
return writer
示例3: createIndexWriter
# 需要导入模块: from org.apache.lucene.index import IndexWriterConfig [as 别名]
# 或者: from org.apache.lucene.index.IndexWriterConfig import setSimilarity [as 别名]
def createIndexWriter(self, actual_dir, max_field_length=20000000):
"""
Returns an IndexWriter object created for the actual_dir specified
"""
ensureDirExists(actual_dir)
index = SimpleFSDirectory(File(actual_dir))
analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)
writerConfig=IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer)
similarity=FieldAgnosticSimilarity()
writerConfig.setSimilarity(similarity)
writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
## res= IndexWriter(index, analyzer, True, IndexWriter.MaxFieldLength(max_field_length))
res= IndexWriter(index, writerConfig)
res.deleteAll()
return res
示例4: __init__
# 需要导入模块: from org.apache.lucene.index import IndexWriterConfig [as 别名]
# 或者: from org.apache.lucene.index.IndexWriterConfig import setSimilarity [as 别名]
def __init__(self, fileRoot, storeDir, analyzer):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
store = SimpleFSDirectory(File(storeDir))
analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
config.setSimilarity(similarities.BM25Similarity())
#Available similarity: BM25Similarity, MultiSimilarity, PerFieldSimilarityWrapper, SimilarityBase, TFIDFSimilarity
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
self.indexDocs(fileRoot, writer)
print 'commit index',
writer.commit()
writer.close()
print 'done'
示例5: __init__
# 需要导入模块: from org.apache.lucene.index import IndexWriterConfig [as 别名]
# 或者: from org.apache.lucene.index.IndexWriterConfig import setSimilarity [as 别名]
def __init__(self, storeDir, aWrapper):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
store = SimpleFSDirectory(File(storeDir))
aWrapper = LimitTokenCountAnalyzer(aWrapper, 1048576)
bm25Sim = BM25Similarity(2.0,0.75) #BM25 with these default values: k1 = 1.2, b = 0.75.
config = IndexWriterConfig(Version.LUCENE_CURRENT, aWrapper)
config.setSimilarity(bm25Sim)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
self.indexTable(writer)
ticker = Ticker()
print 'commit index'
threading.Thread(target=ticker.run).start()
writer.commit()
writer.close()
ticker.tick = False
print 'done'
示例6: IndexingEngine
# 需要导入模块: from org.apache.lucene.index import IndexWriterConfig [as 别名]
# 或者: from org.apache.lucene.index.IndexWriterConfig import setSimilarity [as 别名]
class IndexingEngine():
def __init__(self):
self.mDocumentDirectory = settings.ADMINS_ENGINE.mDocumentDirectory
self.mIndexDirectory = settings.ADMINS_ENGINE.mIndexDirectory
self.mAnalyzers = settings.ADMINS_ENGINE.getIndexingAnalyzers()
############################# Writer Configurattion #####################################
map = HashMap()
map.put('name', self.mAnalyzers['name'])
map.put('parent', self.mAnalyzers['parent'])
map.put('content', self.mAnalyzers['default'])
map.put('id', self.mAnalyzers['id'])
analyzerWrapper = PerFieldAnalyzerWrapper(self.mAnalyzers['default'], map)
self.mWriterConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzerWrapper)
self.mWriterConfig.setOpenMode(settings.ADMINS_ENGINE.mOpenMode)
if settings.ADMINS_ENGINE.mSimilarity != None:
self.mWriterConfig.setSimilarity(settings.ADMINS_ENGINE.mSimilarity)
########################################################################################
directory = SimpleFSDirectory(File(self.mIndexDirectory))
self.mIndexWriter = IndexWriter(directory, self.mWriterConfig)
############################# FieldType Prepration #####################
nameField = FieldType()
nameField.setIndexed(True)
nameField.setStored(True)
nameField.setTokenized(True)
nameField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
parentField = FieldType()
parentField.setIndexed(True)
parentField.setStored(True)
parentField.setTokenized(True)
parentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
contentField = FieldType()
contentField.setIndexed(True)
contentField.setStored(True)
contentField.setTokenized(True)
contentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
idField = FieldType()
idField.setIndexed(True)
idField.setStored(True)
idField.setTokenized(False)
idField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
self.mFieldTypes = {
'name' : nameField,
'parent' : parentField,
'content' : contentField,
'id' : idField
}
#######################################################################
self.mLog = ""
def indexing(self, root = settings.ADMINS_ENGINE.mDocumentDirectory, parent = [], docID = 1, parentID = 0, id = 0):
realPath = os.path.abspath(root)
for i in os.listdir(realPath):
path = os.path.join(realPath, i)
if os.path.isfile(path):
#index this file
doc = Document()
doc.add(Field('name', ("%s %s" % (' '.join(parent), i)).strip(), self.mFieldTypes['name']))
doc.add(Field('parent', ' '.join(parent), self.mFieldTypes['parent']))
doc.add(Field('id', str(docID), self.mFieldTypes['id']))
doc.add(Field('parentID', str(parentID), self.mFieldTypes['id']))
fd = open(path, 'r')
content = fd.read()
fd.close()
if len(content) > 0:
doc.add(Field('content', content, self.mFieldTypes['content']))
self.mIndexWriter.addDocument(doc)
##################### Logging ##############################
if IS_DEBUG:
nameDebug = AnalyzerDebug.debug(self.mAnalyzers['name'], ("%s %s" % (' '.join(parent), i)).strip())
parentDebug = AnalyzerDebug.debug(self.mAnalyzers['parent'], ' '.join(parent))
contentDebug = AnalyzerDebug.debug(self.mAnalyzers['default'], content)
self.mLog = self.mLog + ( "File %s\n {name - %s}: %s\n {parent - %s}: %s\n {content}: %s\n\n" % (path, docID, nameDebug, parentID, parentDebug, contentDebug) )
#.........这里部分代码省略.........