本文整理汇总了Python中whoosh.writing.AsyncWriter.add_document方法的典型用法代码示例。如果您正苦于以下问题:Python AsyncWriter.add_document方法的具体用法?Python AsyncWriter.add_document怎么用?Python AsyncWriter.add_document使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类whoosh.writing.AsyncWriter
的用法示例。
在下文中一共展示了AsyncWriter.add_document方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: store_page
# 需要导入模块: from whoosh.writing import AsyncWriter [as 别名]
# 或者: from whoosh.writing.AsyncWriter import add_document [as 别名]
def store_page(user, url):
writer = AsyncWriter(idx)
resp = requests.get(url)
content = parse(resp.content)
now = datetime.now()
writer.add_document(ts=now, user=unicode(user), url=unicode(url), content=content)
writer.commit()
示例2: whoosh_index
# 需要导入模块: from whoosh.writing import AsyncWriter [as 别名]
# 或者: from whoosh.writing.AsyncWriter import add_document [as 别名]
def whoosh_index(self):
it = QTreeWidgetItemIterator(
self.notesTree, QTreeWidgetItemIterator.All)
print("Starting complete indexing.")
#writer = self.ix.writer()
writer = AsyncWriter(self.ix)
while it.value():
treeItem = it.value()
name = self.notesTree.itemToPage(treeItem)
path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/')
print(path)
fileobj = open(path, 'r', encoding='utf-8')
content = fileobj.read()
fileobj.close()
if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions:
no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip()
self.settings.md.reset().convert(content)
writer.update_document(
path=name, title=parseTitle(content, name), content=no_metadata_content,
tags=','.join(self.settings.md.Meta.get('tags', [])).strip())
else:
writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='')
it += 1
writer.commit()
print("Finished completely reindexing.")
示例3: addLink
# 需要导入模块: from whoosh.writing import AsyncWriter [as 别名]
# 或者: from whoosh.writing.AsyncWriter import add_document [as 别名]
def addLink(self, url, title, summary, txt):
titleb = title + " "
title10 = titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb
sumario = summary + " "
sumario2 = sumario + sumario
text = title10 + sumario2 + " " + txt
ix = open_dir(self.indexDir, indexname='MAIN', readonly=False)
writer = AsyncWriter(ix)
writer.add_document(id=url, content=unicode(text))
writer.commit()
ix.close()
示例4: whoosh_task
# 需要导入模块: from whoosh.writing import AsyncWriter [as 别名]
# 或者: from whoosh.writing.AsyncWriter import add_document [as 别名]
def whoosh_task(ids, pool_number, ix, model_class):
session = sqla['session']
writer = AsyncWriter(ix)
for id_ in ids:
obj = session.query(model_class).filter_by(id=id_).one()
if obj.title is None or obj.summary is None:
continue
writer.add_document(
title=obj.title,
summary=obj.summary
)
writer.commit()
示例5: createIndex
# 需要导入模块: from whoosh.writing import AsyncWriter [as 别名]
# 或者: from whoosh.writing.AsyncWriter import add_document [as 别名]
def createIndex(self):
print " Whoosh Loading from SQL "
created = self.createIndexDirIfNotExist()
if not created:
#already exists
return
conn = sqlite3.connect(self.dbName)
c = conn.cursor()
c.execute('''SELECT * FROM newsStorage where ARTICLE <> "" ''')
feeds = c.fetchall()
conn.close()
linkN = 1
schema = Schema(id = TEXT(stored = True), content=TEXT)
ix = create_in(self.indexDir, schema, indexname='MAIN')
writer = AsyncWriter(ix)
for feed in feeds:
# Descartar links sem Titulo
if( isinstance(feed[3], type(None))):
#print "is Null"
continue
index = feed[0]
# print " Whoosh Loaded Titulo " + str(linkN) + ":" + feed[3]
linkN += 1
titolo = feed[3] + " "
titolo10 = titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo
sumario = feed[4] + " "
sumario2 = sumario + sumario
text = titolo10 + sumario2 + " " +feed[5]
writer.add_document(id=index, content=unicode(text))
writer.commit()
ix.close()
print " Done Loading from SQL"
示例6: newPageCore
# 需要导入模块: from whoosh.writing import AsyncWriter [as 别名]
# 或者: from whoosh.writing.AsyncWriter import add_document [as 别名]
def newPageCore(self, item, newPageName):
pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, '/')
if not newPageName:
dialog = LineEditDialog(pagePath, self)
if dialog.exec_():
newPageName = dialog.editor.text()
if newPageName:
if hasattr(item, 'text'):
pagePath = os.path.join(self.notePath,
pagePath + '/').replace(os.sep, '/')
if not QDir(pagePath).exists():
QDir(self.notePath).mkdir(pagePath)
fileName = pagePath + newPageName + self.settings.fileExt
fh = QFile(fileName)
fh.open(QIODevice.WriteOnly)
savestream = QTextStream(fh)
savestream << '# ' + newPageName + '\n'
savestream << 'Created ' + str(datetime.date.today()) + '\n\n'
fh.close()
QTreeWidgetItem(item, [newPageName])
newItem = self.pageToItem(pagePath + newPageName)
self.sortItems(0, Qt.AscendingOrder)
self.setCurrentItem(newItem)
if hasattr(item, 'text'):
self.expandItem(item)
# create attachment folder if not exist
attDir = self.itemToAttachmentDir(newItem)
if not QDir(attDir).exists():
QDir().mkpath(attDir)
# TODO improvement needed, can be reused somehow
fileobj = open(fileName, 'r')
content = fileobj.read()
fileobj.close()
self.ix = open_dir(self.settings.indexdir)
#writer = self.ix.writer()
writer = AsyncWriter(self.ix)
writer.add_document(path=pagePath+newPageName, content=content)
writer.commit()
示例7: Index
# 需要导入模块: from whoosh.writing import AsyncWriter [as 别名]
# 或者: from whoosh.writing.AsyncWriter import add_document [as 别名]
class Index(object):
def __init__(self, directory, persist):
self.log = logging.getLogger("ftpvista.index")
self._persist = persist
if not os.path.exists(directory):
self.log.info("Creating the index in %s" % directory)
os.mkdir(directory)
self._idx = index.create_in(directory, schema=self.get_schema())
else:
self.log.info("Opening the index in %s" % directory)
self._idx = index.open_dir(directory)
self._searcher = self._idx.searcher()
self._writer = None
self.open_writer()
self._last_optimization = None
def open_writer(self):
# self._writer = BufferedWriter(self._idx, 120, 4000)
self._writer = AsyncWriter(self._idx)
def get_schema(self):
analyzer = StemmingAnalyzer("([a-zA-Z0-9])+")
my_analyzer = analyzer | CharsetFilter(accent_map)
return Schema(
server_id=ID(stored=True),
has_id=ID(),
path=TEXT(analyzer=my_analyzer, stored=True),
name=TEXT(analyzer=my_analyzer, stored=True),
ext=TEXT(analyzer=my_analyzer, stored=True),
size=ID(stored=True),
mtime=ID(stored=True, sortable=True),
audio_album=TEXT(analyzer=my_analyzer, stored=True),
audio_artist=TEXT(analyzer=my_analyzer, stored=True),
audio_title=TEXT(analyzer=my_analyzer, stored=True),
audio_track=ID(stored=True),
audio_year=ID(stored=True),
)
def delete_all_docs(self, server):
self.open_writer()
self._writer.delete_by_term("server_id", str(server.get_server_id()))
self._writer.commit()
self.log.info("All documents of server %s deleted" % server.get_ip_addr())
def incremental_server_update(self, server_id, current_files):
"""Prepares to incrementaly update the documents for the given server.
server_id -- Id of the server to update.
current_files -- a list of (path, size, mtime) tuples for each files
currently on the server.
Delete all the outdated files from the index and returns a list
of files needing to be reindexed.
"""
def delete_doc(writer, serverid, path):
writer.delete_by_query(Term("server_id", str(serverid)) & Term("path", path))
# Build a {path => (size, mtime)} mapping for quick lookups
to_index = {}
for path, size, mtime in current_files:
to_index[path] = (size, mtime)
results = self._searcher.documents(server_id=str(server_id))
if results:
for fields in results:
indexed_path = fields["path"]
if indexed_path not in to_index:
# This file was deleted from the server since it was indexed
delete_doc(self._writer, server_id, indexed_path)
self.log.debug("%s has been removed" % indexed_path)
else:
size, mtime = to_index[indexed_path]
try:
if mtime > datetime.strptime(fields["mtime"], "%Y-%m-%d %H:%M:%S"):
# This file has been modified since it was indexed
delete_doc(self._writer, server_id, indexed_path)
else:
# up to date, no need to reindex
del to_index[indexed_path]
except ValueError:
delete_doc(self._writer, server_id, indexed_path)
# return the remaining files
return [(path, xsize, xmtime) for (path, (xsize, xmtime)) in to_index.items()]
def add_document(
self, server_id, name, path, size, mtime, audio_album=None, audio_artist=None, audio_title=None, audio_year=None
):
"""Add a document with the specified fields in the index.
Changes need to be commited.
"""
# passing the optional arguments is quite a mess
# let's build a dict for that purpose
#.........这里部分代码省略.........
示例8: index_update
# 需要导入模块: from whoosh.writing import AsyncWriter [as 别名]
# 或者: from whoosh.writing.AsyncWriter import add_document [as 别名]
def index_update(index, items):
"""
:param:index: index name
:param:items: list of (operation, full class name, primary key, data) tuples.
"""
index_name = index
index = service.app_state.indexes[index_name]
adapted = service.adapted
session = safe_session()
updated = set()
writer = AsyncWriter(index)
try:
for op, cls_name, pk, data in items:
if pk is None:
continue
# always delete. Whoosh manual says that 'update' is actually delete + add
# operation
object_key = f"{cls_name}:{pk}"
writer.delete_by_term("object_key", object_key)
adapter = adapted.get(cls_name)
if not adapter:
# FIXME: log to sentry?
continue
if object_key in updated:
# don't add twice the same document in same transaction. The writer will
# not delete previous records, ending in duplicate records for same
# document.
continue
if op in ("new", "changed"):
with session.begin(nested=True):
obj = adapter.retrieve(pk, _session=session, **data)
if obj is None:
# deleted after task queued, but before task run
continue
document = service.get_document(obj, adapter)
try:
writer.add_document(**document)
except ValueError:
# logger is here to give us more infos in order to catch a weird bug
# that happens regularly on CI but is not reliably
# reproductible.
logger.error("writer.add_document(%r)", document, exc_info=True)
raise
updated.add(object_key)
except Exception:
writer.cancel()
raise
session.close()
writer.commit()
try:
# async thread: wait for its termination
writer.join()
except RuntimeError:
# happens when actual writer was already available: asyncwriter didn't need
# to start a thread
pass
示例9: newPageCore
# 需要导入模块: from whoosh.writing import AsyncWriter [as 别名]
# 或者: from whoosh.writing.AsyncWriter import add_document [as 别名]
def newPageCore(self, item, newPageName, useTemplate=False, templateTitle=None, templateBody=None):
pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, "/")
if not newPageName:
if useTemplate:
dialog = mikitemplate.PickTemplateDialog(pagePath, self.settings, parent=self)
if dialog.exec_():
curTitleIdx = dialog.titleTemplates.currentIndex()
curBodyIdx = dialog.bodyTemplates.currentIndex()
dtnow = datetime.datetime.now()
if curTitleIdx > -1:
titleItem = dialog.titleTemplates.model().item(curTitleIdx)
titleItemContent = titleItem.data(TTPL_COL_DATA)
titleItemType = titleItem.data(TTPL_COL_EXTRA_DATA)
titleParameter = dialog.titleTemplateParameter.text()
newPageName = mikitemplate.makeTemplateTitle(
titleItemType, titleItemContent, dtnow=dtnow, userinput=titleParameter
)
if curBodyIdx > -1:
bodyItemIdx = dialog.bodyTemplates.rootModelIndex().child(curBodyIdx, 0)
bodyFPath = dialog.bodyTemplates.model().filePath(bodyItemIdx)
else:
bodyFPath = None
else:
dialog = LineEditDialog(pagePath, self)
if dialog.exec_():
newPageName = dialog.editor.text()
prevparitem = None
if newPageName:
if hasattr(item, "text"):
pagePath = os.path.join(self.notePath, pagePath + "/").replace(os.sep, "/")
if not QtCore.QDir(pagePath).exists():
QtCore.QDir(self.notePath).mkdir(pagePath)
if not QtCore.QDir(os.path.dirname(newPageName)).exists():
curdirname = os.path.dirname(newPageName)
needed_parents = []
while curdirname != "":
needed_parents.append(curdirname)
curdirname = os.path.dirname(curdirname)
# create the needed hierarchy in reverse order
for i, needed_parent in enumerate(needed_parents[::-1]):
paritem = self.pageToItem(needed_parent)
if paritem is None:
if i == 0:
self.newPageCore(item, os.path.basename(needed_parent))
else:
self.newPageCore(prevparitem, os.path.basename(needed_parent))
QtCore.QDir(pagePath).mkdir(needed_parent)
elif not QtCore.QDir(os.path.join(self.notePath, needed_parent).replace(os.sep, "/")).exists():
QtCore.QDir(pagePath).mkdir(needed_parent)
if paritem is not None:
prevparitem = paritem
else:
prevparitem = self.pageToItem(needed_parent)
fileName = pagePath + newPageName + self.settings.fileExt
fh = QtCore.QFile(fileName)
fh.open(QtCore.QIODevice.WriteOnly)
savestream = QtCore.QTextStream(fh)
if useTemplate and bodyFPath is not None:
with open(bodyFPath, "r", encoding="utf-8") as templatef:
savestream << mikitemplate.makeTemplateBody(
os.path.basename(newPageName),
dtnow=dtnow,
dt_in_body_txt=self.tr("Created {}"),
body=templatef.read(),
)
else:
savestream << mikitemplate.makeDefaultBody(os.path.basename(newPageName), self.tr("Created {}"))
fh.close()
if prevparitem is not None:
QtWidgets.QTreeWidgetItem(prevparitem, [os.path.basename(newPageName)])
else:
QtWidgets.QTreeWidgetItem(item, [os.path.basename(newPageName)])
newItem = self.pageToItem(pagePath + newPageName)
self.sortItems(0, Qt.AscendingOrder)
self.setCurrentItem(newItem)
if hasattr(item, "text"):
self.expandItem(item)
# create attachment folder if not exist
attDir = self.itemToAttachmentDir(newItem)
if not QtCore.QDir(attDir).exists():
QtCore.QDir().mkpath(attDir)
# TODO improvement needed, can be reused somehow
with open(fileName, "r") as fileobj:
content = fileobj.read()
self.ix = open_dir(self.settings.indexdir)
# writer = self.ix.writer()
writer = AsyncWriter(self.ix)
writer.add_document(path=pagePath + newPageName, content=content)
writer.commit()
示例10: create_in
# 需要导入模块: from whoosh.writing import AsyncWriter [as 别名]
# 或者: from whoosh.writing.AsyncWriter import add_document [as 别名]
#Create index and AsyncWriter object
index = create_in("tweetindex", my_schema)
writer = AsyncWriter(index)
if __name__=='__main__':
#Load raw data
with open("WC2015_headers.csv",'rb') as to_load:
data=csv.DictReader(to_load)
for row in data:
#Extract required information from date to create python datetime object
date=row['created_at'][:19]+' '+row['created_at'][-4:]
#Clean text and parse into keywords
text=row['text'].replace('\\','')
keywords=[word for word in word_tokenize(text) if word not in stops]
#Check for Retweets
rt=False
if 'RT ' in text:
rt=True
#Add completed document to index
writer.add_document(id = unicode(row['id']),
screen_name = unicode(row['screen_name']),
text = unicode(text),
contains_retweet=rt,
keyword = unicode(" ".join(keywords)),
created = datetime.datetime.strptime(date, "%a %b %d %H:%M:%S %Y")
)
writer.commit()
示例11: AsyncWriter
# 需要导入模块: from whoosh.writing import AsyncWriter [as 别名]
# 或者: from whoosh.writing.AsyncWriter import add_document [as 别名]
writer = AsyncWriter(ix)
for entry in entries:
try:
item = Item.get(guid = entry['guid'])
except Item.DoesNotExist:
item = Item.create(**entry)
records += 1
if len(entry['html']):
soup = BeautifulSoup(entry['html'], settings.fetcher.parser)
plaintext = ''.join(soup.find_all(text=True))
writer.add_document(
id = item.id,
guid = unicode(item.guid),
title = entry['title'],
text = plaintext,
when = datetime.datetime.utcfromtimestamp(item.when)
)
hrefs = get_link_references(soup)
else:
hrefs = []
hrefs.append(entry['url'])
if not settings.fetcher.post_processing.expand_links:
return
lnow = time.time()
links = expand_links(set(hrefs))
log.debug("%s - %d links in %fs" % (netloc, len(hrefs),time.time()-lnow))
示例12: add_to_fts
# 需要导入模块: from whoosh.writing import AsyncWriter [as 别名]
# 或者: from whoosh.writing.AsyncWriter import add_document [as 别名]
def add_to_fts(cls, content, title=None, id=None, source_hash=None, tags=None):
ix = open_dir(LOCAL_FTS_INDEX)
writer = AsyncWriter(ix)
writer.add_document(content=content, title=title, id=id, source_hash=source_hash, tags=tags)
writer.commit()