本文整理汇总了Python中elasticsearch.helpers.scan方法的典型用法代码示例。如果您正苦于以下问题:Python helpers.scan方法的具体用法?Python helpers.scan怎么用?Python helpers.scan使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类elasticsearch.helpers
的用法示例。
在下文中一共展示了helpers.scan方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get
# 需要导入模块: from elasticsearch import helpers [as 别名]
# 或者: from elasticsearch.helpers import scan [as 别名]
def get(self, spec, fields=None, **kargs):
"""Queries the active index."""
query = {"query": spec.to_dict()}
if fields is not None:
query['_source'] = fields
for rec in helpers.scan(self.db_client,
query=query,
index=self.indexes[0],
ignore_unavailable=True):
host = dict(rec['_source'], _id=rec['_id'])
if 'coordinates' in host.get('infos', {}):
host['infos']['coordinates'] = host['infos'][
'coordinates'
][::-1]
for field in self.datetime_fields:
if field in host:
host[field] = utils.all2datetime(host[field])
yield host
示例2: scan
# 需要导入模块: from elasticsearch import helpers [as 别名]
# 或者: from elasticsearch.helpers import scan [as 别名]
def scan(self):
"""
Turn the search into a scan search and return a generator that will
iterate over all the documents matching the query.
Use ``params`` method to specify any additional arguments you with to
pass to the underlying ``scan`` helper from ``elasticsearch-py`` -
http://elasticsearch-py.readthedocs.org/en/master/helpers.html#elasticsearch.helpers.scan
"""
es = connections.get_connection(self._using)
for hit in scan(
es,
query=self.to_dict(),
index=self._index,
doc_type=self._doc_type,
**self._params
):
yield self._doc_type_map.get(hit['_type'], Result)(hit)
示例3: scan
# 需要导入模块: from elasticsearch import helpers [as 别名]
# 或者: from elasticsearch.helpers import scan [as 别名]
def scan(self):
"""
Turn the search into a scan search and return a generator that will
iterate over all the documents matching the query.
Use ``params`` method to specify any additional arguments you with to
pass to the underlying ``scan`` helper from ``elasticsearch-py`` -
https://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.scan
"""
es = connections.get_connection(self._using)
for hit in scan(
es,
query=self.to_dict(),
index=self._index,
**self._params
):
yield self._get_result(hit)
示例4: reindex
# 需要导入模块: from elasticsearch import helpers [as 别名]
# 或者: from elasticsearch.helpers import scan [as 别名]
def reindex(src_index, dst_index, type_list, chunk_size=None, time=None):
"""Reindex a set of indexes internally within ElasticSearch. All of the
documents under the types that live in "type_list" under the index
"src_index" will be copied into the documents under the same types
in the index "dst_index". In other words, a perfect re-index!
Instead of using the plugin API and consuming bandwidth to perform
the re-index we will allow ElasticSearch to do some heavy lifting for
us. Under the covers we are combining scan/scroll with bulk operations
to do this re-indexing as efficient as possible.
"""
es_engine = searchlight.elasticsearch.get_api()
# Create a Query DSL string to access all documents within the specified
# document types. We will filter on the "_type" field in this index. Since
# there are multiple docuent types, we will need to use the "terms" filter.
# All of the document types will be added to the list for "_type". We need
# to enable version to allow the search to return the version field. This
# will be used by the reindexer.
body = {"version": "true",
"query": {"bool": {"filter": {"terms": {"_type": type_list}}}}}
# Debug: Show all documents that ES will re-index.
# LOG.debug(es_engine.search(index=src_index, body=body, size=500))
helper_reindex(client=es_engine, source_index=src_index,
target_index=dst_index, query=body)
示例5: scan_index
# 需要导入模块: from elasticsearch import helpers [as 别名]
# 或者: from elasticsearch.helpers import scan [as 别名]
def scan_index(index: str, model: Model) -> Generator:
"""
Yield all documents of model type in an index.
This function calls the elasticsearch.helpers.scan function,
and yields all the documents in the index that match the doc_type
produced by a specific Django model.
Args:
index: string, the name of the index to scan, must be a configured
index as returned from settings.get_index_names.
model: a Django model type, used to filter the the documents that
are scanned.
Yields each document of type model in index, one at a time.
"""
# noqa: E501, see https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-type-query.html
query = {"query": {"type": {"value": model._meta.model_name}}}
client = get_client()
for hit in helpers.scan(client, index=index, query=query):
yield hit
示例6: dump_documents
# 需要导入模块: from elasticsearch import helpers [as 别名]
# 或者: from elasticsearch.helpers import scan [as 别名]
def dump_documents(client, index, out_path, total_docs, progress_message_suffix=""):
from elasticsearch import helpers
logger = logging.getLogger(__name__)
freq = max(1, total_docs // 1000)
progress = console.progress()
compressor = DOCS_COMPRESSOR()
comp_outpath = out_path + COMP_EXT
with open(out_path, "wb") as outfile:
with open(comp_outpath, "wb") as comp_outfile:
logger.info("Dumping corpus for index [%s] to [%s].", index, out_path)
query = {"query": {"match_all": {}}}
for n, doc in enumerate(helpers.scan(client, query=query, index=index)):
if n > total_docs:
break
data = (json.dumps(doc["_source"], separators=(",", ":")) + "\n").encode("utf-8")
outfile.write(data)
comp_outfile.write(compressor.compress(data))
render_progress(progress, progress_message_suffix, index, n + 1, total_docs, freq)
comp_outfile.write(compressor.flush())
progress.finish()
示例7: read_item
# 需要导入模块: from elasticsearch import helpers [as 别名]
# 或者: from elasticsearch.helpers import scan [as 别名]
def read_item(self, from_date=None):
"""Read items one by one.
:param from_date: start date for incremental reading
:return: _source field of each ES hit.
:raises ValueError: `metadata__timestamp` field not found in index
:raises NotFoundError: index not found in ElasticSearch
"""
search_query = self._build_search_query(from_date)
for hit in helpers.scan(self._es_conn,
search_query,
scroll='300m',
index=self._es_index,
preserve_order=True):
yield hit["_source"]
示例8: scan
# 需要导入模块: from elasticsearch import helpers [as 别名]
# 或者: from elasticsearch.helpers import scan [as 别名]
def scan(self):
"""
Turn the search into a scan search and return a generator that will
iterate over all the documents matching the query.
Use ``params`` method to specify any additional arguments you with to
pass to the underlying ``scan`` helper from ``elasticsearch-py`` -
https://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.scan
"""
es = get_connection(self._using)
for hit in scan(
es,
query=self.to_dict(),
index=self._index,
**self._params
):
yield self._get_result(hit)
示例9: get_link_list
# 需要导入模块: from elasticsearch import helpers [as 别名]
# 或者: from elasticsearch.helpers import scan [as 别名]
def get_link_list(self, website_id, base_url):
hits = helpers.scan(client=self.es,
query={
"_source": {
"includes": ["path", "name", "ext"]
},
"query": {
"constant_score": {
"filter": {
"term": {"website_id": website_id}
}
}
},
},
index=self.index_name, request_timeout=20, routing=website_id)
for hit in hits:
src = hit["_source"]
yield urljoin(base_url, "/") + src["path"] + ("/" if src["path"] != "" else "") + src["name"] + \
("." if src["ext"] != "" else "") + src["ext"]
示例10: clear
# 需要导入模块: from elasticsearch import helpers [as 别名]
# 或者: from elasticsearch.helpers import scan [as 别名]
def clear(cls, index=None, using=None):
"""
Deletes the Elasticsearch mapping associated with this document type.
"""
using = using or cls._doc_type.using or 'default'
index = index or cls._doc_type.index or getattr(settings, 'SEEKER_INDEX', 'seeker')
es = connections.get_connection(using)
if es.indices.exists_type(index=index, doc_type=cls._doc_type.name):
def get_actions():
for hit in scan(es, index=index, doc_type=cls._doc_type.name, query={'query': {'match_all': {}}}):
yield {
'_op_type': 'delete',
'_index': index,
'_type': cls._doc_type.name,
'_id': hit['_id'],
}
bulk(es, get_actions())
es.indices.refresh(index=index)
示例11: run_query
# 需要导入模块: from elasticsearch import helpers [as 别名]
# 或者: from elasticsearch.helpers import scan [as 别名]
def run_query(self):
self.vprint("{info} Gathering flow data... this may take a while...".format(info=self.info))
FLOW_BYTES = self.beacon_flow_bytes_toserver
if self.suricata_defaults:
FLOW_BYTES = 'flow.' + FLOW_BYTES
query = self.hour_query(self.period, self.beacon_src_ip, self.beacon_dest_ip, self.beacon_destination_port,
self.beacon_timestamp, FLOW_BYTES, self.beacon_flow_id)
self.dprint(query)
resp = helpers.scan(query=query, client=self.es, scroll="90m", index=self.es_index, timeout="10m")
df = pd.DataFrame([rec['_source'] for rec in resp])
if len(df) == 0:
raise Exception("Elasticsearch did not retrieve any data. Please ensure your settings are correct inside the config file.")
self.dprint(df)
df['dest_port'] = df[self.beacon_destination_port].fillna(0).astype(int)
if 'flow' in df.columns:
df[self.beacon_flow_bytes_toserver] = df['flow'].apply(lambda x: x.get(self.beacon_flow_bytes_toserver))
df['triad_id'] = (df[self.beacon_src_ip] + df[self.beacon_dest_ip] + df[self.beacon_destination_port].astype(str)).apply(hash)
df['triad_freq'] = df.groupby('triad_id')['triad_id'].transform('count').fillna(0).astype(int)
self.high_freq = list(df[df.triad_freq > self.MIN_OCCURRENCES].groupby('triad_id').groups.keys())
return df
示例12: delete
# 需要导入模块: from elasticsearch import helpers [as 别名]
# 或者: from elasticsearch.helpers import scan [as 别名]
def delete(self, **kwargs):
"""
Deletes a document from the index
Pass an index and id to delete a specific document
Pass a body with a query dsl to delete by query
"""
kwargs = self._add_prefix(**kwargs)
body = kwargs.pop("body", None)
if body is not None:
try:
data = []
refresh = kwargs.pop("refresh", False)
for hit in helpers.scan(self.es, query=body, **kwargs):
hit["_op_type"] = "delete"
data.append(hit)
return helpers.bulk(self.es, data, refresh=refresh, **kwargs)
except Exception as detail:
try:
# ignore 404 errors (index_not_found_exception)
if detail.status_code == 404:
pass
except:
self.logger.warning(
"%s: WARNING: failed to delete document by query: %s \nException detail: %s\n" % (datetime.now(), body, detail)
)
raise detail
else:
try:
return self.es.delete(ignore=[404], **kwargs)
except Exception as detail:
self.logger.warning("%s: WARNING: failed to delete document: %s \nException detail: %s\n" % (datetime.now(), body, detail))
raise detail
示例13: get_all_doc
# 需要导入模块: from elasticsearch import helpers [as 别名]
# 或者: from elasticsearch.helpers import scan [as 别名]
def get_all_doc(self, index, doc_type, search_body):
results = helpers.scan(
client=self.es,
query=search_body,
scroll="60s",
index=index,
doc_type=doc_type,
timeout="60s")
results_list = []
for item in results:
results_list.append(item["_source"])
return results_list
示例14: _scan
# 需要导入模块: from elasticsearch import helpers [as 别名]
# 或者: from elasticsearch.helpers import scan [as 别名]
def _scan(self, index, search_range, bool_clause=None, sort_clause=None, query_fields=None, search_query=None,
model_settings=None):
"""
Scan and get documents in Elasticsearch
:param index: on which index the request must be done
:param search_range: the range of the search
:param bool_clause: boolean condition
:param sort_clause: request to sort results
:param query_fields: the query field
:param search_query: the search query
:param model_settings: part of the configuration linked to the model
:return: generator to fetch documents
"""
preserve_order = False
highlight_settings = self._get_highlight_settings(model_settings)
if model_settings is not None and model_settings["process_documents_chronologically"]:
sort_clause = {"sort": [{model_settings["timestamp_field"]: "desc"}]}
preserve_order = True
return eshelpers.scan(self.conn, request_timeout=self.settings.config.getint("general", "es_timeout"),
index=index, query=build_search_query(bool_clause=bool_clause,
sort_clause=sort_clause,
search_range=search_range,
query_fields=query_fields,
search_query=search_query,
highlight_settings=highlight_settings),
size=self.settings.config.getint("general", "es_scan_size"),
scroll=self.settings.config.get("general", "es_scroll_time"),
preserve_order=preserve_order, raise_on_error=False)
示例15: scan
# 需要导入模块: from elasticsearch import helpers [as 别名]
# 或者: from elasticsearch.helpers import scan [as 别名]
def scan(self, body, **kwargs):
return scan(self.client, query=body, **kwargs)