本文整理汇总了Python中pyelasticsearch.ElasticSearch.bulk方法的典型用法代码示例。如果您正苦于以下问题:Python ElasticSearch.bulk方法的具体用法?Python ElasticSearch.bulk怎么用?Python ElasticSearch.bulk使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyelasticsearch.ElasticSearch
的用法示例。
在下文中一共展示了ElasticSearch.bulk方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: add_document
# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
def add_document(entries):
es_server = 'http://localhost:9200/'
if os.environ.get('ELASTICSEARCH_SERVER'):
es_server = os.environ['ELASTICSEARCH_SERVER']
es = ElasticSearch(es_server)
es.bulk([es.index_op(doc) for doc in entries],
index=os.environ['ELASTICSEARCH_INDEX'] if os.environ.get('ELASTICSEARCH_INDEX') else 'memex',
doc_type=os.environ['ELASTICSEARCH_DOC_TYPE'] if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')
示例2: Indexer
# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
class Indexer(object):
def __init__(self, input):
self.input = input
self.es = ElasticSearch()
self.index_name = "psim"
self.doc_type = 'book'
def delete_index(self):
# Delete index if already found one
try:
self.es.delete_index(index = self.index_name)
except Exception:
pass
def create_index(self):
self.es.create_index(index=self.index_name, settings = self.get_index_settings())
def get_index_settings(self):
settings = {
"mappings": {
"book": {
"_all" : {"enabled" : "false"},
"properties": {
"codes": {"type": "string",
"term_vector": "yes",
"store": "true"},
"pid" : {"type" : "string"},
"embedding": {"type": "float",
"store": "true"},
"magnitude": {"type": "float", "store": "true"}
}
}
}
}
return settings
def documents(self):
with open(self.input) as input_file:
for line in input_file:
json_doc = json.loads(line)
yield self.es.index_op(json_doc, doc_type=self.doc_type)
def index(self):
self.delete_index()
self.create_index()
for chunk in bulk_chunks(self.documents(), docs_per_chunk=1000):
self.es.bulk(chunk, index = self.index_name, doc_type = self.doc_type)
self.es.refresh(self.index_name)
示例3: update_document
# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
def update_document(entries):
es_server = 'http://localhost:9200/'
if os.environ.get('ELASTICSEARCH_SERVER'):
es_server = os.environ['ELASTICSEARCH_SERVER']
es = ElasticSearch(es_server)
# es.update(index=os.environ['ELASTICSEARCH_INDEX'] if os.environ.get('ELASTICSEARCH_INDEX') else 'memex',
# doc_type=os.environ['ELASTICSEARCH_DOC_TYPE'] if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page',
# id=url,
# script=doc,
# upsert=True
# )
es.bulk([es.update_op(doc, id=doc['url'], upsert=True) for doc in entries],
index=os.environ['ELASTICSEARCH_INDEX'] if os.environ.get('ELASTICSEARCH_INDEX') else 'memex',
doc_type=os.environ['ELASTICSEARCH_DOC_TYPE'] if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')
示例4: commit
# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
def commit(self):
if len(self.buffer) > 0:
logging.debug("Inserting {} to elasticsearch".format(len(self.buffer)))
es = ElasticSearch(ELASTICSEARCH_URI)
docs = []
for doc in self.buffer:
t = time.gmtime(int(doc['@timestamp']/1000))
index = ELASTICSEARCH_INDEX + "-" + str(t.tm_year).zfill(2) + "." + str(t.tm_mon).zfill(2) + "." + str(t.tm_mday).zfill(2)
docs.append(es.index_op(doc, index=index, doc_type=ELASTICSEARCH_DOC))
if len(docs) > 0:
try:
es.bulk(docs)
logging.debug("inserted %d records" % (len(docs)))
self.buffer = []
except Exception as e:
logging.error("Insert Exception " + str(e))
示例5: update_index
# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
def update_index(sender, created, **kwargs):
"""
A signal for indexing new coffeehouses
upon creation
"""
es = ElasticSearch()
if created:
m = sender.objects.last()
es.bulk([
es.index_op({
"pk": m.pk,
"name": m.name,
"rating": m.rating,
"location": {
"lon": m.position.longitude,
"lat": m.position.latitude
}
}),
],
doc_type="place",
index="toerh_coffee")
示例6: ElasticSearch
# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
return data
es = ElasticSearch('http://localhost:9200/')
es.delete_index('pet')
spider = Spider()
breeds = spider.getPetBreeds()
p = Pinyin()
for breed in breeds:
flg = 1
page = 1
pet_list = []
while(flg):
pets = spider.getPets(breed, (page - 1) * spider.limit)
if not pets:
flg = 0
else:
page = page + 1
for pet in pets:
pet_obj = {}
pet_obj['name'] = pet['name']
pet_obj['img'] = pet['img']
pet_obj['type'] = breed['ename']
pet_list.append(pet_obj)
#print pet['name'] + '\t' + p.get_pinyin(pet['name'], '')
print breed['ename'] + '\n'
if not pet_list:
continue
doc_type = p.get_pinyin(breed['ename'].replace('宠物', ''), '')
es.bulk((es.index_op(pet_obj) for pet_obj in pet_list), doc_type=doc_type, index = 'pet')
es.refresh('pet')
示例7: getFeeds
# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
def getFeeds():
print "getting feeds"
es = ElasticSearch('http://fisensee.ddns.net:9200/')
query = {"query": {"range": {"date": {"lte": "now-1w/w"}}}}
oldFeeds = es.search(query, size=300, index='feeds')
if(len(oldFeeds['hits']['hits']) is not 0):
es.bulk(es.delete_op(id=feed['_id'], index='feeds',
doc_type='feed') for feed in oldFeeds['hits']['hits'])
feedSources = FeedSource.objects.all()
feeds = []
defaultText = 'undefined'
defaultDate = datetime.datetime.now().isoformat()
utc = pytz.utc
berlin = pytz.timezone('Europe/Berlin')
now = datetime.datetime.today()
dateThreshold = now - datetime.timedelta(weeks=2)
allUrls = []
for feedSource in feedSources:
allUrls.append(feedSource.sourceUrl)
urls = set(allUrls)
for url in urls:
source = feedparser.parse(url)
for entry in source['items']:
feed = {
'title':defaultText,
'description':defaultText,
'link':defaultText,
'date':defaultDate,
'url': defaultText
}
if('title' in entry):
feed['title'] = entry['title']
if('description' in entry):
feed['description'] = entry['description']
if('link' in entry):
feed['link'] = entry['link']
if('published_parsed' in entry):
date = datetime.datetime.fromtimestamp(time.mktime(entry['published_parsed']))
if(date < dateThreshold):
break
utcDate = utc.localize(date)
feed['date'] = utcDate.astimezone(berlin).isoformat()
#id creation should be enough for now, but it's made to fail
if('title' or 'published_parsed' in entry):
feed['id'] = base64.urlsafe_b64encode(hashlib.sha256((feed['title'] + feed['date']).encode('utf8')).hexdigest())
else:
feed['id'] = base64.urlsafe_b64encode(hashlib.sha256((feed['title']).encode('utf8')).hexdigest())
feed['url'] = url
feeds.append(feed)
es.bulk((es.index_op(feed, **{'id': feed.pop('id')}) for feed in feeds),
index = 'feeds',
doc_type = 'feed')
print es.refresh('feeds')
示例8: prepareDataFromDB
# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
}
}
es.health(wait_for_status='yellow')
es.delete_index('write-ads')
es.create_index('write-ads', settings={'mappings': ad_mapping})
dateYMD = args["date"]
prepareDataFromDB(dateYMD)
dir = DATA_FILES_JSON + '/' + dateYMD
for filename in os.listdir(dir):
if filename.endswith('.json'):
with open(dir + '/' + filename) as open_file:
json_docs = json.load(open_file)
es.bulk((es.index_op(doc) for doc in json_docs),
index='write-ads',
doc_type='ad')
es.refresh("write-ads")
res = es.search('website:com', index='write-ads')
print("Got %d Hits for .com websites" % res['hits']['total'])
for hit in res['hits']['hits']:
print (hit["_source"])
res = es.search('website:in', index='write-ads')
print("Got %d Hits for .in websites" % res['hits']['total'])
res = es.search('category:entertainment', index='write-ads')
print("Got %d Hits for category:Entertainment" % res['hits']['total'])
示例9: bulk_chunks
# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
"coordinates" : coords, # 4, 5
"feature_class" : row[6],
"feature_code" : row[7],
"country_code2" : row[8],
"country_code3" : country_code3,
"cc2" : row[9],
"admin1_code" : row[10],
"admin2_code" : row[11],
"admin3_code" : row[12],
"admin4_code" : row[13],
"population" : row[14],
"elevation" : row[15],
"dem" : row[16],
"timzeone" : row[17],
"modification_date" : "2014-01-01"
}
yield es.index_op(doc, index='geonames', doc_type='geoname')
except:
count += 1
print 'Exception count:', count
chunk_count = 0
for chunk in bulk_chunks(documents(reader, es), docs_per_chunk=500):
es.bulk(chunk)
chunk_count += 1
print 'Chunk count:', chunk_count
es.refresh('geonames')
示例10: enumerate
# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
}
}
es.create_index(ELASTICSEARCH_INDEX, settings=index_settings)
for filename in FILES:
print "Processing %s" % filename
sf = shapefile.Reader(filename)
shapes = sf.shapes()
for i, shape in enumerate(shapes, start=1):
points = [(p[0], p[1]) for p in shape.points]
data = {
'filename': filename,
'location': {
'type': 'polygon',
'coordinates': [points]
}
}
if points[-1] != points[0]:
points.append(points[0])
try:
es.bulk([es.index_op(data)],
doc_type=ELASTICSEARCH_DOC,
index=ELASTICSEARCH_INDEX)
except:
print "Exception"
示例11: SearchIndex
# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
class SearchIndex(object):
def __init__(self, model):
self.es = ElasticSearch()
self.model = model
def put_mapping(self, index, doc_type):
mapping = {
doc_type: {
"properties": {
"location": {
"type": "geo_point"
},
}
}
}
self.es.put_mapping(index=index, doc_type=doc_type, mapping=mapping)
def bulk_items(self, index, doc_type):
for m in self.model.objects.all():
self.es.bulk([
self.es.index_op({
"pk": m.pk,
"name": m.name,
"rating": m.rating,
"address": m.address,
"description": m.description,
"location": {
"lon": m.longitude,
"lat": m.latitude
}
}),
],
doc_type=doc_type,
index=index)
def search(self, index, question, longitude, latitude, size=10):
#self.es.delete_index(index)
try:
self.es.create_index(index)
self.put_mapping(index, "place")
self.bulk_items(index, "place")
except IndexAlreadyExistsError:
pass
query = {
"query": {
"function_score": {
"query": {
"bool": {
"should": [
{"match": {"name": question}},
{"match": {"_all": {
"query": question,
"operator": "or",
"fuzziness": "auto",
"zero_terms_query": "all"
}}}
]
}
},
"functions": [
{"exp": {"rating": {"origin": 5, "scale": 1, "offset": 0.1}}},
]
}
}
}
if longitude and longitude is not None:
query['query']['function_score']['functions'] = [
{'gauss': {
"location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "550m", "scale": "1km"}
}},
{'gauss': {
"location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "500m", "scale": "2km"}
}},
]
results = self.es.search(query, index=index, size=size)
self.es.refresh()
return results
示例12: MySync
# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
class MySync(object):
ts = 0 #last chunk time
log_file = None
log_pos = None
def __init__(self):
self.config = yaml.load(open('./etc/config.yaml'))
self.mark_path = self.config['binlog']['mark']
self.bulk_size = self.config['es']['bulk_size']
self.excludes_fields = self.config['slave']['excludes_fields']
self.es = ElasticSearch('http://{host}:{port}/'.format(
host=self.config['es']['host'],
port=self.config['es']['port']
))
#set logger
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
filename=self.config['log']['run']
)
logging.getLogger('elasticsearch').setLevel(logging.INFO)
logging.getLogger('elasticsearch.trace').setLevel(logging.INFO)
logging.getLogger('elasticsearch.trace').addHandler(logging.StreamHandler())
#resume stream
if os.path.isfile(self.mark_path):
with open(self.mark_path, 'r') as y:
mark = yaml.load(y)
self.log_file = mark.get('log_file')
self.log_pos = mark.get('log_pos')
logging.info('resume stream >> file:%s, pos:%s' % (self.log_file, self.log_pos))
"""
mark binlog position
"""
def mark_binlog(self):
if self.log_file and self.log_pos:
with open(self.mark_path, 'w') as y:
logging.info('mark binlog >> file:%s, pos:%s' % (self.log_file, self.log_pos))
yaml.safe_dump({'log_file':self.log_file, 'log_pos':self.log_pos}, y, default_flow_style=False)
"""
format fields
"""
def _format(self, dat):
for k,v in dat.items():
if isinstance(v, datetime):
dat[k] = v.strftime('%Y-%m-%d %H:%M:%S')
elif isinstance(v, date):
dat[k] = v.strftime('%Y-%m-%d')
if k in self.excludes_fields:
del dat[k]
return dat
"""
mysql binlog event handle
"""
def proc_binlog(self):
stream = BinLogStreamReader(
connection_settings = self.config['mysql'],
server_id = self.config['slave']['server_id'],
log_file = self.log_file,
log_pos = self.log_pos,
only_schemas = self.config['slave']['schemas'],
blocking = True,
resume_stream = bool(self.log_file and self.log_pos),
only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent]
)
for binlogevent in stream:
#binlogevent.dump()
self.log_file = stream.log_file
self.log_pos = stream.log_pos
for row in binlogevent.rows:
pk = binlogevent.primary_key
table = binlogevent.table
schema = binlogevent.schema
if isinstance(binlogevent, WriteRowsEvent):
yield self.es.index_op(self._format(row['values']), doc_type=table, index=schema, id=row['values'][pk])
elif isinstance(binlogevent, UpdateRowsEvent):
yield self.es.update_op(self._format(row['after_values']), doc_type=table, index=schema, id=row['after_values'][pk])
elif isinstance(binlogevent, DeleteRowsEvent):
yield self.es.delete_op(doc_type=table, index=schema, id=row['values'][pk])
else:
continue
stream.close()
"""
notify exception
"""
def send_email(self, msg):
import smtplib
from email.mime.text import MIMEText
msg = MIMEText(msg, 'plain', 'utf-8')
msg['From'] = self.config['email']['from']['user']
#.........这里部分代码省略.........
示例13: __init__
# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
def __init__(self,start,**kwargs):
"""
Invoke a Downloader object to get data from
the Record. It will check to see if the necessary
files are already downloaded and use those instead of
querying FDSys. Downloaders are the endpoint for raw data.
Required arguments:
start : In form 'YYYY-MM-DD.' This is the day/start day you want.
Optional arguments:
parse : Defaults to True. This tells the downloader whether you just want
the raw files, or if you also want it to extract data from the HTML.
(Default means yes, give me the data.)
end : Same form as start. This is the end date.
outpath : Output path RELATIVE TO the present working directory. Defaults
to 'output' and works fine when you run it from the repo's root
directory.
do_mode : Specify what kind of data you want from the parser.
If do_mode is not set, the downloader will do absolutely zilch.
do_mode can take the following values:
json : write json files in a /json directory for that
day of the Record.
es : Specify the URL and index of an ElasticSearch cluster with
arguments es_url and index, and it will pass each file to
that cluster for indexing. WARNING: This doesn't handle any
mappings, and it doesn't check to see if records are already
there, so it will overwrite old files in the same index
WITHOUT versioning.
also specify:
es_url : ElasticSearch cluster url
index : ElasticSearch cluster index
yield : For each day of the Record the user specifies,
the downloader acts like a generator, yielding that day's
"crfile" dictionary.
"""
self.status = 'idle'
logging.debug('Downloader object ready with params:')
logging.debug(','.join(['='.join([key,value]) for key,value in kwargs.items()]))
if 'outpath' in kwargs.keys():
outpath = kwargs['outpath']
else:
outpath = 'output'
if kwargs['do_mode'] == 'es':
es = ElasticSearch(kwargs['es_url'])
for chunk in bulk_chunks((es.index_op(crfile.crdoc,id=crfile.crdoc.pop('id')) for crfile
in self.bulkdownload(start,**kwargs)),
docs_per_chunk=100):
es.bulk(chunk,index=kwargs['index'],doc_type='crdoc')
elif kwargs['do_mode'] == 'json':
# outpath called so often to make it easy to follow
# the idea that we're traversing a directory tree
for crfile in self.bulkdownload(start,**kwargs):
filename = os.path.split(crfile.filepath)[-1].split('.')[0] + '.json'
outpath = os.path.split(crfile.filepath)[0]
outpath = os.path.split(outpath)[0]
if 'json' not in os.listdir(outpath):
os.mkdir(os.path.join(outpath,'json'))
outpath = os.path.join(outpath,'json',filename)
with open(outpath,'w') as out_json:
json.dump(crfile.crdoc,out_json)
elif kwargs['do_mode'] == 'yield':
self.yielded = self.bulkdownload(start,parse=True,**kwargs)
elif kwargs['do_mode'] == 'noparse':
self.bulkdownload(start,parse=False,**kwargs)
else:
return None