当前位置: 首页>>代码示例>>Python>>正文


Python ElasticSearch.bulk方法代码示例

本文整理汇总了Python中pyelasticsearch.ElasticSearch.bulk方法的典型用法代码示例。如果您正苦于以下问题:Python ElasticSearch.bulk方法的具体用法?Python ElasticSearch.bulk怎么用?Python ElasticSearch.bulk使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyelasticsearch.ElasticSearch的用法示例。


在下文中一共展示了ElasticSearch.bulk方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: add_document

# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
def add_document(entries):
    es_server = 'http://localhost:9200/'
    if os.environ.get('ELASTICSEARCH_SERVER'):
        es_server = os.environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    es.bulk([es.index_op(doc) for doc in entries],
            index=os.environ['ELASTICSEARCH_INDEX'] if os.environ.get('ELASTICSEARCH_INDEX') else 'memex', 
            doc_type=os.environ['ELASTICSEARCH_DOC_TYPE'] if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')
开发者ID:ViDA-NYU,项目名称:memex,代码行数:11,代码来源:add_documents.py

示例2: Indexer

# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
class Indexer(object):
  
  def __init__(self, input):
    self.input = input
    self.es = ElasticSearch()
    self.index_name = "psim"
    self.doc_type = 'book'
    
  def delete_index(self):
    # Delete index if already found one
    try:
      self.es.delete_index(index = self.index_name)
    except Exception:
      pass
  
  def create_index(self):
    self.es.create_index(index=self.index_name, settings = self.get_index_settings())
    
  def get_index_settings(self):
    settings = {
                        "mappings": {
                           "book": {
                             "_all" : {"enabled" : "false"},       
                             "properties": {
                                "codes": {"type": "string",
                                         "term_vector": "yes",
                                         "store": "true"},
                                "pid" : {"type" : "string"},
                                "embedding": {"type": "float",
                                              "store": "true"},
                                "magnitude": {"type": "float", "store": "true"}
                             }     
                           }
                        }
               }
    return settings
  
  def documents(self):
    with open(self.input) as input_file:
      for line in input_file:
        json_doc = json.loads(line)
        yield self.es.index_op(json_doc, doc_type=self.doc_type)
    
  def index(self):
    self.delete_index()
    self.create_index()
    for chunk in bulk_chunks(self.documents(), docs_per_chunk=1000):
      self.es.bulk(chunk, index = self.index_name, doc_type = self.doc_type)
    self.es.refresh(self.index_name)
开发者ID:dapurv5,项目名称:es-scripts,代码行数:51,代码来源:bulk_index_docs.py

示例3: update_document

# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
def update_document(entries):
    es_server = 'http://localhost:9200/'
    if os.environ.get('ELASTICSEARCH_SERVER'):
        es_server = os.environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)
    
    # es.update(index=os.environ['ELASTICSEARCH_INDEX'] if os.environ.get('ELASTICSEARCH_INDEX') else 'memex', 
    #           doc_type=os.environ['ELASTICSEARCH_DOC_TYPE'] if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page',
    #           id=url,
    #           script=doc,
    #           upsert=True
    #       )
    es.bulk([es.update_op(doc, id=doc['url'], upsert=True) for doc in entries],
            index=os.environ['ELASTICSEARCH_INDEX'] if os.environ.get('ELASTICSEARCH_INDEX') else 'memex', 
            doc_type=os.environ['ELASTICSEARCH_DOC_TYPE'] if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')
开发者ID:ViDA-NYU,项目名称:memex,代码行数:17,代码来源:add_documents.py

示例4: commit

# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
    def commit(self):
        if len(self.buffer) > 0:
            logging.debug("Inserting {} to elasticsearch".format(len(self.buffer)))

            es = ElasticSearch(ELASTICSEARCH_URI)

            docs = []
            for doc in self.buffer:
                t = time.gmtime(int(doc['@timestamp']/1000))
                index = ELASTICSEARCH_INDEX + "-" + str(t.tm_year).zfill(2) + "." + str(t.tm_mon).zfill(2) + "." + str(t.tm_mday).zfill(2)
                docs.append(es.index_op(doc, index=index, doc_type=ELASTICSEARCH_DOC))
            if len(docs) > 0:
                try:
                    es.bulk(docs)
                    logging.debug("inserted %d records" % (len(docs)))
                    self.buffer = []
                except Exception as e:
                    logging.error("Insert Exception " + str(e))
开发者ID:visgence,项目名称:teleceptor,代码行数:20,代码来源:ESSession.py

示例5: update_index

# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
def update_index(sender, created, **kwargs):
    """
    A signal for indexing new coffeehouses
    upon creation
    """
    es = ElasticSearch()
    if created:
        m = sender.objects.last()
        es.bulk([
            es.index_op({
                "pk": m.pk,
                "name": m.name,
                "rating": m.rating,
                "location": {
                    "lon": m.position.longitude,
                    "lat": m.position.latitude
                }
            }),
            ],
            doc_type="place",
            index="toerh_coffee")
开发者ID:drager,项目名称:toerh,代码行数:23,代码来源:signals.py

示例6: ElasticSearch

# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
                return data

es = ElasticSearch('http://localhost:9200/')
es.delete_index('pet')
spider = Spider()
breeds = spider.getPetBreeds()
p = Pinyin()
for breed in breeds:
    flg = 1
    page = 1
    pet_list = []
    while(flg):
        pets = spider.getPets(breed, (page - 1) * spider.limit)
        if not pets:
            flg = 0
        else:
            page = page + 1
            for pet in pets:
                pet_obj = {}
                pet_obj['name'] = pet['name']
                pet_obj['img'] = pet['img']
                pet_obj['type'] = breed['ename'] 
                pet_list.append(pet_obj)
                #print pet['name'] + '\t' + p.get_pinyin(pet['name'], '')
    print breed['ename'] + '\n'
    if not pet_list:
        continue
    doc_type = p.get_pinyin(breed['ename'].replace('宠物', ''), '')
    es.bulk((es.index_op(pet_obj) for pet_obj in pet_list), doc_type=doc_type, index = 'pet')
es.refresh('pet')
开发者ID:wangqiuyi,项目名称:0517,代码行数:32,代码来源:spider.py

示例7: getFeeds

# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
def getFeeds():
    print "getting feeds"
    es = ElasticSearch('http://fisensee.ddns.net:9200/')

    query = {"query": {"range": {"date": {"lte": "now-1w/w"}}}}
    oldFeeds = es.search(query, size=300, index='feeds')

    if(len(oldFeeds['hits']['hits']) is not 0):
        es.bulk(es.delete_op(id=feed['_id'], index='feeds',
        doc_type='feed') for feed in oldFeeds['hits']['hits'])


    feedSources = FeedSource.objects.all()
    feeds = []
    defaultText = 'undefined'
    defaultDate = datetime.datetime.now().isoformat()
    utc = pytz.utc
    berlin = pytz.timezone('Europe/Berlin')
    now = datetime.datetime.today()
    dateThreshold = now - datetime.timedelta(weeks=2)

    allUrls = []
    for feedSource in feedSources:
        allUrls.append(feedSource.sourceUrl)

    urls = set(allUrls)
    for url in urls:
        source = feedparser.parse(url)
        for entry in source['items']:
            feed = {
                'title':defaultText,
                'description':defaultText,
                'link':defaultText,
                'date':defaultDate,
                'url': defaultText
            }
            if('title' in entry):
                feed['title'] = entry['title']
            if('description' in entry):
                feed['description'] = entry['description']
            if('link' in entry):
                feed['link'] = entry['link']
            if('published_parsed' in entry):
                date = datetime.datetime.fromtimestamp(time.mktime(entry['published_parsed']))
                if(date < dateThreshold):
                    break
                utcDate = utc.localize(date)
                feed['date'] = utcDate.astimezone(berlin).isoformat()
            #id creation should be enough for now, but it's made to fail
            if('title' or 'published_parsed' in entry):
                feed['id'] = base64.urlsafe_b64encode(hashlib.sha256((feed['title'] + feed['date']).encode('utf8')).hexdigest())
            else:
                feed['id'] = base64.urlsafe_b64encode(hashlib.sha256((feed['title']).encode('utf8')).hexdigest())
            feed['url'] = url
            feeds.append(feed)



    es.bulk((es.index_op(feed, **{'id': feed.pop('id')}) for feed in feeds),
        index = 'feeds',
        doc_type = 'feed')
    print es.refresh('feeds')
开发者ID:fiesensee,项目名称:CompREST,代码行数:64,代码来源:timer.py

示例8: prepareDataFromDB

# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
    }
}


es.health(wait_for_status='yellow')
es.delete_index('write-ads')
es.create_index('write-ads', settings={'mappings': ad_mapping})

dateYMD = args["date"]
prepareDataFromDB(dateYMD)

dir = DATA_FILES_JSON + '/' + dateYMD
for filename in os.listdir(dir):
    if filename.endswith('.json'):
        with open(dir + '/' + filename) as open_file:
            json_docs = json.load(open_file)
            es.bulk((es.index_op(doc) for doc in json_docs),
                index='write-ads',
                doc_type='ad')

es.refresh("write-ads")

res = es.search('website:com', index='write-ads')
print("Got %d Hits for .com websites" % res['hits']['total'])
for hit in res['hits']['hits']:
    print (hit["_source"])
res = es.search('website:in', index='write-ads')
print("Got %d Hits for .in websites" % res['hits']['total'])
res = es.search('category:entertainment', index='write-ads')
print("Got %d Hits for category:Entertainment" % res['hits']['total'])
开发者ID:cervere,项目名称:scrapemall,代码行数:32,代码来源:indexer.py

示例9: bulk_chunks

# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
                    "coordinates" : coords,  # 4, 5
                    "feature_class" : row[6],
                    "feature_code" : row[7],
                    "country_code2" : row[8],
                    "country_code3" : country_code3,
                    "cc2" : row[9],
                    "admin1_code" : row[10],
                    "admin2_code" : row[11],
                    "admin3_code" : row[12],
                    "admin4_code" : row[13],
                    "population" : row[14],
                    "elevation" : row[15],
                    "dem" : row[16],
                    "timzeone" :  row[17],
                    "modification_date" : "2014-01-01"
                   }
            yield es.index_op(doc, index='geonames', doc_type='geoname')
        except:
            count += 1

    print 'Exception count:', count


chunk_count = 0
for chunk in bulk_chunks(documents(reader, es), docs_per_chunk=500):
    es.bulk(chunk)
    chunk_count += 1
    print 'Chunk count:', chunk_count

es.refresh('geonames')
开发者ID:ahalterman,项目名称:mordecai,代码行数:32,代码来源:geonames_elasticsearch.py

示例10: enumerate

# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
    }
}
es.create_index(ELASTICSEARCH_INDEX, settings=index_settings)

for filename in FILES:
    print "Processing %s" % filename

    sf = shapefile.Reader(filename)

    shapes = sf.shapes()
    for i, shape in enumerate(shapes, start=1):
        points = [(p[0], p[1]) for p in shape.points]

        data = {
            'filename': filename,
            'location': {
                'type': 'polygon',
                'coordinates': [points]
            }
        }

        if points[-1] != points[0]:
            points.append(points[0])

        try:
            es.bulk([es.index_op(data)],
                    doc_type=ELASTICSEARCH_DOC,
                    index=ELASTICSEARCH_INDEX)
        except:
            print "Exception"
开发者ID:rainum,项目名称:dronevision,代码行数:32,代码来源:fill.py

示例11: SearchIndex

# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
class SearchIndex(object):
    def __init__(self, model):
        self.es = ElasticSearch()
        self.model = model

    def put_mapping(self, index, doc_type):
        mapping = {
            doc_type: {
                "properties": {
                    "location": {
                        "type": "geo_point"
                    },
                }
            }
        }
        self.es.put_mapping(index=index, doc_type=doc_type, mapping=mapping)

    def bulk_items(self, index, doc_type):
        for m in self.model.objects.all():
            self.es.bulk([
                self.es.index_op({
                    "pk": m.pk,
                    "name": m.name,
                    "rating": m.rating,
                    "address": m.address,
                    "description": m.description,
                    "location": {
                        "lon": m.longitude,
                        "lat": m.latitude
                    }
                }),
                ],
                doc_type=doc_type,
                index=index)

    def search(self, index, question, longitude, latitude, size=10):
        #self.es.delete_index(index)
        try:
            self.es.create_index(index)
            self.put_mapping(index, "place")
            self.bulk_items(index, "place")
        except IndexAlreadyExistsError:
            pass

        query = {
            "query": {
                "function_score": {
                    "query": {
                        "bool": {
                            "should": [
                                {"match": {"name": question}},
                                {"match": {"_all": {
                                    "query": question,
                                    "operator": "or",
                                    "fuzziness": "auto",
                                    "zero_terms_query": "all"
                                    }}}
                                ]
                            }
                        },
                    "functions": [
                        {"exp": {"rating": {"origin": 5, "scale": 1, "offset": 0.1}}},
                    ]
                    }
                }
            }

        if longitude and longitude is not None:
            query['query']['function_score']['functions'] = [
                {'gauss': {
                    "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "550m", "scale": "1km"}
                    }},
                {'gauss': {
                    "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "500m", "scale": "2km"}
                    }},
            ]

        results = self.es.search(query, index=index, size=size)

        self.es.refresh()

        return results
开发者ID:drager,项目名称:toerh,代码行数:84,代码来源:indexes.py

示例12: MySync

# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
class MySync(object):
	ts = 0	#last chunk time
	log_file = None
	log_pos  = None
	
	def __init__(self):
		self.config = yaml.load(open('./etc/config.yaml'))
		self.mark_path = self.config['binlog']['mark']
		self.bulk_size = self.config['es']['bulk_size']
		self.excludes_fields = self.config['slave']['excludes_fields']
		self.es = ElasticSearch('http://{host}:{port}/'.format(
			host=self.config['es']['host'], 
			port=self.config['es']['port']
		))

		#set logger
		logging.basicConfig(
			level=logging.DEBUG,
			format='%(asctime)s %(levelname)s %(message)s',
			datefmt='%Y-%m-%d %H:%M:%S',
			filename=self.config['log']['run']
		)
		logging.getLogger('elasticsearch').setLevel(logging.INFO)
		logging.getLogger('elasticsearch.trace').setLevel(logging.INFO)
		logging.getLogger('elasticsearch.trace').addHandler(logging.StreamHandler())

		#resume stream
		if os.path.isfile(self.mark_path):		
			with open(self.mark_path, 'r') as y:
				mark = yaml.load(y)
				self.log_file = mark.get('log_file')
				self.log_pos  = mark.get('log_pos')
				logging.info('resume stream >> file:%s, pos:%s' % (self.log_file, self.log_pos))
	
	"""
	mark binlog position
	"""
	def mark_binlog(self):
		if self.log_file and self.log_pos:
			with open(self.mark_path, 'w') as y:
				logging.info('mark binlog >> file:%s, pos:%s' % (self.log_file, self.log_pos))
				yaml.safe_dump({'log_file':self.log_file, 'log_pos':self.log_pos}, y, default_flow_style=False)
	

	"""
	format fields
	"""
	def _format(self, dat):
		for k,v in dat.items():
			if isinstance(v, datetime):
				dat[k] = v.strftime('%Y-%m-%d %H:%M:%S')
			elif isinstance(v, date):
				dat[k] = v.strftime('%Y-%m-%d')
			if k in self.excludes_fields:
				del dat[k]
		return dat
	

	"""
	mysql binlog event handle
	"""
	def proc_binlog(self):
		stream = BinLogStreamReader(
			connection_settings = self.config['mysql'],
			server_id = self.config['slave']['server_id'],
			log_file = self.log_file,
			log_pos = self.log_pos,
			only_schemas = self.config['slave']['schemas'],
			blocking = True,
			resume_stream = bool(self.log_file and self.log_pos),
			only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent]
		)
		for binlogevent in stream:
			#binlogevent.dump()
			self.log_file = stream.log_file
			self.log_pos  = stream.log_pos
			for row in binlogevent.rows:
				pk = binlogevent.primary_key
				table = binlogevent.table
				schema = binlogevent.schema
				if isinstance(binlogevent, WriteRowsEvent):
					yield self.es.index_op(self._format(row['values']), doc_type=table, index=schema, id=row['values'][pk])
				elif isinstance(binlogevent, UpdateRowsEvent):
					yield self.es.update_op(self._format(row['after_values']), doc_type=table, index=schema, id=row['after_values'][pk])
				elif isinstance(binlogevent, DeleteRowsEvent):
					yield self.es.delete_op(doc_type=table, index=schema, id=row['values'][pk])
				else:
					continue

		stream.close()
	

	"""
	notify exception
	"""
	def send_email(self, msg):
		import smtplib
		from email.mime.text import MIMEText
		msg = MIMEText(msg, 'plain', 'utf-8')
		msg['From'] = self.config['email']['from']['user']
#.........这里部分代码省略.........
开发者ID:xhook7,项目名称:py-mysql-es,代码行数:103,代码来源:sync.py

示例13: __init__

# 需要导入模块: from pyelasticsearch import ElasticSearch [as 别名]
# 或者: from pyelasticsearch.ElasticSearch import bulk [as 别名]
    def __init__(self,start,**kwargs):
        """
        Invoke a Downloader object to get data from
        the Record. It will check to see if the necessary
        files are already downloaded and use those instead of
        querying FDSys. Downloaders are the endpoint for raw data.

        Required arguments:

        start : In form 'YYYY-MM-DD.' This is the day/start day you want.

        Optional arguments:

        parse : Defaults to True. This tells the downloader whether you just want
                the raw files, or if you also want it to extract data from the HTML.
                (Default means yes, give me the data.)


        end : Same form as start. This is the end date.

        outpath : Output path RELATIVE TO the present working directory. Defaults
                  to 'output' and works fine when you run it from the repo's root
                  directory.

        do_mode : Specify what kind of data you want from the parser.
                  If do_mode is not set, the downloader will do absolutely zilch.
                  do_mode can take the following values:

                  json : write json files in a /json directory for that
                         day of the Record.

                  es : Specify the URL and index of an ElasticSearch cluster with
                       arguments es_url and index, and it will pass each file to
                       that cluster for indexing. WARNING: This doesn't handle any
                       mappings, and it doesn't check to see if records are already
                       there, so it will overwrite old files in the same index
                       WITHOUT versioning.

                       also specify:
                       es_url : ElasticSearch cluster url
                       index  : ElasticSearch cluster index

                  yield : For each day of the Record the user specifies,
                          the downloader acts like a generator, yielding that day's
                          "crfile" dictionary.
        """

        self.status = 'idle'
        logging.debug('Downloader object ready with params:')
        logging.debug(','.join(['='.join([key,value]) for key,value in kwargs.items()]))
        if 'outpath' in kwargs.keys():
            outpath = kwargs['outpath']
        else:
            outpath = 'output'
        if kwargs['do_mode'] == 'es':
            es = ElasticSearch(kwargs['es_url'])
            for chunk in bulk_chunks((es.index_op(crfile.crdoc,id=crfile.crdoc.pop('id')) for crfile
                                        in self.bulkdownload(start,**kwargs)),
                                        docs_per_chunk=100):
                es.bulk(chunk,index=kwargs['index'],doc_type='crdoc')
        elif kwargs['do_mode'] == 'json':
            # outpath called so often to make it easy to follow
            # the idea that we're traversing a directory tree
            for crfile in self.bulkdownload(start,**kwargs):
                filename = os.path.split(crfile.filepath)[-1].split('.')[0] + '.json'
                outpath = os.path.split(crfile.filepath)[0]
                outpath = os.path.split(outpath)[0]
                if 'json' not in os.listdir(outpath):
                    os.mkdir(os.path.join(outpath,'json'))
                outpath = os.path.join(outpath,'json',filename)
                with open(outpath,'w') as out_json:
                    json.dump(crfile.crdoc,out_json)
        elif kwargs['do_mode'] == 'yield':
            self.yielded = self.bulkdownload(start,parse=True,**kwargs)
        elif kwargs['do_mode'] == 'noparse':
            self.bulkdownload(start,parse=False,**kwargs)

        else:
            return None
开发者ID:nclarkjudd,项目名称:congressionalrecord2,代码行数:81,代码来源:downloader.py


注:本文中的pyelasticsearch.ElasticSearch.bulk方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。