本文整理汇总了Python中kafka.consumer.SimpleConsumer.seek方法的典型用法代码示例。如果您正苦于以下问题:Python SimpleConsumer.seek方法的具体用法?Python SimpleConsumer.seek怎么用?Python SimpleConsumer.seek使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类kafka.consumer.SimpleConsumer
的用法示例。
在下文中一共展示了SimpleConsumer.seek方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
def main():
client = KafkaClient("localhost:9092")
consumer = SimpleConsumer(client, "test-group", "twitter_raw")
consumer.seek(0,2)
num = 0
for message in consumer:
print "redis publish:", num
num+=1
try:
data_depickled = pickle.loads(message.message.value.decode('utf-8'))
except Exception, e:
continue
# print data_depickled
# {
# 'text':'@_LulaMoore me hamas perra',
# 'created_at':datetime.datetime(2015, 10, 9, 23, 36, 49),
# 'source':u'Twitter Web Client',
# 'lang:':u'es',
# 'place':{
# 'country_code':u'AR',
# 'coordinates':[
# [
# -68.176283,
# -38.984724
# ],
# [
# -68.176283,
# -38.921051
# ],
# [
# -68.015162,
# -38.921051
# ],
# [
# -68.015162,
# -38.984724
# ]
# ]
# },
# 'user':{
# 'statuses_count':15067,
# 'name':u'Dama negra *\uffe6*',
# 'friends_count':390,
# 'created_at':datetime.datetime(2014, 3, 15,2,37, 10),
# 'profile_image_url': u'http://pbs.twimg.com/profile_images/652333268256313344/x9K9Nlys_normal.jpg',
# 'followers_count':384,
# 'id':2390242428
# },
# 'id':652628813935980544
# }
### process data here ###
# text = data_depickled['text']
filtered_data = data_filter(data_depickled)
data_pickled = pickle.dumps(filtered_data)
redis.publish('tweets_processed', data_pickled)
示例2: run
# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
def run(self):
client = KafkaClient("10.206.216.13:19092,10.206.212.14:19092,10.206.209.25:19092")
consumer = SimpleConsumer(client, "test-group", "jiketest",auto_commit=False,partitions=self.part)
consumer.seek(0,0)
while True:
message = consumer.get_message(True,60)
self.__offset = message.offset
print message.message.value
示例3: main
# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
def main():
client = KafkaClient("localhost:9092")
consumer = SimpleConsumer(client, "test-group", "twitter_raw")
consumer.seek(0,2)
for message in consumer:
# data_deserialized = str.decode(message.message.value)
data_depickled = pickle.loads(message.message.value.decode('utf-8'))
# print str(data_depickled).decode('string_escape')
print data_depickled
示例4: blocking_consumer
# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
def blocking_consumer(self, message_consume_function, parse_json, topic_group, topic_name):
print "starting blocking consumer with topic group %s and topic name %s" % (topic_group, topic_name)
consumer = SimpleConsumer(self.client, topic_group, topic_name)
consumer.seek(0,2)
for message in consumer:
message = parse_json(message)
print "=============" + str(message) + "============"
message_consume_function(message)
print "called message consume function"
示例5: Consumer
# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class Consumer(object):
def __init__(self, addr, group, topic):
self.client = KafkaClient(addr)
self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000, auto_offset_reset='smallest')
self.temp_file_path = None
self.temp_file = None
self.topic = topic
self.group = group
self.block_cnt = 0
def consume_topic(self):
timestamp = time.strftime('%Y%m%d%H%M%S')
#open file for writing
self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp)
self.temp_file = open(self.temp_file_path,"w")
header = 'experiment_id,job_id,results_file,package_id,package_name,worker_id,config_id,replicate_no,setup_time,run_time,collect_time,hw_cpu_arch,hw_cpu_mhz,hw_gpu_mhz,hw_num_cpus,hw_page_sz,hw_ram_mhz,hw_ram_sz,sw_address_randomization,sw_autogroup,sw_compiler,sw_drop_caches,sw_env_padding,sw_filesystem,sw_freq_scaling,sw_link_order,sw_opt_flag,sw_swap,sw_sys_time'
self.temp_file.write(header)
while True:
try:
messages = self.consumer.get_messages(count=100, block=False)
for message in messages:
self.temp_file.write(message.message.value + "\n")
if self.temp_file.tell() > 20000:
self.save_to_hdfs()
self.consumer.commit()
except:
self.consumer.seek(0, 2)
self.consumer.commit()
def save_to_hdfs(self):
self.temp_file.close()
timestamp = time.strftime('%Y%m%d%H%M%S')
hadoop_path = "/datamill/%s_%s_%s.csv" % (self.group, self.topic, timestamp)
print "Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_path
self.block_cnt += 1
# place blocked messages into history and cached folders on hdfs
os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_path))
os.remove(self.temp_file_path)
timestamp = time.strftime('%Y%m%d%H%M%S')
self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp)
self.temp_file = open(self.temp_file_path, "w")
示例6: Consumer
# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class Consumer(object):
def __init__(self, addr, group, topic):
"""Initialize Consumer with kafka broker IP, group, and topic."""
self.client = KafkaClient(addr)
self.consumer = SimpleConsumer(self.client, group, topic,
max_buffer_size=1310720000)
self.temp_file_path = None
self.temp_file = None
self.hadoop_path = "/insight/artsy/geo"
self.topic = topic
self.group = group
self.block_cnt = 0
def consume_topic(self, output_dir):
"""Consumes a stream of messages from the "post_geo_activity" topic.
Code template from https://github.com/ajmssc/bitcoin-inspector.git
"""
timestamp = time.strftime('%Y%m%d%H%M%S')
# open file for writing
self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp)
self.temp_file = open(self.temp_file_path,"w")
while True:
try:
# get 1000 messages at a time, non blocking
messages = self.consumer.get_messages(count=1000, block=False)
for message in messages:
self.temp_file.write(message.message.value + "\n")
# file size > 20MB
if self.temp_file.tell() > 20000000:
self.flush_to_hdfs(output_dir)
self.consumer.commit()
except:
# move to tail of kafka topic if consumer is referencing
# unknown offset
self.consumer.seek(0, 2)
def flush_to_hdfs(self, output_dir):
"""Flushes the 20MB file into HDFS."""
self.temp_file.close()
timestamp = time.strftime('%Y%m%d%H%M%S')
hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,self.topic, timestamp)
print "Block {}: Flushing data file to HDFS => {}".format(str(self.block_cnt),hadoop_fullpath)
self.block_cnt += 1
os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # save from local to hdfs
os.remove(self.temp_file_path) # remove temp local file
timestamp = time.strftime('%Y%m%d%H%M%S')
self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp)
self.temp_file = open(self.temp_file_path, "w")
示例7: KafkaDatawakeLookaheadSpout
# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class KafkaDatawakeLookaheadSpout(Spout):
group = 'datawake-crawler-out-consumer'.encode()
def __init__(self):
Spout.__init__(self)
self.queue = None
def initialize(self, stormconf, context):
try:
settings = all_settings.get_settings(stormconf['topology.deployment'])
self.topic = settings['crawler-out-topic'].encode()
self.conn_pool = settings['conn_pool'].encode()
self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool)
self.kafka = KafkaClient(self.conn_pool)
self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None)
self.consumer.seek(0, 2) # move to the tail of the queue
except:
self.log("KafkaDatawakeLookaheadSpout initialize error", level='error')
self.log(traceback.format_exc(), level='error')
raise
def next_tuple(self):
"""
input message:
dict(
id = input['id'],
appid = input['appid'],
url = url,
status_code = response.getcode(),
status_msg = 'Success',
timestamp = response.info()['date'],
links_found = links,
raw_html = html,
attrs = input['attrs']
)
:return: (url, status, headers, flags, body, timestamp, source,context)
"""
offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
message = offsetAndMessage.message.value
crawled = json.loads(message)
safeurl = crawled['url'].encode('utf-8', 'ignore')
self.log("Lookahead spout received id: " + crawled['id'] + " url: " + safeurl)
context = {
'source': 'datawake-lookahead',
'userId': crawled['attrs']['userId'],
'org': crawled['attrs']['org'],
'domain': crawled['attrs']['domain'],
'url': crawled['url']
}
self.emit([crawled['url'], crawled['status_code'], '', '', crawled['raw_html'], crawled['timestamp'], context['source'], context])
示例8: __init__
# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class KafkaConsumer:
group = "python-lookahead-consumer"
def __init__(self,conn_pool,topic,group):
self.conn_pool = conn_pool
self.topic = topic
self.group = group
self.kafka = KafkaClient(self.conn_pool)
self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None)
self.consumer.seek(0,2) # move to the tail of the queue
def next(self):
offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
message = offsetAndMessage.message.value
return message
示例9: KafkaDatawakeVisitedSpout
# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class KafkaDatawakeVisitedSpout(Spout):
group = 'datawake-visited-consumer'.encode()
def __init__(self):
Spout.__init__(self)
self.queue = None
def initialize(self, stormconf, context):
try:
settings = all_settings.get_settings(stormconf['topology.deployment'])
self.topic = settings['visited-topic'].encode()
self.conn_pool = settings['conn_pool'].encode()
self.log('KafkaDatawakeVisitedSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool)
self.kafka = KafkaClient(self.conn_pool)
self.kafka.ensure_topic_exists(self.topic)
self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None)
self.consumer.seek(0, 2) # move to the tail of the queue
except:
self.log("KafkaDatawakeVisitedSpout initialize error", level='error')
self.log(traceback.format_exc(), level='error')
raise
def next_tuple(self):
"""
input: (timestamp,org,domain,user_id,url,html)
:return: (url, status, headers, flags, body, timestamp, source,context)
"""
try:
for message in self.consumer:
self.log("msg")
self.log(message)
#offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
message = message.split('\0')
(timestamp, org, domain, userId, url, html) = message
context = {
'source': 'datawake-visited',
'domain': domain
}
self.emit([url, '', '', '', html, timestamp, context['source'], context])
except:
self.log(traceback.format_exc(), level='error')
def fail(self, tup_id):
pass
示例10: CrawlerSpout
# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class CrawlerSpout(Spout):
group = 'datawake-crawler-in-consumer'.encode()
def initialize(self, stormconf, context):
try:
settings = all_settings.get_settings(stormconf['topology.deployment'])
self.topic = settings['crawler-in-topic'].encode()
self.conn_pool = settings['conn_pool'].encode()
self.log('CrawlerSpout initialized with topic ='+self.topic+' conn_pool='+self.conn_pool)
self.kafka = KafkaClient(self.conn_pool)
self.kafka.ensure_topic_exists(self.topic)
self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None, fetch_size_bytes=2000000)
self.consumer.seek(0,2) # move to the tail of the queue
except:
self.log("CrawlerSpout initialize error",level='error')
self.log(traceback.format_exc(),level='error')
raise
def next_tuple(self):
"""
input message:
json.dumps(dict(
id = 'abcdefg', #TODO generate UUID,
appid = self.appid,
url = url,
priority = 50,
depth = 0,
attrs = dict(
userId = context['userId'],
org = context['org'],
domain = context['domain']
)
))
:return:
"""
try:
for message in self.consumer:
to_crawl = json.loads(message)
self.emit([to_crawl])
except:
self.log(traceback.format_exc(),level='error')
示例11: __init__
# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
#.........这里部分代码省略.........
for property, subschema in properties.iteritems():
if "default" in subschema:
instance.setdefault(property, subschema["default"])
return validators.extend(
validator_class, {"properties": set_defaults},
)
def handle_crawl_request(self, dict):
'''
Processes a vaild crawl request
@param dict: a valid dictionary object
'''
# format key
key = "{sid}:queue".format(sid=dict['spiderid'])
val = pickle.dumps(dict, protocol=-1)
# shortcut to shove stuff into the priority queue
self.redis_conn.zadd(key, val, -dict['priority'])
# if timeout crawl, add value to redis
if 'expires' in dict:
key = "timeout:{sid}:{appid}:{crawlid}".format(
sid=dict['spiderid'],
appid=dict['appid'],
crawlid=dict['crawlid'])
self.redis_conn.set(key, dict['expires'])
def handle_action_request(self, dict):
'''
Processes a vaild action request
@param dict: The valid dictionary object
'''
# format key
key = "{action}:{spiderid}:{appid}".format(
action=dict['action'],
spiderid=dict['spiderid'],
appid=dict['appid'])
if "crawlid" in dict:
key = key + ":" + dict['crawlid']
self.redis_conn.set(key, dict['uuid'])
def _main_loop(self):
'''
Continuous loop that reads from a kafka topic and tries to validate
incoming messages
'''
while True:
start = time.time()
try:
for message in self.consumer.get_messages():
if message is None:
break
try:
the_dict = json.loads(message.message.value)
try:
self.validator(self.schema).validate(the_dict)
self.result_method(the_dict)
except ValidationError as ex:
print "invalid json received"
except ValueError:
print "bad json recieved"
except OffsetOutOfRangeError:
# consumer has no idea where they are
self.consumer.seek(0, 2)
end = time.time()
time.sleep(.01)
def run(self):
'''
Sets up the schema to be validated against
'''
self.setup()
with open(self.settings.SCHEMA) as the_file:
# No try/catch so we can see if there is a json parse error
# on the schemas
self.schema = json.load(the_file)
self._main_loop()
def feed(self, json_item):
'''
Feeds a json item into the Kafka topic
@param json_item: The loaded json object
'''
topic = self.settings.KAFKA_INCOMING_TOPIC
producer = SimpleProducer(self.kafka_conn)
print "=> feeding JSON request into {0}...".format(topic)
print json.dumps(json_item, indent=4)
self.kafka_conn.ensure_topic_exists(topic)
producer.send_messages(topic, json.dumps(json_item))
print "=> done feeding request."
示例12: ZKConsumer
# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
#.........这里部分代码省略.........
return
self.consumer = SimpleConsumer(self.client, self.group, self.topic,
partitions=my_partitions,
**self.consumer_kwargs)
self.consumer.provide_partition_info()
self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets)
def stop(self):
if self.consumer is not None:
self.logger.info('Stopping Kafka consumer')
self.consumer.stop()
self.consumer = None
if self.client is not None:
self.logger.info('Stopping Kafka client')
self.client.close()
self.client = None
if self.zk is not None:
self.logger.info('Stopping ZooKeeper client')
if self.zkp is not None and not self.zkp.failed:
self.zkp.finish()
self.zk.stop()
self.zkp = None
self.zk = None
def commit(self, partitions=None):
"""
Commit offsets for this consumer
partitions: list of partitions to commit, default is to commit
all of them
"""
if self.consumer is None:
return
self.logger.debug('Begin committing offsets for partitions: %s',
partitions if partitions else 'All')
self.consumer.commit(partitions)
self.logger.debug('End committing offsets for partitions: %s',
partitions if partitions else 'All')
def pending(self, partitions=None):
"""
Gets the pending message count
partitions: list of partitions to check for, default is to check all
"""
return self.consumer.pending(partitions)
def provide_partition_info(self):
"""
Indicates that partition info must be returned by the consumer
"""
self.consumer.provide_partition_info()
def seek(self, offset, whence):
"""
Alter the current offset in the consumer, similar to fseek
offset: how much to modify the offset
whence: where to modify it from
0 is relative to the earliest available offset (head)
1 is relative to the current offset
2 is relative to the latest known offset (tail)
"""
self.consumer.seek(offset, whence)
def get_messages(self, count=1, block=True, timeout=0.1):
"""
Fetch the specified number of messages
count: Indicates the maximum number of messages to be fetched
block: If True, the API will block till some messages are fetched.
timeout: If block is True, the function will block for the specified
time (in seconds) until count messages is fetched. If None,
it will block forever.
"""
if self.consumer is None:
return []
else:
try:
messages = self.consumer.get_messages(count, block, timeout)
if not messages and self.zkp.failed:
raise FailedPayloadsError
return messages
except FailedPayloadsError as err:
msg = 'Failed to retrieve payload, restarting consumer'
self.logger.exception(msg)
raise err
def get_message(self, block=True, timeout=0.1, get_partition_info=None):
return self.consumer.get_message(block, timeout, get_partition_info)
def _get_message(self, block=True, timeout=0.1, get_partition_info=None,
update_offset=True):
return self.consumer._get_message(block, timeout, get_partition_info,
update_offset)
def __iter__(self):
for msg in self.consumer:
yield msg
示例13: Consumer
# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class Consumer(object):
"""Kafka consumer class with functions to consume messages to HDFS.
Messages are blocked into 20MB files and transferred to HDFS
Attributes:
client: string representing IP:port of the kafka broker
consumer: Consumer object specifying the client group, and topic
temp_file_path: location of the 20MB file to be appended to before
transfer to HDFS
temp_file: File object opened from temp_file_path
topic: String representing the topic on Kafka
group: String representing the Kafka consumer group to be associated
with
block_cnt: integer representing the block count for print statements
"""
def __init__(self, addr, group, topic):
"""Initialize Consumer with kafka broker IP, group, and topic."""
self.client = KafkaClient(addr)
self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
self.temp_file_path = None
self.temp_file = None
self.hadoop_path = "/user/parking_data/history"
self.topic = topic
self.group = group
self.block_cnt = 0
def consume_topic(self, output_dir):
"""Consumes a stream of messages from the "messages" topic.
Code template from https://github.com/ajmssc/bitcoin-inspector.git
Args:
output_dir: string representing the directory to store the 20MB
before transferring to HDFS
Returns:
None
"""
timestamp = time.strftime("%Y%m%d%H%M%S")
# open file for writing
self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp)
self.temp_file = open(self.temp_file_path, "w")
# while True:
for ii in range(0, 2):
try:
# get 1000 messages at a time, non blocking
messages = self.consumer.get_messages(count=1000, block=False)
# OffsetAndMessage(offset=43, message=Message(magic=0,
# attributes=0, key=None, value='some message'))
for message in messages:
self.temp_file.write(message.message.value + "\n")
# file size > 20MB
if self.temp_file.tell() > 20000000:
self.flush_to_hdfs(output_dir)
self.consumer.commit()
except:
# move to tail of kafka topic if consumer is referencing
# unknown offset
self.consumer.seek(0, 2)
def flush_to_hdfs(self, output_dir):
"""Flushes the 20MB file into HDFS.
Code template from https://github.com/ajmssc/bitcoin-inspector.git
Flushes the file into HDFS folders
Args:
output_dir: string representing the directory to store the 20MB
before transferring to HDFS
Returns:
None
"""
self.temp_file.close()
timestamp = time.strftime("%Y%m%d%H%M%S")
hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp)
print "Block {}: Flushing 20MB file to HDFS => {}".format(str(self.block_cnt), hadoop_fullpath)
self.block_cnt += 1
# place blocked messages into history and cached folders on hdfs
print ("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath))
os.system("sudo hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath))
# os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,
# cached_fullpath))
os.remove(self.temp_file_path)
timestamp = time.strftime("%Y%m%d%H%M%S")
self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp)
self.temp_file = open(self.temp_file_path, "w")
示例14: Consumer
# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class Consumer(object):
def __init__(self, addr, group, topic):
self.client = KafkaClient(addr)
self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
self.temp_file_path = None
self.temp_file = None
self.hadoop_path = "/user/AdReport/%s/history" %(topic)
self.cached_path = "/user/AdReport/%s/cached" % (topic)
self.topic = topic
self.group = group
self.block_cnt = 0
def consume_topic(self, output_dir):
timestamp = time.strftime('%Y%m%d%H%M%S')
#open file for writing
self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,
self.topic,
self.group,
timestamp)
self.temp_file = open(self.temp_file_path,"w")
print ( self.temp_file)
#one_entry = False
while True:
try:
messages = self.consumer.get_messages(count=10, block=False)
#OffsetAndMessage(offset=43, message=Message(magic=0,
# attributes=0, key=None, value='some message'))
for message in messages:
print (message)
#one_entry = True
#print (self.temp_file.tell())
self.temp_file.write(message.message.value + "\n")
if self.temp_file.tell() > 2000000:
self.save_to_hdfs(output_dir)
self.consumer.commit()
except:
self.consumer.seek(0, 2)
#if one_entry:
#print ("sending to hdfs")
#self.save_to_hdfs(output_dir, self.topic)
#self.consumer.commit()
def save_to_hdfs(self, output_dir):
print ("Saving file to hdfs")
self.temp_file.close()
print ("Closed open file")
timestamp = time.strftime('%Y%m%d%H%M%S')
hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,
self.topic, timestamp)
cached_fullpath = "%s/%s_%s_%s.dat" % (self.cached_path, self.group,
self.topic, timestamp)
#print ("Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_fullpath)
self.block_cnt += 1
# place blocked messages into history and cached folders on hdfs
os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path,
hadoop_fullpath))
os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path,
cached_fullpath))
os.remove(self.temp_file_path)
timestamp = time.strftime('%Y%m%d%H%M%S')
self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,
self.topic,
self.group,
timestamp)
self.temp_file = open(self.temp_file_path, "w")
示例15: main
# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
def main():
"""kafkadump: Kafka topic dump utility for debugging.
Usage:
kafkadump list --host=<host>
kafkadump dump <topic> --host=<host> [--consumer=<consumer>]
Examples:
List all the topics on your local Kafka instance:
python kafkadump.py list --host=<kafkahost>:9092
Dump the contents of a single topic starting from offset 0:
python kafkadump.py dump test.crawled_firehose --host=<kafkahost>:9092
Use CTRL+C (SIGINT, KeyboardInterrupt) to stop it from polling Kafka.
It will end by printing the total records serviced and the raw output
of the most recent record.
Options:
-h --host <host> Kafka host name where Kafka cluster will be resolved
-c --consumer <consumer> Consumer group ID to use for reading messages
"""
args = docopt(main.__doc__)
host = args["--host"]
logging.basicConfig()
print "=> Connecting to {0}...".format(host)
kafka = KafkaClient(host)
print "=> Connected."
if args["list"]:
for topic in kafka.topic_partitions.keys():
print topic
return 0
elif args["dump"]:
topic = args["<topic>"]
consumer_id = args["--consumer"] or "default"
consumer = SimpleConsumer(kafka, consumer_id, topic,
buffer_size=1024*100, # 100kb
fetch_size_bytes=1024*100, # 100kb
max_buffer_size=None # eliminate big message errors
)
consumer.seek(0, 0)
num_records = 0
total_bytes = 0
item = None
while True:
try:
message = consumer.get_message()
if message is None:
time.sleep(1)
continue
val = message.message.value
item = json.loads(val)
body_bytes = len(item)
print item
num_records = num_records + 1
total_bytes = total_bytes + body_bytes
except:
traceback.print_exc()
break
total_mbs = float(total_bytes) / (1024*1024)
print
if item is not None:
print json.dumps(item, indent=4)
if num_records == 0:
num_records = 1
print num_records, "records", total_mbs, "megabytes", (float(total_bytes) / num_records / 1024), "kb per msg"
kafka.close()
return 0