本文整理汇总了Python中kafka.consumer.SimpleConsumer类的典型用法代码示例。如果您正苦于以下问题:Python SimpleConsumer类的具体用法?Python SimpleConsumer怎么用?Python SimpleConsumer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了SimpleConsumer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
def run(self):
client = KafkaClient(self.bootstrap_server, client_id='commandline')
consumer = SimpleConsumer(client, self.group, self.topic, auto_commit_every_n=1, buffer_size=160,
auto_commit=True)
for message in consumer:
now = datetime.now()
print("%s: %s" % (now, message))
consumer.commit()
示例2: main
def main():
client = KafkaClient("localhost:9092")
consumer = SimpleConsumer(client, "test-group", "twitter_raw")
consumer.seek(0,2)
num = 0
for message in consumer:
print "redis publish:", num
num+=1
try:
data_depickled = pickle.loads(message.message.value.decode('utf-8'))
except Exception, e:
continue
# print data_depickled
# {
# 'text':'@_LulaMoore me hamas perra',
# 'created_at':datetime.datetime(2015, 10, 9, 23, 36, 49),
# 'source':u'Twitter Web Client',
# 'lang:':u'es',
# 'place':{
# 'country_code':u'AR',
# 'coordinates':[
# [
# -68.176283,
# -38.984724
# ],
# [
# -68.176283,
# -38.921051
# ],
# [
# -68.015162,
# -38.921051
# ],
# [
# -68.015162,
# -38.984724
# ]
# ]
# },
# 'user':{
# 'statuses_count':15067,
# 'name':u'Dama negra *\uffe6*',
# 'friends_count':390,
# 'created_at':datetime.datetime(2014, 3, 15,2,37, 10),
# 'profile_image_url': u'http://pbs.twimg.com/profile_images/652333268256313344/x9K9Nlys_normal.jpg',
# 'followers_count':384,
# 'id':2390242428
# },
# 'id':652628813935980544
# }
### process data here ###
# text = data_depickled['text']
filtered_data = data_filter(data_depickled)
data_pickled = pickle.dumps(filtered_data)
redis.publish('tweets_processed', data_pickled)
示例3: blocking_consumer
def blocking_consumer(self, message_consume_function, parse_json, topic_group, topic_name):
print "starting blocking consumer with topic group %s and topic name %s" % (topic_group, topic_name)
consumer = SimpleConsumer(self.client, topic_group, topic_name)
consumer.seek(0,2)
for message in consumer:
message = parse_json(message)
print "=============" + str(message) + "============"
message_consume_function(message)
print "called message consume function"
示例4: main
def main():
client = KafkaClient("localhost:9092")
consumer = SimpleConsumer(client, "test-group", "twitter_raw")
consumer.seek(0,2)
for message in consumer:
# data_deserialized = str.decode(message.message.value)
data_depickled = pickle.loads(message.message.value.decode('utf-8'))
# print str(data_depickled).decode('string_escape')
print data_depickled
示例5: run
def run(self):
client = KafkaClient("10.206.216.13:19092,10.206.212.14:19092,10.206.209.25:19092")
consumer = SimpleConsumer(client, "test-group", "jiketest",auto_commit=False,partitions=self.part)
consumer.seek(0,0)
while True:
message = consumer.get_message(True,60)
self.__offset = message.offset
print message.message.value
示例6: Consumer
class Consumer(object):
def __init__(self, addr, group, topic):
"""Initialize Consumer with kafka broker IP, group, and topic."""
self.client = KafkaClient(addr)
self.consumer = SimpleConsumer(self.client, group, topic,
max_buffer_size=1310720000)
self.temp_file_path = None
self.temp_file = None
self.hadoop_path = "/insight/artsy/geo"
self.topic = topic
self.group = group
self.block_cnt = 0
def consume_topic(self, output_dir):
"""Consumes a stream of messages from the "post_geo_activity" topic.
Code template from https://github.com/ajmssc/bitcoin-inspector.git
"""
timestamp = time.strftime('%Y%m%d%H%M%S')
# open file for writing
self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp)
self.temp_file = open(self.temp_file_path,"w")
while True:
try:
# get 1000 messages at a time, non blocking
messages = self.consumer.get_messages(count=1000, block=False)
for message in messages:
self.temp_file.write(message.message.value + "\n")
# file size > 20MB
if self.temp_file.tell() > 20000000:
self.flush_to_hdfs(output_dir)
self.consumer.commit()
except:
# move to tail of kafka topic if consumer is referencing
# unknown offset
self.consumer.seek(0, 2)
def flush_to_hdfs(self, output_dir):
"""Flushes the 20MB file into HDFS."""
self.temp_file.close()
timestamp = time.strftime('%Y%m%d%H%M%S')
hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,self.topic, timestamp)
print "Block {}: Flushing data file to HDFS => {}".format(str(self.block_cnt),hadoop_fullpath)
self.block_cnt += 1
os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # save from local to hdfs
os.remove(self.temp_file_path) # remove temp local file
timestamp = time.strftime('%Y%m%d%H%M%S')
self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp)
self.temp_file = open(self.temp_file_path, "w")
示例7: KafkaDatawakeLookaheadSpout
class KafkaDatawakeLookaheadSpout(Spout):
group = 'datawake-crawler-out-consumer'.encode()
def __init__(self):
Spout.__init__(self)
self.queue = None
def initialize(self, stormconf, context):
try:
settings = all_settings.get_settings(stormconf['topology.deployment'])
self.topic = settings['crawler-out-topic'].encode()
self.conn_pool = settings['conn_pool'].encode()
self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool)
self.kafka = KafkaClient(self.conn_pool)
self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None)
self.consumer.seek(0, 2) # move to the tail of the queue
except:
self.log("KafkaDatawakeLookaheadSpout initialize error", level='error')
self.log(traceback.format_exc(), level='error')
raise
def next_tuple(self):
"""
input message:
dict(
id = input['id'],
appid = input['appid'],
url = url,
status_code = response.getcode(),
status_msg = 'Success',
timestamp = response.info()['date'],
links_found = links,
raw_html = html,
attrs = input['attrs']
)
:return: (url, status, headers, flags, body, timestamp, source,context)
"""
offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
message = offsetAndMessage.message.value
crawled = json.loads(message)
safeurl = crawled['url'].encode('utf-8', 'ignore')
self.log("Lookahead spout received id: " + crawled['id'] + " url: " + safeurl)
context = {
'source': 'datawake-lookahead',
'userId': crawled['attrs']['userId'],
'org': crawled['attrs']['org'],
'domain': crawled['attrs']['domain'],
'url': crawled['url']
}
self.emit([crawled['url'], crawled['status_code'], '', '', crawled['raw_html'], crawled['timestamp'], context['source'], context])
示例8: spiderIdle
def spiderIdle(self, spider):
consumer = SimpleConsumer(self.kafka_conn, "test", "commands")
for msg in consumer.get_messages():
print msg.message.value
if msg.message.value == spider.name + "_stop":
print "stop"
spider.spider_pause()
# spider.close(spider,'ok')
# self.scrapy.engine.close_spider(spider, 'closespider_itemcount')
if msg.message.value == spider.name + "_start":
# self.scrapy.engine.scraper.open_spider(spider)
spider.spider_resume()
示例9: __init__
def __init__(self,conn_pool,topic,group):
self.conn_pool = conn_pool
self.topic = topic
self.group = group
self.kafka = KafkaClient(self.conn_pool)
self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None)
self.consumer.seek(0,2) # move to the tail of the queue
示例10: Consumer
class Consumer(object):
def __init__(self, addr):
self.client = KafkaClient(addr)
self.topic = "steps_data_part4"
self.consumer_group = 's3_consumer'
self.consumer = SimpleConsumer(self.client, self.consumer_group, self.topic)
def consume_message(self):
while True:
timestamp = time.strftime('%Y%m%d%H%M%S')
temp_file_name = "%s_%s_%s.dat" %(self.topic, self.consumer_group, timestamp)
temp_file = open("/home/ubuntu/rankMyStep/kafka/"+temp_file_name,"w")
messages = self.consumer.get_messages(count=1000, block=False)
for msg in messages:
print msg.message.value + "\n"
temp_file.write(msg.message.value + "\n")
self.save_to_s3(temp_file_name)
def save_to_s3(self, file_name):
mybucket = "anurag-raw-data-store"
aws_access_key = os.getenv('AWS_ACCESS_KEY_ID', 'default')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'default')
s3_client = boto3.client('s3')
s3_client.upload_file("/home/ubuntu/rankMyStep/kafka/"+file_name,
mybucket,"rankmysteps/"+file_name)
os.remove("/home/ubuntu/rankMyStep/kafka/"+file_name)
示例11: __init__
def __init__(self, addr, group, topic):
self.client = KafkaClient(addr)
self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
self.temp_file_path = None
self.temp_file = None
self.topic = topic
self.group = group
self.block_cnt = 0
示例12: KafkaDatawakeVisitedSpout
class KafkaDatawakeVisitedSpout(Spout):
group = 'datawake-visited-consumer'.encode()
def __init__(self):
Spout.__init__(self)
self.queue = None
def initialize(self, stormconf, context):
try:
settings = all_settings.get_settings(stormconf['topology.deployment'])
self.topic = settings['visited-topic'].encode()
self.conn_pool = settings['conn_pool'].encode()
self.log('KafkaDatawakeVisitedSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool)
self.kafka = KafkaClient(self.conn_pool)
self.kafka.ensure_topic_exists(self.topic)
self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None)
self.consumer.seek(0, 2) # move to the tail of the queue
except:
self.log("KafkaDatawakeVisitedSpout initialize error", level='error')
self.log(traceback.format_exc(), level='error')
raise
def next_tuple(self):
"""
input: (timestamp,org,domain,user_id,url,html)
:return: (url, status, headers, flags, body, timestamp, source,context)
"""
try:
for message in self.consumer:
self.log("msg")
self.log(message)
#offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
message = message.split('\0')
(timestamp, org, domain, userId, url, html) = message
context = {
'source': 'datawake-visited',
'domain': domain
}
self.emit([url, '', '', '', html, timestamp, context['source'], context])
except:
self.log(traceback.format_exc(), level='error')
def fail(self, tup_id):
pass
示例13: CrawlerSpout
class CrawlerSpout(Spout):
group = 'datawake-crawler-in-consumer'.encode()
def initialize(self, stormconf, context):
try:
settings = all_settings.get_settings(stormconf['topology.deployment'])
self.topic = settings['crawler-in-topic'].encode()
self.conn_pool = settings['conn_pool'].encode()
self.log('CrawlerSpout initialized with topic ='+self.topic+' conn_pool='+self.conn_pool)
self.kafka = KafkaClient(self.conn_pool)
self.kafka.ensure_topic_exists(self.topic)
self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None, fetch_size_bytes=2000000)
self.consumer.seek(0,2) # move to the tail of the queue
except:
self.log("CrawlerSpout initialize error",level='error')
self.log(traceback.format_exc(),level='error')
raise
def next_tuple(self):
"""
input message:
json.dumps(dict(
id = 'abcdefg', #TODO generate UUID,
appid = self.appid,
url = url,
priority = 50,
depth = 0,
attrs = dict(
userId = context['userId'],
org = context['org'],
domain = context['domain']
)
))
:return:
"""
try:
for message in self.consumer:
to_crawl = json.loads(message)
self.emit([to_crawl])
except:
self.log(traceback.format_exc(),level='error')
示例14: __init__
def __init__(self, addr, group, topic):
"""Initialize Consumer with kafka broker IP, group, and topic."""
self.client = KafkaClient(addr)
self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
self.temp_file_path = None
self.temp_file = None
self.hadoop_path = "/user/parking_data/history"
self.topic = topic
self.group = group
self.block_cnt = 0
示例15: __init__
def __init__(self, addr, group, topic):
self.client = KafkaClient(addr)
self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
self.temp_file_path = None
self.temp_file = None
self.hadoop_path = "/user/AdReport/%s/history" %(topic)
self.cached_path = "/user/AdReport/%s/cached" % (topic)
self.topic = topic
self.group = group
self.block_cnt = 0