当前位置: 首页>>代码示例>>Python>>正文


Python SimpleConsumer.seek方法代码示例

本文整理汇总了Python中kafka.consumer.SimpleConsumer.seek方法的典型用法代码示例。如果您正苦于以下问题:Python SimpleConsumer.seek方法的具体用法?Python SimpleConsumer.seek怎么用?Python SimpleConsumer.seek使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在kafka.consumer.SimpleConsumer的用法示例。


在下文中一共展示了SimpleConsumer.seek方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
def main():
    client = KafkaClient("localhost:9092")
    consumer = SimpleConsumer(client, "test-group", "twitter_raw")
    consumer.seek(0,2)

    num = 0
    for message in consumer:
        print "redis publish:", num
        num+=1
        try:
            data_depickled = pickle.loads(message.message.value.decode('utf-8'))
        except Exception, e:
            continue
        # print data_depickled
        # {  
        #    'text':'@_LulaMoore me hamas perra',
        #    'created_at':datetime.datetime(2015, 10, 9, 23, 36, 49),
        #    'source':u'Twitter Web Client',
        #    'lang:':u'es',
        #    'place':{  
        #       'country_code':u'AR',
        #       'coordinates':[  
        #          [  
        #             -68.176283,
        #             -38.984724
        #          ],
        #          [  
        #             -68.176283,
        #             -38.921051
        #          ],
        #          [  
        #             -68.015162,
        #             -38.921051
        #          ],
        #          [  
        #             -68.015162,
        #             -38.984724
        #          ]
        #       ]
        #    },
        #    'user':{  
        #       'statuses_count':15067,
        #       'name':u'Dama negra *\uffe6*',
        #       'friends_count':390,
        #       'created_at':datetime.datetime(2014, 3, 15,2,37, 10),
        #       'profile_image_url': u'http://pbs.twimg.com/profile_images/652333268256313344/x9K9Nlys_normal.jpg',
        #       'followers_count':384,
        #       'id':2390242428
        #    },
        #    'id':652628813935980544
        # }

        ### process data here ###
        # text = data_depickled['text']
        filtered_data = data_filter(data_depickled)
        data_pickled = pickle.dumps(filtered_data)
        redis.publish('tweets_processed', data_pickled)
开发者ID:krist-jin,项目名称:tweets-map,代码行数:59,代码来源:simpleSub.py

示例2: run

# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
    def run(self):
        client = KafkaClient("10.206.216.13:19092,10.206.212.14:19092,10.206.209.25:19092")
        consumer = SimpleConsumer(client, "test-group", "jiketest",auto_commit=False,partitions=self.part)

        consumer.seek(0,0)

        while True:
            message = consumer.get_message(True,60)
            self.__offset = message.offset
            print message.message.value
开发者ID:2lusy,项目名称:learndemo,代码行数:12,代码来源:partition_consumer.py

示例3: main

# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
def main():
    client = KafkaClient("localhost:9092")
    consumer = SimpleConsumer(client, "test-group", "twitter_raw")
    consumer.seek(0,2)

    for message in consumer:
        # data_deserialized = str.decode(message.message.value)
        data_depickled = pickle.loads(message.message.value.decode('utf-8'))
        # print str(data_depickled).decode('string_escape')
        print data_depickled
开发者ID:krist-jin,项目名称:tweets-map,代码行数:12,代码来源:testSub.py

示例4: blocking_consumer

# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
    def blocking_consumer(self, message_consume_function, parse_json, topic_group, topic_name):
        print "starting blocking consumer with topic group %s and topic name %s" % (topic_group, topic_name)
        consumer = SimpleConsumer(self.client, topic_group, topic_name)
        consumer.seek(0,2)

        for message in consumer:
            message = parse_json(message)
            print "=============" + str(message) + "============"
            message_consume_function(message)
            print "called message consume function"
开发者ID:pombredanne,项目名称:splash-kafka,代码行数:12,代码来源:kafkaclient.py

示例5: Consumer

# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class Consumer(object):

    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000, auto_offset_reset='smallest')
        self.temp_file_path = None
        self.temp_file = None
        self.topic = topic
        self.group = group
        self.block_cnt = 0


    def consume_topic(self):

        timestamp = time.strftime('%Y%m%d%H%M%S')

        #open file for writing
        self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path,"w")
        header = 'experiment_id,job_id,results_file,package_id,package_name,worker_id,config_id,replicate_no,setup_time,run_time,collect_time,hw_cpu_arch,hw_cpu_mhz,hw_gpu_mhz,hw_num_cpus,hw_page_sz,hw_ram_mhz,hw_ram_sz,sw_address_randomization,sw_autogroup,sw_compiler,sw_drop_caches,sw_env_padding,sw_filesystem,sw_freq_scaling,sw_link_order,sw_opt_flag,sw_swap,sw_sys_time'
        self.temp_file.write(header)

        while True:
            try:
                messages = self.consumer.get_messages(count=100, block=False)

                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                if self.temp_file.tell() > 20000:
                    self.save_to_hdfs()

                self.consumer.commit()
            except:
                self.consumer.seek(0, 2)

        self.consumer.commit()

    def save_to_hdfs(self):
        self.temp_file.close()

        timestamp = time.strftime('%Y%m%d%H%M%S')
        hadoop_path = "/datamill/%s_%s_%s.csv" % (self.group, self.topic, timestamp)
        print "Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_path
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_path))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "/home/ubuntu/datamill/kafka_%s_%s_%s.dat" % (self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
开发者ID:yuguang,项目名称:performance-lab,代码行数:56,代码来源:hdfs_consumer.py

示例6: Consumer

# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class Consumer(object):
    def __init__(self, addr, group, topic):
        """Initialize Consumer with kafka broker IP, group, and topic."""
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic,
                                       max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/insight/artsy/geo"
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self, output_dir):
        """Consumes a stream of messages from the "post_geo_activity" topic.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        """
        timestamp = time.strftime('%Y%m%d%H%M%S')
        
        # open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp)
        self.temp_file = open(self.temp_file_path,"w")

        while True:
            try:
                # get 1000 messages at a time, non blocking
                messages = self.consumer.get_messages(count=1000, block=False)
                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                # file size > 20MB
                if self.temp_file.tell() > 20000000:
                    self.flush_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                # move to tail of kafka topic if consumer is referencing
                # unknown offset
                self.consumer.seek(0, 2)


    def flush_to_hdfs(self, output_dir):
        """Flushes the 20MB file into HDFS."""
        self.temp_file.close()
        timestamp = time.strftime('%Y%m%d%H%M%S')
        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,self.topic, timestamp)

        print "Block {}: Flushing data file to HDFS => {}".format(str(self.block_cnt),hadoop_fullpath)
        self.block_cnt += 1
        os.system("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath)) # save from local to hdfs
        os.remove(self.temp_file_path) # remove temp local file
        timestamp = time.strftime('%Y%m%d%H%M%S')
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,self.topic,self.group,timestamp)
        self.temp_file = open(self.temp_file_path, "w")
开发者ID:keiraqz,项目名称:artmosphere,代码行数:56,代码来源:hdfs_consumer.py

示例7: KafkaDatawakeLookaheadSpout

# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class KafkaDatawakeLookaheadSpout(Spout):
    group = 'datawake-crawler-out-consumer'.encode()

    def __init__(self):
        Spout.__init__(self)
        self.queue = None

    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(stormconf['topology.deployment'])
            self.topic = settings['crawler-out-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
            self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None)
            self.consumer.seek(0, 2)  # move to the tail of the queue
        except:
            self.log("KafkaDatawakeLookaheadSpout initialize error", level='error')
            self.log(traceback.format_exc(), level='error')
            raise


    def next_tuple(self):
        """
        input message:
            dict(
                 id = input['id'],
                 appid = input['appid'],
                 url = url,
                 status_code = response.getcode(),
                 status_msg = 'Success',
                 timestamp = response.info()['date'],
                 links_found = links,
                 raw_html =  html,
                 attrs = input['attrs']
            )
        :return:  (url, status, headers, flags, body, timestamp, source,context)
        """

        offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
        message = offsetAndMessage.message.value

        crawled = json.loads(message)
        safeurl = crawled['url'].encode('utf-8', 'ignore')
        self.log("Lookahead spout received id: " + crawled['id'] + " url: " + safeurl)
        context = {
            'source': 'datawake-lookahead',
            'userId': crawled['attrs']['userId'],
            'org': crawled['attrs']['org'],
            'domain': crawled['attrs']['domain'],
            'url': crawled['url']
        }
        self.emit([crawled['url'], crawled['status_code'], '', '', crawled['raw_html'], crawled['timestamp'], context['source'], context])
开发者ID:diffeo,项目名称:Datawake,代码行数:55,代码来源:kafka_spouts.py

示例8: __init__

# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class KafkaConsumer:

    group = "python-lookahead-consumer"

    def __init__(self,conn_pool,topic,group):
        self.conn_pool = conn_pool
        self.topic = topic
        self.group = group
        self.kafka = KafkaClient(self.conn_pool)
        self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None)
        self.consumer.seek(0,2) # move to the tail of the queue

    def next(self):
        offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
        message = offsetAndMessage.message.value
        return message
开发者ID:Sotera,项目名称:Datawake-Legacy,代码行数:18,代码来源:kafka_consumer.py

示例9: KafkaDatawakeVisitedSpout

# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class KafkaDatawakeVisitedSpout(Spout):
    group = 'datawake-visited-consumer'.encode()

    def __init__(self):
        Spout.__init__(self)
        self.queue = None

    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(stormconf['topology.deployment'])
            self.topic = settings['visited-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('KafkaDatawakeVisitedSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
            self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None)
            self.consumer.seek(0, 2)  # move to the tail of the queue
        except:
            self.log("KafkaDatawakeVisitedSpout initialize error", level='error')
            self.log(traceback.format_exc(), level='error')
            raise

    def next_tuple(self):
        """
        input:  (timestamp,org,domain,user_id,url,html)
        :return:  (url, status, headers, flags, body, timestamp, source,context)
        """
        try:
            for message in self.consumer:
                self.log("msg")
                self.log(message)
                #offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
                message = message.split('\0')
                (timestamp, org, domain, userId, url, html) = message
                context = {
                    'source': 'datawake-visited',
                    'domain': domain
                }
                self.emit([url, '', '', '', html, timestamp, context['source'], context])
        except:
            self.log(traceback.format_exc(), level='error')

    def fail(self, tup_id):
	pass 
开发者ID:Sotera,项目名称:datawake-prefetch,代码行数:46,代码来源:kafka_spouts.py

示例10: CrawlerSpout

# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class CrawlerSpout(Spout):

    group = 'datawake-crawler-in-consumer'.encode()


    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(stormconf['topology.deployment'])
            self.topic = settings['crawler-in-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('CrawlerSpout initialized with topic ='+self.topic+' conn_pool='+self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
	    self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None, fetch_size_bytes=2000000)
            self.consumer.seek(0,2) # move to the tail of the queue
        except:
            self.log("CrawlerSpout initialize error",level='error')
            self.log(traceback.format_exc(),level='error')
            raise

    def next_tuple(self):
        """
        input message:
             json.dumps(dict(
                    id = 'abcdefg', #TODO generate UUID,
                    appid = self.appid,
                    url = url,
                    priority = 50,
                    depth = 0,
                    attrs  = dict(
                        userId = context['userId'],
                        org =  context['org'],
                        domain = context['domain']
                    )
                ))
        :return:
        """
        try:
            for message in self.consumer:
                to_crawl = json.loads(message)
                self.emit([to_crawl])
        except:
            self.log(traceback.format_exc(),level='error')
开发者ID:Sotera,项目名称:datawake-prefetch,代码行数:45,代码来源:crawler_spout.py

示例11: __init__

# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]

#.........这里部分代码省略.........
            for property, subschema in properties.iteritems():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

        return validators.extend(
            validator_class, {"properties": set_defaults},
        )

    def handle_crawl_request(self, dict):
        '''
        Processes a vaild crawl request

        @param dict: a valid dictionary object
        '''
        # format key
        key = "{sid}:queue".format(sid=dict['spiderid'])
        val = pickle.dumps(dict, protocol=-1)

        # shortcut to shove stuff into the priority queue
        self.redis_conn.zadd(key, val, -dict['priority'])

        # if timeout crawl, add value to redis
        if 'expires' in dict:
            key = "timeout:{sid}:{appid}:{crawlid}".format(
                sid=dict['spiderid'],
                appid=dict['appid'],
                crawlid=dict['crawlid'])
            self.redis_conn.set(key, dict['expires'])

    def handle_action_request(self, dict):
        '''
        Processes a vaild action request

        @param dict: The valid dictionary object
        '''
        # format key
        key = "{action}:{spiderid}:{appid}".format(
            action=dict['action'],
            spiderid=dict['spiderid'],
            appid=dict['appid'])

        if "crawlid" in dict:
            key = key + ":" + dict['crawlid']

        self.redis_conn.set(key, dict['uuid'])

    def _main_loop(self):
        '''
        Continuous loop that reads from a kafka topic and tries to validate
        incoming messages
        '''
        while True:
            start = time.time()

            try:
                for message in self.consumer.get_messages():
                    if message is None:
                        break
                    try:
                        the_dict = json.loads(message.message.value)

                        try:
                            self.validator(self.schema).validate(the_dict)
                            self.result_method(the_dict)
                        except ValidationError as ex:
                            print "invalid json received"

                    except ValueError:
                        print "bad json recieved"
            except OffsetOutOfRangeError:
                # consumer has no idea where they are
                self.consumer.seek(0, 2)

            end = time.time()
            time.sleep(.01)

    def run(self):
        '''
        Sets up the schema to be validated against
        '''
        self.setup()
        with open(self.settings.SCHEMA) as the_file:
            # No try/catch so we can see if there is a json parse error
            # on the schemas
            self.schema = json.load(the_file)
            self._main_loop()

    def feed(self, json_item):
        '''
        Feeds a json item into the Kafka topic

        @param json_item: The loaded json object
        '''
        topic = self.settings.KAFKA_INCOMING_TOPIC
        producer = SimpleProducer(self.kafka_conn)
        print "=> feeding JSON request into {0}...".format(topic)
        print json.dumps(json_item, indent=4)
        self.kafka_conn.ensure_topic_exists(topic)
        producer.send_messages(topic, json.dumps(json_item))
        print "=> done feeding request."
开发者ID:openslack,项目名称:openslack-crawler,代码行数:104,代码来源:kafka-monitor.py

示例12: ZKConsumer

# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]

#.........这里部分代码省略.........
                return

        self.consumer = SimpleConsumer(self.client, self.group, self.topic,
                                       partitions=my_partitions,
                                       **self.consumer_kwargs)
        self.consumer.provide_partition_info()
        self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets)

    def stop(self):
        if self.consumer is not None:
            self.logger.info('Stopping Kafka consumer')
            self.consumer.stop()
            self.consumer = None
        if self.client is not None:
            self.logger.info('Stopping Kafka client')
            self.client.close()
            self.client = None
        if self.zk is not None:
            self.logger.info('Stopping ZooKeeper client')
            if self.zkp is not None and not self.zkp.failed:
                self.zkp.finish()
                self.zk.stop()
            self.zkp = None
            self.zk = None

    def commit(self, partitions=None):
        """
        Commit offsets for this consumer

        partitions: list of partitions to commit, default is to commit
                    all of them
        """
        if self.consumer is None:
            return
        self.logger.debug('Begin committing offsets for partitions: %s',
                          partitions if partitions else 'All')
        self.consumer.commit(partitions)
        self.logger.debug('End committing offsets for partitions: %s',
                          partitions if partitions else 'All')

    def pending(self, partitions=None):
        """
        Gets the pending message count

        partitions: list of partitions to check for, default is to check all
        """
        return self.consumer.pending(partitions)

    def provide_partition_info(self):
        """
        Indicates that partition info must be returned by the consumer
        """
        self.consumer.provide_partition_info()

    def seek(self, offset, whence):
        """
        Alter the current offset in the consumer, similar to fseek

        offset: how much to modify the offset
        whence: where to modify it from
                0 is relative to the earliest available offset (head)
                1 is relative to the current offset
                2 is relative to the latest known offset (tail)
        """
        self.consumer.seek(offset, whence)

    def get_messages(self, count=1, block=True, timeout=0.1):
        """
        Fetch the specified number of messages

        count: Indicates the maximum number of messages to be fetched
        block: If True, the API will block till some messages are fetched.
        timeout: If block is True, the function will block for the specified
                 time (in seconds) until count messages is fetched. If None,
                 it will block forever.
        """
        if self.consumer is None:
            return []
        else:
            try:
                messages = self.consumer.get_messages(count, block, timeout)
                if not messages and self.zkp.failed:
                    raise FailedPayloadsError
                return messages
            except FailedPayloadsError as err:
                msg = 'Failed to retrieve payload, restarting consumer'
                self.logger.exception(msg)
                raise err

    def get_message(self, block=True, timeout=0.1, get_partition_info=None):
        return self.consumer.get_message(block, timeout, get_partition_info)

    def _get_message(self, block=True, timeout=0.1, get_partition_info=None,
                     update_offset=True):
        return self.consumer._get_message(block, timeout, get_partition_info,
                                          update_offset)

    def __iter__(self):
        for msg in self.consumer:
            yield msg
开发者ID:CrowdStrike,项目名称:cs.eyrie,代码行数:104,代码来源:zk_consumer.py

示例13: Consumer

# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class Consumer(object):
    """Kafka consumer class with functions to consume messages to HDFS.
    Messages are blocked into 20MB files and transferred to HDFS
    Attributes:
        client: string representing IP:port of the kafka broker
        consumer: Consumer object specifying the client group, and topic
        temp_file_path: location of the 20MB file to be appended to before
            transfer to HDFS
        temp_file: File object opened from temp_file_path
        topic: String representing the topic on Kafka
        group: String representing the Kafka consumer group to be associated
            with
        block_cnt: integer representing the block count for print statements
    """

    def __init__(self, addr, group, topic):
        """Initialize Consumer with kafka broker IP, group, and topic."""
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/user/parking_data/history"
        self.topic = topic
        self.group = group
        self.block_cnt = 0

    def consume_topic(self, output_dir):
        """Consumes a stream of messages from the "messages" topic.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        Args:
            output_dir: string representing the directory to store the 20MB
                before transferring to HDFS
        Returns:
            None
        """
        timestamp = time.strftime("%Y%m%d%H%M%S")

        # open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")

        # while True:
        for ii in range(0, 2):
            try:
                # get 1000 messages at a time, non blocking
                messages = self.consumer.get_messages(count=1000, block=False)

                # OffsetAndMessage(offset=43, message=Message(magic=0,
                # attributes=0, key=None, value='some message'))
                for message in messages:
                    self.temp_file.write(message.message.value + "\n")

                # file size > 20MB
                if self.temp_file.tell() > 20000000:
                    self.flush_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                # move to tail of kafka topic if consumer is referencing
                # unknown offset
                self.consumer.seek(0, 2)

    def flush_to_hdfs(self, output_dir):
        """Flushes the 20MB file into HDFS.
        Code template from https://github.com/ajmssc/bitcoin-inspector.git
        Flushes the file into HDFS folders
        Args:
            output_dir: string representing the directory to store the 20MB
                before transferring to HDFS
        Returns:
            None
        """
        self.temp_file.close()

        timestamp = time.strftime("%Y%m%d%H%M%S")

        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group, self.topic, timestamp)
        print "Block {}: Flushing 20MB file to HDFS => {}".format(str(self.block_cnt), hadoop_fullpath)
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        print ("hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath))
        os.system("sudo hdfs dfs -put %s %s" % (self.temp_file_path, hadoop_fullpath))
        # os.system("sudo -u hdfs hdfs dfs -put %s %s" % (self.temp_file_path,
        # cached_fullpath))
        os.remove(self.temp_file_path)

        timestamp = time.strftime("%Y%m%d%H%M%S")

        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir, self.topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
开发者ID:suhashm,项目名称:ParkMate,代码行数:93,代码来源:kafka_to_hdfs.py

示例14: Consumer

# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
class Consumer(object):

    def __init__(self, addr, group, topic):
        self.client = KafkaClient(addr)
        self.consumer = SimpleConsumer(self.client, group, topic, max_buffer_size=1310720000)
        self.temp_file_path = None
        self.temp_file = None
        self.hadoop_path = "/user/AdReport/%s/history" %(topic)
        self.cached_path = "/user/AdReport/%s/cached" % (topic)
        self.topic = topic
        self.group = group
        self.block_cnt = 0


    def consume_topic(self, output_dir):

        timestamp = time.strftime('%Y%m%d%H%M%S')
        
        #open file for writing
        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,
                                                         self.topic,
                                                         self.group,
                                                         timestamp)
        self.temp_file = open(self.temp_file_path,"w")
	print ( self.temp_file) 
	#one_entry = False

        while True:
            try:
                messages = self.consumer.get_messages(count=10, block=False)
		
                #OffsetAndMessage(offset=43, message=Message(magic=0,
                # attributes=0, key=None, value='some message'))
                for message in messages:
		    print (message)
		    #one_entry = True
                    #print (self.temp_file.tell())
		    self.temp_file.write(message.message.value + "\n")		

                if self.temp_file.tell() > 2000000:
                    self.save_to_hdfs(output_dir)

                self.consumer.commit()
            except:
                self.consumer.seek(0, 2)

	#if one_entry:
	    #print ("sending to hdfs")
            #self.save_to_hdfs(output_dir, self.topic)
	#self.consumer.commit()

    def save_to_hdfs(self, output_dir):
	print ("Saving file to hdfs")
        self.temp_file.close()
	print ("Closed open file")
        timestamp = time.strftime('%Y%m%d%H%M%S')

        hadoop_fullpath = "%s/%s_%s_%s.dat" % (self.hadoop_path, self.group,
                                               self.topic, timestamp)
        cached_fullpath = "%s/%s_%s_%s.dat" % (self.cached_path, self.group,
                                               self.topic, timestamp)
        #print ("Block " + str(self.block_cnt) + ": Saving file to HDFS " + hadoop_fullpath)
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path,
                                                        hadoop_fullpath))
        os.system("sudo -u ubuntu /usr/local/hadoop/bin/hdfs dfs -put %s %s" % (self.temp_file_path,
                                                        cached_fullpath))
        os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')

        self.temp_file_path = "%s/kafka_%s_%s_%s.dat" % (output_dir,
                                                         self.topic,
                                                         self.group,
                                                         timestamp)
        self.temp_file = open(self.temp_file_path, "w")
开发者ID:prarthanabhattarai,项目名称:AdReportProject,代码行数:80,代码来源:kafka_consumer_bids.py

示例15: main

# 需要导入模块: from kafka.consumer import SimpleConsumer [as 别名]
# 或者: from kafka.consumer.SimpleConsumer import seek [as 别名]
def main():
    """kafkadump: Kafka topic dump utility for debugging.

    Usage:
        kafkadump list --host=<host>
        kafkadump dump <topic> --host=<host> [--consumer=<consumer>]

    Examples:

        List all the topics on your local Kafka instance:

            python kafkadump.py list --host=<kafkahost>:9092

        Dump the contents of a single topic starting from offset 0:

            python kafkadump.py dump test.crawled_firehose --host=<kafkahost>:9092

        Use CTRL+C (SIGINT, KeyboardInterrupt) to stop it from polling Kafka.
        It will end by printing the total records serviced and the raw output
        of the most recent record.

    Options:
        -h --host <host>            Kafka host name where Kafka cluster will be resolved
        -c --consumer <consumer>    Consumer group ID to use for reading messages
    """
    args = docopt(main.__doc__)
    host = args["--host"]

    logging.basicConfig()

    print "=> Connecting to {0}...".format(host)
    kafka = KafkaClient(host)
    print "=> Connected."

    if args["list"]:
        for topic in kafka.topic_partitions.keys():
            print topic
        return 0
    elif args["dump"]:
        topic = args["<topic>"]
        consumer_id = args["--consumer"] or "default"
        consumer = SimpleConsumer(kafka, consumer_id, topic,
                            buffer_size=1024*100,      # 100kb
                            fetch_size_bytes=1024*100, # 100kb
                            max_buffer_size=None       # eliminate big message errors
                            )
        consumer.seek(0, 0)
        num_records = 0
        total_bytes = 0
        item = None
        while True:
            try:
                message = consumer.get_message()
                if message is None:
                    time.sleep(1)
                    continue
                val = message.message.value
                item = json.loads(val)
                body_bytes = len(item)
                print item
                num_records = num_records + 1
                total_bytes = total_bytes + body_bytes
            except:
                traceback.print_exc()
                break
        total_mbs = float(total_bytes) / (1024*1024)
        print
        if item is not None:
            print json.dumps(item, indent=4)
        if num_records == 0:
            num_records = 1
        print num_records, "records", total_mbs, "megabytes", (float(total_bytes) / num_records / 1024), "kb per msg"
        kafka.close()
        return 0
开发者ID:WalnutATiie,项目名称:scrapy-cluster,代码行数:76,代码来源:kafkadump.py


注:本文中的kafka.consumer.SimpleConsumer.seek方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。