本文整理汇总了Python中happybase.Connection.close方法的典型用法代码示例。如果您正苦于以下问题:Python Connection.close方法的具体用法?Python Connection.close怎么用?Python Connection.close使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类happybase.Connection
的用法示例。
在下文中一共展示了Connection.close方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: put_data_into_hbase
# 需要导入模块: from happybase import Connection [as 别名]
# 或者: from happybase.Connection import close [as 别名]
def put_data_into_hbase(rdd):
"""
functions to store data into hbase table
"""
# collecting the results
results = rdd.collect()
# computing the exact time: this will serve as the row id
date = str(datetime.datetime.now())[:19]
# making connection to the right
connection = Connection(host='localhost', port=9090, autoconnect=True)
table = connection.table(name='base_tweets')
#
for data in results:
if data[0] == 0:
table.put(row=date, data={'tweet_count:neg': str(data[1])})
else:
table.put(row=date, data={'tweet_count:pos': str(data[1])})
connection.close()
示例2: HBaseBackend
# 需要导入模块: from happybase import Connection [as 别名]
# 或者: from happybase.Connection import close [as 别名]
class HBaseBackend(Backend):
component_name = 'HBase Backend'
def __init__(self, manager):
self.manager = manager
settings = manager.settings
port = settings.get('HBASE_THRIFT_PORT', 9090)
hosts = settings.get('HBASE_THRIFT_HOST', 'localhost')
namespace = settings.get('HBASE_NAMESPACE', 'crawler')
drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES', False)
self.queue_partitions = settings.get('HBASE_QUEUE_PARTITIONS', 4)
self._table_name = settings.get('HBASE_METADATA_TABLE', 'metadata')
host = choice(hosts) if type(hosts) in [list, tuple] else hosts
self.connection = Connection(host=host, port=int(port), table_prefix=namespace, table_prefix_separator=':')
# protocol='compact', transport='framed'
self.queue = HBaseQueue(self.connection, self.queue_partitions, self.manager.logger.backend,
drop=drop_all_tables)
self.state_checker = HBaseState(self.connection, self._table_name)
tables = set(self.connection.tables())
if drop_all_tables and self._table_name in tables:
self.connection.delete_table(self._table_name, disable=True)
tables.remove(self._table_name)
if self._table_name not in tables:
self.connection.create_table(self._table_name, {'m': {'max_versions': 5}, # 'compression': 'SNAPPY'
's': {'max_versions': 1, 'block_cache_enabled': 1,
'bloom_filter_type': 'ROW', 'in_memory': True, },
'c': {'max_versions': 1}
})
table = self.connection.table(self._table_name)
self.batch = table.batch(batch_size=9216)
@classmethod
def from_manager(cls, manager):
return cls(manager)
def frontier_start(self):
pass
def frontier_stop(self):
self.connection.close()
self.flush()
def add_seeds(self, seeds):
for seed in seeds:
url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(seed)
obj = prepare_hbase_object(url=url,
depth=0,
created_at=utcnow_timestamp(),
domain_fingerprint=domain['fingerprint'])
self.batch.put(unhexlify(fingerprint), obj)
def page_crawled(self, response, links):
url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(response)
obj = prepare_hbase_object(status_code=response.status_code, content=response.body)
links_dict = dict()
for link in links:
link_url, link_fingerprint, link_domain = self.manager.canonicalsolver.get_canonical_url(link)
links_dict[unhexlify(link_fingerprint)] = (link, link_url, link_domain)
self.batch.put(unhexlify(fingerprint), obj)
for link_fingerprint, (link, link_url, link_domain) in links_dict.iteritems():
obj = prepare_hbase_object(url=link_url,
created_at=utcnow_timestamp(),
domain_fingerprint=link_domain['fingerprint'])
self.batch.put(link_fingerprint, obj)
def request_error(self, request, error):
url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(request)
obj = prepare_hbase_object(url=request.url,
created_at=utcnow_timestamp(),
error=error,
domain_fingerprint=domain['fingerprint'])
rk = unhexlify(request.meta['fingerprint'])
self.batch.put(rk, obj)
def get_next_requests(self, max_next_requests, **kwargs):
next_pages = []
log = self.manager.logger.backend
log.debug("Querying queue table.")
partitions = set(kwargs.pop('partitions', []))
for partition_id in range(0, self.queue_partitions):
if partition_id not in partitions:
continue
results = self.queue.get(partition_id, max_next_requests,
min_hosts=24, max_requests_per_host=128)
log.debug("Got %d items for partition id %d" % (len(results), partition_id))
for fingerprint, url, score in results:
r = self.manager.request_model(url=url)
r.meta['fingerprint'] = fingerprint
r.meta['score'] = score
next_pages.append(r)
return next_pages
#.........这里部分代码省略.........
示例3: HBaseBackend
# 需要导入模块: from happybase import Connection [as 别名]
# 或者: from happybase.Connection import close [as 别名]
class HBaseBackend(DistributedBackend):
component_name = 'HBase Backend'
def __init__(self, manager):
self.manager = manager
self.logger = logging.getLogger("hbase.backend")
settings = manager.settings
port = settings.get('HBASE_THRIFT_PORT')
hosts = settings.get('HBASE_THRIFT_HOST')
namespace = settings.get('HBASE_NAMESPACE')
self._min_requests = settings.get('BC_MIN_REQUESTS')
self._min_hosts = settings.get('BC_MIN_HOSTS')
self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST')
self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS')
host = choice(hosts) if type(hosts) in [list, tuple] else hosts
kwargs = {
'host': host,
'port': int(port),
'table_prefix': namespace,
'table_prefix_separator': ':'
}
if settings.get('HBASE_USE_FRAMED_COMPACT'):
kwargs.update({
'protocol': 'compact',
'transport': 'framed'
})
self.connection = Connection(**kwargs)
self._metadata = None
self._queue = None
self._states = None
@classmethod
def strategy_worker(cls, manager):
o = cls(manager)
settings = manager.settings
o._states = HBaseState(o.connection, settings.get('HBASE_METADATA_TABLE'),
settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'))
return o
@classmethod
def db_worker(cls, manager):
o = cls(manager)
settings = manager.settings
drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES')
o._queue = HBaseQueue(o.connection, o.queue_partitions,
settings.get('HBASE_QUEUE_TABLE'), drop=drop_all_tables)
o._metadata = HBaseMetadata(o.connection, settings.get('HBASE_METADATA_TABLE'), drop_all_tables,
settings.get('HBASE_USE_SNAPPY'), settings.get('HBASE_BATCH_SIZE'),
settings.get('STORE_CONTENT'))
return o
@property
def metadata(self):
return self._metadata
@property
def queue(self):
return self._queue
@property
def states(self):
return self._states
def frontier_start(self):
for component in [self.metadata, self.queue, self.states]:
if component:
component.frontier_start()
def frontier_stop(self):
for component in [self.metadata, self.queue, self.states]:
if component:
component.frontier_stop()
self.connection.close()
def add_seeds(self, seeds):
self.metadata.add_seeds(seeds)
def page_crawled(self, response):
self.metadata.page_crawled(response)
def links_extracted(self, request, links):
self.metadata.links_extracted(request, links)
def request_error(self, page, error):
self.metadata.request_error(page, error)
def finished(self):
raise NotImplementedError
def get_next_requests(self, max_next_requests, **kwargs):
next_pages = []
self.logger.debug("Querying queue table.")
partitions = set(kwargs.pop('partitions', []))
for partition_id in range(0, self.queue_partitions):
if partition_id not in partitions:
continue
results = self.queue.get_next_requests(max_next_requests, partition_id,
min_requests=self._min_requests,
min_hosts=self._min_hosts,
#.........这里部分代码省略.........
示例4: HBaseBackend
# 需要导入模块: from happybase import Connection [as 别名]
# 或者: from happybase.Connection import close [as 别名]
class HBaseBackend(Backend):
component_name = "HBase Backend"
def __init__(self, manager):
self.manager = manager
settings = manager.settings
port = settings.get("HBASE_THRIFT_PORT")
hosts = settings.get("HBASE_THRIFT_HOST")
namespace = settings.get("HBASE_NAMESPACE")
drop_all_tables = settings.get("HBASE_DROP_ALL_TABLES")
self.queue_partitions = settings.get("HBASE_QUEUE_PARTITIONS")
self._table_name = settings.get("HBASE_METADATA_TABLE")
host = choice(hosts) if type(hosts) in [list, tuple] else hosts
kwargs = {"host": host, "port": int(port), "table_prefix": namespace, "table_prefix_separator": ":"}
if settings.get("HBASE_USE_COMPACT_PROTOCOL"):
kwargs.update({"protocol": "compact", "transport": "framed"})
self.connection = Connection(**kwargs)
self.queue = HBaseQueue(
self.connection,
self.queue_partitions,
self.manager.logger.backend,
settings.get("HBASE_QUEUE_TABLE"),
drop=drop_all_tables,
)
self.state_checker = HBaseState(
self.connection, self._table_name, self.manager.logger.backend, settings.get("HBASE_STATE_CACHE_SIZE_LIMIT")
)
tables = set(self.connection.tables())
if drop_all_tables and self._table_name in tables:
self.connection.delete_table(self._table_name, disable=True)
tables.remove(self._table_name)
if self._table_name not in tables:
schema = {
"m": {"max_versions": 1},
"s": {"max_versions": 1, "block_cache_enabled": 1, "bloom_filter_type": "ROW", "in_memory": True},
"c": {"max_versions": 1},
}
if settings.get("HBASE_USE_SNAPPY"):
schema["m"]["compression"] = "SNAPPY"
schema["c"]["compression"] = "SNAPPY"
self.connection.create_table(self._table_name, schema)
table = self.connection.table(self._table_name)
self.batch = table.batch(batch_size=settings.get("HBASE_BATCH_SIZE"))
self.store_content = settings.get("HBASE_STORE_CONTENT")
@classmethod
def from_manager(cls, manager):
return cls(manager)
def frontier_start(self):
pass
def frontier_stop(self):
self.connection.close()
self.flush()
def add_seeds(self, seeds):
for seed in seeds:
url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(seed)
obj = prepare_hbase_object(
url=url, depth=0, created_at=utcnow_timestamp(), domain_fingerprint=domain["fingerprint"]
)
self.batch.put(unhexlify(fingerprint), obj)
def page_crawled(self, response, links):
url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(response)
obj = (
prepare_hbase_object(status_code=response.status_code, content=response.body)
if self.store_content
else prepare_hbase_object(status_code=response.status_code)
)
links_dict = dict()
for link in links:
link_url, link_fingerprint, link_domain = self.manager.canonicalsolver.get_canonical_url(link)
links_dict[unhexlify(link_fingerprint)] = (link, link_url, link_domain)
self.batch.put(unhexlify(fingerprint), obj)
for link_fingerprint, (link, link_url, link_domain) in links_dict.iteritems():
obj = prepare_hbase_object(
url=link_url, created_at=utcnow_timestamp(), domain_fingerprint=link_domain["fingerprint"]
)
self.batch.put(link_fingerprint, obj)
def request_error(self, request, error):
url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(request)
obj = prepare_hbase_object(
url=request.url, created_at=utcnow_timestamp(), error=error, domain_fingerprint=domain["fingerprint"]
)
rk = unhexlify(request.meta["fingerprint"])
self.batch.put(rk, obj)
def get_next_requests(self, max_next_requests, **kwargs):
next_pages = []
log = self.manager.logger.backend
log.debug("Querying queue table.")
partitions = set(kwargs.pop("partitions", []))
for partition_id in range(0, self.queue_partitions):
if partition_id not in partitions:
continue
#.........这里部分代码省略.........