本文整理汇总了Python中airflow.hooks.hive_hooks.HiveMetastoreHook类的典型用法代码示例。如果您正苦于以下问题:Python HiveMetastoreHook类的具体用法?Python HiveMetastoreHook怎么用?Python HiveMetastoreHook使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了HiveMetastoreHook类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: execute
def execute(self, context):
hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
logging.info("Extracting data from Hive")
hive_table = "druid." + context["task_instance_key_str"].replace(".", "_")
sql = self.sql.strip().strip(";")
hql = """\
set mapred.output.compress=false;
set hive.exec.compress.output=false;
DROP TABLE IF EXISTS {hive_table};
CREATE TABLE {hive_table}
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
TBLPROPERTIES ('serialization.null.format' = '')
AS
{sql}
""".format(
**locals()
)
logging.info("Running command:\n {}".format(hql))
hive.run_cli(hql)
m = HiveMetastoreHook(self.metastore_conn_id)
t = m.get_table(hive_table)
columns = [col.name for col in t.sd.cols]
hdfs_uri = m.get_table(hive_table).sd.location
pos = hdfs_uri.find("/user")
static_path = hdfs_uri[pos:]
schema, table = hive_table.split(".")
druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
logging.info("Inserting rows into Druid")
logging.info("HDFS path: " + static_path)
try:
druid.load_from_hdfs(
datasource=self.druid_datasource,
intervals=self.intervals,
static_path=static_path,
ts_dim=self.ts_dim,
columns=columns,
num_shards=self.num_shards,
target_partition_size=self.target_partition_size,
query_granularity=self.query_granularity,
segment_granularity=self.segment_granularity,
metric_spec=self.metric_spec,
hadoop_dependency_coordinates=self.hadoop_dependency_coordinates,
)
logging.info("Load seems to have succeeded!")
finally:
logging.info("Cleaning up by dropping the temp " "Hive table {}".format(hive_table))
hql = "DROP TABLE IF EXISTS {}".format(hive_table)
hive.run_cli(hql)
示例2: max_partition
def max_partition(
table, schema="default", field=None, filter_map=None,
metastore_conn_id='metastore_default'):
"""
Gets the max partition for a table.
:param schema: The hive schema the table lives in
:type schema: str
:param table: The hive table you are interested in, supports the dot
notation as in "my_database.my_table", if a dot is found,
the schema param is disregarded
:type table: str
:param metastore_conn_id: The hive connection you are interested in.
If your default is set you don't need to use this parameter.
:type metastore_conn_id: str
:param filter_map: partition_key:partition_value map used for partition filtering,
e.g. {'key1': 'value1', 'key2': 'value2'}.
Only partitions matching all partition_key:partition_value
pairs will be considered as candidates of max partition.
:type filter_map: map
:param field: the field to get the max value from. If there's only
one partition field, this will be inferred
:type field: str
>>> max_partition('airflow.static_babynames_partitioned')
'2015-01-01'
"""
from airflow.hooks.hive_hooks import HiveMetastoreHook
if '.' in table:
schema, table = table.split('.')
hh = HiveMetastoreHook(metastore_conn_id=metastore_conn_id)
return hh.max_partition(
schema=schema, table_name=table, field=field, filter_map=filter_map)
示例3: test_get_max_partition_from_valid_part_specs_and_invalid_filter_map
def test_get_max_partition_from_valid_part_specs_and_invalid_filter_map(self):
with self.assertRaises(AirflowException):
HiveMetastoreHook._get_max_partition_from_part_specs(
[{'key1': 'value1', 'key2': 'value2'},
{'key1': 'value3', 'key2': 'value4'}],
'key1',
{'key3': 'value5'})
示例4: max_partition
def max_partition(
table, schema="default", field=None, filter=None,
metastore_conn_id='metastore_default'):
'''
Gets the max partition for a table.
:param schema: The hive schema the table lives in
:type schema: string
:param table: The hive table you are interested in, supports the dot
notation as in "my_database.my_table", if a dot is found,
the schema param is disregarded
:type table: string
:param hive_conn_id: The hive connection you are interested in.
If your default is set you don't need to use this parameter.
:type hive_conn_id: string
:param filter: filter on a subset of partition as in
`sub_part='specific_value'`
:type filter: string
:param field: the field to get the max value from. If there's only
one partition field, this will be inferred
>>> max_partition('airflow.static_babynames_partitioned')
'2015-01-01'
'''
from airflow.hooks.hive_hooks import HiveMetastoreHook
if '.' in table:
schema, table = table.split('.')
hh = HiveMetastoreHook(metastore_conn_id=metastore_conn_id)
return hh.max_partition(
schema=schema, table_name=table, field=field, filter=filter)
示例5: test_get_max_partition_from_valid_part_specs_and_none_partition_key
def test_get_max_partition_from_valid_part_specs_and_none_partition_key(self):
with self.assertRaises(AirflowException):
HiveMetastoreHook._get_max_partition_from_part_specs(
[{'key1': 'value1', 'key2': 'value2'},
{'key1': 'value3', 'key2': 'value4'}],
None,
self.VALID_FILTER_MAP)
示例6: table
def table(self):
table_name = request.args.get("table")
m = HiveMetastoreHook(METASTORE_CONN_ID)
table = m.get_table(table_name)
return self.render(
"metastore_browser/table.html",
table=table, table_name=table_name, datetime=datetime, int=int)
示例7: execute
def execute(self, context):
hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
self.log.info("Extracting data from Hive")
hive_table = 'druid.' + context['task_instance_key_str'].replace('.', '_')
sql = self.sql.strip().strip(';')
tblproperties = ''.join([", '{}' = '{}'"
.format(k, v)
for k, v in self.hive_tblproperties.items()])
hql = """\
SET mapred.output.compress=false;
SET hive.exec.compress.output=false;
DROP TABLE IF EXISTS {hive_table};
CREATE TABLE {hive_table}
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
TBLPROPERTIES ('serialization.null.format' = ''{tblproperties})
AS
{sql}
""".format(hive_table=hive_table, tblproperties=tblproperties, sql=sql)
self.log.info("Running command:\n %s", hql)
hive.run_cli(hql)
m = HiveMetastoreHook(self.metastore_conn_id)
# Get the Hive table and extract the columns
t = m.get_table(hive_table)
columns = [col.name for col in t.sd.cols]
# Get the path on hdfs
static_path = m.get_table(hive_table).sd.location
schema, table = hive_table.split('.')
druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
try:
index_spec = self.construct_ingest_query(
static_path=static_path,
columns=columns,
)
self.log.info("Inserting rows into Druid, hdfs path: %s", static_path)
druid.submit_indexing_job(index_spec)
self.log.info("Load seems to have succeeded!")
finally:
self.log.info(
"Cleaning up by dropping the temp Hive table %s",
hive_table
)
hql = "DROP TABLE IF EXISTS {}".format(hive_table)
hive.run_cli(hql)
示例8: test_get_max_partition_from_mal_valid_part_names
def test_get_max_partition_from_mal_valid_part_names(self):
max_partition = \
HiveMetastoreHook._get_max_partition_from_part_names(['some_key=value1',
'some_key=value2',
'some_key=value3'],
'some_key')
self.assertEqual(max_partition, 'value3')
示例9: test_get_max_partition_from_valid_part_specs
def test_get_max_partition_from_valid_part_specs(self):
max_partition = \
HiveMetastoreHook._get_max_partition_from_part_specs(
[{'key1': 'value1', 'key2': 'value2'},
{'key1': 'value3', 'key2': 'value4'}],
'key1',
self.VALID_FILTER_MAP)
self.assertEqual(max_partition, b'value1')
示例10: test_get_max_partition_from_valid_part_specs_and_none_filter_map
def test_get_max_partition_from_valid_part_specs_and_none_filter_map(self):
max_partition = \
HiveMetastoreHook._get_max_partition_from_part_specs(
[{'key1': 'value1', 'key2': 'value2'},
{'key1': 'value3', 'key2': 'value4'}],
'key1',
None)
# No partition will be filtered out.
self.assertEqual(max_partition, b'value3')
示例11: poke_partition
def poke_partition(self, partition):
if not self.hook:
from airflow.hooks.hive_hooks import HiveMetastoreHook
self.hook = HiveMetastoreHook(
metastore_conn_id=self.metastore_conn_id)
schema, table, partition = self.parse_partition_name(partition)
self.log.info('Poking for %s.%s/%s', schema, table, partition)
return self.hook.check_for_named_partition(
schema, table, partition)
示例12: poke
def poke(self, context):
if '.' in self.table:
self.schema, self.table = self.table.split('.')
self.log.info(
'Poking for table {self.schema}.{self.table}, '
'partition {self.partition}'.format(**locals()))
if not hasattr(self, 'hook'):
from airflow.hooks.hive_hooks import HiveMetastoreHook
self.hook = HiveMetastoreHook(
metastore_conn_id=self.metastore_conn_id)
return self.hook.check_for_partition(
self.schema, self.table, self.partition)
示例13: poke
def poke(self, context):
if '.' in self.table:
self.schema, self.table = self.table.split('.')
self.log.info(
'Poking for table %s.%s, partition %s', self.schema, self.table, self.partition
)
if not hasattr(self, 'hook'):
from airflow.hooks.hive_hooks import HiveMetastoreHook
self.hook = HiveMetastoreHook(
metastore_conn_id=self.metastore_conn_id)
return self.hook.check_for_partition(
self.schema, self.table, self.partition)
示例14: HivePartitionSensor
class HivePartitionSensor(BaseSensorOperator):
"""
Waits for a partition to show up in Hive.
Note: Because ``partition`` supports general logical operators, it
can be inefficient. Consider using NamedHivePartitionSensor instead if
you don't need the full flexibility of HivePartitionSensor.
:param table: The name of the table to wait for, supports the dot
notation (my_database.my_table)
:type table: string
:param partition: The partition clause to wait for. This is passed as
is to the metastore Thrift client ``get_partitions_by_filter`` method,
and apparently supports SQL like notation as in ``ds='2015-01-01'
AND type='value'`` and comparison operators as in ``"ds>=2015-01-01"``
:type partition: string
:param metastore_conn_id: reference to the metastore thrift service
connection id
:type metastore_conn_id: str
"""
template_fields = ('schema', 'table', 'partition',)
ui_color = '#C5CAE9'
@apply_defaults
def __init__(
self,
table, partition="ds='{{ ds }}'",
metastore_conn_id='metastore_default',
schema='default',
poke_interval=60*3,
*args, **kwargs):
super(HivePartitionSensor, self).__init__(
poke_interval=poke_interval, *args, **kwargs)
if not partition:
partition = "ds='{{ ds }}'"
self.metastore_conn_id = metastore_conn_id
self.table = table
self.partition = partition
self.schema = schema
def poke(self, context):
if '.' in self.table:
self.schema, self.table = self.table.split('.')
self.log.info(
'Poking for table {self.schema}.{self.table}, '
'partition {self.partition}'.format(**locals()))
if not hasattr(self, 'hook'):
from airflow.hooks.hive_hooks import HiveMetastoreHook
self.hook = HiveMetastoreHook(
metastore_conn_id=self.metastore_conn_id)
return self.hook.check_for_partition(
self.schema, self.table, self.partition)
示例15: poke_partition
def poke_partition(self, partition):
if not self.hook:
from airflow.hooks.hive_hooks import HiveMetastoreHook
self.hook = HiveMetastoreHook(
metastore_conn_id=self.metastore_conn_id)
schema, table, partition = self.parse_partition_name(partition)
self.log.info(
'Poking for {schema}.{table}/{partition}'.format(**locals())
)
return self.hook.check_for_named_partition(
schema, table, partition)