本文整理匯總了Python中airflow.hooks.hive_hooks.HiveCliHook類的典型用法代碼示例。如果您正苦於以下問題:Python HiveCliHook類的具體用法?Python HiveCliHook怎麽用?Python HiveCliHook使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了HiveCliHook類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: execute
def execute(self, context):
hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
logging.info("Extracting data from Hive")
hive_table = "druid." + context["task_instance_key_str"].replace(".", "_")
sql = self.sql.strip().strip(";")
hql = """\
set mapred.output.compress=false;
set hive.exec.compress.output=false;
DROP TABLE IF EXISTS {hive_table};
CREATE TABLE {hive_table}
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
TBLPROPERTIES ('serialization.null.format' = '')
AS
{sql}
""".format(
**locals()
)
logging.info("Running command:\n {}".format(hql))
hive.run_cli(hql)
m = HiveMetastoreHook(self.metastore_conn_id)
t = m.get_table(hive_table)
columns = [col.name for col in t.sd.cols]
hdfs_uri = m.get_table(hive_table).sd.location
pos = hdfs_uri.find("/user")
static_path = hdfs_uri[pos:]
schema, table = hive_table.split(".")
druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
logging.info("Inserting rows into Druid")
logging.info("HDFS path: " + static_path)
try:
druid.load_from_hdfs(
datasource=self.druid_datasource,
intervals=self.intervals,
static_path=static_path,
ts_dim=self.ts_dim,
columns=columns,
num_shards=self.num_shards,
target_partition_size=self.target_partition_size,
query_granularity=self.query_granularity,
segment_granularity=self.segment_granularity,
metric_spec=self.metric_spec,
hadoop_dependency_coordinates=self.hadoop_dependency_coordinates,
)
logging.info("Load seems to have succeeded!")
finally:
logging.info("Cleaning up by dropping the temp " "Hive table {}".format(hive_table))
hql = "DROP TABLE IF EXISTS {}".format(hive_table)
hive.run_cli(hql)
示例2: execute
def execute(self, context):
hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
logging.info("Dumping Microsoft SQL Server query results to local file")
conn = mssql.get_conn()
cursor = conn.cursor()
cursor.execute(self.sql)
with NamedTemporaryFile("w") as f:
csv_writer = csv.writer(f, delimiter=self.delimiter, encoding='utf-8')
field_dict = OrderedDict()
col_count = 0
for field in cursor.description:
col_count += 1
col_position = "Column{position}".format(position=col_count)
field_dict[col_position if field[0] == '' else field[0]] = self.type_map(field[1])
csv_writer.writerows(cursor)
f.flush()
cursor.close()
conn.close()
logging.info("Loading file into Hive")
hive.load_file(
f.name,
self.hive_table,
field_dict=field_dict,
create=self.create,
partition=self.partition,
delimiter=self.delimiter,
recreate=self.recreate)
示例3: test_run_cli_with_hive_conf
def test_run_cli_with_hive_conf(self):
hql = "set key;\n" \
"set airflow.ctx.dag_id;\nset airflow.ctx.dag_run_id;\n" \
"set airflow.ctx.task_id;\nset airflow.ctx.execution_date;\n"
dag_id_ctx_var_name = \
AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_ID']['env_var_format']
task_id_ctx_var_name = \
AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_TASK_ID']['env_var_format']
execution_date_ctx_var_name = \
AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_EXECUTION_DATE'][
'env_var_format']
dag_run_id_ctx_var_name = \
AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_RUN_ID'][
'env_var_format']
os.environ[dag_id_ctx_var_name] = 'test_dag_id'
os.environ[task_id_ctx_var_name] = 'test_task_id'
os.environ[execution_date_ctx_var_name] = 'test_execution_date'
os.environ[dag_run_id_ctx_var_name] = 'test_dag_run_id'
hook = HiveCliHook()
output = hook.run_cli(hql=hql, hive_conf={'key': 'value'})
self.assertIn('value', output)
self.assertIn('test_dag_id', output)
self.assertIn('test_task_id', output)
self.assertIn('test_execution_date', output)
self.assertIn('test_dag_run_id', output)
del os.environ[dag_id_ctx_var_name]
del os.environ[task_id_ctx_var_name]
del os.environ[execution_date_ctx_var_name]
del os.environ[dag_run_id_ctx_var_name]
示例4: test_load_df_with_data_types
def test_load_df_with_data_types(self, mock_run_cli):
d = OrderedDict()
d['b'] = [True]
d['i'] = [-1]
d['t'] = [1]
d['f'] = [0.0]
d['c'] = ['c']
d['M'] = [datetime.datetime(2018, 1, 1)]
d['O'] = [object()]
d['S'] = ['STRING'.encode('utf-8')]
d['U'] = ['STRING']
d['V'] = [None]
df = pd.DataFrame(d)
hook = HiveCliHook()
hook.load_df(df, 't')
query = """
CREATE TABLE IF NOT EXISTS t (
b BOOLEAN,
i BIGINT,
t BIGINT,
f DOUBLE,
c STRING,
M TIMESTAMP,
O STRING,
S STRING,
U STRING,
V STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS textfile
;
"""
assertEqualIgnoreMultipleSpaces(self, mock_run_cli.call_args_list[0][0][0], query)
示例5: execute
def execute(self, context):
hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
self.log.info("Dumping MySQL query results to local file")
conn = mysql.get_conn()
cursor = conn.cursor()
cursor.execute(self.sql)
with NamedTemporaryFile("wb") as f:
csv_writer = csv.writer(f, delimiter=self.delimiter, encoding="utf-8")
field_dict = OrderedDict()
for field in cursor.description:
field_dict[field[0]] = self.type_map(field[1])
csv_writer.writerows(cursor)
f.flush()
cursor.close()
conn.close()
self.log.info("Loading file into Hive")
hive.load_file(
f.name,
self.hive_table,
field_dict=field_dict,
create=self.create,
partition=self.partition,
delimiter=self.delimiter,
recreate=self.recreate,
tblproperties=self.tblproperties)
示例6: execute
def execute(self, context):
hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
self.log.info("Extracting data from Hive")
hive_table = 'druid.' + context['task_instance_key_str'].replace('.', '_')
sql = self.sql.strip().strip(';')
tblproperties = ''.join([", '{}' = '{}'"
.format(k, v)
for k, v in self.hive_tblproperties.items()])
hql = """\
SET mapred.output.compress=false;
SET hive.exec.compress.output=false;
DROP TABLE IF EXISTS {hive_table};
CREATE TABLE {hive_table}
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
TBLPROPERTIES ('serialization.null.format' = ''{tblproperties})
AS
{sql}
""".format(hive_table=hive_table, tblproperties=tblproperties, sql=sql)
self.log.info("Running command:\n %s", hql)
hive.run_cli(hql)
m = HiveMetastoreHook(self.metastore_conn_id)
# Get the Hive table and extract the columns
t = m.get_table(hive_table)
columns = [col.name for col in t.sd.cols]
# Get the path on hdfs
static_path = m.get_table(hive_table).sd.location
schema, table = hive_table.split('.')
druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
try:
index_spec = self.construct_ingest_query(
static_path=static_path,
columns=columns,
)
self.log.info("Inserting rows into Druid, hdfs path: %s", static_path)
druid.submit_indexing_job(index_spec)
self.log.info("Load seems to have succeeded!")
finally:
self.log.info(
"Cleaning up by dropping the temp Hive table %s",
hive_table
)
hql = "DROP TABLE IF EXISTS {}".format(hive_table)
hive.run_cli(hql)
示例7: test_load_file
def test_load_file(self, mock_run_cli):
filepath = "/path/to/input/file"
table = "output_table"
hook = HiveCliHook()
hook.load_file(filepath=filepath, table=table, create=False)
query = (
"LOAD DATA LOCAL INPATH '{filepath}' "
"OVERWRITE INTO TABLE {table} \n"
.format(filepath=filepath, table=table)
)
mock_run_cli.assert_called_with(query)
示例8: test_load_df_with_optional_parameters
def test_load_df_with_optional_parameters(self, mock_to_csv, mock_load_file):
hook = HiveCliHook()
b = (True, False)
for create, recreate in itertools.product(b, b):
mock_load_file.reset_mock()
hook.load_df(df=pd.DataFrame({"c": range(0, 10)}),
table="t",
create=create,
recreate=recreate)
mock_load_file.assert_called_once()
kwargs = mock_load_file.call_args[1]
self.assertEqual(kwargs["create"], create)
self.assertEqual(kwargs["recreate"], recreate)
示例9: execute
def execute(self, context):
self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
self.s3 = S3Hook(s3_conn_id=self.s3_conn_id)
logging.info("Downloading S3 file")
if self.wildcard_match:
if not self.s3.check_for_wildcard_key(self.s3_key):
raise AirflowException("No key matches {0}".format(self.s3_key))
s3_key_object = self.s3.get_wildcard_key(self.s3_key)
else:
if not self.s3.check_for_key(self.s3_key):
raise AirflowException(
"The key {0} does not exists".format(self.s3_key))
s3_key_object = self.s3.get_key(self.s3_key)
with NamedTemporaryFile("w") as f:
logging.info("Dumping S3 key {0} contents to local"
" file {1}".format(s3_key_object.key, f.name))
s3_key_object.get_contents_to_file(f)
f.flush()
self.s3.connection.close()
if not self.headers:
logging.info("Loading file into Hive")
self.hive.load_file(
f.name,
self.hive_table,
field_dict=self.field_dict,
create=self.create,
partition=self.partition,
delimiter=self.delimiter,
recreate=self.recreate)
else:
with open(f.name, 'r') as tmpf:
if self.check_headers:
header_l = tmpf.readline()
header_line = header_l.rstrip()
header_list = header_line.split(self.delimiter)
field_names = list(self.field_dict.keys())
test_field_match = [h1.lower() == h2.lower() for h1, h2
in zip(header_list, field_names)]
if not all(test_field_match):
logging.warning("Headers do not match field names"
"File headers:\n {header_list}\n"
"Field names: \n {field_names}\n"
"".format(**locals()))
raise AirflowException("Headers do not match the "
"field_dict keys")
with NamedTemporaryFile("w") as f_no_headers:
tmpf.seek(0)
next(tmpf)
for line in tmpf:
f_no_headers.write(line)
f_no_headers.flush()
logging.info("Loading file without headers into Hive")
self.hive.load_file(
f_no_headers.name,
self.hive_table,
field_dict=self.field_dict,
create=self.create,
partition=self.partition,
delimiter=self.delimiter,
recreate=self.recreate)
示例10: test_load_df
def test_load_df(self, mock_to_csv, mock_load_file):
df = pd.DataFrame({"c": ["foo", "bar", "baz"]})
table = "t"
delimiter = ","
encoding = "utf-8"
hook = HiveCliHook()
hook.load_df(df=df,
table=table,
delimiter=delimiter,
encoding=encoding)
mock_to_csv.assert_called_once()
kwargs = mock_to_csv.call_args[1]
self.assertEqual(kwargs["header"], False)
self.assertEqual(kwargs["index"], False)
self.assertEqual(kwargs["sep"], delimiter)
mock_load_file.assert_called_once()
kwargs = mock_load_file.call_args[1]
self.assertEqual(kwargs["delimiter"], delimiter)
self.assertEqual(kwargs["field_dict"], {"c": u"STRING"})
self.assertTrue(isinstance(kwargs["field_dict"], OrderedDict))
self.assertEqual(kwargs["table"], table)
示例11: S3ToHiveTransfer
class S3ToHiveTransfer(BaseOperator):
"""
Moves data from S3 to Hive. The operator downloads a file from S3,
stores the file locally before loading it into a Hive table.
If the ``create`` or ``recreate`` arguments are set to ``True``,
a ``CREATE TABLE`` and ``DROP TABLE`` statements are generated.
Hive data types are inferred from the cursor's metadata from.
Note that the table generated in Hive uses ``STORED AS textfile``
which isn't the most efficient serialization format. If a
large amount of data is loaded and/or if the tables gets
queried considerably, you may want to use this operator only to
stage the data into a temporary table before loading it into its
final destination using a ``HiveOperator``.
:param s3_key: The key to be retrieved from S3
:type s3_key: str
:param field_dict: A dictionary of the fields name in the file
as keys and their Hive types as values
:type field_dict: dict
:param hive_table: target Hive table, use dot notation to target a
specific database
:type hive_table: str
:param create: whether to create the table if it doesn't exist
:type create: bool
:param recreate: whether to drop and recreate the table at every
execution
:type recreate: bool
:param partition: target partition as a dict of partition columns
and values
:type partition: dict
:param headers: whether the file contains column names on the first
line
:type headers: bool
:param check_headers: whether the column names on the first line should be
checked against the keys of field_dict
:type check_headers: bool
:param wildcard_match: whether the s3_key should be interpreted as a Unix
wildcard pattern
:type wildcard_match: bool
:param delimiter: field delimiter in the file
:type delimiter: str
:param aws_conn_id: source s3 connection
:type aws_conn_id: str
:param hive_cli_conn_id: destination hive connection
:type hive_cli_conn_id: str
:param input_compressed: Boolean to determine if file decompression is
required to process headers
:type input_compressed: bool
:param tblproperties: TBLPROPERTIES of the hive table being created
:type tblproperties: dict
"""
template_fields = ('s3_key', 'partition', 'hive_table')
template_ext = ()
ui_color = '#a0e08c'
@apply_defaults
def __init__(
self,
s3_key,
field_dict,
hive_table,
delimiter=',',
create=True,
recreate=False,
partition=None,
headers=False,
check_headers=False,
wildcard_match=False,
aws_conn_id='aws_default',
hive_cli_conn_id='hive_cli_default',
input_compressed=False,
tblproperties=None,
*args, **kwargs):
super(S3ToHiveTransfer, self).__init__(*args, **kwargs)
self.s3_key = s3_key
self.field_dict = field_dict
self.hive_table = hive_table
self.delimiter = delimiter
self.create = create
self.recreate = recreate
self.partition = partition
self.headers = headers
self.check_headers = check_headers
self.wildcard_match = wildcard_match
self.hive_cli_conn_id = hive_cli_conn_id
self.aws_conn_id = aws_conn_id
self.input_compressed = input_compressed
self.tblproperties = tblproperties
if (self.check_headers and
not (self.field_dict is not None and self.headers)):
raise AirflowException("To check_headers provide " +
"field_dict and headers")
def execute(self, context):
# Downloading file from S3
self.s3 = S3Hook(aws_conn_id=self.aws_conn_id)
self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
#.........這裏部分代碼省略.........
示例12: execute
def execute(self, context):
# Downloading file from S3
self.s3 = S3Hook(aws_conn_id=self.aws_conn_id)
self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
self.log.info("Downloading S3 file")
if self.wildcard_match:
if not self.s3.check_for_wildcard_key(self.s3_key):
raise AirflowException("No key matches {0}"
.format(self.s3_key))
s3_key_object = self.s3.get_wildcard_key(self.s3_key)
else:
if not self.s3.check_for_key(self.s3_key):
raise AirflowException(
"The key {0} does not exists".format(self.s3_key))
s3_key_object = self.s3.get_key(self.s3_key)
root, file_ext = os.path.splitext(s3_key_object.key)
with TemporaryDirectory(prefix='tmps32hive_') as tmp_dir,\
NamedTemporaryFile(mode="wb",
dir=tmp_dir,
suffix=file_ext) as f:
self.log.info("Dumping S3 key {0} contents to local file {1}"
.format(s3_key_object.key, f.name))
s3_key_object.download_fileobj(f)
f.flush()
if not self.headers:
self.log.info("Loading file %s into Hive", f.name)
self.hive.load_file(
f.name,
self.hive_table,
field_dict=self.field_dict,
create=self.create,
partition=self.partition,
delimiter=self.delimiter,
recreate=self.recreate,
tblproperties=self.tblproperties)
else:
# Decompressing file
if self.input_compressed:
self.log.info("Uncompressing file %s", f.name)
fn_uncompressed = uncompress_file(f.name,
file_ext,
tmp_dir)
self.log.info("Uncompressed to %s", fn_uncompressed)
# uncompressed file available now so deleting
# compressed file to save disk space
f.close()
else:
fn_uncompressed = f.name
# Testing if header matches field_dict
if self.check_headers:
self.log.info("Matching file header against field_dict")
header_list = self._get_top_row_as_list(fn_uncompressed)
if not self._match_headers(header_list):
raise AirflowException("Header check failed")
# Deleting top header row
self.log.info("Removing header from file %s", fn_uncompressed)
headless_file = (
self._delete_top_row_and_compress(fn_uncompressed,
file_ext,
tmp_dir))
self.log.info("Headless file %s", headless_file)
self.log.info("Loading file %s into Hive", headless_file)
self.hive.load_file(headless_file,
self.hive_table,
field_dict=self.field_dict,
create=self.create,
partition=self.partition,
delimiter=self.delimiter,
recreate=self.recreate,
tblproperties=self.tblproperties)
示例13: ddl
def ddl(self):
table = request.args.get("table")
sql = "SHOW CREATE TABLE {table};".format(table=table)
h = HiveCliHook(HIVE_CLI_CONN_ID)
return h.run_cli(sql)
示例14: execute
def execute(self, context):
# Downloading file from S3
self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
self.log.info("Downloading S3 file")
if self.wildcard_match:
if not self.s3.check_for_wildcard_key(self.s3_key):
raise AirflowException("No key matches {0}"
.format(self.s3_key))
s3_key_object = self.s3.get_wildcard_key(self.s3_key)
else:
if not self.s3.check_for_key(self.s3_key):
raise AirflowException(
"The key {0} does not exists".format(self.s3_key))
s3_key_object = self.s3.get_key(self.s3_key)
root, file_ext = os.path.splitext(s3_key_object.key)
if (self.select_expression and self.input_compressed and
file_ext.lower() != '.gz'):
raise AirflowException("GZIP is the only compression " +
"format Amazon S3 Select supports")
with TemporaryDirectory(prefix='tmps32hive_') as tmp_dir,\
NamedTemporaryFile(mode="wb",
dir=tmp_dir,
suffix=file_ext) as f:
self.log.info(
"Dumping S3 key %s contents to local file %s", s3_key_object.key, f.name
)
if self.select_expression:
option = {}
if self.headers:
option['FileHeaderInfo'] = 'USE'
if self.delimiter:
option['FieldDelimiter'] = self.delimiter
input_serialization = {'CSV': option}
if self.input_compressed:
input_serialization['CompressionType'] = 'GZIP'
content = self.s3.select_key(
bucket_name=s3_key_object.bucket_name,
key=s3_key_object.key,
expression=self.select_expression,
input_serialization=input_serialization
)
f.write(content.encode("utf-8"))
else:
s3_key_object.download_fileobj(f)
f.flush()
if self.select_expression or not self.headers:
self.log.info("Loading file %s into Hive", f.name)
self.hive.load_file(
f.name,
self.hive_table,
field_dict=self.field_dict,
create=self.create,
partition=self.partition,
delimiter=self.delimiter,
recreate=self.recreate,
tblproperties=self.tblproperties)
else:
# Decompressing file
if self.input_compressed:
self.log.info("Uncompressing file %s", f.name)
fn_uncompressed = uncompress_file(f.name,
file_ext,
tmp_dir)
self.log.info("Uncompressed to %s", fn_uncompressed)
# uncompressed file available now so deleting
# compressed file to save disk space
f.close()
else:
fn_uncompressed = f.name
# Testing if header matches field_dict
if self.check_headers:
self.log.info("Matching file header against field_dict")
header_list = self._get_top_row_as_list(fn_uncompressed)
if not self._match_headers(header_list):
raise AirflowException("Header check failed")
# Deleting top header row
self.log.info("Removing header from file %s", fn_uncompressed)
headless_file = (
self._delete_top_row_and_compress(fn_uncompressed,
file_ext,
tmp_dir))
self.log.info("Headless file %s", headless_file)
self.log.info("Loading file %s into Hive", headless_file)
self.hive.load_file(headless_file,
self.hive_table,
field_dict=self.field_dict,
create=self.create,
partition=self.partition,
delimiter=self.delimiter,
recreate=self.recreate,
tblproperties=self.tblproperties)
示例15: test_run_cli
def test_run_cli(self):
hook = HiveCliHook()
hook.run_cli("SHOW DATABASES")