本文整理汇总了Python中airflow.hooks.hive_hooks.HiveCliHook.load_file方法的典型用法代码示例。如果您正苦于以下问题:Python HiveCliHook.load_file方法的具体用法?Python HiveCliHook.load_file怎么用?Python HiveCliHook.load_file使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类airflow.hooks.hive_hooks.HiveCliHook
的用法示例。
在下文中一共展示了HiveCliHook.load_file方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: execute
# 需要导入模块: from airflow.hooks.hive_hooks import HiveCliHook [as 别名]
# 或者: from airflow.hooks.hive_hooks.HiveCliHook import load_file [as 别名]
def execute(self, context):
hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
logging.info("Dumping Microsoft SQL Server query results to local file")
conn = mssql.get_conn()
cursor = conn.cursor()
cursor.execute(self.sql)
with NamedTemporaryFile("w") as f:
csv_writer = csv.writer(f, delimiter=self.delimiter, encoding='utf-8')
field_dict = OrderedDict()
col_count = 0
for field in cursor.description:
col_count += 1
col_position = "Column{position}".format(position=col_count)
field_dict[col_position if field[0] == '' else field[0]] = self.type_map(field[1])
csv_writer.writerows(cursor)
f.flush()
cursor.close()
conn.close()
logging.info("Loading file into Hive")
hive.load_file(
f.name,
self.hive_table,
field_dict=field_dict,
create=self.create,
partition=self.partition,
delimiter=self.delimiter,
recreate=self.recreate)
示例2: execute
# 需要导入模块: from airflow.hooks.hive_hooks import HiveCliHook [as 别名]
# 或者: from airflow.hooks.hive_hooks.HiveCliHook import load_file [as 别名]
def execute(self, context):
hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
self.log.info("Dumping MySQL query results to local file")
conn = mysql.get_conn()
cursor = conn.cursor()
cursor.execute(self.sql)
with NamedTemporaryFile("wb") as f:
csv_writer = csv.writer(f, delimiter=self.delimiter, encoding="utf-8")
field_dict = OrderedDict()
for field in cursor.description:
field_dict[field[0]] = self.type_map(field[1])
csv_writer.writerows(cursor)
f.flush()
cursor.close()
conn.close()
self.log.info("Loading file into Hive")
hive.load_file(
f.name,
self.hive_table,
field_dict=field_dict,
create=self.create,
partition=self.partition,
delimiter=self.delimiter,
recreate=self.recreate,
tblproperties=self.tblproperties)
示例3: test_load_file
# 需要导入模块: from airflow.hooks.hive_hooks import HiveCliHook [as 别名]
# 或者: from airflow.hooks.hive_hooks.HiveCliHook import load_file [as 别名]
def test_load_file(self, mock_run_cli):
filepath = "/path/to/input/file"
table = "output_table"
hook = HiveCliHook()
hook.load_file(filepath=filepath, table=table, create=False)
query = (
"LOAD DATA LOCAL INPATH '{filepath}' "
"OVERWRITE INTO TABLE {table} \n"
.format(filepath=filepath, table=table)
)
mock_run_cli.assert_called_with(query)
示例4: S3ToHiveTransfer
# 需要导入模块: from airflow.hooks.hive_hooks import HiveCliHook [as 别名]
# 或者: from airflow.hooks.hive_hooks.HiveCliHook import load_file [as 别名]
#.........这里部分代码省略.........
self.hive_cli_conn_id = hive_cli_conn_id
self.aws_conn_id = aws_conn_id
self.input_compressed = input_compressed
self.tblproperties = tblproperties
if (self.check_headers and
not (self.field_dict is not None and self.headers)):
raise AirflowException("To check_headers provide " +
"field_dict and headers")
def execute(self, context):
# Downloading file from S3
self.s3 = S3Hook(aws_conn_id=self.aws_conn_id)
self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
self.log.info("Downloading S3 file")
if self.wildcard_match:
if not self.s3.check_for_wildcard_key(self.s3_key):
raise AirflowException("No key matches {0}"
.format(self.s3_key))
s3_key_object = self.s3.get_wildcard_key(self.s3_key)
else:
if not self.s3.check_for_key(self.s3_key):
raise AirflowException(
"The key {0} does not exists".format(self.s3_key))
s3_key_object = self.s3.get_key(self.s3_key)
root, file_ext = os.path.splitext(s3_key_object.key)
with TemporaryDirectory(prefix='tmps32hive_') as tmp_dir,\
NamedTemporaryFile(mode="wb",
dir=tmp_dir,
suffix=file_ext) as f:
self.log.info("Dumping S3 key {0} contents to local file {1}"
.format(s3_key_object.key, f.name))
s3_key_object.download_fileobj(f)
f.flush()
if not self.headers:
self.log.info("Loading file %s into Hive", f.name)
self.hive.load_file(
f.name,
self.hive_table,
field_dict=self.field_dict,
create=self.create,
partition=self.partition,
delimiter=self.delimiter,
recreate=self.recreate,
tblproperties=self.tblproperties)
else:
# Decompressing file
if self.input_compressed:
self.log.info("Uncompressing file %s", f.name)
fn_uncompressed = uncompress_file(f.name,
file_ext,
tmp_dir)
self.log.info("Uncompressed to %s", fn_uncompressed)
# uncompressed file available now so deleting
# compressed file to save disk space
f.close()
else:
fn_uncompressed = f.name
# Testing if header matches field_dict
if self.check_headers:
self.log.info("Matching file header against field_dict")
header_list = self._get_top_row_as_list(fn_uncompressed)
if not self._match_headers(header_list):
raise AirflowException("Header check failed")
示例5: S3ToHiveTransfer
# 需要导入模块: from airflow.hooks.hive_hooks import HiveCliHook [as 别名]
# 或者: from airflow.hooks.hive_hooks.HiveCliHook import load_file [as 别名]
#.........这里部分代码省略.........
:param s3_conn_id: source s3 connection
:type s3_conn_id: str
:param hive_conn_id: destination hive connection
:type hive_conn_id: str
"""
template_fields = ('s3_key', 'partition', 'hive_table')
template_ext = ()
ui_color = '#a0e08c'
@apply_defaults
def __init__(
self,
s3_key,
field_dict,
hive_table,
delimiter=',',
create=True,
recreate=False,
partition=None,
headers=False,
check_headers=False,
wildcard_match=False,
s3_conn_id='s3_default',
hive_cli_conn_id='hive_cli_default',
*args, **kwargs):
super(S3ToHiveTransfer, self).__init__(*args, **kwargs)
self.s3_key = s3_key
self.field_dict = field_dict
self.hive_table = hive_table
self.delimiter = delimiter
self.create = create
self.recreate = recreate
self.partition = partition
self.headers = headers
self.check_headers = check_headers
self.wildcard_match = wildcard_match
self.hive_cli_conn_id = hive_cli_conn_id
self.s3_conn_id = s3_conn_id
def execute(self, context):
self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
self.s3 = S3Hook(s3_conn_id=self.s3_conn_id)
logging.info("Downloading S3 file")
if self.wildcard_match:
if not self.s3.check_for_wildcard_key(self.s3_key):
raise AirflowException("No key matches {0}".format(self.s3_key))
s3_key_object = self.s3.get_wildcard_key(self.s3_key)
else:
if not self.s3.check_for_key(self.s3_key):
raise AirflowException(
"The key {0} does not exists".format(self.s3_key))
s3_key_object = self.s3.get_key(self.s3_key)
with NamedTemporaryFile("w") as f:
logging.info("Dumping S3 key {0} contents to local"
" file {1}".format(s3_key_object.key, f.name))
s3_key_object.get_contents_to_file(f)
f.flush()
self.s3.connection.close()
if not self.headers:
logging.info("Loading file into Hive")
self.hive.load_file(
f.name,
self.hive_table,
field_dict=self.field_dict,
create=self.create,
partition=self.partition,
delimiter=self.delimiter,
recreate=self.recreate)
else:
with open(f.name, 'r') as tmpf:
if self.check_headers:
header_l = tmpf.readline()
header_line = header_l.rstrip()
header_list = header_line.split(self.delimiter)
field_names = list(self.field_dict.keys())
test_field_match = [h1.lower() == h2.lower() for h1, h2
in zip(header_list, field_names)]
if not all(test_field_match):
logging.warning("Headers do not match field names"
"File headers:\n {header_list}\n"
"Field names: \n {field_names}\n"
"".format(**locals()))
raise AirflowException("Headers do not match the "
"field_dict keys")
with NamedTemporaryFile("w") as f_no_headers:
tmpf.seek(0)
next(tmpf)
for line in tmpf:
f_no_headers.write(line)
f_no_headers.flush()
logging.info("Loading file without headers into Hive")
self.hive.load_file(
f_no_headers.name,
self.hive_table,
field_dict=self.field_dict,
create=self.create,
partition=self.partition,
delimiter=self.delimiter,
recreate=self.recreate)