本文整理汇总了Python中DbUtil.copy_dict_cursor方法的典型用法代码示例。如果您正苦于以下问题:Python DbUtil.copy_dict_cursor方法的具体用法?Python DbUtil.copy_dict_cursor怎么用?Python DbUtil.copy_dict_cursor使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类DbUtil
的用法示例。
在下文中一共展示了DbUtil.copy_dict_cursor方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: update_flow_jobs
# 需要导入模块: import DbUtil [as 别名]
# 或者: from DbUtil import copy_dict_cursor [as 别名]
def update_flow_jobs(self, last_unixtime=None):
if last_unixtime:
flow_sql = """
SELECT a.app_code, f.* FROM flow f JOIN cfg_application a on f.app_id = a.app_id
WHERE from_unixtime(modified_time) >= DATE_SUB(from_unixtime(%f), INTERVAL 1 HOUR)
""" % last_unixtime
else:
flow_sql = """
SELECT a.app_code, f.* FROM flow f JOIN cfg_application a on f.app_id = a.app_id
"""
job_sql = """
SELECT * FROM flow_job WHERE app_id = %d and flow_id = %d
"""
url = self.elasticsearch_index_url + ':' + str(self.elasticsearch_port) + '/wherehows/flow_jobs/_bulk'
params = []
self.wh_cursor.execute(flow_sql)
rows = DbUtil.copy_dict_cursor(self.wh_cursor)
row_count = 1
for row in rows:
self.wh_cursor.execute(job_sql %(long(row['app_id']), long(row['flow_id'])))
jobs = []
job_rows = DbUtil.copy_dict_cursor(self.wh_cursor)
if job_rows:
for job_row in job_rows:
jobs.append({"app_id": job_row['app_id'], "flow_id": job_row['flow_id'], "job_id": job_row['job_id'],
"job_name": job_row['job_name'] if job_row['job_name'] else '',
"job_path": job_row['job_path'] if job_row['job_path'] else '',
"job_type_id": job_row['job_type_id'],
"job_type": job_row['job_type'] if job_row['job_type'] else '',
"pre_jobs": job_row['pre_jobs'] if job_row['pre_jobs'] else '',
"post_jobs": job_row['post_jobs'] if job_row['post_jobs'] else '',
"is_current": job_row['is_current'] if job_row['is_current'] else '',
"is_first": job_row['is_first'] if job_row['is_first'] else '',
"is_last": job_row['is_last'] if job_row['is_last'] else ''})
params.append('{ "index": { "_id": ' + str(long(row['flow_id'])*10000 + long(row['app_id'])) + ' }}')
if len(jobs) > 0:
params.append(
"""{"app_id": %d, "flow_id": %d, "app_code": "%s", "flow_name": "%s", "flow_group": "%s", "flow_path": "%s", "flow_level": %d, "is_active": "%s", "is_scheduled": "%s", "pre_flows": "%s", "jobs": %s}"""
% (row['app_id'], row['flow_id'], row['app_code'] if row['app_code'] else '',
row['flow_name'] if row['flow_name'] else '', row['flow_group'] if row['flow_group'] else '',
row['flow_path'] if row['flow_path'] else '', row['flow_level'],
row['is_active'] if row['is_active'] else '', row['is_scheduled'] if row['is_scheduled'] else '',
row['pre_flows'] if row['pre_flows'] else '', json.dumps(jobs)))
else:
params.append(
"""{"app_id": %d, "flow_id": %d, "app_code": "%s", "flow_name": "%s", "flow_group": "%s", "flow_path": "%s", "flow_level": %d, "is_active": "%s", "is_scheduled": "%s", "pre_flows": "%s", "jobs": ""}"""
% (row['app_id'], row['flow_id'], row['app_code'] if row['app_code'] else '',
row['flow_name'] if row['flow_name'] else '', row['flow_group'] if row['flow_group'] else '',
row['flow_path'] if row['flow_path'] else '', row['flow_level'],
row['is_active'] if row['is_active'] else '', row['is_scheduled'] if row['is_scheduled'] else '',
row['pre_flows'] if row['pre_flows'] else ''))
if row_count % 1000 == 0:
self.bulk_insert(params, url)
self.logger.info('flow jobs' + str(row_count))
params = []
row_count += 1
if len(params) > 0:
self.logger.info('flow_jobs' + str(len(params)))
self.bulk_insert(params, url)
示例2: update_comment
# 需要导入模块: import DbUtil [as 别名]
# 或者: from DbUtil import copy_dict_cursor [as 别名]
def update_comment(self, last_time):
if last_time:
sql = """
SELECT * FROM comments WHERE modified >= DATE_SUB(%s, INTERVAL 1 HOUR)
""" % last_time
else:
sql = """
SELECT * FROM comments
"""
url = self.elasticsearch_index_url + ':' + str(self.elasticsearch_port) + '/wherehows/comment/_bulk'
params = []
self.wh_cursor.execute(sql)
rows = DbUtil.copy_dict_cursor(self.wh_cursor)
row_count = 1
for row in rows:
params.append('{ "index": { "_id": ' + str(row['id']) + ', "parent": ' + str(row['dataset_id']) + ' }}')
params.append(
"""{ "text": %s, "user_id": %d, "dataset_id": %d, "comment_type": "%s"}"""
% (json.dumps(row['text']) if row['text'] else '', row['user_id'] if row['user_id'] else 0,
row['dataset_id'] if row['dataset_id'] else 0, row['comment_type'] if row['comment_type'] else ''))
if row_count % 1000 == 0:
self.bulk_insert(params, url)
params = []
row_count += 1
if len(params) > 0:
self.bulk_insert(params, url)
示例3: update_dataset
# 需要导入模块: import DbUtil [as 别名]
# 或者: from DbUtil import copy_dict_cursor [as 别名]
def update_dataset(self, last_unixtime):
if last_unixtime:
sql = """
SELECT * FROM dict_dataset WHERE from_unixtime(modified_time) >= DATE_SUB(from_unixtime(%f), INTERVAL 1 HOUR)
""" % last_unixtime
else:
sql = """
SELECT * FROM dict_dataset
"""
url = self.elasticsearch_index_url + ':' + str(self.elasticsearch_port) + '/wherehows/dataset/_bulk'
params = []
self.wh_cursor.execute(sql)
rows = DbUtil.copy_dict_cursor(self.wh_cursor)
row_count = 1
for row in rows:
params.append('{ "index": { "_id": ' + str(row['id']) + ' }}')
params.append(
"""{ "name": "%s", "source": "%s", "urn": "%s", "location_prefix": "%s", "parent_name": "%s",
"schema_type": "%s", "properties": %s, "schema": %s , "fields": %s}"""
% (row['name'] if row['name'] else '', row['source'] if row['source'] else '',
row['urn'] if row['urn'] else '', row['location_prefix'] if row['location_prefix'] else '',
row['parent_name'] if row['parent_name'] else '', row['schema_type'] if row['schema_type'] else '',
json.dumps(row['properties']) if row['properties'] else '',
json.dumps(row['schema']) if row['schema'] else '', json.dumps(row['fields']) if row['fields'] else ''))
if row_count % 1000 == 0:
self.bulk_insert(params, url)
params = []
row_count += 1
if len(params) > 0:
self.bulk_insert(params, url)
示例4: update_dataset_field
# 需要导入模块: import DbUtil [as 别名]
# 或者: from DbUtil import copy_dict_cursor [as 别名]
def update_dataset_field(self, last_time):
if last_time:
sql = """
SELECT * FROM dict_field_detail WHERE modified >= DATE_SUB(%s, INTERVAL 1 HOUR)
""" % last_time
else:
sql = """
SELECT * FROM dict_field_detail
"""
comment_query = """
SELECT d.field_id, d.dataset_id, f.comment FROM dict_dataset_field_comment d
LEFT JOIN field_comments f ON d.comment_id = f.id WHERE d.field_id = %d
"""
url = self.elasticsearch_index_url + ':' + str(self.elasticsearch_port) + '/wherehows/field/_bulk'
params = []
self.wh_cursor.execute(sql)
rows = DbUtil.copy_dict_cursor(self.wh_cursor)
row_count = 1
for row in rows:
self.wh_cursor.execute(comment_query % long(row['field_id']))
comments = []
comment_rows = DbUtil.copy_dict_cursor(self.wh_cursor)
for comment_row in comment_rows:
comments.append(comment_row['comment'])
params.append('{ "index": { "_id": ' +
str(row['field_id']) + ', "parent": ' + str(row['dataset_id']) + ' }}')
if len(comments) > 0:
params.append(
"""{ "comments": %s, "dataset_id": %d, "sort_id": %d, "field_name": "%s", "parent_path": "%s"}"""
% (json.dumps(comments) if comments else '', row['dataset_id'] if row['dataset_id'] else 0,
row['sort_id'] if row['sort_id'] else 0,
row['field_name'] if row['field_name'] else '', row['parent_path'] if row['parent_path'] else ''))
else:
params.append(
"""{ "comments": "", "dataset_id": %d, "sort_id": %d, "field_name": "%s", "parent_path": "%s"}"""
% (row['dataset_id'] if row['dataset_id'] else 0, row['sort_id'] if row['sort_id'] else 0,
row['field_name'] if row['field_name'] else '', row['parent_path'] if row['parent_path'] else ''))
if row_count % 1000 == 0:
self.bulk_insert(params, url)
params = []
row_count += 1
if len(params) > 0:
self.bulk_insert(params, url)
示例5: update_metric
# 需要导入模块: import DbUtil [as 别名]
# 或者: from DbUtil import copy_dict_cursor [as 别名]
def update_metric(self):
sql = """
SELECT * FROM dict_business_metric
"""
url = self.elasticsearch_index_url + ':' + str(self.elasticsearch_port) + '/wherehows/metric/_bulk'
params = []
self.wh_cursor.execute(sql)
rows = DbUtil.copy_dict_cursor(self.wh_cursor)
row_count = 1
for row in rows:
params.append('{ "index": { "_id": ' + str(row['metric_id']) + ' }}')
params.append(
"""{"metric_id": %d, "metric_name": %s, "metric_description": %s, "dashboard_name": %s,
"metric_group": %s, "metric_category": %s, "metric_sub_category": %s, "metric_level": %s,
"metric_source_type": %s, "metric_source": %s, "metric_source_dataset_id": %d,
"metric_ref_id_type": %s, "metric_ref_id": %s, "metric_type": %s, "metric_additive_type": %s,
"metric_grain": %s, "metric_display_factor": %f, "metric_display_factor_sym": %s,
"metric_good_direction": %s, "metric_formula": %s, "dimensions": %s, "owners": %s, "tags": %s,
"urn": %s, "metric_url": %s, "wiki_url": %s, "scm_url": %s}"""
% (row['metric_id'], json.dumps(row['metric_name']) if row['metric_name'] else json.dumps(''),
json.dumps(row['metric_description']) if row['metric_description'] else json.dumps(''),
json.dumps(row['dashboard_name']) if row['dashboard_name'] else json.dumps(''),
json.dumps(row['metric_group']) if row['metric_group'] else json.dumps(''),
json.dumps(row['metric_category']) if row['metric_category'] else json.dumps(''),
json.dumps(row['metric_sub_category']) if row['metric_sub_category'] else json.dumps(''),
json.dumps(row['metric_level']) if row['metric_level'] else json.dumps(''),
json.dumps(row['metric_source_type']) if row['metric_source_type'] else json.dumps(''),
json.dumps(row['metric_source']) if row['metric_source'] else json.dumps(''),
row['metric_source_dataset_id'] if row['metric_source_dataset_id'] else 0,
json.dumps(row['metric_ref_id_type']) if row['metric_ref_id_type'] else json.dumps(''),
json.dumps(row['metric_ref_id']) if row['metric_ref_id'] else json.dumps(''),
json.dumps(row['metric_type']) if row['metric_type'] else json.dumps(''),
json.dumps(row['metric_additive_type']) if row['metric_additive_type'] else json.dumps(''),
json.dumps(row['metric_grain']) if row['metric_grain'] else json.dumps(''),
row['metric_display_factor'] if row['metric_display_factor'] else 0.0,
json.dumps(row['metric_display_factor_sym']) if row['metric_display_factor_sym'] else json.dumps(''),
json.dumps(row['metric_good_direction']) if row['metric_good_direction'] else json.dumps(''),
json.dumps(row['metric_formula']) if row['metric_formula'] else json.dumps(''),
json.dumps(row['dimensions']) if row['dimensions'] else json.dumps(''),
json.dumps(row['owners']) if row['owners'] else json.dumps(''),
json.dumps(row['tags']) if row['tags'] else json.dumps(''),
json.dumps(row['urn']) if row['urn'] else json.dumps(''),
json.dumps(row['metric_url']) if row['metric_url'] else json.dumps(''),
json.dumps(row['wiki_url']) if row['wiki_url'] else json.dumps(''),
json.dumps(row['scm_url']) if row['scm_url'] else json.dumps('')))
if row_count % 1000 == 0:
self.bulk_insert(params, url)
params = []
row_count += 1
if len(params) > 0:
self.bulk_insert(params, url)
示例6: get_log_file_name
# 需要导入模块: import DbUtil [as 别名]
# 或者: from DbUtil import copy_dict_cursor [as 别名]
def get_log_file_name(self, module_name, days_offset=1):
if self.last_execution_unix_time:
query = \
"""select je.*, fj.job_type, fl.flow_path,
CONCAT('o', je.job_exec_id, '.', LPAD(je.attempt_id, 2, 0)) as log_file_name,
CONCAT('%s', DATE_FORMAT(FROM_UNIXTIME(je.end_time), '%%Y%%m%%d'), '/',
CONCAT('o', je.job_exec_id, '.', LPAD(je.attempt_id, 2, 0)), '.gz') as gzipped_file_name
from job_execution je
JOIN flow_job fj on je.app_id = fj.app_id and je.flow_id = fj.flow_id and fj.is_current = 'Y' and
je.job_id = fj.job_id
JOIN flow fl on fj.app_id = fl.app_id and fj.flow_id = fl.flow_id
WHERE je.app_id = %d
and je.end_time >= %d - 3660
and fj.job_type = '%s'
ORDER BY je.flow_exec_id DESC, je.job_exec_id
"""
self.logger.info(query % (self.aw_archive_dir, self.app_id, long(self.last_execution_unix_time), module_name))
self.aw_cursor.execute(query %
(self.aw_archive_dir, self.app_id, long(self.last_execution_unix_time), module_name))
else:
query = \
"""select je.*, fj.job_type, fl.flow_path,
CONCAT('o', je.job_exec_id, '.', LPAD(je.attempt_id, 2, 0)) as log_file_name,
CONCAT('%s', DATE_FORMAT(FROM_UNIXTIME(je.end_time), '%%Y%%m%%d'), '/',
CONCAT('o', je.job_exec_id, '.', LPAD(je.attempt_id, 2, 0)), '.gz') as gzipped_file_name
from job_execution je
JOIN flow_job fj on je.app_id = fj.app_id and je.flow_id = fj.flow_id and fj.is_current = 'Y' and
je.job_id = fj.job_id
JOIN flow fl on fj.app_id = fl.app_id and fj.flow_id = fl.flow_id
WHERE je.app_id = %d and je.job_exec_status in ('FINISHED', 'SUCCEEDED', 'OK')
and je.end_time >= unix_timestamp(CURRENT_DATE - INTERVAL %d DAY) and fj.job_type = '%s'
ORDER BY je.flow_exec_id DESC, je.job_exec_id
"""
self.logger.info(query % (self.aw_archive_dir, self.app_id, int(self.look_back_days), module_name))
self.aw_cursor.execute(query % (self.aw_archive_dir, self.app_id, int(self.look_back_days), module_name))
job_rows = DbUtil.copy_dict_cursor(self.aw_cursor)
self.logger.info("%d job executions will be scanned for lineage" % len(job_rows))
return job_rows
示例7: process_li_shell_gw
# 需要导入模块: import DbUtil [as 别名]
# 或者: from DbUtil import copy_dict_cursor [as 别名]
def process_li_shell_gw(self):
self.logger.info("process li_shell_gw")
parameter_query = \
"""
SELECT param_value
FROM job_parameter
WHERE app_id = %d
AND job_exec_id = %d
AND attempt_number = %d
ORDER BY param_no
"""
rows = self.get_log_file_name(module_name='LI_SHELL_GW')
self.logger.info(str(len(rows)))
for row in rows:
self.aw_cursor.execute(parameter_query %
(int(row['app_id']), int(row['job_exec_id']), int(row['attempt_id'])))
arg_values = DbUtil.copy_dict_cursor(self.aw_cursor)
if arg_values and len(arg_values) > 0 and 'param_value' in arg_values[0]:
args = arg_values[0]['param_value'].replace('"','')
m = re.match("(.*?/trim.sh)\s+\d+\s+\d+", args)
if m is not None:
shell_script_name = os.path.basename(m.group(1))
shell_script_path = os.path.dirname(m.group(1))
app_path = shell_script_path.replace(self.remote_hadoop_script_dir,'')
local_shell_script_path = self.local_script_path + self.remote_script_path + app_path
self.logger.info(local_shell_script_path + '/' + shell_script_name)
shell_script_content = open(local_shell_script_path + '/' + shell_script_name).read()
pig_script = re.search(r'pig\s+\-f\s+(.*?)\s+',shell_script_content)
if pig_script is not None:
pig_script_name = os.path.basename(pig_script.group(1))
pig_script_path = os.path.dirname(pig_script.group(1))
# Compare md5 string of the file with the one parsed last time
md5_str = hashlib.md5()
md5_str.update(shell_script_content)
self.logger.info('Parsing log file: %s' % row['gzipped_file_name'])
src_tgt_map = PigLogParser(log_file_name=row['gzipped_file_name']).simple_parser()
if any(src_tgt_map) == 0:
self.logger.warn('Pig log parser fails to retrieve relevant information from file: %s. Most probably Pig script did not complete successfully' % row['gzipped_file_name'])
continue
self.logger.info(str(src_tgt_map))
db_id = self.db_lookup(src_tgt_map['cluster'])
self.clean_up_staging_lineage_dbs(row['app_id'], row['job_id'], row['job_exec_id'], row['attempt_id'])
update_source_code_query = \
"""
INSERT INTO job_attempt_source_code(
application_id, job_id, attempt_number, script_name, script_path, script_type, created_date)
VALUES( %d, %d, %d, '%s', '%s', '%s', now())
""" % (int(row['app_id']), int(row['job_id']), int(row['attempt_id']),
pig_script_name, pig_script_path, 'Pig')
self.aw_cursor.execute(update_source_code_query )
update_staging_lineage_query = \
"""
INSERT IGNORE INTO stg_job_execution_data_lineage (
app_id, flow_exec_id, job_exec_id, flow_path, job_name, job_start_unixtime, job_finished_unixtime,
db_id, abstracted_object_name, full_object_name, partition_start, partition_end, partition_type,
storage_type, source_target_type, srl_no, source_srl_no, operation, record_count, insert_count,
created_date, wh_etl_exec_id)
SELECT %d, %d, %d, '%s', '%s', %d, %d, %d, '%s', '%s',
CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
'%s', '%s', %d,
CASE WHEN %d = 0 THEN NULL ELSE %d END,
'%s', %d, %d, UNIX_TIMESTAMP(now()), %d
FROM dual
"""
srl = 0
detail_srl = 1
for k,v in src_tgt_map.items():
if k in ['cluster'] or k in ['hive-cluster']: continue
srl = srl + 1
self.aw_cursor.execute(update_staging_lineage_query %
(int(row['app_id']), int(row['flow_exec_id']), int(row['job_exec_id']),
row['flow_path'], row['job_name'], int(row['start_time']), int(row['end_time']),
db_id, src_tgt_map[k]['abstracted_hdfs_path'], None,
src_tgt_map[k]['min_start_partition'], src_tgt_map[k]['min_start_partition'],
src_tgt_map[k]['max_end_partition'] if src_tgt_map[k]['table_type'] != 'snapshot' else None,
src_tgt_map[k]['max_end_partition'] if src_tgt_map[k]['table_type'] != 'snapshot' else None,
src_tgt_map[k]['partition_type'], src_tgt_map[k]['partition_type'], 'HDFS',
src_tgt_map[k]['table_type'], int(srl), 0, 0,
'Load' if src_tgt_map[k]['table_type'] == 'source' else 'Store',
0, 0, int(row['wh_etl_exec_id']) ))
srl = srl + 1
self.aw_con.commit()
self.logger.debug('Completed writing metadata for: %s' % row['gzipped_file_name'])
else:
self.logger.error("Fails to get Pig script file name used in GW shell script: %s" % m.group(1))
示例8: process_li_getreplacemergegeneral
# 需要导入模块: import DbUtil [as 别名]
# 或者: from DbUtil import copy_dict_cursor [as 别名]
def process_li_getreplacemergegeneral(self, module_name):
self.logger.info("process %s" % module_name)
if module_name not in ['LI_GETREPLACEMERGEGENERAL', 'LINKEDIN_SHELL','LI_WEBHDFS_GET']: return
parameter_query = \
"""
SELECT GROUP_CONCAT(param_value,'\x01') args
FROM job_parameter
WHERE app_id = %d
AND job_exec_id = %d
AND attempt_number = %d
ORDER BY param_no
"""
rows = self.get_log_file_name(module_name=module_name)
for row in rows:
try:
self.logger.info('Parsing log file: %s' % row['gzipped_file_name'])
if module_name == 'LI_GETREPLACEMERGEGENERAL':
self.aw_cursor.execute(parameter_query %
(int(row['app_id']), int(row['job_exec_id']), int(row['attempt_id'])))
arg_values = DbUtil.copy_dict_cursor(self.aw_cursor)
if arg_values and len(arg_values) > 0:
args = arg_values[0]['args']
results = AppworxLogParser(log_file_name = row['gzipped_file_name']).parse_log({}, command_type=module_name)
if any(results) == 0 or not 'cluster' in results:
self.logger.info('Skipped parsing %s' % row['gzipped_file_name'])
continue
self.logger.info(str(results))
matched_cluster = re.match(r'(.*)_.*', results['cluster'])
if matched_cluster is not None:
results['cluster'] = matched_cluster.group(1)
db_id = int(self.db_lookup(results['cluster']))
self.logger.info(str(db_id))
self.clean_up_staging_lineage_dbs(row['app_id'], row['job_id'], row['job_exec_id'], row['attempt_id'])
update_source_code_query = \
"""
INSERT INTO job_attempt_source_code(
application_id, job_id, attempt_number, script_name, script_path, script_type, created_date)
VALUES( %d, %d, %d, '%s', '%s', '%s', now())
""" % (int(row['app_id']), int(row['job_id']), int(row['attempt_id']),
results['script_name'], results['script_path'],
results['script_type'] if module_name == 'LI_WEBHDFS_GET' else 'Shell')
update_staging_lineage_query = \
"""
INSERT IGNORE INTO stg_job_execution_data_lineage (
app_id, flow_exec_id, job_exec_id, flow_path, job_name, job_start_unixtime, job_finished_unixtime,
db_id, abstracted_object_name, full_object_name, partition_start, partition_end, partition_type,
storage_type, source_target_type, srl_no, source_srl_no, operation, record_count, insert_count,
created_date, wh_etl_exec_id)
SELECT %d, %d, %d, '%s', '%s', %d, %d, %d, '%s', '%s',
CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
'%s', '%s', %d,
CASE WHEN %d = 0 THEN NULL ELSE %d END,
'%s', %d, %d, UNIX_TIMESTAMP(now()), %d
FROM dual
"""
self.aw_cursor.execute(update_source_code_query )
self.aw_con.commit()
for k, tab in enumerate(results['table']):
self.aw_cursor.execute(update_staging_lineage_query %
(int(row['app_id']), int(row['flow_exec_id']), int(row['job_exec_id']),
row['flow_path'], row['job_name'], int(row['start_time']), int(row['end_time']),
db_id if tab['table_type'] == 'source' else 0, tab['abstracted_path'], tab['full_path'],
tab['start_partition'], tab['start_partition'],
tab['end_partition'] if tab['frequency'] != 'snapshot' else None,
tab['end_partition'] if tab['frequency'] != 'snapshot' else None,
tab['frequency'], tab['frequency'], tab['storage_type'],
tab['table_type'], k + 1, 0, 0,
'Hadoop Get' if tab['table_type'] == 'source' else 'Move',
0, 0, int(row['wh_etl_exec_id']) ))
self.aw_con.commit()
self.logger.debug('Completed processing metadata for log file: %s' % row['gzipped_file_name'])
except:
self.logger.error(str(sys.exc_info()[0]))
示例9: process_li_bteq
# 需要导入模块: import DbUtil [as 别名]
# 或者: from DbUtil import copy_dict_cursor [as 别名]
def process_li_bteq(self):
self.logger.info("process li bteq")
bteq_rows = self.get_log_file_name(module_name='LI_BTEQ')
kfk_rows = self.get_log_file_name(module_name='KFK_SONORA_STG_TO_FACT')
rows = bteq_rows + kfk_rows
parameter_query = \
"""
SELECT param_value
FROM job_parameter
WHERE app_id = %d
AND job_exec_id = %d
AND attempt_number = %d
ORDER BY param_no
"""
check_parsed_source_code_query = \
"""
SELECT application_id, job_id, attempt_number
FROM job_attempt_source_code
WHERE script_name = '%s'
AND script_path = '%s'
AND script_md5_sum = CASE WHEN '%s' = 'None' THEN NULL ELSE UNHEX('%s') END
AND application_id = %d
ORDER BY job_id DESC
LIMIT 1
"""
check_parsed_lineage_query = \
"""
SELECT *
FROM stg_job_execution_data_lineage
WHERE app_id = %d
AND job_exec_id = %d
"""
update_staging_lineage_query = \
"""
INSERT IGNORE INTO stg_job_execution_data_lineage (
app_id, flow_exec_id, job_exec_id, flow_path, job_name, job_start_unixtime, job_finished_unixtime,
db_id, abstracted_object_name, full_object_name, partition_start, partition_end, partition_type,
storage_type, source_target_type, srl_no, source_srl_no, operation, record_count, insert_count,
created_date, wh_etl_exec_id)
SELECT %d, %d, %d, '%s', '%s', %d, %d, %d, '%s', '%s',
CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
'%s', '%s', %d,
CASE WHEN %d = 0 THEN NULL ELSE %d END,
'%s', %d, %d, UNIX_TIMESTAMP(now()), %d
FROM dual
"""
for row in rows:
try:
self.logger.info('Parsing log file: %s' % row['gzipped_file_name'])
self.aw_cursor.execute(parameter_query %
(int(row['app_id']), int(row['job_exec_id']), int(row['attempt_id'])))
params = DbUtil.copy_dict_cursor(self.aw_cursor)
key_values = {}
for param in params:
ParseUtil.value_from_cmd_line(param['param_value'], key_values)
analyzed_script = False
results = AppworxLogParser(log_file_name = row['gzipped_file_name']).parse_log({}, command_type='LI_BTEQ')
if any(results) == 0:
self.logger.info('Skipped parsing %s' % row['gzipped_file_name'])
continue
self.logger.info(str(results))
self.logger.info('Completed parsing log file: %s' % row['gzipped_file_name'])
# Compare md5 string of the file with the one parsed last time
md5_str = hashlib.md5()
try:
md5_str.update(open(self.local_script_path +
results['script_path'] + '/' +
results['script_name']).read())
except IOError:
self.logger.warn("Fails to find script file: %s/%s. Skipping the file" % (results['script_path'], results['script_name']))
continue
self.aw_cursor.execute(check_parsed_source_code_query %
(results['script_name'],
results['script_path'],
md5_str,
md5_str.hexdigest(),
int(row['app_id'])))
parsed_scripts = DbUtil.copy_dict_cursor(self.aw_cursor)
if len(parsed_scripts) != 0:
self.aw_cursor.execute(check_parsed_lineage_query %
(int(row['app_id']),
int(row['job_exec_id'])))
parsed_lineage = DbUtil.copy_dict_cursor(self.aw_cursor)
if len(parsed_lineage) == 0 or self.skip_already_parsed == 'N':
analyzed_script = False
else:
self.logger.debug("%s/%s has already been analyzed. Skipping..." %
(results['script_path'], results['script_name']))
analyzed_script = True
self.clean_up_staging_lineage_dbs(row['app_id'], row['job_id'], row['job_exec_id'], row['attempt_id'])
update_source_code_query = \
"""
INSERT INTO job_attempt_source_code(
application_id, job_id, attempt_number, script_name, script_path, script_type, created_date, script_md5_sum)
#.........这里部分代码省略.........
示例10: run
# 需要导入模块: import DbUtil [as 别名]
# 或者: from DbUtil import copy_dict_cursor [as 别名]
def run(self):
sql = """
SELECT a.application_id, a.job_id, a.attempt_number, a.script_path, a.script_type, a.script_name,
fl.flow_name as chain_name, fj.job_name,
CASE WHEN e.dir_short_path IS NOT NULL THEN CASE
WHEN LOCATE('/scripts',a.script_path) > 0 THEN CONCAT('https://gitli.corp.linkedin.com',
CONCAT(CONCAT(REPLACE(e.dir_short_path,'/code_base_','/source/code_base_'),
SUBSTR(a.script_path,LOCATE('/scripts',a.script_path)) ),
CONCAT('/',a.script_name) )) WHEN LOCATE('/sql',a.script_path) > 0 THEN
CONCAT('https://gitli.corp.linkedin.com',
CONCAT(CONCAT(REPLACE(e.dir_short_path,'/code_base_','/source/code_base_'),
SUBSTR(a.script_path,LOCATE('/sql',a.script_path))),
CONCAT('/',a.script_name))) END ELSE a.script_name END script_url
FROM job_attempt_source_code a
JOIN flow fl ON fl.app_id = a.application_id
JOIN flow_job fj ON a.application_id = fj.app_id and fl.flow_id = fj.flow_id and a.job_id = fj.job_id
LEFT OUTER JOIN cfg_file_system_dir_map d
ON (d.directory_path =
CASE WHEN LOCATE('/scripts',a.script_path) > 0 THEN
SUBSTR(a.script_path,1,LOCATE('/scripts',a.script_path) - 1)
WHEN LOCATE('/sql',a.script_path) > 0 THEN
SUBSTR(a.script_path,1,LOCATE('/sql',a.script_path) - 1) END)
LEFT OUTER JOIN cfg_scm_repository_dir e ON (d.map_to_object_id = e.scm_repo_dir_id)
"""
git_committer = """
SELECT git_path, commit_time, committer_name, committer_email, ui.user_id
FROM ( SELECT CONCAT(CONCAT(CONCAT(CONCAT(repository_base_name,'/'),
CONCAT(module_name,'/source/')),file_path), file_name) as git_path, committer_name, commit_time,
committer_email FROM source_code_repository_info ) a
LEFT JOIN dir_external_user_info ui on a.committer_name = ui.full_name
WHERE a.git_path = '%s' GROUP BY committer_name
"""
insert_sql = """
INSERT IGNORE INTO job_execution_script (
app_id, job_id, script_name, script_path, script_type, chain_name, job_name,
committer_name, committer_email, committer_ldap, commit_time, script_url)
SELECT %d, %d, '%s', '%s', '%s', '%s', '%s',
'%s', '%s', '%s', '%s', '%s'
FROM dual
"""
try:
self.logger.info('script collect')
if self.last_execution_unix_time:
sql += ' WHERE a.created_date >= from_unixtime(%d) - INTERVAL 1 DAY '% long(self.last_execution_unix_time)
else:
sql += ' WHERE a.created_date >= CURRENT_DATE - INTERVAL %d DAY '% long(self.look_back_days)
self.logger.info(sql)
self.wh_cursor.execute(sql)
rows = DbUtil.copy_dict_cursor(self.wh_cursor)
for row in rows:
git_url = row['script_url'].replace('https://', 'git://')
self.wh_cursor.execute(git_committer % git_url)
git_rows = DbUtil.copy_dict_cursor(self.wh_cursor)
if git_rows and len(git_rows) > 0:
for git_row in git_rows:
self.wh_cursor.execute(insert_sql %
(int(row['application_id']), int(row['job_id']), row['script_name'],
row['script_path'], row['script_type'], row['chain_name'], row['job_name'],
git_row['committer_name'], git_row['committer_email'],
git_row['user_id'] if git_row['user_id'] else git_row['committer_name'],
git_row['commit_time'], row['script_url'] ))
else:
self.logger.info("git rows size is 0")
self.logger.info(row['script_name'])
self.wh_cursor.execute(insert_sql %
(int(row['application_id']), int(row['job_id']), row['script_name'],
row['script_path'], row['script_type'], row['chain_name'], row['job_name'],
"", "", "",
"", row['script_url'] ))
self.wh_con.commit()
finally:
self.wh_cursor.close()
self.wh_con.close()