本文整理汇总了Python中wherehows.common.writers.FileWriter.flush方法的典型用法代码示例。如果您正苦于以下问题:Python FileWriter.flush方法的具体用法?Python FileWriter.flush怎么用?Python FileWriter.flush使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类wherehows.common.writers.FileWriter
的用法示例。
在下文中一共展示了FileWriter.flush方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: collect_flow_execs
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import flush [as 别名]
def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period):
self.logger.info( "collect flow&job executions")
flow_exec_writer = FileWriter(flow_exec_file)
job_exec_writer = FileWriter(job_exec_file)
cmd = """select * from execution_flows where end_time > UNIX_TIMESTAMP(now() - INTERVAL %d MINUTE) * 1000 """ % (int(look_back_period))
self.az_cursor.execute(cmd)
rows = DbUtil.dict_cursor(self.az_cursor)
row_count = 0
for row in rows:
json_column = 'flow_data'
unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
try:
row[json_column] = json.loads(unzipped_content)
except Exception as e:
self.logger.error(e)
pass
flow_data = row[json_column]
flow_path = flow_data['projectName'] + ":" + flow_data['flowId']
flow_exec_record = AzkabanFlowExecRecord(self.app_id,
flow_data['flowId'],
flow_path,
row['version'],
row['exec_id'],
flow_data['status'],
flow_data['attempt'],
row['submit_user'],
long(row['start_time']) / 1000,
long(row['end_time']) / 1000,
self.wh_exec_id)
flow_exec_writer.append(flow_exec_record)
nodes = flow_data['nodes']
job_exec_records = []
for node in nodes:
job_exec_record = AzkabanJobExecRecord(self.app_id,
flow_path,
row['version'],
row['exec_id'],
node['id'],
flow_path + "/" + node['id'],
None,
node['status'],
node['attempt'],
long(node['startTime']) / 1000,
long(node['endTime']) / 1000,
self.wh_exec_id)
job_exec_records.append(job_exec_record)
AzkabanJobExecUtil.sortAndSet(job_exec_records)
for r in job_exec_records:
job_exec_writer.append(r)
row_count += 1
if row_count % 10000 == 0:
flow_exec_writer.flush()
job_exec_writer.flush()
flow_exec_writer.close()
job_exec_writer.close()
示例2: collect_flow_jobs
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import flush [as 别名]
def collect_flow_jobs(self, flow_file, job_file, dag_file):
self.logger.info("collect flow&jobs")
query = "SELECT distinct f.*, p.name as project_name FROM project_flows f inner join projects p on f.project_id = p.id and f.version = p.version where p.active = 1"
self.az_cursor.execute(query)
rows = DbUtil.dict_cursor(self.az_cursor)
flow_writer = FileWriter(flow_file)
job_writer = FileWriter(job_file)
dag_writer = FileWriter(dag_file)
row_count = 0
for row in rows:
row['version'] = 0 if (row["version"] is None) else row["version"]
json_column = 'json'
unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
try:
row[json_column] = json.loads(unzipped_content)
except:
pass
flow_path = row['project_name'] + ":" + row['flow_id']
flow_record = AzkabanFlowRecord(self.app_id,
row['flow_id'],
row['project_name'],
flow_path,
0,
row['modified_time'] / 1000,
row["version"],
'Y',
self.wh_exec_id)
flow_writer.append(flow_record)
# get flow jobs
nodes = row[json_column]['nodes']
for node in nodes:
job_record = AzkabanJobRecord(self.app_id,
flow_path,
row["version"],
node['id'],
flow_path + '/' + node['id'],
node['jobType'],
'Y',
self.wh_exec_id)
if node['jobType'] == 'flow':
job_record.setRefFlowPath(row['project_name'] + ":" + node['embeddedFlowId'])
job_writer.append(job_record)
# job dag
edges = row[json_column]['edges']
for edge in edges:
dag_edge = AzkabanFlowDagRecord(self.app_id,
flow_path,
row['version'],
flow_path + '/' + edge['source'],
flow_path + '/' + edge['target'],
self.wh_exec_id)
dag_writer.append(dag_edge)
row_count += 1
if row_count % 1000 == 0:
flow_writer.flush()
job_writer.flush()
dag_writer.flush()
flow_writer.close()
job_writer.close()
dag_writer.close()
示例3: transform
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import flush [as 别名]
def transform(self, input, td_metadata, td_field_metadata):
'''
convert from json to csv
:param input: input json file
:param td_metadata: output data file for teradata metadata
:param td_field_metadata: output data file for teradata field metadata
:return:
'''
f_json = open(input)
data = json.load(f_json)
f_json.close()
schema_file_writer = FileWriter(td_metadata)
field_file_writer = FileWriter(td_field_metadata)
for d in data:
i = 0
for k in d.keys():
if k not in ['tables', 'views']:
continue
self.logger.info("%s %4d %s" % (datetime.datetime.now().strftime("%H:%M:%S"), len(d[k]), k))
for t in d[k]:
self.logger.info("%4d %s" % (i, t['name']))
if t['name'] == 'HDFStoTD_2464_ERR_1':
continue
i += 1
output = {}
prop_json = {}
output['name'] = t['name']
output['original_name'] = t['original_name']
prop_json["createTime"] = t["createTime"] if t.has_key("createTime") else None
prop_json["lastAlterTime"] = t["lastAlterTime"] if t.has_key("lastAlterTime") else None
prop_json["lastAccessTime"] = t["lastAccessTime"] if t.has_key("lastAccessTime") else None
prop_json["accessCount"] = t["accessCount"] if t.has_key("accessCount") else None
prop_json["sizeInMbytes"] = t["sizeInMbytes"] if t.has_key("sizeInMbytes") else None
if "type" in t:
prop_json["storage_type"] = t["type"]
if "partition" in t:
prop_json["partition"] = t["partition"]
if "partitions" in t:
prop_json["partitions"] = t["partitions"]
if "hashKey" in t:
prop_json["hashKey"] = t["hashKey"]
if "indices" in t:
prop_json["indices"] = t["indices"]
if "referenceTables" in t:
prop_json["referenceTables"] = t["referenceTables"]
if "viewSqlText" in t:
prop_json["viewSqlText"] = t["viewSqlText"]
output['fields'] = []
flds = {}
field_detail_list = []
sort_id = 0
for c in t['columns']:
# output['fields'].append(
# { 'name' : t['name'].encode('latin-1'),
# 'type' : None if c['data_type'] is None else c['data_type'].encode('latin-1'),
# 'attributes_json' : c}
# output['fields'][c['name'].encode('latin-1')].append({ "doc" : "", "type" : [None if c['data_type'] is None else c['data_type'].encode('latin-1')]})
sort_id += 1
output['fields'].append({"name": c['name'], "doc": '', "type": c['dataType'] if c['dataType'] else None,
"nullable": c['nullable'], "maxByteLength": c['maxByteLength'],
"format": c['columnFormat'] if c.has_key('columnFormat') else None,
"accessCount": c['accessCount'] if c.has_key('accessCount') else None,
"lastAccessTime": c['lastAccessTime'] if c.has_key("lastAccessTime") else None})
flds[c['name']] = {'type': c['dataType'], "maxByteLength": c['maxByteLength']}
field_detail_list.append(
["teradata:///%s/%s" % (d['database'], output['name']), str(sort_id), '0', '', c['name'], '',
c['dataType'] if 'dataType' in c and c['dataType'] is not None else '',
str(c['maxByteLength']) if 'maxByteLength' in c else '0',
str(c['precision']) if 'precision' in c and c['precision'] is not None else '',
str(c['scale']) if 'scale' in c and c['scale'] is not None else '',
c['nullable'] if 'nullable' in c and c['nullable'] is not None else 'Y', '', '', '', '', '', '', ''])
dataset_scehma_record = DatasetSchemaRecord(output['name'], json.dumps(output), json.dumps(prop_json),
json.dumps(flds),
"teradata:///%s/%s" % (d['database'], output['name']), 'Teradata',
output['original_name'],
(self.convert_timestamp(t["createTime"]) if t.has_key("createTime") else None),
(self.convert_timestamp(t["lastAlterTime"]) if t.has_key("lastAlterTime") else None))
schema_file_writer.append(dataset_scehma_record)
for fields in field_detail_list:
field_record = DatasetFieldRecord(fields)
field_file_writer.append(field_record)
schema_file_writer.flush()
field_file_writer.flush()
self.logger.info("%20s contains %6d %s" % (d['database'], i, k))
schema_file_writer.close()
field_file_writer.close()
示例4: collect_flow_execs
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import flush [as 别名]
#.........这里部分代码省略.........
from so_job_history where SO_JOBID = SO_CHAIN_ID or SO_PARENTS_JOBID <> SO_CHAIN_ID
'''
if self.last_execution_unix_time:
job_cmd = \
"""SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
FROM SO_JOB_HISTORY H
JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
WHERE --H.SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400) and
H.SO_CHAIN_ID = %d"""
else:
job_cmd = \
"""SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
FROM SO_JOB_HISTORY H
JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
WHERE H.SO_JOB_FINISHED >= SYSDATE - %d and
H.SO_CHAIN_ID = %d"""
try:
self.aw_cursor.execute(flow_cmd)
except Exception as e:
self.logger.error(e + "\n" + flow_cmd)
rows = DbUtil.dict_cursor(self.aw_cursor)
row_count = 0
for row in rows:
flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']
so_flow_id = row['SO_JOBID']
flow_attempt = 0
flow_exec_id = 0
try:
flow_attempt = int(float(str(so_flow_id - int(so_flow_id))[1:])*100)
flow_exec_id = int(so_flow_id)
except Exception as e:
self.logger.error(e)
self.logger.debug("processing flow_exec_id: %8d" % flow_exec_id)
flow_exec_record = AppworxFlowExecRecord(self.app_id,
long(row['SO_JOB_SEQ']),
row['SO_MODULE'],
flow_path,
0,
flow_exec_id,
row['SO_STATUS_NAME'],
flow_attempt,
row['SO_USER_NAME'] if row['SO_USER_NAME'] else '',
long(row['JOB_STARTED']),
long(row['JOB_FINISHED'] if row['JOB_FINISHED'] else 0),
self.wh_exec_id)
flow_exec_writer.append(flow_exec_record)
new_appworx_cursor = self.aw_con.cursor()
if self.last_execution_unix_time:
new_appworx_cursor.execute(job_cmd % (long(self.last_execution_unix_time), flow_exec_id))
else:
new_appworx_cursor.execute(job_cmd % (int(self.lookback_period), flow_exec_id))
job_rows = DbUtil.dict_cursor(new_appworx_cursor)
for job in job_rows:
so_job_id = job['SO_JOBID']
job_attempt = 0
job_exec_id = 0
try:
job_attempt = int(float(str(so_job_id - int(so_job_id))[1:])*100)
job_exec_id = int(so_job_id)
except Exception as e:
self.logger.error(e)
job_exec_record = AppworxJobExecRecord(self.app_id,
long(row['SO_JOB_SEQ']),
flow_path,
0,
flow_exec_id,
long(job['JOB_ID']),
job['SO_TASK_NAME'],
flow_path + "/" + job['SO_TASK_NAME'],
job_exec_id,
job['SO_STATUS_NAME'],
job_attempt,
long(job['JOB_STARTED']),
long(job['JOB_FINISHED']),
self.wh_exec_id)
job_exec_writer.append(job_exec_record)
row_count += 1
if row_count % 10000 == 0:
flow_exec_writer.flush()
job_exec_writer.flush()
flow_exec_writer.close()
job_exec_writer.close()
示例5: transform
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import flush [as 别名]
def transform(self, input, hive_metadata, hive_field_metadata):
"""
convert from json to csv
:param input: input json file
:param hive_metadata: output data file for hive table metadata
:param hive_field_metadata: output data file for hive field metadata
:return:
"""
f_json = open(input)
all_data = json.load(f_json)
f_json.close()
schema_file_writer = FileWriter(hive_metadata)
field_file_writer = FileWriter(hive_field_metadata)
lineageInfo = LineageInfo()
depends_sql = """
SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
else t.TBL_NAME
end dataset_name,
concat('/', d.NAME, '/', t.TBL_NAME) object_name,
case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
then 'dalids'
else 'hive'
end object_type,
case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
then 'View'
else
case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
else 'Table'
end
end object_sub_type,
case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
then 'dalids'
else 'hive'
end prefix
FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
"""
# one db info : 'type', 'database', 'tables'
# one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
# optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
for one_db_info in all_data:
i = 0
for table in one_db_info['tables']:
i += 1
schema_json = {}
prop_json = {} # set the prop json
for prop_name in TableInfo.optional_prop:
if prop_name in table and table[prop_name] is not None:
prop_json[prop_name] = table[prop_name]
if TableInfo.view_expended_text in prop_json:
text = prop_json[TableInfo.view_expended_text].replace('`', '')
array = HiveViewDependency.getViewDependency(text)
l = []
for a in array:
l.append(a)
names = str(a).split('.')
if names and len(names) >= 2:
db_name = names[0]
table_name = names[1]
if db_name and table_name:
rows = []
self.curs.execute(depends_sql.format(db_name=db_name, table_name=table_name, version='{version}'))
rows = self.curs.fetchall()
if rows and len(rows) > 0:
for row_index, row_value in enumerate(rows):
dependent_record = HiveDependencyInstanceRecord(
one_db_info['type'],
table['type'],
"/%s/%s" % (one_db_info['database'], table['name']),
'dalids:///' + one_db_info['database'] + '/' + table['name']
if one_db_info['type'].lower() == 'dalids'
else 'hive:///' + one_db_info['database'] + '/' + table['name'],
'depends on',
'is used by',
row_value[3],
row_value[4],
row_value[2],
row_value[5] + ':///' + row_value[0] + '/' + row_value[1], '')
self.instance_writer.append(dependent_record)
prop_json['view_depends_on'] = l
self.instance_writer.flush()
# process either schema
flds = {}
field_detail_list = []
if TableInfo.schema_literal in table and table[TableInfo.schema_literal] is not None:
sort_id = 0
urn = "hive:///%s/%s" % (one_db_info['database'], table['name'])
try:
schema_data = json.loads(table[TableInfo.schema_literal])
schema_json = schema_data
acp = AvroColumnParser(schema_data, urn = urn)
#.........这里部分代码省略.........
示例6: collect_flow_jobs
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import flush [as 别名]
def collect_flow_jobs(self, flow_file, job_file, dag_file):
self.logger.info("collect flow&jobs [last_execution_unix_time=%s lookback_period=%s]"
% (self.last_execution_unix_time, self.lookback_period))
timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
self.aw_cursor.execute(timezone)
schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
self.aw_cursor.execute(schema)
if self.last_execution_unix_time:
time_filter = "(DATE '1970-01-01' - INTERVAL '8' HOUR) + (%d - 3600) / 86400" % long(self.last_execution_unix_time)
else:
time_filter = "SYSDATE - %d" % int(self.lookback_period)
flow_query = \
"""SELECT J.SO_JOB_SEQ, J.SO_APPLICATION, J.SO_MODULE, R.LAST_CHAIN_ID
FROM SO_JOB_TABLE J JOIN (
SELECT SO_JOB_SEQ, MAX(SO_CHAIN_ID) as LAST_CHAIN_ID
FROM
( SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_HISTORY
WHERE SO_JOB_FINISHED >= %s
AND SO_CHILD_COUNT > 0
UNION ALL
SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_QUEUE
WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
AND SO_CHILD_COUNT > 0
)
GROUP BY SO_JOB_SEQ
) R ON J.SO_JOB_SEQ = R.SO_JOB_SEQ
WHERE SO_COMMAND_TYPE = 'CHAIN'
ORDER BY 2,3
""" % time_filter
job_query = \
"""SELECT d.SO_TASK_NAME, d.SO_CHAIN_ORDER, d.SO_PREDECESSORS as PREDECESSORS, d.SO_DET_SEQ as JOB_ID,
t.* FROM SO_CHAIN_DETAIL d
JOIN SO_JOB_TABLE t ON d.SO_JOB_SEQ = t.SO_JOB_SEQ
WHERE d.SO_CHAIN_SEQ = %d
ORDER BY d.SO_CHAIN_ORDER
"""
self.aw_cursor.execute(flow_query)
rows = DbUtil.dict_cursor(self.aw_cursor)
flow_writer = FileWriter(flow_file)
job_writer = FileWriter(job_file)
dag_writer = FileWriter(dag_file)
row_count = 0
for row in rows:
flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']
flow_record = AppworxFlowRecord(self.app_id,
long(row['SO_JOB_SEQ']),
row['SO_MODULE'],
row['SO_APPLICATION'],
flow_path,
0,
0,
0,
'Y',
self.wh_exec_id)
flow_writer.append(flow_record)
new_appworx_cursor = self.aw_con.cursor()
new_appworx_cursor.execute(job_query % row['SO_JOB_SEQ'])
job_rows = DbUtil.dict_cursor(new_appworx_cursor)
for job in job_rows:
job_record = AppworxJobRecord(self.app_id,
long(row['SO_JOB_SEQ']),
flow_path,
0,
long(job['JOB_ID']),
job['SO_TASK_NAME'],
flow_path + '/' + job['SO_TASK_NAME'],
job['SO_MODULE'],
'Y',
self.wh_exec_id)
command_type = job['SO_COMMAND_TYPE']
if command_type and command_type == 'CHAIN':
job_record.setRefFlowPath(job['SO_APPLICATION'] + ":" + job['SO_MODULE'])
job_record.setJobType('CHAIN')
job_writer.append(job_record)
predecessors_str = job['PREDECESSORS']
if predecessors_str:
predecessors = re.findall(r"\&\/(.+?)\s\=\sS", predecessors_str)
if predecessors:
for predecessor in predecessors:
dag_edge = AppworxFlowDagRecord(self.app_id,
long(row['SO_JOB_SEQ']),
flow_path,
0,
flow_path + '/' + predecessor,
flow_path + '/' + job['SO_TASK_NAME'],
self.wh_exec_id)
dag_writer.append(dag_edge)
row_count += 1
if row_count % 1000 == 0:
flow_writer.flush()
job_writer.flush()
dag_writer.flush()
flow_writer.close()
#.........这里部分代码省略.........
示例7: transform
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import flush [as 别名]
def transform(self, input, hive_metadata, hive_field_metadata):
"""
convert from json to csv
:param input: input json file
:param hive_metadata: output data file for hive table metadata
:param hive_field_metadata: output data file for hive field metadata
:return:
"""
f_json = open(input)
all_data = json.load(f_json)
f_json.close()
schema_file_writer = FileWriter(hive_metadata)
field_file_writer = FileWriter(hive_field_metadata)
lineageInfo = LineageInfo()
# one db info : 'type', 'database', 'tables'
# one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
# optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
for one_db_info in all_data:
i = 0
for table in one_db_info['tables']:
i += 1
schema_json = {}
prop_json = {} # set the prop json
for prop_name in TableInfo.optional_prop:
if prop_name in table and table[prop_name] is not None:
prop_json[prop_name] = table[prop_name]
if TableInfo.view_expended_text in prop_json:
text = prop_json[TableInfo.view_expended_text].replace('`', '')
array = HiveViewDependency.getViewDependency(text)
l = []
for a in array:
l.append(a)
prop_json['view_depends_on'] = l
# process either schema
flds = {}
field_detail_list = []
if TableInfo.schema_literal in table and table[TableInfo.schema_literal] is not None:
sort_id = 0
try:
schema_data = json.loads(table[TableInfo.schema_literal])
except ValueError:
self.logger.error("Schema json error for table : \n" + str(table))
schema_json = schema_data
# process each field
for field in schema_data['fields']:
field_name = field['name']
type = field['type'] # could be a list
default_value = field['default'] if 'default' in field else None
doc = field['doc'] if 'doc' in field else None
attributes_json = json.loads(field['attributes_json']) if 'attributes_json' in field else None
pk = delta = is_nullable = is_indexed = is_partitioned = inside_type = format = data_size = None
if attributes_json:
pk = attributes_json['pk'] if 'pk' in attributes_json else None
delta = attributes_json['delta'] if 'delta' in attributes_json else None
is_nullable = attributes_json['nullable'] if 'nullable' in attributes_json else None
inside_type = attributes_json['type'] if 'type' in attributes_json else None
format = attributes_json['format'] if 'format' in attributes_json else None
flds[field_name] = {'type': type}
# String urn, Integer sortId, Integer parentSortId, String parentPath, String fieldName,
#String dataType, String isNullable, String defaultValue, Integer dataSize, String namespace, String description
sort_id += 1
field_detail_list.append(
["hive:///%s/%s" % (one_db_info['database'], table['name']), str(sort_id), '0', None, field_name, '',
type, data_size, None, None, is_nullable, is_indexed, is_partitioned, default_value, None,
json.dumps(attributes_json)])
elif TableInfo.field_list in table:
schema_json = {'type': 'record', 'name': table['name'],
'fields': table[TableInfo.field_list]} # construct a schema for data came from COLUMN_V2
for field in table[TableInfo.field_list]:
field_name = field['ColumnName']
type = field['TypeName']
# ColumnName, IntegerIndex, TypeName, Comment
flds[field_name] = {'type': type}
pk = delta = is_nullable = is_indexed = is_partitioned = inside_type = format = data_size = default_value = None # TODO ingest
field_detail_list.append(
["hive:///%s/%s" % (one_db_info['database'], table['name']), field['IntegerIndex'], '0', None, field_name,
'', field['TypeName'], None, None, None, is_nullable, is_indexed, is_partitioned, default_value, None,
None])
dataset_scehma_record = DatasetSchemaRecord(table['name'], json.dumps(schema_json), json.dumps(prop_json),
json.dumps(flds),
"hive:///%s/%s" % (one_db_info['database'], table['name']), 'Hive',
'', (table[TableInfo.create_time] if table.has_key(
TableInfo.create_time) else None), (table["lastAlterTime"]) if table.has_key("lastAlterTime") else None)
schema_file_writer.append(dataset_scehma_record)
for fields in field_detail_list:
field_record = DatasetFieldRecord(fields)
field_file_writer.append(field_record)
schema_file_writer.flush()
#.........这里部分代码省略.........
示例8: transform
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import flush [as 别名]
def transform(self, input, hive_metadata, hive_field_metadata):
"""
convert from json to csv
:param input: input json file
:param hive_metadata: output data file for hive table metadata
:param hive_field_metadata: output data file for hive field metadata
:return:
"""
f_json = open(input)
all_data = json.load(f_json)
f_json.close()
schema_file_writer = FileWriter(hive_metadata)
field_file_writer = FileWriter(hive_field_metadata)
lineageInfo = LineageInfo()
# one db info : 'type', 'database', 'tables'
# one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
# optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
for one_db_info in all_data:
i = 0
for table in one_db_info['tables']:
i += 1
schema_json = {}
prop_json = {} # set the prop json
for prop_name in TableInfo.optional_prop:
if prop_name in table and table[prop_name] is not None:
prop_json[prop_name] = table[prop_name]
if TableInfo.view_expended_text in prop_json:
text = prop_json[TableInfo.view_expended_text].replace('`', '')
array = HiveViewDependency.getViewDependency(text)
l = []
for a in array:
l.append(a)
prop_json['view_depends_on'] = l
# process either schema
flds = {}
field_detail_list = []
if TableInfo.schema_literal in table and table[TableInfo.schema_literal] is not None:
sort_id = 0
urn = "hive:///%s/%s" % (one_db_info['database'], table['name'])
try:
schema_data = json.loads(table[TableInfo.schema_literal])
schema_json = schema_data
acp = AvroColumnParser(schema_data, urn = urn)
result = acp.get_column_list_result()
field_detail_list += result
except ValueError:
self.logger.error("Schema json error for table : \n" + str(table))
elif TableInfo.field_list in table:
# Convert to avro
uri = "hive:///%s/%s" % (one_db_info['database'], table['name'])
hcp = HiveColumnParser(table, urn = uri)
schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
field_detail_list += hcp.column_type_list
dataset_scehma_record = DatasetSchemaRecord(table['name'], json.dumps(schema_json), json.dumps(prop_json),
json.dumps(flds),
"hive:///%s/%s" % (one_db_info['database'], table['name']), 'Hive',
'', (table[TableInfo.create_time] if table.has_key(
TableInfo.create_time) else None), (table["lastAlterTime"]) if table.has_key("lastAlterTime") else None)
schema_file_writer.append(dataset_scehma_record)
for fields in field_detail_list:
field_record = DatasetFieldRecord(fields)
field_file_writer.append(field_record)
schema_file_writer.flush()
field_file_writer.flush()
self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))
schema_file_writer.close()
field_file_writer.close()
示例9: transform
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import flush [as 别名]
#.........这里部分代码省略.........
try:
array = HiveViewDependency.getViewDependency(text)
except:
self.logger.error("HiveViewDependency.getViewDependency(%s) failed!" % (table['name']))
l = []
for a in array:
l.append(a)
names = str(a).split('.')
if names and len(names) >= 2:
db_name = names[0].lower()
table_name = names[1].lower()
if db_name and table_name:
self.curs.execute(depends_sql.format(db_name=db_name, table_name=table_name, version='{version}'))
rows = self.curs.fetchall()
self.conn_hms.commit()
if rows and len(rows) > 0:
for row_index, row_value in enumerate(rows):
dependent_record = HiveDependencyInstanceRecord(
one_db_info['type'],
table['type'],
"/%s/%s" % (one_db_info['database'], table['name']),
'dalids:///' + one_db_info['database'] + '/' + table['dataset_name']
if one_db_info['type'].lower() == 'dalids'
else 'hive:///' + one_db_info['database'] + '/' + table['dataset_name'],
'depends on',
'Y',
row_value[3],
row_value[4],
row_value[2],
row_value[5] + ':///' + row_value[0] + '/' + row_value[1], '')
dependency_file_writer.append(dependent_record)
prop_json['view_depends_on'] = l
dependency_file_writer.flush()
# process either schema
flds = {}
field_detail_list = []
if TableInfo.schema_literal in table and \
table[TableInfo.schema_literal] is not None and \
table[TableInfo.schema_literal].startswith('{'):
sort_id = 0
urn = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
self.logger.info("Getting schema literal for: %s" % (urn))
try:
schema_data = json.loads(table[TableInfo.schema_literal])
schema_json = schema_data
acp = AvroColumnParser(schema_data, urn = urn)
result = acp.get_column_list_result()
field_detail_list += result
except ValueError:
self.logger.error("Schema Literal JSON error for table: " + str(table))
elif TableInfo.field_list in table:
# Convert to avro
uri = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
if one_db_info['type'].lower() == 'dalids':
uri = "dalids:///%s/%s" % (one_db_info['database'], table['dataset_name'])
else:
uri = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
self.logger.info("Getting column definition for: %s" % (uri))
try:
hcp = HiveColumnParser(table, urn = uri)
schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
field_detail_list += hcp.column_type_list
示例10: __init__
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import flush [as 别名]
#.........这里部分代码省略.........
if TableInfo.view_expended_text in prop_json:
view_expanded_text = prop_json[TableInfo.view_expended_text]
text = prop_json[TableInfo.view_expended_text].replace('`', '')
array = HiveViewDependency.getViewDependency(text)
l = []
for a in array:
l.append(a)
names = str(a).split('.')
if names and len(names) >= 2:
db_name = names[0].lower()
table_name = names[1].lower()
if db_name and table_name:
rows = []
self.curs.execute(depends_sql.format(db_name=db_name, table_name=table_name, version='{version}'))
rows = self.curs.fetchall()
if rows and len(rows) > 0:
for row_index, row_value in enumerate(rows):
dependent_record = HiveDependencyInstanceRecord(
one_db_info['type'],
table['type'],
"/%s/%s" % (one_db_info['database'], table['name']),
'dalids:///' + one_db_info['database'] + '/' + table['dataset_name']
if one_db_info['type'].lower() == 'dalids'
else 'hive:///' + one_db_info['database'] + '/' + table['dataset_name'],
'depends on',
'Y',
row_value[3],
row_value[4],
row_value[2],
row_value[5] + ':///' + row_value[0] + '/' + row_value[1], '')
self.instance_writer.append(dependent_record)
prop_json['view_depends_on'] = l
self.instance_writer.flush()
# process either schema
flds = {}
field_detail_list = []
if TableInfo.schema_literal in table and \
table[TableInfo.schema_literal] is not None and \
table[TableInfo.schema_literal].startswith('{'):
sort_id = 0
urn = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
self.logger.info("Getting schema literal for: %s" % (urn))
try:
schema_data = json.loads(table[TableInfo.schema_literal])
schema_json = schema_data
acp = AvroColumnParser(schema_data, urn = urn)
result = acp.get_column_list_result()
field_detail_list += result
except ValueError:
self.logger.error("Schema Literal JSON error for table: " + str(table))
elif TableInfo.field_list in table:
# Convert to avro
uri = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
if one_db_info['type'].lower() == 'dalids':
uri = "dalids:///%s/%s" % (one_db_info['database'], table['dataset_name'])
else:
uri = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
self.logger.info("Getting column definition for: %s" % (uri))
hcp = HiveColumnParser(table, urn = uri)
schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
field_detail_list += hcp.column_type_list