本文整理汇总了Python中wherehows.common.writers.FileWriter.append方法的典型用法代码示例。如果您正苦于以下问题:Python FileWriter.append方法的具体用法?Python FileWriter.append怎么用?Python FileWriter.append使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类wherehows.common.writers.FileWriter
的用法示例。
在下文中一共展示了FileWriter.append方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: collect_job_execs
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import append [as 别名]
def collect_job_execs(self, job_exec_file, lookback_period):
self.logger.info("collect job execs")
job_exec_writer = FileWriter(job_exec_file)
query = """
select a.id as job_exec_id, a.name as job_name, j.id as flow_exec_id, a.status, a.user_retry_count,
unix_timestamp(a.start_time) start_time, unix_timestamp(a.end_time) end_time,
j.app_name as jname, j.app_path, transition from WF_ACTIONS a JOIN WF_JOBS j on a.wf_id = j.id where j.end_time > now() - INTERVAL %d MINUTE
""" % (int(lookback_period))
self.oz_cursor.execute(query)
rows = DbUtil.dict_cursor(self.oz_cursor)
for row in rows:
job_exec_record = OozieJobExecRecord(self.app_id,
row['app_path'],
row['flow_exec_id'],
row['flow_exec_id'],
row['job_name'],
row['app_path'] + "/" + row['job_name'],
row['job_exec_id'],
row['status'],
row['user_retry_count'],
row['start_time'],
row['end_time'],
self.wh_exec_id)
job_exec_writer.append(job_exec_record)
job_exec_writer.close()
示例2: collect_flow_schedules
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import append [as 别名]
def collect_flow_schedules(self, schedule_file):
self.logger.info("collect flow schedule")
schedule_writer = FileWriter(schedule_file)
query = """
SELECT DISTINCT cj.id as ref_id, cj.frequency, cj.time_unit,
unix_timestamp(cj.start_time) as start_time, unix_timestamp(cj.end_time) as end_time,
wj.app_path
FROM COORD_JOBS cj JOIN COORD_ACTIONS ca ON ca.job_id = cj.id JOIN WF_JOBS wj ON ca.external_id = wj.id
WHERE cj.status = 'RUNNING'
"""
self.oz_cursor.execute(query)
rows = DbUtil.dict_cursor(self.oz_cursor)
for row in rows:
schedule_record = OozieFlowScheduleRecord(self.app_id,
row['app_path'],
row['time_unit'],
str(row['frequency']),
None,
row['start_time'],
row['end_time'],
row['ref_id'],
self.wh_exec_id)
schedule_writer.append(schedule_record)
schedule_writer.close()
示例3: collect_flow_schedules
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import append [as 别名]
def collect_flow_schedules(self, schedule_file):
# load flow scheduling info from table triggers
self.logger.info("collect flow schedule")
timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
self.aw_cursor.execute(timezone)
schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
self.aw_cursor.execute(schema)
schedule_writer = FileWriter(schedule_file)
query = \
"""SELECT J.SO_APPLICATION, J.SO_MODULE, S.AW_SCH_NAME, S.AW_SCH_INTERVAL, S.AW_ACTIVE,
ROUND((cast((FROM_TZ(CAST(S.AW_SCH_START as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_STARTED,
ROUND((cast((FROM_TZ(CAST(S.AW_SCH_END as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_END
FROM SO_JOB_TABLE J
JOIN AW_MODULE_SCHED S ON J.SO_JOB_SEQ = S.AW_JOB_SEQ
WHERE J.SO_COMMAND_TYPE = 'CHAIN' AND S.AW_ACTIVE = 'Y' """
self.aw_cursor.execute(query)
rows = DbUtil.dict_cursor(self.aw_cursor)
for row in rows:
schedule_record = AppworxFlowScheduleRecord(self.app_id,
row['SO_APPLICATION'] + ":" + row['SO_MODULE'],
row['AW_SCH_NAME'],
int(row['AW_SCH_INTERVAL']),
long(row['EFFECT_STARTED']),
long(row['EFFECT_END']),
'0',
self.wh_exec_id
)
schedule_writer.append(schedule_record)
schedule_writer.close()
示例4: collect_flow_execs
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import append [as 别名]
def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period):
self.logger.info( "collect flow&job executions")
flow_exec_writer = FileWriter(flow_exec_file)
job_exec_writer = FileWriter(job_exec_file)
cmd = """select * from execution_flows where end_time > UNIX_TIMESTAMP(now() - INTERVAL %d MINUTE) * 1000 """ % (int(look_back_period))
self.az_cursor.execute(cmd)
rows = DbUtil.dict_cursor(self.az_cursor)
row_count = 0
for row in rows:
json_column = 'flow_data'
unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
try:
row[json_column] = json.loads(unzipped_content)
except Exception as e:
self.logger.error(e)
pass
flow_data = row[json_column]
flow_path = flow_data['projectName'] + ":" + flow_data['flowId']
flow_exec_record = AzkabanFlowExecRecord(self.app_id,
flow_data['flowId'],
flow_path,
row['version'],
row['exec_id'],
flow_data['status'],
flow_data['attempt'],
row['submit_user'],
long(row['start_time']) / 1000,
long(row['end_time']) / 1000,
self.wh_exec_id)
flow_exec_writer.append(flow_exec_record)
nodes = flow_data['nodes']
job_exec_records = []
for node in nodes:
job_exec_record = AzkabanJobExecRecord(self.app_id,
flow_path,
row['version'],
row['exec_id'],
node['id'],
flow_path + "/" + node['id'],
None,
node['status'],
node['attempt'],
long(node['startTime']) / 1000,
long(node['endTime']) / 1000,
self.wh_exec_id)
job_exec_records.append(job_exec_record)
AzkabanJobExecUtil.sortAndSet(job_exec_records)
for r in job_exec_records:
job_exec_writer.append(r)
row_count += 1
if row_count % 10000 == 0:
flow_exec_writer.flush()
job_exec_writer.flush()
flow_exec_writer.close()
job_exec_writer.close()
示例5: collect_flow_owners
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import append [as 别名]
def collect_flow_owners(self, owner_file):
self.logger.info("collect owners")
owner_writer = FileWriter(owner_file)
query = "SELECT DISTINCT app_name, app_path, user_name from WF_JOBS"
self.oz_cursor.execute(query)
rows = DbUtil.dict_cursor(self.oz_cursor)
for row in rows:
owner_record = OozieFlowOwnerRecord(self.app_id,
row['app_path'],
row['user_name'],
self.wh_exec_id)
owner_writer.append(owner_record)
owner_writer.close()
示例6: collect_flow_schedules
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import append [as 别名]
def collect_flow_schedules(self, schedule_file):
# load flow scheduling info from table triggers
self.logger.info("collect flow schedule")
schedule_writer = FileWriter(schedule_file)
query = "select * from triggers"
self.az_cursor.execute(query)
rows = DbUtil.dict_cursor(self.az_cursor)
for row in rows:
json_column = "data"
if row[json_column] != None:
unzipped_content = gzip.GzipFile(
mode="r", fileobj=StringIO.StringIO(row[json_column].tostring())
).read()
try:
row[json_column] = json.loads(unzipped_content)
except Exception as e:
self.logger.error(e)
pass
if not "projectId" in row[json_column]["actions"][0]["actionJson"]:
continue
# print json.dumps(row[json_column], indent=4)
if row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["isRecurring"] == "true":
unit, frequency, cron_expr = None, None, None
period = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"]
if period is not None and period != "null" and period[-1:] in self._period_unit_table:
unit = self._period_unit_table[period[-1:]]
frequency = int(
row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"][:-1]
)
if "cronExpression" in row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]:
cron_expr = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["cronExpression"]
schedule_record = AzkabanFlowScheduleRecord(
self.app_id,
row[json_column]["actions"][0]["actionJson"]["projectName"]
+ ":"
+ row[json_column]["actions"][0]["actionJson"]["flowName"],
unit,
frequency,
cron_expr,
long(row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["firstCheckTime"])
/ 1000,
int(time.mktime(datetime.date(2099, 12, 31).timetuple())),
"0",
self.wh_exec_id,
)
schedule_writer.append(schedule_record)
schedule_writer.close()
示例7: collect_flow_owners
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import append [as 别名]
def collect_flow_owners(self, owner_file):
# load user info from table project_permissions
self.logger.info("collect owner&permissions")
user_writer = FileWriter(owner_file)
query = "select f.flow_id, p.name as project_name, p.version as project_verison, pp.name as owner, pp.permissions, pp.isGroup " \
"from project_flows f join project_permissions pp on f.project_id = pp.project_id join projects p on f.project_id = p.id where p.active = 1"
self.az_cursor.execute(query)
rows = DbUtil.dict_cursor(self.az_cursor)
for row in rows:
record = AzkabanFlowOwnerRecord(self.app_id,
row['project_name'] + ':' + row["flow_id"],
row["owner"],
AzkabanPermission(row["permissions"]).toFlatString(),
'GROUP' if row['isGroup'] == 1 else 'LDAP',
self.wh_exec_id)
user_writer.append(record)
user_writer.close()
示例8: collect_flow_execs
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import append [as 别名]
def collect_flow_execs(self, flow_exec_file, lookback_period):
self.logger.info("collect flow execs")
flow_exec_writer = FileWriter(flow_exec_file)
query = "select id, app_name, app_path, unix_timestamp(start_time) as start_time, unix_timestamp(end_time) as end_time, run, status, user_name from WF_JOBS where end_time > now() - INTERVAL %d MINUTE" % (int(lookback_period))
self.oz_cursor.execute(query)
rows = DbUtil.dict_cursor(self.oz_cursor)
for row in rows:
flow_exec_record = OozieFlowExecRecord(self.app_id,
row['app_name'],
row['app_path'],
row['id'],
row['id'],
row['status'],
row['run'],
row['user_name'],
row['start_time'],
row['end_time'],
self.wh_exec_id)
flow_exec_writer.append(flow_exec_record)
flow_exec_writer.close()
示例9: collect_flow_jobs
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import append [as 别名]
def collect_flow_jobs(self, flow_file, job_file, dag_file):
self.logger.info("collect flow&jobs")
flow_writer = FileWriter(flow_file)
job_writer = FileWriter(job_file)
dag_writer = FileWriter(dag_file)
query = """
SELECT a.*, b.created_time FROM
(SELECT w.app_name, w.app_path, max(w.id) as source_version, max(unix_timestamp(w.last_modified_time)) as last_modified_time
from WF_JOBS w LEFT JOIN WF_JOBS s
ON w.app_path = s.app_path AND w.created_time < s.created_time
WHERE s.created_time IS NULL GROUP BY w.app_name, w.app_path) a
JOIN
(SELECT app_path, min(unix_timestamp(created_time)) as created_time FROM WF_JOBS GROUP BY app_path) b
ON a.app_path = b.app_path
"""
self.oz_cursor.execute(query)
rows = DbUtil.dict_cursor(self.oz_cursor)
for row in rows:
flow_record = OozieFlowRecord(self.app_id,
row['app_name'],
row['app_path'],
0,
row['source_version'],
row['created_time'],
row['last_modified_time'],
self.wh_exec_id)
flow_writer.append(flow_record)
query = """
select name, type, transition from WF_ACTIONS
where wf_id = '{source_version}'
""".format(source_version=row['source_version'])
new_oz_cursor = self.oz_con.cursor()
new_oz_cursor.execute(query)
nodes = DbUtil.dict_cursor(new_oz_cursor)
for node in nodes:
job_record = OozieJobRecord(self.app_id,
row['app_path'],
row['source_version'],
node['name'],
row['app_path'] + "/" + node['name'],
node['type'],
self.wh_exec_id)
job_writer.append(job_record)
if node['transition'] != "*" and node['transition'] is not None:
dag_edge = OozieFlowDagRecord(self.app_id,
row['app_path'],
row['source_version'],
row['app_path'] + "/" + node['name'],
row['app_path'] + "/" + node['transition'],
self.wh_exec_id)
dag_writer.append(dag_edge)
new_oz_cursor.close()
dag_writer.close()
job_writer.close()
flow_writer.close()
示例10: collect_flow_owners
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import append [as 别名]
def collect_flow_owners(self, owner_file):
self.logger.info("collect owner&permissions")
timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
self.aw_cursor.execute(timezone)
schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
self.aw_cursor.execute(schema)
user_writer = FileWriter(owner_file)
query = \
"""SELECT DISTINCT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, U.SO_USER_NAME FROM SO_JOB_TABLE J
JOIN SO_JOB_HISTORY H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
WHERE J.SO_COMMAND_TYPE = 'CHAIN' """
self.aw_cursor.execute(query)
rows = DbUtil.dict_cursor(self.aw_cursor)
for row in rows:
record = AppworxFlowOwnerRecord(self.app_id,
row['SO_APPLICATION'] + ':' + row["SO_MODULE"],
row["SO_USER_NAME"],
'EXECUTE',
'GROUP',
self.wh_exec_id)
user_writer.append(record)
user_writer.close()
示例11: transform
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import append [as 别名]
def transform(self, input, hive_metadata, hive_field_metadata):
"""
convert from json to csv
:param input: input json file
:param hive_metadata: output data file for hive table metadata
:param hive_field_metadata: output data file for hive field metadata
:return:
"""
f_json = open(input)
all_data = json.load(f_json)
f_json.close()
schema_file_writer = FileWriter(hive_metadata)
field_file_writer = FileWriter(hive_field_metadata)
lineageInfo = LineageInfo()
depends_sql = """
SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
else t.TBL_NAME
end dataset_name,
concat('/', d.NAME, '/', t.TBL_NAME) object_name,
case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
then 'dalids'
else 'hive'
end object_type,
case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
then 'View'
else
case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
else 'Table'
end
end object_sub_type,
case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
then 'dalids'
else 'hive'
end prefix
FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
"""
# one db info : 'type', 'database', 'tables'
# one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
# optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
for one_db_info in all_data:
i = 0
for table in one_db_info['tables']:
i += 1
schema_json = {}
prop_json = {} # set the prop json
for prop_name in TableInfo.optional_prop:
if prop_name in table and table[prop_name] is not None:
prop_json[prop_name] = table[prop_name]
if TableInfo.view_expended_text in prop_json:
text = prop_json[TableInfo.view_expended_text].replace('`', '')
array = HiveViewDependency.getViewDependency(text)
l = []
for a in array:
l.append(a)
names = str(a).split('.')
if names and len(names) >= 2:
db_name = names[0]
table_name = names[1]
if db_name and table_name:
rows = []
self.curs.execute(depends_sql.format(db_name=db_name, table_name=table_name, version='{version}'))
rows = self.curs.fetchall()
if rows and len(rows) > 0:
for row_index, row_value in enumerate(rows):
dependent_record = HiveDependencyInstanceRecord(
one_db_info['type'],
table['type'],
"/%s/%s" % (one_db_info['database'], table['name']),
'dalids:///' + one_db_info['database'] + '/' + table['name']
if one_db_info['type'].lower() == 'dalids'
else 'hive:///' + one_db_info['database'] + '/' + table['name'],
'depends on',
'is used by',
row_value[3],
row_value[4],
row_value[2],
row_value[5] + ':///' + row_value[0] + '/' + row_value[1], '')
self.instance_writer.append(dependent_record)
prop_json['view_depends_on'] = l
self.instance_writer.flush()
# process either schema
flds = {}
field_detail_list = []
if TableInfo.schema_literal in table and table[TableInfo.schema_literal] is not None:
sort_id = 0
urn = "hive:///%s/%s" % (one_db_info['database'], table['name'])
try:
schema_data = json.loads(table[TableInfo.schema_literal])
schema_json = schema_data
acp = AvroColumnParser(schema_data, urn = urn)
#.........这里部分代码省略.........
示例12: transform
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import append [as 别名]
def transform(self, input, hive_instance, hive_metadata, hive_field_metadata, view_dependency):
"""
convert from json to csv
:param input: input json file
:param hive_instance: output data file for hive instance
:param hive_metadata: output data file for hive table metadata
:param hive_field_metadata: output data file for hive field metadata
:return:
"""
all_data = []
with open(input) as input_file:
for line in input_file:
all_data.append(json.loads(line))
dataset_idx = -1
instance_file_writer = FileWriter(hive_instance)
schema_file_writer = FileWriter(hive_metadata)
field_file_writer = FileWriter(hive_field_metadata)
dependency_file_writer = FileWriter(view_dependency)
depends_sql = """
SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
else t.TBL_NAME
end dataset_name,
concat('/', d.NAME, '/', t.TBL_NAME) object_name,
case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
then 'dalids'
else 'hive'
end object_type,
case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
then 'View'
else
case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
else 'Table'
end
end object_sub_type,
case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
then 'dalids'
else 'hive'
end prefix
FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
"""
# one db info : 'type', 'database', 'tables'
# one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
# optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
for one_db_info in all_data:
i = 0
for table in one_db_info['tables']:
i += 1
schema_json = {}
prop_json = {} # set the prop json
for prop_name in TableInfo.optional_prop:
if prop_name in table and table[prop_name] is not None:
prop_json[prop_name] = table[prop_name]
view_expanded_text = ''
if TableInfo.view_expended_text in prop_json:
view_expanded_text = prop_json[TableInfo.view_expended_text]
text = prop_json[TableInfo.view_expended_text].replace('`', '') # this will be fixed after switching to Hive AST
array = []
try:
array = HiveViewDependency.getViewDependency(text)
except:
self.logger.error("HiveViewDependency.getViewDependency(%s) failed!" % (table['name']))
l = []
for a in array:
l.append(a)
names = str(a).split('.')
if names and len(names) >= 2:
db_name = names[0].lower()
table_name = names[1].lower()
if db_name and table_name:
self.curs.execute(depends_sql.format(db_name=db_name, table_name=table_name, version='{version}'))
rows = self.curs.fetchall()
self.conn_hms.commit()
if rows and len(rows) > 0:
for row_index, row_value in enumerate(rows):
dependent_record = HiveDependencyInstanceRecord(
one_db_info['type'],
table['type'],
"/%s/%s" % (one_db_info['database'], table['name']),
'dalids:///' + one_db_info['database'] + '/' + table['dataset_name']
if one_db_info['type'].lower() == 'dalids'
else 'hive:///' + one_db_info['database'] + '/' + table['dataset_name'],
'depends on',
'Y',
row_value[3],
row_value[4],
row_value[2],
row_value[5] + ':///' + row_value[0] + '/' + row_value[1], '')
dependency_file_writer.append(dependent_record)
prop_json['view_depends_on'] = l
#.........这里部分代码省略.........
示例13: transform
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import append [as 别名]
def transform(self, input, hive_metadata, hive_field_metadata):
"""
convert from json to csv
:param input: input json file
:param hive_metadata: output data file for hive table metadata
:param hive_field_metadata: output data file for hive field metadata
:return:
"""
f_json = open(input)
all_data = json.load(f_json)
f_json.close()
schema_file_writer = FileWriter(hive_metadata)
field_file_writer = FileWriter(hive_field_metadata)
lineageInfo = LineageInfo()
# one db info : 'type', 'database', 'tables'
# one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
# optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
for one_db_info in all_data:
i = 0
for table in one_db_info['tables']:
i += 1
schema_json = {}
prop_json = {} # set the prop json
for prop_name in TableInfo.optional_prop:
if prop_name in table and table[prop_name] is not None:
prop_json[prop_name] = table[prop_name]
if TableInfo.view_expended_text in prop_json:
text = prop_json[TableInfo.view_expended_text].replace('`', '')
array = HiveViewDependency.getViewDependency(text)
l = []
for a in array:
l.append(a)
prop_json['view_depends_on'] = l
# process either schema
flds = {}
field_detail_list = []
if TableInfo.schema_literal in table and table[TableInfo.schema_literal] is not None:
sort_id = 0
urn = "hive:///%s/%s" % (one_db_info['database'], table['name'])
try:
schema_data = json.loads(table[TableInfo.schema_literal])
schema_json = schema_data
acp = AvroColumnParser(schema_data, urn = urn)
result = acp.get_column_list_result()
field_detail_list += result
except ValueError:
self.logger.error("Schema json error for table : \n" + str(table))
elif TableInfo.field_list in table:
# Convert to avro
uri = "hive:///%s/%s" % (one_db_info['database'], table['name'])
hcp = HiveColumnParser(table, urn = uri)
schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
field_detail_list += hcp.column_type_list
dataset_scehma_record = DatasetSchemaRecord(table['name'], json.dumps(schema_json), json.dumps(prop_json),
json.dumps(flds),
"hive:///%s/%s" % (one_db_info['database'], table['name']), 'Hive',
'', (table[TableInfo.create_time] if table.has_key(
TableInfo.create_time) else None), (table["lastAlterTime"]) if table.has_key("lastAlterTime") else None)
schema_file_writer.append(dataset_scehma_record)
for fields in field_detail_list:
field_record = DatasetFieldRecord(fields)
field_file_writer.append(field_record)
schema_file_writer.flush()
field_file_writer.flush()
self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))
schema_file_writer.close()
field_file_writer.close()
示例14: collect_flow_jobs
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import append [as 别名]
def collect_flow_jobs(self, flow_file, job_file, dag_file):
self.logger.info("collect flow&jobs")
query = "SELECT distinct f.*, p.name as project_name FROM project_flows f inner join projects p on f.project_id = p.id and f.version = p.version where p.active = 1"
self.az_cursor.execute(query)
rows = DbUtil.dict_cursor(self.az_cursor)
flow_writer = FileWriter(flow_file)
job_writer = FileWriter(job_file)
dag_writer = FileWriter(dag_file)
row_count = 0
for row in rows:
row['version'] = 0 if (row["version"] is None) else row["version"]
json_column = 'json'
unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
try:
row[json_column] = json.loads(unzipped_content)
except:
pass
flow_path = row['project_name'] + ":" + row['flow_id']
flow_record = AzkabanFlowRecord(self.app_id,
row['flow_id'],
row['project_name'],
flow_path,
0,
row['modified_time'] / 1000,
row["version"],
'Y',
self.wh_exec_id)
flow_writer.append(flow_record)
# get flow jobs
nodes = row[json_column]['nodes']
for node in nodes:
job_record = AzkabanJobRecord(self.app_id,
flow_path,
row["version"],
node['id'],
flow_path + '/' + node['id'],
node['jobType'],
'Y',
self.wh_exec_id)
if node['jobType'] == 'flow':
job_record.setRefFlowPath(row['project_name'] + ":" + node['embeddedFlowId'])
job_writer.append(job_record)
# job dag
edges = row[json_column]['edges']
for edge in edges:
dag_edge = AzkabanFlowDagRecord(self.app_id,
flow_path,
row['version'],
flow_path + '/' + edge['source'],
flow_path + '/' + edge['target'],
self.wh_exec_id)
dag_writer.append(dag_edge)
row_count += 1
if row_count % 1000 == 0:
flow_writer.flush()
job_writer.flush()
dag_writer.flush()
flow_writer.close()
job_writer.close()
dag_writer.close()
示例15: __init__
# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import append [as 别名]
class MultiproductLoad:
def __init__(self):
self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
requests.packages.urllib3.disable_warnings()
self.app_id = int(args[Constant.APP_ID_KEY])
self.wh_exec_id = long(args[Constant.WH_EXEC_ID_KEY])
self.project_writer = FileWriter(args[Constant.GIT_PROJECT_OUTPUT_KEY])
self.repo_writer = FileWriter(args[Constant.PRODUCT_REPO_OUTPUT_KEY])
self.repo_owner_writer = FileWriter(args[Constant.PRODUCT_REPO_OWNER_OUTPUT_KEY])
self.multiproduct = {}
self.git_repo = {}
self.product_repo = []
def get_multiproducts(self):
'''
fetch all products and owners of Multiproduct
'''
resp = requests.get(args[Constant.MULTIPRODUCT_SERVICE_URL], verify=False)
if resp.status_code != 200:
# This means something went wrong.
raise Exception('Request Error', 'GET /api/v1/mpl {}'.format(resp.status_code))
# print resp.content
re_git_repo_name = re.compile(r":(.*)\.git$")
re_svn_repo_name = re.compile(r"/(.*)/trunk$")
if resp.headers['content-type'].split(';')[0] == 'application/json':
for product_name, product_info in resp.json()['products'].items():
scm_type = product_info["scm"]["name"]
try:
if scm_type == 'git':
repo_fullname = re_git_repo_name.search(product_info["uris"]["trunk"]).group(1)
repo_key = 'git:' + repo_fullname
elif scm_type == 'svn':
repo_fullname = re_svn_repo_name.search(product_info["uris"]["trunk"]).group(1)
repo_key = 'svn:' + repo_fullname
except:
self.logger.debug("Error parsing repo full name {} - {}".format(product_name, product_info["uris"]))
continue
self.multiproduct[repo_key] = {
"scm_repo_fullname": repo_fullname,
"scm_type": scm_type,
"multiproduct_name": product_name,
"product_type": product_info["type"],
"namespace": product_info["org"],
"owner_name": ",".join(product_info["owners"]),
"product_version": product_info["product-version"]
}
self.logger.info("Fetched {} Multiproducts".format(len(self.multiproduct)))
def get_project_repo(self):
'''
fetch detail and repos of all git projects
'''
re_git_project_name = re.compile(r"(.*)/(.*)$")
re_git_repo_name = re.compile(r"git://[\w\.-]+/(.*)\.git$")
project_nonexist = []
project_names = {}
for key, product in self.multiproduct.iteritems():
if product["scm_type"] == 'svn':
continue
project_name = re_git_project_name.search(product['scm_repo_fullname']).group(1)
if project_name in project_names:
continue
project_url = '{}/{}?format=xml'.format(args[Constant.GIT_URL_PREFIX], project_name)
try:
resp = requests.get(project_url, verify=False)
except Exception as ex:
self.logger.info("Error getting /{}.xml - {}".format(project_name, ex.message))
continue
if resp.status_code != 200:
# This means something went wrong.
self.logger.debug('Request Error: GET /{}.xml {}'.format(project_name, resp.status_code))
project_nonexist.append(project_name)
continue
# print resp.content
if resp.headers['content-type'].split(';')[0] == 'application/xml':
xml = ET.fromstring(resp.content)
current_project = MultiproductProjectRecord(
self.app_id,
xml.find('slug').text,
'git',
xml.find('owner').attrib['kind'],
xml.find('owner').text,
xml.find('created-at').text,
xml.find('license').text,
self.trim_newline(xml.find('description').text),
self.wh_exec_id
)
project_repo_names = []
for repo in xml.findall('repositories/mainlines/repository'):
#.........这里部分代码省略.........