当前位置: 首页>>代码示例>>Python>>正文


Python FileWriter.close方法代码示例

本文整理汇总了Python中wherehows.common.writers.FileWriter.close方法的典型用法代码示例。如果您正苦于以下问题:Python FileWriter.close方法的具体用法?Python FileWriter.close怎么用?Python FileWriter.close使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在wherehows.common.writers.FileWriter的用法示例。


在下文中一共展示了FileWriter.close方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: collect_flow_schedules

# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import close [as 别名]
  def collect_flow_schedules(self, schedule_file):
    self.logger.info("collect flow schedule")
    schedule_writer = FileWriter(schedule_file)
    query = """
            SELECT DISTINCT cj.id as ref_id, cj.frequency, cj.time_unit,
            unix_timestamp(cj.start_time) as start_time, unix_timestamp(cj.end_time) as end_time,
            wj.app_path
            FROM COORD_JOBS cj JOIN COORD_ACTIONS ca ON ca.job_id = cj.id JOIN WF_JOBS wj ON ca.external_id = wj.id
            WHERE cj.status = 'RUNNING'
            """
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      schedule_record = OozieFlowScheduleRecord(self.app_id,
                                                row['app_path'],
                                                row['time_unit'],
                                                str(row['frequency']),
                                                None,
                                                row['start_time'],
                                                row['end_time'],
                                                row['ref_id'],
                                                self.wh_exec_id)
      schedule_writer.append(schedule_record)

    schedule_writer.close()
开发者ID:alyiwang,项目名称:WhereHows,代码行数:28,代码来源:OozieExtract.py

示例2: collect_job_execs

# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import close [as 别名]
  def collect_job_execs(self, job_exec_file, lookback_period):
    self.logger.info("collect job execs")
    job_exec_writer = FileWriter(job_exec_file)
    query = """
            select  a.id as job_exec_id, a.name as job_name, j.id as flow_exec_id, a.status, a.user_retry_count,
            unix_timestamp(a.start_time) start_time, unix_timestamp(a.end_time) end_time,
            j.app_name as jname, j.app_path, transition from WF_ACTIONS a JOIN WF_JOBS j on a.wf_id = j.id where j.end_time > now() - INTERVAL %d MINUTE
            """ % (int(lookback_period))
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      job_exec_record = OozieJobExecRecord(self.app_id,
                                           row['app_path'],
                                           row['flow_exec_id'],
                                           row['flow_exec_id'],
                                           row['job_name'],
                                           row['app_path'] + "/" + row['job_name'],
                                           row['job_exec_id'],
                                           row['status'],
                                           row['user_retry_count'],
                                           row['start_time'],
                                           row['end_time'],
                                           self.wh_exec_id)
      job_exec_writer.append(job_exec_record)
    job_exec_writer.close()
开发者ID:alyiwang,项目名称:WhereHows,代码行数:28,代码来源:OozieExtract.py

示例3: collect_flow_schedules

# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import close [as 别名]
 def collect_flow_schedules(self, schedule_file):
   # load flow scheduling info from table triggers
   self.logger.info("collect flow schedule")
   timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
   self.aw_cursor.execute(timezone)
   schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
   self.aw_cursor.execute(schema)
   schedule_writer = FileWriter(schedule_file)
   query = \
       """SELECT J.SO_APPLICATION, J.SO_MODULE, S.AW_SCH_NAME, S.AW_SCH_INTERVAL, S.AW_ACTIVE,
          ROUND((cast((FROM_TZ(CAST(S.AW_SCH_START as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
          to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_STARTED,
          ROUND((cast((FROM_TZ(CAST(S.AW_SCH_END as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
          to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_END
          FROM SO_JOB_TABLE J
          JOIN AW_MODULE_SCHED S ON J.SO_JOB_SEQ = S.AW_JOB_SEQ
          WHERE J.SO_COMMAND_TYPE = 'CHAIN' AND S.AW_ACTIVE = 'Y' """
   self.aw_cursor.execute(query)
   rows = DbUtil.dict_cursor(self.aw_cursor)
   for row in rows:
     schedule_record = AppworxFlowScheduleRecord(self.app_id,
                                                 row['SO_APPLICATION'] + ":" + row['SO_MODULE'],
                                                 row['AW_SCH_NAME'],
                                                 int(row['AW_SCH_INTERVAL']),
                                                 long(row['EFFECT_STARTED']),
                                                 long(row['EFFECT_END']),
                                                 '0',
                                                 self.wh_exec_id
                                                 )
     schedule_writer.append(schedule_record)
   schedule_writer.close()
开发者ID:alyiwang,项目名称:WhereHows,代码行数:33,代码来源:AppworxExtract.py

示例4: collect_flow_execs

# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import close [as 别名]
  def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period):
    self.logger.info( "collect flow&job executions")
    flow_exec_writer = FileWriter(flow_exec_file)
    job_exec_writer = FileWriter(job_exec_file)

    cmd = """select * from execution_flows where end_time > UNIX_TIMESTAMP(now() - INTERVAL %d MINUTE) * 1000 """ % (int(look_back_period))
    self.az_cursor.execute(cmd)
    rows = DbUtil.dict_cursor(self.az_cursor)
    row_count = 0
    for row in rows:
      json_column = 'flow_data'
      unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
      try:
        row[json_column] = json.loads(unzipped_content)
      except Exception as e:
        self.logger.error(e)
        pass
      flow_data = row[json_column]
      flow_path = flow_data['projectName'] + ":" + flow_data['flowId']
      flow_exec_record = AzkabanFlowExecRecord(self.app_id,
                                               flow_data['flowId'],
                                               flow_path,
                                               row['version'],
                                               row['exec_id'],
                                               flow_data['status'],
                                               flow_data['attempt'],
                                               row['submit_user'],
                                               long(row['start_time']) / 1000,
                                               long(row['end_time']) / 1000,
                                               self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)
      nodes = flow_data['nodes']
      job_exec_records = []
      for node in nodes:
        job_exec_record = AzkabanJobExecRecord(self.app_id,
                                                flow_path,
                                                row['version'],
                                                row['exec_id'],
                                                node['id'],
                                                flow_path + "/" + node['id'],
                                                None,
                                                node['status'],
                                                node['attempt'],
                                                long(node['startTime']) / 1000,
                                                long(node['endTime']) / 1000,
                                                self.wh_exec_id)
        job_exec_records.append(job_exec_record)

      AzkabanJobExecUtil.sortAndSet(job_exec_records)
      for r in job_exec_records:
        job_exec_writer.append(r)

      row_count += 1
      if row_count % 10000 == 0:
        flow_exec_writer.flush()
        job_exec_writer.flush()
    flow_exec_writer.close()
    job_exec_writer.close()
开发者ID:alyiwang,项目名称:WhereHows,代码行数:60,代码来源:AzkabanExtract.py

示例5: collect_flow_owners

# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import close [as 别名]
  def collect_flow_owners(self, owner_file):
    self.logger.info("collect owners")
    owner_writer = FileWriter(owner_file)
    query = "SELECT DISTINCT app_name, app_path, user_name from WF_JOBS"
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      owner_record = OozieFlowOwnerRecord(self.app_id,
                                          row['app_path'],
                                          row['user_name'],
                                          self.wh_exec_id)
      owner_writer.append(owner_record)
    owner_writer.close()
开发者ID:alyiwang,项目名称:WhereHows,代码行数:16,代码来源:OozieExtract.py

示例6: collect_flow_schedules

# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import close [as 别名]
    def collect_flow_schedules(self, schedule_file):
        # load flow scheduling info from table triggers
        self.logger.info("collect flow schedule")
        schedule_writer = FileWriter(schedule_file)
        query = "select * from triggers"
        self.az_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.az_cursor)
        for row in rows:
            json_column = "data"
            if row[json_column] != None:
                unzipped_content = gzip.GzipFile(
                    mode="r", fileobj=StringIO.StringIO(row[json_column].tostring())
                ).read()
                try:
                    row[json_column] = json.loads(unzipped_content)
                except Exception as e:
                    self.logger.error(e)
                    pass

                if not "projectId" in row[json_column]["actions"][0]["actionJson"]:
                    continue
                # print json.dumps(row[json_column], indent=4)

                if row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["isRecurring"] == "true":
                    unit, frequency, cron_expr = None, None, None
                    period = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"]
                    if period is not None and period != "null" and period[-1:] in self._period_unit_table:
                        unit = self._period_unit_table[period[-1:]]
                        frequency = int(
                            row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"][:-1]
                        )
                    if "cronExpression" in row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]:
                        cron_expr = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["cronExpression"]
                    schedule_record = AzkabanFlowScheduleRecord(
                        self.app_id,
                        row[json_column]["actions"][0]["actionJson"]["projectName"]
                        + ":"
                        + row[json_column]["actions"][0]["actionJson"]["flowName"],
                        unit,
                        frequency,
                        cron_expr,
                        long(row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["firstCheckTime"])
                        / 1000,
                        int(time.mktime(datetime.date(2099, 12, 31).timetuple())),
                        "0",
                        self.wh_exec_id,
                    )
                    schedule_writer.append(schedule_record)
        schedule_writer.close()
开发者ID:linkedin,项目名称:WhereHows,代码行数:51,代码来源:AzkabanExtract.py

示例7: collect_flow_owners

# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import close [as 别名]
  def collect_flow_owners(self, owner_file):
    # load user info from table project_permissions
    self.logger.info("collect owner&permissions")
    user_writer = FileWriter(owner_file)
    query = "select f.flow_id, p.name as project_name, p.version as project_verison, pp.name as owner, pp.permissions, pp.isGroup " \
            "from project_flows f join project_permissions pp on f.project_id = pp.project_id join projects p on f.project_id = p.id where p.active = 1"
    self.az_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.az_cursor)

    for row in rows:
      record = AzkabanFlowOwnerRecord(self.app_id,
                                      row['project_name'] + ':' + row["flow_id"],
                                      row["owner"],
                                      AzkabanPermission(row["permissions"]).toFlatString(),
                                      'GROUP' if row['isGroup'] == 1 else 'LDAP',
                                      self.wh_exec_id)
      user_writer.append(record)
    user_writer.close()
开发者ID:alyiwang,项目名称:WhereHows,代码行数:20,代码来源:AzkabanExtract.py

示例8: collect_flow_jobs

# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import close [as 别名]
  def collect_flow_jobs(self, flow_file, job_file, dag_file):
    self.logger.info("collect flow&jobs")
    flow_writer = FileWriter(flow_file)
    job_writer = FileWriter(job_file)
    dag_writer = FileWriter(dag_file)
    query = """
            SELECT a.*, b.created_time FROM
              (SELECT w.app_name, w.app_path, max(w.id) as source_version, max(unix_timestamp(w.last_modified_time)) as last_modified_time
              from WF_JOBS w LEFT JOIN WF_JOBS s
              ON w.app_path = s.app_path AND w.created_time < s.created_time
              WHERE s.created_time IS NULL GROUP BY w.app_name, w.app_path) a
              JOIN
              (SELECT app_path, min(unix_timestamp(created_time)) as created_time FROM WF_JOBS GROUP BY app_path) b
              ON a.app_path = b.app_path
            """
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      flow_record = OozieFlowRecord(self.app_id,
                                    row['app_name'],
                                    row['app_path'],
                                    0,
                                    row['source_version'],
                                    row['created_time'],
                                    row['last_modified_time'],
                                    self.wh_exec_id)
      flow_writer.append(flow_record)
      query = """
              select name, type, transition from WF_ACTIONS
              where wf_id = '{source_version}'
              """.format(source_version=row['source_version'])
      new_oz_cursor = self.oz_con.cursor()
      new_oz_cursor.execute(query)
      nodes = DbUtil.dict_cursor(new_oz_cursor)

      for node in nodes:
        job_record = OozieJobRecord(self.app_id,
                                    row['app_path'],
                                    row['source_version'],
                                    node['name'],
                                    row['app_path'] + "/" + node['name'],
                                    node['type'],
                                    self.wh_exec_id)
        job_writer.append(job_record)

        if node['transition'] != "*" and node['transition'] is not None:
          dag_edge = OozieFlowDagRecord(self.app_id,
                                        row['app_path'],
                                        row['source_version'],
                                        row['app_path'] + "/" + node['name'],
                                        row['app_path'] + "/" + node['transition'],
                                        self.wh_exec_id)
          dag_writer.append(dag_edge)
      new_oz_cursor.close()

    dag_writer.close()
    job_writer.close()
    flow_writer.close()
开发者ID:alyiwang,项目名称:WhereHows,代码行数:61,代码来源:OozieExtract.py

示例9: collect_flow_execs

# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import close [as 别名]
  def collect_flow_execs(self, flow_exec_file, lookback_period):
    self.logger.info("collect flow execs")
    flow_exec_writer = FileWriter(flow_exec_file)
    query = "select id, app_name, app_path, unix_timestamp(start_time) as start_time, unix_timestamp(end_time) as end_time, run, status, user_name from WF_JOBS where end_time > now() - INTERVAL %d MINUTE" % (int(lookback_period))
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      flow_exec_record = OozieFlowExecRecord(self.app_id,
                                             row['app_name'],
                                             row['app_path'],
                                             row['id'],
                                             row['id'],
                                             row['status'],
                                             row['run'],
                                             row['user_name'],
                                             row['start_time'],
                                             row['end_time'],
                                             self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)

    flow_exec_writer.close()
开发者ID:alyiwang,项目名称:WhereHows,代码行数:24,代码来源:OozieExtract.py

示例10: collect_flow_owners

# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import close [as 别名]
  def collect_flow_owners(self, owner_file):
    self.logger.info("collect owner&permissions")
    timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
    self.aw_cursor.execute(timezone)
    schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
    self.aw_cursor.execute(schema)
    user_writer = FileWriter(owner_file)
    query = \
        """SELECT DISTINCT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, U.SO_USER_NAME FROM SO_JOB_TABLE J
             JOIN SO_JOB_HISTORY H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
             JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
             WHERE J.SO_COMMAND_TYPE = 'CHAIN' """
    self.aw_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.aw_cursor)

    for row in rows:
      record = AppworxFlowOwnerRecord(self.app_id,
                                      row['SO_APPLICATION'] + ':' + row["SO_MODULE"],
                                      row["SO_USER_NAME"],
                                      'EXECUTE',
                                      'GROUP',
                                      self.wh_exec_id)
      user_writer.append(record)
    user_writer.close()
开发者ID:alyiwang,项目名称:WhereHows,代码行数:26,代码来源:AppworxExtract.py

示例11: transform

# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import close [as 别名]

#.........这里部分代码省略.........
            # o_field_namespace = f['type'][effective_type_index_in_type]['namespace']
            current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
            fields_json_to_csv(output_list_, current_field_path, f['type'][effective_type_index_in_type]['fields'])

            # End of function

    for line in input_json_file:
      try:
        j = json.loads(line)
      except:
        self.logger.error("    Invalid JSON:\n%s" % line)
        continue

      i += 1
      o_field_list_ = []
      parent_field_path = ''
      self.sort_id = 0

      if not (j.has_key('attributes_json') or j.has_key('attributes')):
        o_properties = {"doc": null}
      else:
        o_properties = {}
        if j.has_key('attributes_json'):
          o_properties = json.loads(j['attributes_json'])
          del j['attributes_json']
        if j.has_key('attributes'):
          o_properties = dict(j['attributes'].items() + o_properties.items())
          del j['attributes']

      if j.has_key('uri'):
        o_urn = j['uri']
      elif o_properties.has_key('uri'):
        o_urn = o_properties['uri']
      else:
        self.logger.info('*** Warning: "uri" is not found in %s' % j['name'])
        o_urn = ''

      if o_urn.find('hdfs://') == 0:
        o_name = o_urn[o_urn.rfind('/') + 1:]
      elif o_properties.has_key('table_name'):
        o_name = o_properties['table_name']
      elif j.has_key('name') and j['name'][0:5] != 'TUPLE':
        o_name = j['name']
      else:
        o_name = o_urn[o_urn.rfind('/') + 1:]

      if j.has_key('id') or not j.has_key('fields'):  # esWritable schema
        o_fields = {}
        for k in j:
          if not (k == 'uri' or k == 'attributes' or k == 'doc'):
            if type(j[k]) == list:
              o_fields[k] = {"name": k, "type": 'list', "doc": str(j[k])}
            elif type(j[k]) == dict:
              o_fields[k] = {"name": k, "type": 'dict', "doc": str(j[k])}
            else:
              o_fields[k] = {"name": k, "type": j[k], "doc": None}

            self.sort_id += 1
            o_field_list_.append([o_urn, self.sort_id, 0, '', k, o_fields[k]['type'], '', '', '',
                                  o_fields[k]['doc'].replace("\n", ' ') if o_fields[k]['doc'] is not None else None])

      elif j.has_key('fields'):
        o_fields = {}
        for f in j['fields']:
          o_field_name = f['name']
          o_fields[o_field_name] = dict(f)  # for schema output
          if f.has_key('attributes_json'):
            f['attributes'] = json.loads(f['attributes_json'])
            del f['attributes_json']

        fields_json_to_csv(o_field_list_, '', j['fields'])

      else:
        o_fields = {"doc": None}

      if j.has_key('attributes') and not o_properties.has_key('source'):
        o_properties['source'] = j['attributes']['source']

      if o_urn.startswith('hdfs:///') and self.file_regex_source_map is not None:
        o_source = self.get_source(o_urn[7:])
      else:
        self.logger.warn("property : " + Constant.HDFS_FILE_SOURCE_MAP_KEY +
                         " is None, will use default source for all dataset")
        o_source = 'Hdfs'

      self.logger.info(
        "%4i (%6i): %4i fields, %4i total fields(including nested) found in [%s]@%s with source %s" % (i, len(j), len(o_fields), len(o_field_list_), o_name, o_urn, o_source))

      dataset_schema_record = DatasetSchemaRecord(o_name, json.dumps(j, sort_keys=True),
                                                  json.dumps(o_properties, sort_keys=True), json.dumps(o_fields), o_urn,
                                                  o_source, None, None, None)
      schema_file_writer.append(dataset_schema_record)

      for fields in o_field_list_:
        field_record = DatasetFieldRecord(fields)
        field_file_writer.append(field_record)

    schema_file_writer.close()
    field_file_writer.close()
    input_json_file.close()
开发者ID:7396553,项目名称:WhereHows,代码行数:104,代码来源:HdfsTransform.py

示例12: transform

# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import close [as 别名]
  def transform(self, input, hive_metadata, hive_field_metadata):
    """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
    f_json = open(input)
    all_data = json.load(f_json)
    f_json.close()

    schema_file_writer = FileWriter(hive_metadata)
    field_file_writer = FileWriter(hive_field_metadata)

    lineageInfo = LineageInfo()

    # one db info : 'type', 'database', 'tables'
    # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
    #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
    for one_db_info in all_data:
      i = 0
      for table in one_db_info['tables']:
        i += 1
        schema_json = {}
        prop_json = {}  # set the prop json

        for prop_name in TableInfo.optional_prop:
          if prop_name in table and table[prop_name] is not None:
            prop_json[prop_name] = table[prop_name]

        if TableInfo.view_expended_text in prop_json:
          text = prop_json[TableInfo.view_expended_text].replace('`', '')
          array = HiveViewDependency.getViewDependency(text)
          l = []
          for a in array:
            l.append(a)
          prop_json['view_depends_on'] = l

        # process either schema
        flds = {}
        field_detail_list = []

        if TableInfo.schema_literal in table and table[TableInfo.schema_literal] is not None:
          sort_id = 0
          urn = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          try:
            schema_data = json.loads(table[TableInfo.schema_literal])
            schema_json = schema_data
            acp = AvroColumnParser(schema_data, urn = urn)
            result = acp.get_column_list_result()
            field_detail_list += result
          except ValueError:
            self.logger.error("Schema json error for table : \n" + str(table))
        elif TableInfo.field_list in table:
          # Convert to avro
          uri = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          hcp = HiveColumnParser(table, urn = uri)
          schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
          field_detail_list += hcp.column_type_list

        dataset_scehma_record = DatasetSchemaRecord(table['name'], json.dumps(schema_json), json.dumps(prop_json),
                                                    json.dumps(flds),
                                                    "hive:///%s/%s" % (one_db_info['database'], table['name']), 'Hive',
                                                    '', (table[TableInfo.create_time] if table.has_key(
            TableInfo.create_time) else None), (table["lastAlterTime"]) if table.has_key("lastAlterTime") else None)
        schema_file_writer.append(dataset_scehma_record)

        for fields in field_detail_list:
          field_record = DatasetFieldRecord(fields)
          field_file_writer.append(field_record)

      schema_file_writer.flush()
      field_file_writer.flush()
      self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))

    schema_file_writer.close()
    field_file_writer.close()
开发者ID:SunZhaonan,项目名称:WhereHows-1,代码行数:80,代码来源:HiveTransform.py

示例13: __init__

# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import close [as 别名]

#.........这里部分代码省略.........
        xml = ET.fromstring(resp.content)
        current_project = MultiproductProjectRecord(
          self.app_id,
          xml.find('slug').text,
          'git',
          xml.find('owner').attrib['kind'],
          xml.find('owner').text,
          xml.find('created-at').text,
          xml.find('license').text,
          self.trim_newline(xml.find('description').text),
          self.wh_exec_id
        )

        project_repo_names = []
        for repo in xml.findall('repositories/mainlines/repository'):
          repo_fullname = re_git_repo_name.search(repo.find('clone_url').text).group(1)
          project_repo_names.append(repo_fullname)
          repo_key = 'git:' + repo_fullname
          self.git_repo[repo_key] = {
            'scm_repo_fullname': repo_fullname,
            'scm_type': 'git',
            'repo_id': repo.find('id').text,
            'project': project_name,
            'owner_type': repo.find('owner').attrib['kind'],
            'owner_name': repo.find('owner').text
          }

        project_repo_num = len(project_repo_names)
        current_project.setRepos(project_repo_num, ','.join(project_repo_names))
        self.project_writer.append(current_project)
        project_names[project_name] = project_repo_num
        # self.logger.debug("Project: {} - Repos: {}".format(project_name, project_repo_num))

    self.project_writer.close()
    self.logger.info("Finish Fetching git projects and repos")
    self.logger.debug('Non-exist projects: {}'.format(project_nonexist))


  def merge_product_repo(self):
    '''
    merge multiproduct and repo into same product_repo store
    '''
    for key, repo in self.git_repo.iteritems():
      record = MultiproductRepoRecord(
        self.app_id,
        repo['scm_repo_fullname'],
        repo['scm_type'],
        int(repo['repo_id']),
        repo['project'],
        repo['owner_type'],
        repo['owner_name'],
        self.wh_exec_id
      )
      if key in self.multiproduct:
        mp = self.multiproduct[key]
        record.setMultiproductInfo(
          mp["multiproduct_name"],
          mp["product_type"],
          mp["product_version"],
          mp["namespace"]
        )
      self.repo_writer.append(record)
      self.product_repo.append(record)

    for key, product in self.multiproduct.iteritems():
      if key not in self.git_repo:
开发者ID:dmoore247,项目名称:WhereHows,代码行数:70,代码来源:MultiproductExtract.py

示例14: transform

# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import close [as 别名]
  def transform(self, input, hive_metadata, hive_field_metadata):
    """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
    f_json = open(input)
    all_data = json.load(f_json)
    f_json.close()

    schema_file_writer = FileWriter(hive_metadata)
    field_file_writer = FileWriter(hive_field_metadata)

    lineageInfo = LineageInfo()

    # one db info : 'type', 'database', 'tables'
    # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
    #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
    for one_db_info in all_data:
      i = 0
      for table in one_db_info['tables']:
        i += 1
        schema_json = {}
        prop_json = {}  # set the prop json

        for prop_name in TableInfo.optional_prop:
          if prop_name in table and table[prop_name] is not None:
            prop_json[prop_name] = table[prop_name]

        if TableInfo.view_expended_text in prop_json:
          text = prop_json[TableInfo.view_expended_text].replace('`', '')
          array = HiveViewDependency.getViewDependency(text)
          l = []
          for a in array:
            l.append(a)
          prop_json['view_depends_on'] = l

        # process either schema
        flds = {}
        field_detail_list = []
        if TableInfo.schema_literal in table and table[TableInfo.schema_literal] is not None:
          sort_id = 0
          try:
            schema_data = json.loads(table[TableInfo.schema_literal])
          except ValueError:
            self.logger.error("Schema json error for table : \n" + str(table))
          schema_json = schema_data

          # process each field
          for field in schema_data['fields']:
            field_name = field['name']
            type = field['type']  # could be a list
            default_value = field['default'] if 'default' in field else None
            doc = field['doc'] if 'doc' in field else None

            attributes_json = json.loads(field['attributes_json']) if 'attributes_json' in field else None
            pk = delta = is_nullable = is_indexed = is_partitioned = inside_type = format = data_size = None
            if attributes_json:
              pk = attributes_json['pk'] if 'pk' in attributes_json else None
              delta = attributes_json['delta'] if 'delta' in attributes_json else None
              is_nullable = attributes_json['nullable'] if 'nullable' in attributes_json else None
              inside_type = attributes_json['type'] if 'type' in attributes_json else None
              format = attributes_json['format'] if 'format' in attributes_json else None

            flds[field_name] = {'type': type}
            # String urn, Integer sortId, Integer parentSortId, String parentPath, String fieldName,
            #String dataType, String isNullable, String defaultValue, Integer dataSize, String namespace, String description
            sort_id += 1
            field_detail_list.append(
              ["hive:///%s/%s" % (one_db_info['database'], table['name']), str(sort_id), '0', None, field_name, '',
               type, data_size, None, None, is_nullable, is_indexed, is_partitioned, default_value, None,
               json.dumps(attributes_json)])
        elif TableInfo.field_list in table:
          schema_json = {'type': 'record', 'name': table['name'],
                         'fields': table[TableInfo.field_list]}  # construct a schema for data came from COLUMN_V2
          for field in table[TableInfo.field_list]:
            field_name = field['ColumnName']
            type = field['TypeName']
            # ColumnName, IntegerIndex, TypeName, Comment
            flds[field_name] = {'type': type}
            pk = delta = is_nullable = is_indexed = is_partitioned = inside_type = format = data_size = default_value = None  # TODO ingest
            field_detail_list.append(
              ["hive:///%s/%s" % (one_db_info['database'], table['name']), field['IntegerIndex'], '0', None, field_name,
               '', field['TypeName'], None, None, None, is_nullable, is_indexed, is_partitioned, default_value, None,
               None])

        dataset_scehma_record = DatasetSchemaRecord(table['name'], json.dumps(schema_json), json.dumps(prop_json),
                                                    json.dumps(flds),
                                                    "hive:///%s/%s" % (one_db_info['database'], table['name']), 'Hive',
                                                    '', (table[TableInfo.create_time] if table.has_key(
            TableInfo.create_time) else None), (table["lastAlterTime"]) if table.has_key("lastAlterTime") else None)
        schema_file_writer.append(dataset_scehma_record)

        for fields in field_detail_list:
          field_record = DatasetFieldRecord(fields)
          field_file_writer.append(field_record)

      schema_file_writer.flush()
#.........这里部分代码省略.........
开发者ID:0xqq,项目名称:WhereHows,代码行数:103,代码来源:HiveTransform.py

示例15: collect_flow_jobs

# 需要导入模块: from wherehows.common.writers import FileWriter [as 别名]
# 或者: from wherehows.common.writers.FileWriter import close [as 别名]
  def collect_flow_jobs(self, flow_file, job_file, dag_file):
    self.logger.info("collect flow&jobs")
    query = "SELECT distinct f.*, p.name as project_name FROM  project_flows f inner join projects p on f.project_id = p.id and f.version = p.version where p.active = 1"
    self.az_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.az_cursor)
    flow_writer = FileWriter(flow_file)
    job_writer = FileWriter(job_file)
    dag_writer = FileWriter(dag_file)
    row_count = 0

    for row in rows:
      row['version'] = 0 if (row["version"] is None) else row["version"]

      json_column = 'json'
      unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
      try:
        row[json_column] = json.loads(unzipped_content)
      except:
        pass

      flow_path = row['project_name'] + ":" + row['flow_id']

      flow_record = AzkabanFlowRecord(self.app_id,
                                      row['flow_id'],
                                      row['project_name'],
                                      flow_path,
                                      0,
                                      row['modified_time'] / 1000,
                                      row["version"],
                                      'Y',
                                      self.wh_exec_id)
      flow_writer.append(flow_record)

      # get flow jobs
      nodes = row[json_column]['nodes']
      for node in nodes:
        job_record = AzkabanJobRecord(self.app_id,
                                      flow_path,
                                      row["version"],
                                      node['id'],
                                      flow_path + '/' + node['id'],
                                      node['jobType'],
                                      'Y',
                                      self.wh_exec_id)
        if node['jobType'] == 'flow':
          job_record.setRefFlowPath(row['project_name'] + ":" + node['embeddedFlowId'])
        job_writer.append(job_record)

      # job dag
      edges = row[json_column]['edges']
      for edge in edges:
        dag_edge = AzkabanFlowDagRecord(self.app_id,
                                        flow_path,
                                        row['version'],
                                        flow_path + '/' + edge['source'],
                                        flow_path + '/' + edge['target'],
                                        self.wh_exec_id)
        dag_writer.append(dag_edge)

      row_count += 1

      if row_count % 1000 == 0:
        flow_writer.flush()
        job_writer.flush()
        dag_writer.flush()

    flow_writer.close()
    job_writer.close()
    dag_writer.close()
开发者ID:alyiwang,项目名称:WhereHows,代码行数:71,代码来源:AzkabanExtract.py


注:本文中的wherehows.common.writers.FileWriter.close方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。