当前位置: 首页>>代码示例>>Python>>正文


Python messytables.headers_processor函数代码示例

本文整理汇总了Python中messytables.headers_processor函数的典型用法代码示例。如果您正苦于以下问题:Python headers_processor函数的具体用法?Python headers_processor怎么用?Python headers_processor使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了headers_processor函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: convert

    def convert(self):

        table_set = CSVTableSet.from_fileobj(self.stream)
        row_set = table_set.tables.pop()
        offset, headers = headers_guess(row_set.sample)

        fields = []
        dup_columns = {}
        noname_count = 1
        for index, field in enumerate(headers):
            field_dict = {}
            if "" == field:
                field = '_'.join(['column', str(noname_count)])
                headers[index] = field
                noname_count += 1
            if headers.count(field) == 1:
                field_dict['id'] = field
            else:
                dup_columns[field] = dup_columns.get(field, 0) + 1
                field_dict['id'] =  u'_'.join([field, str(dup_columns[field])])
            fields.append(field_dict)
        row_set.register_processor(headers_processor([x['id'] for x in fields]))
        row_set.register_processor(offset_processor(offset + 1))

        data_row = {}
        result = []
        for row in row_set:
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            result.append(data_row)
        return fields, result
开发者ID:Big-Data,项目名称:data-converters,代码行数:31,代码来源:csv_json_converter.py

示例2: get_schema

    def get_schema(self, filename):
        """
        Guess schema using messytables
        """
        table_set = self.read_file(filename)
            
        # Have I been able to read the filename
        if table_set is None: 
            return [] 

        # Get the first table as rowset
        row_set = table_set.tables[0]

        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        types = type_guess(row_set.sample, strict=True)

        # Get a sample as well..
        sample = next(row_set.sample)

        clean = lambda v: str(v) if not isinstance(v, str) else v 
        schema = []
        for i, h in enumerate(headers):
            schema.append([h,
                           str(types[i]),
                           clean(sample[i].value)])

        return schema
开发者ID:purnima215,项目名称:dgit,代码行数:29,代码来源:tableformat.py

示例3: generate_table

    def generate_table(self, document, meta, sheet, row_set):
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        tabular = self.create_tabular(sheet, row_set.name)
        columns = [tabular.add_column(h) for h in headers]
        if not len(columns):
            return

        def generate_rows():
            for i, row in enumerate(row_set):
                record = {}
                try:
                    for cell, column in zip(row, columns):
                        record[column.name] = string_value(cell.value)
                    if len(record):
                        for column in columns:
                            record[column.name] = record.get(column.name, None)
                        yield record
                except Exception as exception:
                    log.warning("Could not decode row %s in %s: %s",
                                i, meta, exception)

        document.insert_records(sheet, generate_rows())
        return tabular
开发者ID:CodeForAfrica,项目名称:aleph,代码行数:25,代码来源:tabular.py

示例4: generate_table

    def generate_table(self, meta, sheet, row_set):
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        schema = TabularSchema({
            'sheet_name': row_set.name,
            'content_hash': meta.content_hash,
            'sheet': sheet
        })
        columns = [schema.add_column(h) for h in headers]
        log.info("Creating internal table: %s columns, table: %r", len(columns),
                 schema.table_name)
        tabular = Tabular(schema)
        tabular.drop()
        tabular.create()

        def generate_rows():
            for i, row in enumerate(row_set):
                record = {}
                for cell, column in zip(row, columns):
                    record[column.name] = string_value(cell.value)
                if len(record):
                    for column in columns:
                        record[column.name] = record.get(column.name, None)
                    yield record
            log.info("Loaded %s rows.", i)

        tabular.load_iter(generate_rows())
        return schema
开发者ID:DavidLemayian,项目名称:aleph,代码行数:29,代码来源:tabular.py

示例5: main

def main(argv=None):
    args = parse_args(argv)

    if args.file is None:
        # slurp the whole input since there seems to be a bug in messytables
        # which should be able to handle streams but doesn't
        args.file = cStringIO.StringIO(sys.stdin.read())

    relation_key = args_to_relation_key(args)

    table_set = any_tableset(args.file)
    if len(table_set.tables) != 1:
        raise ValueError("Can only handle files with a single table, not %s" % len(table_set.tables))

    row_set = table_set.tables[0]

    # guess header names and the offset of the header:
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(strip_processor())
    row_set.register_processor(headers_processor(headers))
    # Temporarily, mark the offset of the header
    row_set.register_processor(offset_processor(offset + 1))

    # guess types and register them
    types = type_guess(replace_empty_string(row_set.sample), strict=True, types=[StringType, DecimalType, IntegerType])
    row_set.register_processor(types_processor(types))

    # Messytables seems to not handle the case where there are no headers.
    # Work around this as follows:
    # 1) offset must be 0
    # 2) if the types of the data match the headers, assume there are
    #    actually no headers
    if offset == 0:
        try:
            [t.cast(v) for (t, v) in zip(types, headers)]
        except:
            pass
        else:
            # We don't need the headers_processor or the offset_processor
            row_set._processors = []
            row_set.register_processor(strip_processor())
            row_set.register_processor(types_processor(types))
            headers = None

    # Construct the Myria schema
    schema = messy_to_schema(types, headers)
    logging.info("Myria schema: {}".format(json.dumps(schema)))

    # Prepare data for writing to Myria
    data, kwargs = write_data(row_set, schema)

    if not args.dry:
        # Connect to Myria and send the data
        connection = myria.MyriaConnection(hostname=args.hostname, port=args.port, ssl=args.ssl)
        ret = connection.upload_file(relation_key, schema, data, args.overwrite, **kwargs)

        sys.stdout.write(pretty_json(ret))
    else:
        sys.stdout.write(data)
开发者ID:helgag,项目名称:myria-python,代码行数:59,代码来源:upload_file.py

示例6: test_guess_headers

    def test_guess_headers(self):
        fh = horror_fobj("weird_head_padding.csv")
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        data = list(row_set)
        assert "Frauenheilkunde" in data[9][0].value, data[9][0].value

        fh = horror_fobj("weird_head_padding.csv")
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        row_set.register_processor(headers_processor(["foo", "bar"]))
        data = list(row_set)
        assert "foo" in data[12][0].column, data[12][0]
        assert "Chirurgie" in data[12][0].value, data[12][0].value
开发者ID:bearrito,项目名称:messytables,代码行数:17,代码来源:test_read.py

示例7: lines

 def lines(self):
     fh = urlopen(self.source.url)
     row_set = CSVRowSet('data', fh, window=3)
     headers = list(row_set.sample)[0]
     headers = [c.value for c in headers]
     row_set.register_processor(headers_processor(headers))
     row_set.register_processor(offset_processor(1))
     for row in row_set:
         yield dict([(c.column, c.value) for c in row])
开发者ID:fucc1,项目名称:FPA_Core,代码行数:9,代码来源:__init__.py

示例8: test_read_encoded_characters_csv

 def test_read_encoded_characters_csv(self):
     fh = horror_fobj('characters.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     row_set.register_processor(headers_processor(headers))
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set)
     assert_equal(382, len(data))
     assert_equal(data[0][2].value, u'雲嘉南濱海國家風景區管理處')
     assert_equal(data[-1][2].value, u'沈光文紀念廳')
开发者ID:ahlusar1989,项目名称:messytables,代码行数:11,代码来源:test_read.py

示例9: parse

def parse(stream, excel_type='xls', sheet=1, guess_types=True, **kwargs):
    '''Parse Excel (xls or xlsx) to structured objects.

    :param excel_type: xls | xlsx
    :param sheet: index of sheet in spreadsheet to convert (starting from index = 1)
    '''
    sheet_number = int(sheet) - 1

    xlsclass = XLSTableSet
    if excel_type == 'xlsx':
        xlsclass = XLSXTableSet
    table_set = xlsclass.from_fileobj(stream)
    try:
        row_set = table_set.tables[sheet_number]
    except IndexError:
        raise Exception('This file does not have sheet number %d' %
                        (sheet_number + 1))
    offset, headers = headers_guess(row_set.sample)

    fields = []
    dup_columns = {}
    noname_count = 1
    if guess_types:
        guess_types = [StringType, IntegerType, FloatType, DecimalType,
                       DateUtilType]
        row_types = type_guess(row_set.sample, guess_types)
    for index, field in enumerate(headers):
        field_dict = {}
        if "" == field:
            field = '_'.join(['column', str(noname_count)])
            headers[index] = field
            noname_count += 1
        if headers.count(field) == 1:
            field_dict['id'] = field
        else:
            dup_columns[field] = dup_columns.get(field, 0) + 1
            field_dict['id'] = u'_'.join([field, str(dup_columns[field])])
        if guess_types:
            if isinstance(row_types[index], DateUtilType):
                field_dict['type'] = 'DateTime'
            else:
                field_dict['type'] = str(row_types[index])
        fields.append(field_dict)
    row_set.register_processor(headers_processor([x['id'] for x in fields]))
    row_set.register_processor(offset_processor(offset + 1))

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            yield data_row

    return row_iterator(), {'fields': fields}
开发者ID:Web5design,项目名称:dataconverters,代码行数:54,代码来源:xls.py

示例10: proc

def proc(f, database_name, table_name):

    table_set = messytables.any_tableset(f)
    row_set = table_set.tables[0]

    # guess header names and the offset of the header:
    offset, headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=[
        messytables.types.StringType,
        messytables.types.DateType,
    ], strict=True)
    hive_data_file = tempfile.NamedTemporaryFile(mode='w')

    fields_ddl = ','.join([
        '  {0} {1}\n'.format(
            canonicalize_column_name(colName),
            hive_column_type(colType)
        )
        for colName, colType in zip(headers, types)
    ])
    hive_sql = '''
DROP TABLE IF EXISTS {0};

CREATE TABLE {0} (
{1}
)
STORED AS TEXTFILE
TBLPROPERTIES ("comment"="add_messytable on {3}");

LOAD DATA LOCAL INPATH '{2}' OVERWRITE INTO TABLE {0};
'''.format(table_name, fields_ddl, hive_data_file.name,
        datetime.datetime.now().isoformat())

    hive_cmd_file = tempfile.NamedTemporaryFile(mode='w')
    print(hive_sql, file=hive_cmd_file)
    hive_cmd_file.flush()

    row_set.register_processor(messytables.types_processor(types))

    for row in row_set:
        print('\001'.join(map(str, [ c.value for c in row])),
                file=hive_data_file)
    hive_data_file.flush()

    subprocess.call([
        'hive',
        '--database', database_name,
        '-f', hive_cmd_file.name,
    ])
开发者ID:Bridg,项目名称:bridg-messytable,代码行数:51,代码来源:add_messytable.py

示例11: test_read_head_padding_csv

 def test_read_head_padding_csv(self):
     fh = horror_fobj("weird_head_padding.csv")
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     assert 11 == len(headers), headers
     assert_equal(u"1985", headers[1].strip())
     row_set.register_processor(headers_processor(headers))
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set.sample)
     for row in row_set:
         assert_equal(11, len(row))
     value = data[1][0].value.strip()
     assert value == u"Gefäßchirurgie", value
开发者ID:bearrito,项目名称:messytables,代码行数:14,代码来源:test_read.py

示例12: validate_file

def validate_file(file_tmp, file_name, tmp_filepath):

    log.info("upload: checking file * %s * ", file_name)
    MAX_HEADER_LENGTH = 64
    # not allowed characters ( - ' " ’ ‘) regex
    inappropriate_chars = re.compile(r"[\-|\'|\"|\u2018|\u2019]");
    datastore_ext = config.get('ckan.mimetype_guess', "csv xls xlsx tsv")
    tmp_file_name, tmp_file_ext = os.path.splitext(file_name)

    #check if datastore file (csv xls xlsx tsv)
    if tmp_file_ext[1:].lower() in datastore_ext:
        table_set = any_tableset(file_tmp)
        #check if only one data sheet in the file
        if len(table_set.tables)>1:
            rollback_tmp(file_tmp, tmp_filepath)
            log.error("upload: the file * %s * was not uploaded - There is more then one data sheet in the file", file_name)
            raise logic.ValidationError(
                {'upload': ['There is more then one data sheet in the file']}
            )
        else:
            row_set = table_set.tables[0]
            # guess header names and the offset of the header:
            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))
            for header in headers:
                # too long header
                if len(header) > MAX_HEADER_LENGTH:
                    rollback_tmp(file_tmp, tmp_filepath)
                    log.error("upload: the file * %s * was not uploaded - too long header - * %s *",
                              file_name, header)
                    raise logic.ValidationError(
                        {'upload': ['too long header (64 max)']}
                    )
                # not allowed characters in header ( - ' " ’ ‘)
                if inappropriate_chars.search(header):
                    rollback_tmp(file_tmp, tmp_filepath)
                    log.error("upload: the file * %s * was not uploaded - there are inappropriate characters in headers * %s *",
                              file_name, header)
                    raise logic.ValidationError(
                        {'upload': ['there are inappropriate characters in headers (apostrophe/apostrophes/dash)']}
                    )
            # Check for duplicate fields
            unique_fields = set(headers)
            if not len(unique_fields) == len(headers):
                rollback_tmp(file_tmp, tmp_filepath)
                log.error("upload: the file * %s * was not uploaded - Duplicate column names are not supported", file_name)
                raise logic.ValidationError({'upload': ['Duplicate column names are not supported']})
        log.info("passed validation succesfully - the file * %s * was uploaded to CKAN (filestore)", file_name)
    else:
        pass
开发者ID:CIOIL,项目名称:DataGovIL,代码行数:50,代码来源:file_validators.py

示例13: get_diff

    def get_diff(self, filename1, filename2):

        # print("get_diff", filename1, filename2)

        ext = filename1.split(".")[-1].lower() 
        if ext not in ['csv', 'tsv', 'xls']: 
            return None

        csvs = {} 
        for f in [filename1, filename2]: 
            # print("Loading file", f)
            table_set = self.read_file(f) 
            if table_set is None: 
                raise Exception("Invalid table set")
            row_set = table_set.tables[0]
            #print("Guessing headers")
            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset+1))
            
            # Output of rowset is a structure
            csvs[f] = [headers] 
            for row in row_set: 
                csvs[f].append([r.value for r in row])
            
            #print(csvs[f][:3])

        # Loaded csv1 and csv2 
        table1 = daff.PythonTableView(csvs[filename1])
        table2 = daff.PythonTableView(csvs[filename2])

        alignment = daff.Coopy.compareTables(table1,table2).align()

        # print("Achieved alignment") 

        data_diff = []
        table_diff = daff.PythonTableView(data_diff)

        flags = daff.CompareFlags()
        highlighter = daff.TableDiff(alignment,flags)
        highlighter.hilite(table_diff)

        # Parse the differences
        #print("Parsing diff") 
        diff = self.parse_diff(table_diff)

        # print("Computed diff", diff) 
        return diff 
开发者ID:purnima215,项目名称:dgit,代码行数:48,代码来源:tableformat.py

示例14: csvimport_table

def csvimport_table(name):
    from messytables import CSVTableSet, type_guess
    from messytables import types_processor, headers_guess
    from messytables import headers_processor, offset_processor
    from spendb.etl.extract import parse_table

    row_set = CSVTableSet(data_fixture(name)).tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    rows = []
    for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)):
        rows.append(row)

    return fields, rows
开发者ID:trickvi,项目名称:spendb,代码行数:18,代码来源:helpers.py

示例15: load_data

def load_data(config):
    if not 'url' in config:
        yield {
            config.get('field'): config.get('value')
            }
        return
    fh = urlopen(config.get('url'))
    table_set = CSVTableSet.from_fileobj(fh)
    row_set = table_set.tables[0]

    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))

    for row in row_set:
        row = [(c.column, c.value) for c in row]
        yield dict(row)

    fh.close()
开发者ID:pombredanne,项目名称:journoid,代码行数:19,代码来源:process.py


注:本文中的messytables.headers_processor函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。