当前位置: 首页>>代码示例>>Python>>正文


Python messytables.type_guess函数代码示例

本文整理汇总了Python中messytables.type_guess函数的典型用法代码示例。如果您正苦于以下问题:Python type_guess函数的具体用法?Python type_guess怎么用?Python type_guess使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了type_guess函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_guessing_uses_first_in_case_of_tie

    def test_guessing_uses_first_in_case_of_tie(self):
        csv_file = StringIO.StringIO('''
            2
            1.1
            1500''')
        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(
            rows.sample, types=[DecimalType, IntegerType], strict=False)
        assert_equal(guessed_types, [DecimalType()])

        guessed_types = type_guess(
            rows.sample, types=[IntegerType, DecimalType], strict=False)
        assert_equal(guessed_types, [IntegerType()])
开发者ID:MPBAUnofficial,项目名称:messytables,代码行数:13,代码来源:test_guessing.py

示例2: generate_mapping

def generate_mapping(fileobj, sample=2000):
    row_set = CSVRowSet('data', fileobj, window=sample)
    sample = list(row_set.sample)
    headers, sample = sample[0], sample[1:]
    values = frequent_values(sample)
    types = type_guess(sample)
    mapping = {}
    for header, type_, value in zip(headers, types, values):
        type_ = repr(type_).lower()
        name = slugify(header.value).lower()
        meta = {
            'label': header.value,
            'column': header.value,
            'common_values': value,
            'datatype': type_
            }
        if type_ in ['decimal', 'integer', 'float']:
            meta['type'] = 'measure'
            meta['datatype'] = 'float'
        elif type_ in ['date']:
            meta['type'] = 'date'
            meta['datatype'] = 'date'
        else:
            meta['type'] = 'value'
        mapping[name] = meta
    return mapping
开发者ID:jagarcias,项目名称:openspending.etl,代码行数:26,代码来源:mapgen.py

示例3: get_schema

    def get_schema(self, filename):
        """
        Guess schema using messytables
        """
        table_set = self.read_file(filename)
            
        # Have I been able to read the filename
        if table_set is None: 
            return [] 

        # Get the first table as rowset
        row_set = table_set.tables[0]

        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        types = type_guess(row_set.sample, strict=True)

        # Get a sample as well..
        sample = next(row_set.sample)

        clean = lambda v: str(v) if not isinstance(v, str) else v 
        schema = []
        for i, h in enumerate(headers):
            schema.append([h,
                           str(types[i]),
                           clean(sample[i].value)])

        return schema
开发者ID:purnima215,项目名称:dgit,代码行数:29,代码来源:tableformat.py

示例4: test_strict_type_guessing_with_large_file

 def test_strict_type_guessing_with_large_file(self):
     fh = horror_fobj('211.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 96)
     assert_equal(guessed_types, [
         IntegerType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), IntegerType(), StringType(), DecimalType(),
         DecimalType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), DateUtilType(),
         DateUtilType(), DateUtilType(), DateUtilType(), StringType(),
         StringType(), StringType()])
开发者ID:MPBAUnofficial,项目名称:messytables,代码行数:34,代码来源:test_guessing.py

示例5: analyze_csv

def analyze_csv(url, sample=1000):
    try:
        fileobj = urlopen(url)
        row_set = CSVRowSet('data', fileobj, window=sample)
        sample = list(row_set.sample)
        headers, sample = sample[0], sample[1:]
        #values = frequent_values(sample)
        types = type_guess(sample[500:], types=LIMITED_TYPES)
        mapping = {}
        for header, type_ in zip(headers, types):
            type_ = repr(type_).lower()
            name = slugify(header.value).lower()
            meta = {
                'label': header.value,
                'column': header.value,
                'datatype': type_
                }
            if type_ in ['decimal', 'integer', 'float']:
                meta['type'] = 'measure'
                meta['datatype'] = 'float'
            elif type_.startswith('date'):
                meta['type'] = 'date'
                meta['datatype'] = 'date'
            else:
                meta['type'] = 'attribute'
            mapping[name] = meta
        return {'columns': [h.value for h in headers], 
                'mapping': mapping}
    except Exception, e:
        return {'error': unicode(e)}
开发者ID:asuffield,项目名称:openspending,代码行数:30,代码来源:analysis.py

示例6: rowset_as_jts

def rowset_as_jts(rowset, headers=None, types=None):
    ''' Create a json table schema from a rowset
    '''
    _, headers = messytables.headers_guess(rowset.sample)
    types = map(celltype_as_string, messytables.type_guess(rowset.sample))

    return headers_and_typed_as_jts(headers, types)
开发者ID:MPBAUnofficial,项目名称:messytables,代码行数:7,代码来源:jts.py

示例7: test_null_process

    def test_null_process(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        row_set.register_processor(null_processor(['null']))
        data = list(row_set)

        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, False, False])

        types = type_guess(row_set.sample, strict=True)
        expected_types = [IntegerType(), BoolType(), BoolType(),
                          BoolType()]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))

        # after applying the types, '' should become None for int columns
        data = list(row_set)
        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, True, True])
开发者ID:ahlusar1989,项目名称:messytables,代码行数:25,代码来源:test_read.py

示例8: main

def main(argv=None):
    args = parse_args(argv)

    if args.file is None:
        # slurp the whole input since there seems to be a bug in messytables
        # which should be able to handle streams but doesn't
        args.file = cStringIO.StringIO(sys.stdin.read())

    relation_key = args_to_relation_key(args)

    table_set = any_tableset(args.file)
    if len(table_set.tables) != 1:
        raise ValueError("Can only handle files with a single table, not %s" % len(table_set.tables))

    row_set = table_set.tables[0]

    # guess header names and the offset of the header:
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(strip_processor())
    row_set.register_processor(headers_processor(headers))
    # Temporarily, mark the offset of the header
    row_set.register_processor(offset_processor(offset + 1))

    # guess types and register them
    types = type_guess(replace_empty_string(row_set.sample), strict=True, types=[StringType, DecimalType, IntegerType])
    row_set.register_processor(types_processor(types))

    # Messytables seems to not handle the case where there are no headers.
    # Work around this as follows:
    # 1) offset must be 0
    # 2) if the types of the data match the headers, assume there are
    #    actually no headers
    if offset == 0:
        try:
            [t.cast(v) for (t, v) in zip(types, headers)]
        except:
            pass
        else:
            # We don't need the headers_processor or the offset_processor
            row_set._processors = []
            row_set.register_processor(strip_processor())
            row_set.register_processor(types_processor(types))
            headers = None

    # Construct the Myria schema
    schema = messy_to_schema(types, headers)
    logging.info("Myria schema: {}".format(json.dumps(schema)))

    # Prepare data for writing to Myria
    data, kwargs = write_data(row_set, schema)

    if not args.dry:
        # Connect to Myria and send the data
        connection = myria.MyriaConnection(hostname=args.hostname, port=args.port, ssl=args.ssl)
        ret = connection.upload_file(relation_key, schema, data, args.overwrite, **kwargs)

        sys.stdout.write(pretty_json(ret))
    else:
        sys.stdout.write(data)
开发者ID:helgag,项目名称:myria-python,代码行数:59,代码来源:upload_file.py

示例9: test_non_strict_guessing_handles_padding

 def test_non_strict_guessing_handles_padding(self):
     csv_file = StringIO.StringIO('''
         1,   , 2.1
         2,   , 1.1
         foo, , 1500''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=False)
     assert_equal(len(guessed_types), 3)
     assert_equal(guessed_types, [IntegerType(), StringType(), DecimalType()])
开发者ID:rossjones,项目名称:messytables,代码行数:9,代码来源:test_guessing.py

示例10: test_strict_guessing_handles_padding

 def test_strict_guessing_handles_padding(self):
     csv_file = io.BytesIO(b'''
         1,   , 2
         2,   , 1.1
         foo, , 1500''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=True)
     assert_equal(len(guessed_types), 3)
     assert_equal(guessed_types,
                  [StringType(), StringType(), DecimalType()])
开发者ID:MikeData,项目名称:messytables,代码行数:10,代码来源:test_guessing.py

示例11: parse

def parse(stream, excel_type='xls', sheet=1, guess_types=True, **kwargs):
    '''Parse Excel (xls or xlsx) to structured objects.

    :param excel_type: xls | xlsx
    :param sheet: index of sheet in spreadsheet to convert (starting from index = 1)
    '''
    sheet_number = int(sheet) - 1

    xlsclass = XLSTableSet
    if excel_type == 'xlsx':
        xlsclass = XLSXTableSet
    table_set = xlsclass.from_fileobj(stream)
    try:
        row_set = table_set.tables[sheet_number]
    except IndexError:
        raise Exception('This file does not have sheet number %d' %
                        (sheet_number + 1))
    offset, headers = headers_guess(row_set.sample)

    fields = []
    dup_columns = {}
    noname_count = 1
    if guess_types:
        guess_types = [StringType, IntegerType, FloatType, DecimalType,
                       DateUtilType]
        row_types = type_guess(row_set.sample, guess_types)
    for index, field in enumerate(headers):
        field_dict = {}
        if "" == field:
            field = '_'.join(['column', str(noname_count)])
            headers[index] = field
            noname_count += 1
        if headers.count(field) == 1:
            field_dict['id'] = field
        else:
            dup_columns[field] = dup_columns.get(field, 0) + 1
            field_dict['id'] = u'_'.join([field, str(dup_columns[field])])
        if guess_types:
            if isinstance(row_types[index], DateUtilType):
                field_dict['type'] = 'DateTime'
            else:
                field_dict['type'] = str(row_types[index])
        fields.append(field_dict)
    row_set.register_processor(headers_processor([x['id'] for x in fields]))
    row_set.register_processor(offset_processor(offset + 1))

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            yield data_row

    return row_iterator(), {'fields': fields}
开发者ID:Web5design,项目名称:dataconverters,代码行数:54,代码来源:xls.py

示例12: test_json_type

    def test_json_type(self):
        csv_file = StringIO.StringIO('''
        "{""a"":""b"", ""c"":""d""}",       "[1, 2, 3]",                12a
        "[""a"", [1, 2, {""a"":""b""}]]",   "{""a"": 1, ""b"":[1, 2]}", abc
        ,,                                                              "abc"
        ''')

        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(rows.sample)

        assert_equal(guessed_types, [JsonType(), JsonType(), StringType()])
开发者ID:MPBAUnofficial,项目名称:messytables,代码行数:11,代码来源:test_guessing.py

示例13: test_wkt_type

    def test_wkt_type(self):
        csv_file = StringIO.StringIO('''
        "0102000020e6100000020000000000000000002640000000000000474000000000000024400000000000804640",
        "0102000020787f0000020000000000000000002640000000000000474000000000000024400000000000804640", "SRID=4326;LINESTRING(11 46,10 45)"
        "0101000020e610000000000000000026400000000000004740", "SRID=4326;LINESTRING(11 46,10 45)"
        , "SRID=4326;POINT(11 46)"
        ''')

        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(rows.sample, strict=True)

        assert_equal(guessed_types, [EWKB(), EWKT()])
开发者ID:MPBAUnofficial,项目名称:messytables,代码行数:12,代码来源:test_guessing.py

示例14: rowset_as_schema

def rowset_as_schema(rowset):
    _, headers = messytables.headers_guess(rowset.sample)
    types = map(celltype_as_string, messytables.type_guess(rowset.sample))

    j = jsontableschema.JSONTableSchema()

    for field_id, field_type in zip(headers, types):
        j.add_field(field_id=field_id, 
                    label=field_id,
                    field_type=field_type)

    return j
开发者ID:mk270,项目名称:messytables-jts,代码行数:12,代码来源:messytables_jts.py

示例15: proc

def proc(f, database_name, table_name):

    table_set = messytables.any_tableset(f)
    row_set = table_set.tables[0]

    # guess header names and the offset of the header:
    offset, headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=[
        messytables.types.StringType,
        messytables.types.DateType,
    ], strict=True)
    hive_data_file = tempfile.NamedTemporaryFile(mode='w')

    fields_ddl = ','.join([
        '  {0} {1}\n'.format(
            canonicalize_column_name(colName),
            hive_column_type(colType)
        )
        for colName, colType in zip(headers, types)
    ])
    hive_sql = '''
DROP TABLE IF EXISTS {0};

CREATE TABLE {0} (
{1}
)
STORED AS TEXTFILE
TBLPROPERTIES ("comment"="add_messytable on {3}");

LOAD DATA LOCAL INPATH '{2}' OVERWRITE INTO TABLE {0};
'''.format(table_name, fields_ddl, hive_data_file.name,
        datetime.datetime.now().isoformat())

    hive_cmd_file = tempfile.NamedTemporaryFile(mode='w')
    print(hive_sql, file=hive_cmd_file)
    hive_cmd_file.flush()

    row_set.register_processor(messytables.types_processor(types))

    for row in row_set:
        print('\001'.join(map(str, [ c.value for c in row])),
                file=hive_data_file)
    hive_data_file.flush()

    subprocess.call([
        'hive',
        '--database', database_name,
        '-f', hive_cmd_file.name,
    ])
开发者ID:Bridg,项目名称:bridg-messytable,代码行数:51,代码来源:add_messytable.py


注:本文中的messytables.type_guess函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。