Python messytables.any_tableset函数代码示例

本文整理汇总了Python中messytables.any_tableset函数的典型用法代码示例。如果您正苦于以下问题：Python any_tableset函数的具体用法？Python any_tableset怎么用？Python any_tableset使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了any_tableset函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _process_upload

def _process_upload(context, data):
    """
    When provided with a filename this function will process each row
    within the file and then return a tuple. The tuple will contain
        - a list of error messages (if any)
        - a list of dicts where each dict contains ...
                {
                 'package': 'a_package_id',
                 'action':  'Added' or 'Updated'
                }
    """
    log = inventory_upload.get_logger()

    errors = []
    results = []

    filename = data['file']
    publisher_name = data['publisher']

    import urlparse
    client = CkanClient(
        base_location=urlparse.urljoin(context['site_url'], 'api'),
        api_key=context['apikey'])

    tableset = None
    try:
        _, ext = os.path.splitext(filename)
        tableset = messytables.any_tableset(
            open(filename, 'r'), extension=ext[1:])
    except Exception, e:
        if str(e) == "Unrecognized MIME type: text/plain":
            tableset = messytables.any_tableset(f, mimetype="text/csv")
        else:
            errors.append("Unable to load file: {0}".format(e))

开发者ID:salum-ar，项目名称:ckanext-dgu，代码行数:34，代码来源:tasks.py

示例2: main

def main(argv=None):
    args = parse_args(argv)

    if args.file is None:
        # slurp the whole input since there seems to be a bug in messytables
        # which should be able to handle streams but doesn't
        args.file = cStringIO.StringIO(sys.stdin.read())

    relation_key = args_to_relation_key(args)

    table_set = any_tableset(args.file)
    if len(table_set.tables) != 1:
        raise ValueError("Can only handle files with a single table, not %s" % len(table_set.tables))

    row_set = table_set.tables[0]

    # guess header names and the offset of the header:
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(strip_processor())
    row_set.register_processor(headers_processor(headers))
    # Temporarily, mark the offset of the header
    row_set.register_processor(offset_processor(offset + 1))

    # guess types and register them
    types = type_guess(replace_empty_string(row_set.sample), strict=True, types=[StringType, DecimalType, IntegerType])
    row_set.register_processor(types_processor(types))

    # Messytables seems to not handle the case where there are no headers.
    # Work around this as follows:
    # 1) offset must be 0
    # 2) if the types of the data match the headers, assume there are
    #    actually no headers
    if offset == 0:
        try:
            [t.cast(v) for (t, v) in zip(types, headers)]
        except:
            pass
        else:
            # We don't need the headers_processor or the offset_processor
            row_set._processors = []
            row_set.register_processor(strip_processor())
            row_set.register_processor(types_processor(types))
            headers = None

    # Construct the Myria schema
    schema = messy_to_schema(types, headers)
    logging.info("Myria schema: {}".format(json.dumps(schema)))

    # Prepare data for writing to Myria
    data, kwargs = write_data(row_set, schema)

    if not args.dry:
        # Connect to Myria and send the data
        connection = myria.MyriaConnection(hostname=args.hostname, port=args.port, ssl=args.ssl)
        ret = connection.upload_file(relation_key, schema, data, args.overwrite, **kwargs)

        sys.stdout.write(pretty_json(ret))
    else:
        sys.stdout.write(data)

开发者ID:helgag，项目名称:myria-python，代码行数:59，代码来源:upload_file.py

示例3: init

    def __init__(self, filename):
        """
        When provided with a filename (to a CSV, XLS, or XLSX) the constructor
        will attempt to load the file and ensure that messytables knows how to
        process it.
        """
        self.tableset = None

        try:
            _, ext = os.path.splitext(filename)
            self.tableset = messytables.any_tableset(open(filename, "r"), extension=ext[1:])
        except Exception, e:
            if str(e) == "Unrecognized MIME type: text/plain":
                # Attempt to force the load as a CSV file to work around messytables
                # not recognising text/plain
                self.tableset = messytables.any_tableset(f, mimetype="text/csv")
            else:
                log.exception(e)
                raise Exception(u"Failed to load the file at {0}".format(filename))

开发者ID:gsplatform，项目名称:ckanext-dgu，代码行数:19，代码来源:ingest.py

示例4: transform

    def transform(self):
        handle = self.open_data(self.url)

        if not handle:
            raise ResourceError("Remote resource missing",
                                "Unable to load the remote resource")

        try:
            if self.is_csv():
                table_set = any_tableset(fileobj=handle,
                                         extension=self.type)
            else:
                table_set = any_tableset(fileobj=handle,
                                         extension=self.type,
                                         mimetype=self.mimetype)
        except Exception, e:
            # e.g. ValueError('Unrecognized MIME type: application/vnd.oasis.opendocument.spreadsheet')
            log.warn('Messytables parse error %s %s: %s', self.resource_identifier, self.url, e)
            log.warn('Some data: ext: %s, mime: %s', self.type, self.mimetype)
            raise ResourceError("Resource loading error",
                                "Unable to load the resource")

开发者ID:derilinx，项目名称:ckanext-datapreview，代码行数:21，代码来源:tabular_transform.py

示例5: ingest

    def ingest(self, meta, local_path):
        with open(local_path, 'rb') as fh:
            table_set = any_tableset(fh,
                                     extension=meta.extension,
                                     mimetype=meta.mime_type,
                                     window=20000)
            tables = []
            for sheet, row_set in enumerate(table_set.tables):
                tables.append(self.generate_table(meta, sheet, row_set))

            meta.tables = tables
            document = self.create_document(meta)
            self.emit(document)

开发者ID:DavidLemayian，项目名称:aleph，代码行数:13，代码来源:tabular.py

示例6: proc

def proc(f, database_name, table_name):

    table_set = messytables.any_tableset(f)
    row_set = table_set.tables[0]

    # guess header names and the offset of the header:
    offset, headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=[
        messytables.types.StringType,
        messytables.types.DateType,
    ], strict=True)
    hive_data_file = tempfile.NamedTemporaryFile(mode='w')

    fields_ddl = ','.join([
        '  {0} {1}\n'.format(
            canonicalize_column_name(colName),
            hive_column_type(colType)
        )
        for colName, colType in zip(headers, types)
    ])
    hive_sql = '''
DROP TABLE IF EXISTS {0};

CREATE TABLE {0} (
{1}
)
STORED AS TEXTFILE
TBLPROPERTIES ("comment"="add_messytable on {3}");

LOAD DATA LOCAL INPATH '{2}' OVERWRITE INTO TABLE {0};
'''.format(table_name, fields_ddl, hive_data_file.name,
        datetime.datetime.now().isoformat())

    hive_cmd_file = tempfile.NamedTemporaryFile(mode='w')
    print(hive_sql, file=hive_cmd_file)
    hive_cmd_file.flush()

    row_set.register_processor(messytables.types_processor(types))

    for row in row_set:
        print('\001'.join(map(str, [ c.value for c in row])),
                file=hive_data_file)
    hive_data_file.flush()

    subprocess.call([
        'hive',
        '--database', database_name,
        '-f', hive_cmd_file.name,
    ])

开发者ID:Bridg，项目名称:bridg-messytable，代码行数:51，代码来源:add_messytable.py

示例7: transform

    def transform(self):
        handle = self.open_data(self.url)

        if not handle:
            raise ResourceError("Informacije",
                "Udaljeni resurs nedostupan")

        try:
            table_set = any_tableset(fileobj=handle,
                                     extension=self.type,
                                     mimetype=self.mimetype)
        except Exception, e:
            raise ResourceError("Informacija",
                "Resurs nedostupan")

开发者ID:tbalaz，项目名称:test，代码行数:14，代码来源:tabular_transform.py

示例8: validate_file

def validate_file(file_tmp, file_name, tmp_filepath):

    log.info("upload: checking file * %s * ", file_name)
    MAX_HEADER_LENGTH = 64
    # not allowed characters ( - ' " ’ ‘) regex
    inappropriate_chars = re.compile(r"[\-|\'|\"|\u2018|\u2019]");
    datastore_ext = config.get('ckan.mimetype_guess', "csv xls xlsx tsv")
    tmp_file_name, tmp_file_ext = os.path.splitext(file_name)

    #check if datastore file (csv xls xlsx tsv)
    if tmp_file_ext[1:].lower() in datastore_ext:
        table_set = any_tableset(file_tmp)
        #check if only one data sheet in the file
        if len(table_set.tables)>1:
            rollback_tmp(file_tmp, tmp_filepath)
            log.error("upload: the file * %s * was not uploaded - There is more then one data sheet in the file", file_name)
            raise logic.ValidationError(
                {'upload': ['There is more then one data sheet in the file']}
            )
        else:
            row_set = table_set.tables[0]
            # guess header names and the offset of the header:
            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))
            for header in headers:
                # too long header
                if len(header) > MAX_HEADER_LENGTH:
                    rollback_tmp(file_tmp, tmp_filepath)
                    log.error("upload: the file * %s * was not uploaded - too long header - * %s *",
                              file_name, header)
                    raise logic.ValidationError(
                        {'upload': ['too long header (64 max)']}
                    )
                # not allowed characters in header ( - ' " ’ ‘)
                if inappropriate_chars.search(header):
                    rollback_tmp(file_tmp, tmp_filepath)
                    log.error("upload: the file * %s * was not uploaded - there are inappropriate characters in headers * %s *",
                              file_name, header)
                    raise logic.ValidationError(
                        {'upload': ['there are inappropriate characters in headers (apostrophe/apostrophes/dash)']}
                    )
            # Check for duplicate fields
            unique_fields = set(headers)
            if not len(unique_fields) == len(headers):
                rollback_tmp(file_tmp, tmp_filepath)
                log.error("upload: the file * %s * was not uploaded - Duplicate column names are not supported", file_name)
                raise logic.ValidationError({'upload': ['Duplicate column names are not supported']})
        log.info("passed validation succesfully - the file * %s * was uploaded to CKAN (filestore)", file_name)
    else:
        pass

开发者ID:CIOIL，项目名称:DataGovIL，代码行数:50，代码来源:file_validators.py

示例9: create_new_model

    def create_new_model(self, modelname, app_label):
        """ Use messytables to guess field types and build a new model """

        nocols = False
        cols = self.csvfile[0]
        for col in cols:
            if not col:
                nocols = True
        if nocols:
            cols = ['col_%s' % num for num in range(1, len(cols))]
            print ('No column names for %s columns' % len(cols))
        else:
            cols = [cleancol.sub('_', col).lower() for col in cols]
        try:
            from messytables import any_tableset, type_guess
        except:
            self.errors.append(
                'If you want to inspect CSV files to generate model code, you must install https://messytables.readthedocs.org')
            self.modelname = ''
            return

        try:
            table_set = any_tableset(self.filehandle)
            row_set = table_set.tables[0]
            types = type_guess(row_set.sample)
            types = [str(typeobj) for typeobj in types]
        except Exception as err:
            self.errors.append('messytables could not run due to error')
            self.errors.append(str(err))
            self.modelname = ''
            return

        fieldset = []
        maximums = self.get_maxlengths(cols)
        for i, col in enumerate(cols):
            length = maximums[i]
            if types[i] == 'String' and length > 255:
                types[i] = 'Text'
            integer = length
            decimal = int(length / 2)
            if decimal > 10:
                decimal = 10
            blank = True
            default = True
            column = (col, types[i], length, length, integer, decimal, blank, default)
            fieldset.append(column)
        # Import here so that messytables is not a dependency for just using csvimport cmd
        from csvimport.make_model import MakeModel
        maker = MakeModel()
        return maker.model_from_table('%s_%s' % (app_label, modelname), fieldset)

开发者ID:edcrewe，项目名称:django-csvimport，代码行数:50，代码来源:inspectcsv.py

示例10: read_file

    def read_file(self, filename): 
        """
        Guess the filetype and read the file into row sets
        """
        #print("Reading file", filename)

        try:
            fh = open(filename, 'rb')
            table_set = any_tableset(fh) # guess the type...
        except:
            #traceback.print_exc()
            # Cannot find the schema.
            table_set = None
            
        return table_set

开发者ID:purnima215，项目名称:dgit，代码行数:15，代码来源:tableformat.py

示例11: parse_table

def parse_table(source):
    # This is a work-around because messytables hangs on boto file
    # handles, so we're doing it via plain old HTTP.
    # We're also passing in an extended window size to give more
    # reliable type detection.
    # Because Python's CSV dialect sniffer isn't the best, this also
    # constrains the field quoting character to a double quote.
    table_set = mt.any_tableset(source.fh(),
                                extension=source.meta.get('extension'),
                                mimetype=source.meta.get('mime_type'),
                                quotechar='"', window=20000)
    tables = list(table_set.tables)
    if not len(tables):
        log.error("No tables were found in the source file.")
        return
    row_set = tables[0]
    headers = [c.value for c in next(row_set.sample)]
    row_set.register_processor(mt.headers_processor(headers))
    row_set.register_processor(mt.offset_processor(1))
    types = mt.type_guess(row_set.sample, strict=True)
    row_set.register_processor(mt.types_processor(types, strict=True))

    fields, i = {}, 0
    row_iter = iter(row_set)

    while True:
        i += 1
        try:
            row = row_iter.next()
            if not len(fields):
                fields = generate_field_spec(row)

            data = convert_row(row, fields, i)
            check_empty = set(data.values())
            if None in check_empty and len(check_empty) == 1:
                continue

            yield None, fields, data
        except StopIteration:
            return
        except Exception, e:
            # log.exception(e)
            yield e, fields, None

开发者ID:CivicVision，项目名称:datahub，代码行数:43，代码来源:extract.py

示例12: resource_row_set

def resource_row_set(package, resource):
    """ Generate an iterator over all the rows in this resource's
    source data. """
    # This is a work-around because messytables hangs on boto file
    # handles, so we're doing it via plain old HTTP.
    table_set = any_tableset(resource.fh(),
                             extension=resource.meta.get('extension'),
                             mimetype=resource.meta.get('mime_type'))
    tables = list(table_set.tables)
    if not len(tables):
        log.error("No tables were found in the source file.")
        return

    row_set = tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))
    return row_set

开发者ID:01-，项目名称:loadkit，代码行数:20，代码来源:table.py

示例13: test_simple_xlsx

 def test_simple_xlsx(self):
     fh = horror_fobj('simple.xlsx')
     table_set = any_tableset(fh, extension='xlsx')
     assert isinstance(table_set, XLSXTableSet)

开发者ID:refractal，项目名称:messytables，代码行数:4，代码来源:test_any.py

示例14: test_simple_csv

 def test_simple_csv(self):
     fh = horror_fobj('simple.csv')
     table_set = any_tableset(fh, extension='csv')
     assert isinstance(table_set, CSVTableSet)