本文整理汇总了Python中messytables.any_tableset函数的典型用法代码示例。如果您正苦于以下问题:Python any_tableset函数的具体用法?Python any_tableset怎么用?Python any_tableset使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了any_tableset函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _process_upload
def _process_upload(context, data):
"""
When provided with a filename this function will process each row
within the file and then return a tuple. The tuple will contain
- a list of error messages (if any)
- a list of dicts where each dict contains ...
{
'package': 'a_package_id',
'action': 'Added' or 'Updated'
}
"""
log = inventory_upload.get_logger()
errors = []
results = []
filename = data['file']
publisher_name = data['publisher']
import urlparse
client = CkanClient(
base_location=urlparse.urljoin(context['site_url'], 'api'),
api_key=context['apikey'])
tableset = None
try:
_, ext = os.path.splitext(filename)
tableset = messytables.any_tableset(
open(filename, 'r'), extension=ext[1:])
except Exception, e:
if str(e) == "Unrecognized MIME type: text/plain":
tableset = messytables.any_tableset(f, mimetype="text/csv")
else:
errors.append("Unable to load file: {0}".format(e))
示例2: main
def main(argv=None):
args = parse_args(argv)
if args.file is None:
# slurp the whole input since there seems to be a bug in messytables
# which should be able to handle streams but doesn't
args.file = cStringIO.StringIO(sys.stdin.read())
relation_key = args_to_relation_key(args)
table_set = any_tableset(args.file)
if len(table_set.tables) != 1:
raise ValueError("Can only handle files with a single table, not %s" % len(table_set.tables))
row_set = table_set.tables[0]
# guess header names and the offset of the header:
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(strip_processor())
row_set.register_processor(headers_processor(headers))
# Temporarily, mark the offset of the header
row_set.register_processor(offset_processor(offset + 1))
# guess types and register them
types = type_guess(replace_empty_string(row_set.sample), strict=True, types=[StringType, DecimalType, IntegerType])
row_set.register_processor(types_processor(types))
# Messytables seems to not handle the case where there are no headers.
# Work around this as follows:
# 1) offset must be 0
# 2) if the types of the data match the headers, assume there are
# actually no headers
if offset == 0:
try:
[t.cast(v) for (t, v) in zip(types, headers)]
except:
pass
else:
# We don't need the headers_processor or the offset_processor
row_set._processors = []
row_set.register_processor(strip_processor())
row_set.register_processor(types_processor(types))
headers = None
# Construct the Myria schema
schema = messy_to_schema(types, headers)
logging.info("Myria schema: {}".format(json.dumps(schema)))
# Prepare data for writing to Myria
data, kwargs = write_data(row_set, schema)
if not args.dry:
# Connect to Myria and send the data
connection = myria.MyriaConnection(hostname=args.hostname, port=args.port, ssl=args.ssl)
ret = connection.upload_file(relation_key, schema, data, args.overwrite, **kwargs)
sys.stdout.write(pretty_json(ret))
else:
sys.stdout.write(data)
示例3: __init__
def __init__(self, filename):
"""
When provided with a filename (to a CSV, XLS, or XLSX) the constructor
will attempt to load the file and ensure that messytables knows how to
process it.
"""
self.tableset = None
try:
_, ext = os.path.splitext(filename)
self.tableset = messytables.any_tableset(open(filename, "r"), extension=ext[1:])
except Exception, e:
if str(e) == "Unrecognized MIME type: text/plain":
# Attempt to force the load as a CSV file to work around messytables
# not recognising text/plain
self.tableset = messytables.any_tableset(f, mimetype="text/csv")
else:
log.exception(e)
raise Exception(u"Failed to load the file at {0}".format(filename))
示例4: transform
def transform(self):
handle = self.open_data(self.url)
if not handle:
raise ResourceError("Remote resource missing",
"Unable to load the remote resource")
try:
if self.is_csv():
table_set = any_tableset(fileobj=handle,
extension=self.type)
else:
table_set = any_tableset(fileobj=handle,
extension=self.type,
mimetype=self.mimetype)
except Exception, e:
# e.g. ValueError('Unrecognized MIME type: application/vnd.oasis.opendocument.spreadsheet')
log.warn('Messytables parse error %s %s: %s', self.resource_identifier, self.url, e)
log.warn('Some data: ext: %s, mime: %s', self.type, self.mimetype)
raise ResourceError("Resource loading error",
"Unable to load the resource")
示例5: ingest
def ingest(self, meta, local_path):
with open(local_path, 'rb') as fh:
table_set = any_tableset(fh,
extension=meta.extension,
mimetype=meta.mime_type,
window=20000)
tables = []
for sheet, row_set in enumerate(table_set.tables):
tables.append(self.generate_table(meta, sheet, row_set))
meta.tables = tables
document = self.create_document(meta)
self.emit(document)
示例6: proc
def proc(f, database_name, table_name):
table_set = messytables.any_tableset(f)
row_set = table_set.tables[0]
# guess header names and the offset of the header:
offset, headers = messytables.headers_guess(row_set.sample)
row_set.register_processor(messytables.headers_processor(headers))
row_set.register_processor(messytables.offset_processor(offset + 1))
types = messytables.type_guess(row_set.sample, types=[
messytables.types.StringType,
messytables.types.DateType,
], strict=True)
hive_data_file = tempfile.NamedTemporaryFile(mode='w')
fields_ddl = ','.join([
' {0} {1}\n'.format(
canonicalize_column_name(colName),
hive_column_type(colType)
)
for colName, colType in zip(headers, types)
])
hive_sql = '''
DROP TABLE IF EXISTS {0};
CREATE TABLE {0} (
{1}
)
STORED AS TEXTFILE
TBLPROPERTIES ("comment"="add_messytable on {3}");
LOAD DATA LOCAL INPATH '{2}' OVERWRITE INTO TABLE {0};
'''.format(table_name, fields_ddl, hive_data_file.name,
datetime.datetime.now().isoformat())
hive_cmd_file = tempfile.NamedTemporaryFile(mode='w')
print(hive_sql, file=hive_cmd_file)
hive_cmd_file.flush()
row_set.register_processor(messytables.types_processor(types))
for row in row_set:
print('\001'.join(map(str, [ c.value for c in row])),
file=hive_data_file)
hive_data_file.flush()
subprocess.call([
'hive',
'--database', database_name,
'-f', hive_cmd_file.name,
])
示例7: transform
def transform(self):
handle = self.open_data(self.url)
if not handle:
raise ResourceError("Informacije",
"Udaljeni resurs nedostupan")
try:
table_set = any_tableset(fileobj=handle,
extension=self.type,
mimetype=self.mimetype)
except Exception, e:
raise ResourceError("Informacija",
"Resurs nedostupan")
示例8: validate_file
def validate_file(file_tmp, file_name, tmp_filepath):
log.info("upload: checking file * %s * ", file_name)
MAX_HEADER_LENGTH = 64
# not allowed characters ( - ' " ’ ‘) regex
inappropriate_chars = re.compile(r"[\-|\'|\"|\u2018|\u2019]");
datastore_ext = config.get('ckan.mimetype_guess', "csv xls xlsx tsv")
tmp_file_name, tmp_file_ext = os.path.splitext(file_name)
#check if datastore file (csv xls xlsx tsv)
if tmp_file_ext[1:].lower() in datastore_ext:
table_set = any_tableset(file_tmp)
#check if only one data sheet in the file
if len(table_set.tables)>1:
rollback_tmp(file_tmp, tmp_filepath)
log.error("upload: the file * %s * was not uploaded - There is more then one data sheet in the file", file_name)
raise logic.ValidationError(
{'upload': ['There is more then one data sheet in the file']}
)
else:
row_set = table_set.tables[0]
# guess header names and the offset of the header:
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
for header in headers:
# too long header
if len(header) > MAX_HEADER_LENGTH:
rollback_tmp(file_tmp, tmp_filepath)
log.error("upload: the file * %s * was not uploaded - too long header - * %s *",
file_name, header)
raise logic.ValidationError(
{'upload': ['too long header (64 max)']}
)
# not allowed characters in header ( - ' " ’ ‘)
if inappropriate_chars.search(header):
rollback_tmp(file_tmp, tmp_filepath)
log.error("upload: the file * %s * was not uploaded - there are inappropriate characters in headers * %s *",
file_name, header)
raise logic.ValidationError(
{'upload': ['there are inappropriate characters in headers (apostrophe/apostrophes/dash)']}
)
# Check for duplicate fields
unique_fields = set(headers)
if not len(unique_fields) == len(headers):
rollback_tmp(file_tmp, tmp_filepath)
log.error("upload: the file * %s * was not uploaded - Duplicate column names are not supported", file_name)
raise logic.ValidationError({'upload': ['Duplicate column names are not supported']})
log.info("passed validation succesfully - the file * %s * was uploaded to CKAN (filestore)", file_name)
else:
pass
示例9: create_new_model
def create_new_model(self, modelname, app_label):
""" Use messytables to guess field types and build a new model """
nocols = False
cols = self.csvfile[0]
for col in cols:
if not col:
nocols = True
if nocols:
cols = ['col_%s' % num for num in range(1, len(cols))]
print ('No column names for %s columns' % len(cols))
else:
cols = [cleancol.sub('_', col).lower() for col in cols]
try:
from messytables import any_tableset, type_guess
except:
self.errors.append(
'If you want to inspect CSV files to generate model code, you must install https://messytables.readthedocs.org')
self.modelname = ''
return
try:
table_set = any_tableset(self.filehandle)
row_set = table_set.tables[0]
types = type_guess(row_set.sample)
types = [str(typeobj) for typeobj in types]
except Exception as err:
self.errors.append('messytables could not run due to error')
self.errors.append(str(err))
self.modelname = ''
return
fieldset = []
maximums = self.get_maxlengths(cols)
for i, col in enumerate(cols):
length = maximums[i]
if types[i] == 'String' and length > 255:
types[i] = 'Text'
integer = length
decimal = int(length / 2)
if decimal > 10:
decimal = 10
blank = True
default = True
column = (col, types[i], length, length, integer, decimal, blank, default)
fieldset.append(column)
# Import here so that messytables is not a dependency for just using csvimport cmd
from csvimport.make_model import MakeModel
maker = MakeModel()
return maker.model_from_table('%s_%s' % (app_label, modelname), fieldset)
示例10: read_file
def read_file(self, filename):
"""
Guess the filetype and read the file into row sets
"""
#print("Reading file", filename)
try:
fh = open(filename, 'rb')
table_set = any_tableset(fh) # guess the type...
except:
#traceback.print_exc()
# Cannot find the schema.
table_set = None
return table_set
示例11: parse_table
def parse_table(source):
# This is a work-around because messytables hangs on boto file
# handles, so we're doing it via plain old HTTP.
# We're also passing in an extended window size to give more
# reliable type detection.
# Because Python's CSV dialect sniffer isn't the best, this also
# constrains the field quoting character to a double quote.
table_set = mt.any_tableset(source.fh(),
extension=source.meta.get('extension'),
mimetype=source.meta.get('mime_type'),
quotechar='"', window=20000)
tables = list(table_set.tables)
if not len(tables):
log.error("No tables were found in the source file.")
return
row_set = tables[0]
headers = [c.value for c in next(row_set.sample)]
row_set.register_processor(mt.headers_processor(headers))
row_set.register_processor(mt.offset_processor(1))
types = mt.type_guess(row_set.sample, strict=True)
row_set.register_processor(mt.types_processor(types, strict=True))
fields, i = {}, 0
row_iter = iter(row_set)
while True:
i += 1
try:
row = row_iter.next()
if not len(fields):
fields = generate_field_spec(row)
data = convert_row(row, fields, i)
check_empty = set(data.values())
if None in check_empty and len(check_empty) == 1:
continue
yield None, fields, data
except StopIteration:
return
except Exception, e:
# log.exception(e)
yield e, fields, None
示例12: resource_row_set
def resource_row_set(package, resource):
""" Generate an iterator over all the rows in this resource's
source data. """
# This is a work-around because messytables hangs on boto file
# handles, so we're doing it via plain old HTTP.
table_set = any_tableset(resource.fh(),
extension=resource.meta.get('extension'),
mimetype=resource.meta.get('mime_type'))
tables = list(table_set.tables)
if not len(tables):
log.error("No tables were found in the source file.")
return
row_set = tables[0]
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(offset + 1))
types = type_guess(row_set.sample, strict=True)
row_set.register_processor(types_processor(types))
return row_set
示例13: test_simple_xlsx
def test_simple_xlsx(self):
fh = horror_fobj('simple.xlsx')
table_set = any_tableset(fh, extension='xlsx')
assert isinstance(table_set, XLSXTableSet)
示例14: test_simple_csv
def test_simple_csv(self):
fh = horror_fobj('simple.csv')
table_set = any_tableset(fh, extension='csv')
assert isinstance(table_set, CSVTableSet)
示例15: test_scraperwiki_xlsx
def test_scraperwiki_xlsx(self):
fh = horror_fobj('sw_gen.xlsx')
table_set = any_tableset(fh)
row_set = table_set.tables[0]
data = list(row_set)
assert_equal(16, len(data))