本文整理汇总了Python中pyarrow.struct方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.struct方法的具体用法?Python pyarrow.struct怎么用?Python pyarrow.struct使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow
的用法示例。
在下文中一共展示了pyarrow.struct方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_simple
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import struct [as 别名]
def test_simple(self, factory):
# 3 int64 values
# 5 int32 offsets
# 1 null bitmap byte for outer ListArray
# 1 null bitmap byte for inner Int64Array
# 46 bytes in total.
list_array = pa.array([[1, 2], [None], None, None],
type=pa.list_(pa.int64()))
# 1 null bitmap byte for outer StructArray.
# 1 null bitmap byte for inner Int64Array.
# 4 int64 values.
# 34 bytes in total
struct_array = pa.array([{"a": 1}, {"a": 2}, {"a": None}, None],
type=pa.struct([pa.field("a", pa.int64())]))
entity = factory([list_array, struct_array], ["a1", "a2"])
self.assertEqual(46 + 34, table_util.TotalByteSize(entity))
示例2: test_sequence_feature_column_name_not_struct_in_schema
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import struct [as 别名]
def test_sequence_feature_column_name_not_struct_in_schema(self):
schema_text_proto = """
feature {
name: "##SEQUENCE##"
type: INT
}
"""
serialized_schema = text_format.Parse(
schema_text_proto, schema_pb2.Schema()).SerializeToString()
error_msg_regex = (
"Found a feature in the schema with the sequence_feature_column_name "
r"\(i.e., ##SEQUENCE##\) that is not a struct.*")
with self.assertRaisesRegex(RuntimeError, error_msg_regex):
sequence_example_coder.SequenceExamplesToRecordBatchDecoder(
_TEST_SEQUENCE_COLUMN_NAME, serialized_schema)
示例3: test_arrow_schema_arrow_1644_list_of_struct
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import struct [as 别名]
def test_arrow_schema_arrow_1644_list_of_struct():
arrow_schema = pa.schema([
pa.field('id', pa.string()),
pa.field('list_of_struct', pa.list_(pa.struct([pa.field('a', pa.string()), pa.field('b', pa.int32())])))
])
mock_dataset = _mock_parquet_dataset([], arrow_schema)
unischema = Unischema.from_arrow_schema(mock_dataset)
assert getattr(unischema, 'id').name == 'id'
assert not hasattr(unischema, 'list_of_struct')
示例4: test_arrow_schema_arrow_1644_list_of_list
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import struct [as 别名]
def test_arrow_schema_arrow_1644_list_of_list():
arrow_schema = pa.schema([
pa.field('id', pa.string()),
pa.field('list_of_list',
pa.list_(pa.list_(pa.struct([pa.field('a', pa.string()), pa.field('b', pa.int32())]))))
])
mock_dataset = _mock_parquet_dataset([], arrow_schema)
unischema = Unischema.from_arrow_schema(mock_dataset)
assert getattr(unischema, 'id').name == 'id'
assert not hasattr(unischema, 'list_of_list')
示例5: test_arrow_schema_convertion_ignore
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import struct [as 别名]
def test_arrow_schema_convertion_ignore():
arrow_schema = pa.schema([
pa.field('list_of_int', pa.float16()),
pa.field('struct', pa.struct([pa.field('a', pa.string()), pa.field('b', pa.int32())])),
])
mock_dataset = _mock_parquet_dataset([], arrow_schema)
unischema = Unischema.from_arrow_schema(mock_dataset, omit_unsupported_fields=True)
assert not hasattr(unischema, 'list_of_int')
示例6: test_bq_to_arrow_data_type_w_struct
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import struct [as 别名]
def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type):
fields = (
schema.SchemaField("field01", "STRING"),
schema.SchemaField("field02", "BYTES"),
schema.SchemaField("field03", "INTEGER"),
schema.SchemaField("field04", "INT64"),
schema.SchemaField("field05", "FLOAT"),
schema.SchemaField("field06", "FLOAT64"),
schema.SchemaField("field07", "NUMERIC"),
schema.SchemaField("field08", "BOOLEAN"),
schema.SchemaField("field09", "BOOL"),
schema.SchemaField("field10", "TIMESTAMP"),
schema.SchemaField("field11", "DATE"),
schema.SchemaField("field12", "TIME"),
schema.SchemaField("field13", "DATETIME"),
schema.SchemaField("field14", "GEOGRAPHY"),
)
field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields)
actual = module_under_test.bq_to_arrow_data_type(field)
expected = pyarrow.struct(
(
pyarrow.field("field01", pyarrow.string()),
pyarrow.field("field02", pyarrow.binary()),
pyarrow.field("field03", pyarrow.int64()),
pyarrow.field("field04", pyarrow.int64()),
pyarrow.field("field05", pyarrow.float64()),
pyarrow.field("field06", pyarrow.float64()),
pyarrow.field("field07", module_under_test.pyarrow_numeric()),
pyarrow.field("field08", pyarrow.bool_()),
pyarrow.field("field09", pyarrow.bool_()),
pyarrow.field("field10", module_under_test.pyarrow_timestamp()),
pyarrow.field("field11", pyarrow.date32()),
pyarrow.field("field12", module_under_test.pyarrow_time()),
pyarrow.field("field13", module_under_test.pyarrow_datetime()),
pyarrow.field("field14", pyarrow.string()),
)
)
assert pyarrow.types.is_struct(actual)
assert actual.num_children == len(fields)
assert actual.equals(expected)
示例7: test_bq_to_arrow_data_type_w_array_struct
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import struct [as 别名]
def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type):
fields = (
schema.SchemaField("field01", "STRING"),
schema.SchemaField("field02", "BYTES"),
schema.SchemaField("field03", "INTEGER"),
schema.SchemaField("field04", "INT64"),
schema.SchemaField("field05", "FLOAT"),
schema.SchemaField("field06", "FLOAT64"),
schema.SchemaField("field07", "NUMERIC"),
schema.SchemaField("field08", "BOOLEAN"),
schema.SchemaField("field09", "BOOL"),
schema.SchemaField("field10", "TIMESTAMP"),
schema.SchemaField("field11", "DATE"),
schema.SchemaField("field12", "TIME"),
schema.SchemaField("field13", "DATETIME"),
schema.SchemaField("field14", "GEOGRAPHY"),
)
field = schema.SchemaField("ignored_name", bq_type, mode="REPEATED", fields=fields)
actual = module_under_test.bq_to_arrow_data_type(field)
expected_value_type = pyarrow.struct(
(
pyarrow.field("field01", pyarrow.string()),
pyarrow.field("field02", pyarrow.binary()),
pyarrow.field("field03", pyarrow.int64()),
pyarrow.field("field04", pyarrow.int64()),
pyarrow.field("field05", pyarrow.float64()),
pyarrow.field("field06", pyarrow.float64()),
pyarrow.field("field07", module_under_test.pyarrow_numeric()),
pyarrow.field("field08", pyarrow.bool_()),
pyarrow.field("field09", pyarrow.bool_()),
pyarrow.field("field10", module_under_test.pyarrow_timestamp()),
pyarrow.field("field11", pyarrow.date32()),
pyarrow.field("field12", module_under_test.pyarrow_time()),
pyarrow.field("field13", module_under_test.pyarrow_datetime()),
pyarrow.field("field14", pyarrow.string()),
)
)
assert pyarrow.types.is_list(actual)
assert pyarrow.types.is_struct(actual.value_type)
assert actual.value_type.num_children == len(fields)
assert actual.value_type.equals(expected_value_type)
示例8: bq_to_arrow_struct_data_type
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import struct [as 别名]
def bq_to_arrow_struct_data_type(field):
arrow_fields = []
for subfield in field.fields:
arrow_subfield = bq_to_arrow_field(subfield)
if arrow_subfield:
arrow_fields.append(arrow_subfield)
else:
# Could not determine a subfield type. Fallback to type
# inference.
return None
return pyarrow.struct(arrow_fields)
示例9: test_iterate_over_timestamp_ntz_chunk
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import struct [as 别名]
def test_iterate_over_timestamp_ntz_chunk():
random.seed(datetime.datetime.now())
scale = random.randint(0, 9)
column_meta = [
{"logicalType": "TIMESTAMP_NTZ", "scale": str(scale)},
{"logicalType": "TIMESTAMP_NTZ", "scale": str(scale)}
]
data_type = pyarrow.struct([pyarrow.field('epoch', pyarrow.int64()),
pyarrow.field('fraction', pyarrow.int32())]) if scale > 7 else pyarrow.int64()
def timestamp_ntz_generator(scale):
epoch = random.randint(-621355968, 2534023007)
frac = random.randint(0, 10**scale - 1) * (10**(9 - scale)) if scale > 7 else random.randint(0, 10**scale - 1)
if scale > 7:
return {'epoch': epoch, 'fraction': frac}
else:
epoch = str(epoch)
frac = str(frac)
ZEROFILL = '000000000'
frac = ZEROFILL[:scale - len(frac)] + frac
return int(epoch + frac) if scale else int(epoch)
def expected_data_transform_ntz(_scale):
def expected_data_transform_ntz_impl(data, scale=_scale):
if scale > 7:
frac = data['fraction']
epoch = data['epoch']
if epoch < 0:
epoch += 1
frac = 10**9 - frac
frac = str(int(frac / 10**(9 - scale)))
ZERO_FILL = '000000000'
frac = ZERO_FILL[:scale - len(frac)] + frac
data = int(str(epoch) + frac)
microsec = str(data)
if scale > 6:
microsec = microsec[:-scale] + "." + microsec[-scale:-scale + 6]
else:
microsec = microsec[:-scale] + "." + microsec[-scale:] if scale else microsec
if platform.system() == 'Windows':
return datetime.datetime.utcfromtimestamp(0) + datetime.timedelta(seconds=(float(microsec)))
else:
return datetime.datetime.utcfromtimestamp(float(microsec))
return expected_data_transform_ntz_impl
iterate_over_test_chunk([data_type, data_type],
column_meta, lambda: timestamp_ntz_generator(scale), expected_data_transform_ntz(scale))
示例10: test_iterate_over_timestamp_ltz_chunk
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import struct [as 别名]
def test_iterate_over_timestamp_ltz_chunk():
random.seed(datetime.datetime.now())
scale = random.randint(0, 9)
column_meta = [
{"logicalType": "TIMESTAMP_LTZ", "scale": str(scale)},
{"logicalType": "TIMESTAMP_LTZ", "scale": str(scale)}
]
data_type = pyarrow.struct([pyarrow.field('epoch', pyarrow.int64()),
pyarrow.field('fraction', pyarrow.int32())]) if scale > 7 else pyarrow.int64()
def timestamp_ltz_generator(scale):
epoch = random.randint(-621355968, 2534023007)
frac = random.randint(0, 10**scale - 1) * (10**(9 - scale)) if scale > 7 else random.randint(0, 10**scale - 1)
if scale > 7:
return {'epoch': epoch, 'fraction': frac}
else:
epoch = str(epoch)
frac = str(frac)
ZEROFILL = '000000000'
frac = ZEROFILL[:scale - len(frac)] + frac
return int(epoch + frac) if scale else int(epoch)
def expected_data_transform_ltz(_scale):
def expected_data_transform_ltz_impl(data, scale=_scale):
tzinfo = get_timezone() # can put a string parameter here in the future
if scale > 7:
frac = data['fraction']
epoch = data['epoch']
if epoch < 0:
epoch += 1
frac = 10**9 - frac
frac = str(int(frac / 10**(9 - scale)))
ZERO_FILL = '000000000'
frac = ZERO_FILL[:scale - len(frac)] + frac
data = int(str(epoch) + frac)
microsec = str(data)
if scale > 6:
microsec = microsec[:-scale] + "." + microsec[-scale:-scale + 6]
else:
microsec = microsec[:-scale] + "." + microsec[-scale:] if scale else microsec
if platform.system() == 'Windows':
t0 = datetime.datetime.utcfromtimestamp(0) + datetime.timedelta(seconds=(float(microsec)))
return pytz.utc.localize(t0, is_dst=False).astimezone(tzinfo)
else:
return datetime.datetime.fromtimestamp(float(microsec), tz=tzinfo)
return expected_data_transform_ltz_impl
iterate_over_test_chunk([data_type, data_type],
column_meta, lambda: timestamp_ltz_generator(scale), expected_data_transform_ltz(scale))
示例11: test_iterate_over_timestamp_tz_chunk
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import struct [as 别名]
def test_iterate_over_timestamp_tz_chunk():
random.seed(datetime.datetime.now())
scale = random.randint(0, 9)
column_meta = [
{"byteLength": "16" if scale > 3 else "8", "logicalType": "TIMESTAMP_TZ", "scale": str(scale)},
{"byteLength": "16" if scale > 3 else "8", "logicalType": "TIMESTAMP_TZ", "scale": str(scale)}
]
type1 = pyarrow.struct([pyarrow.field('epoch', pyarrow.int64()),
pyarrow.field('timezone', pyarrow.int32()),
pyarrow.field('fraction', pyarrow.int32())])
type2 = pyarrow.struct([pyarrow.field('epoch', pyarrow.int64()),
pyarrow.field('timezone', pyarrow.int32())])
data_type = type1 if scale > 3 else type2
def timestamp_tz_generator(scale):
epoch = random.randint(-621355968, 2534023007)
frac = random.randint(0, 10**scale - 1) * (10**(9 - scale)) if scale > 3 else random.randint(0, 10**scale - 1)
timezone = random.randint(1, 2879)
if scale > 3:
return {'epoch': epoch, 'timezone': timezone, 'fraction': frac}
else:
epoch = str(epoch)
frac = str(frac)
ZEROFILL = '000000000'
frac = ZEROFILL[:scale - len(frac)] + frac
return {'epoch': int(epoch + frac) if scale else int(epoch), 'timezone': timezone}
def expected_data_transform_tz(_scale):
def expected_data_transform_tz_impl(data, scale=_scale):
timezone = data['timezone']
tzinfo = _generate_tzinfo_from_tzoffset(timezone - 1440)
epoch = data['epoch']
if scale > 3:
frac = data['fraction']
if epoch < 0:
epoch += 1
frac = 10**9 - frac
frac = str(int(frac / 10**(9 - scale)))
ZERO_FILL = '000000000'
frac = ZERO_FILL[:scale - len(frac)] + frac
epoch = int(str(epoch) + frac)
microsec = str(epoch)
if scale > 6:
microsec = microsec[:-scale] + "." + microsec[-scale:-scale + 6]
else:
microsec = microsec[:-scale] + "." + microsec[-scale:] if scale else microsec
if platform.system() == 'Windows':
t = datetime.datetime.utcfromtimestamp(0) + datetime.timedelta(seconds=(float(microsec)))
if pytz.utc != tzinfo:
t += tzinfo.utcoffset(t)
return t.replace(tzinfo=tzinfo)
else:
return datetime.datetime.fromtimestamp(float(microsec), tz=tzinfo)
return expected_data_transform_tz_impl
iterate_over_test_chunk([data_type, data_type],
column_meta, lambda: timestamp_tz_generator(scale), expected_data_transform_tz(scale))