本文整理汇总了Python中pyarrow.binary方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.binary方法的具体用法?Python pyarrow.binary怎么用?Python pyarrow.binary使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow
的用法示例。
在下文中一共展示了pyarrow.binary方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_iterate_over_binary_chunk
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import binary [as 别名]
def test_iterate_over_binary_chunk():
random.seed(datetime.datetime.now())
column_meta = {
"byteLength": "100",
"logicalType": "BINARY",
"precision": "0",
"scale": "0",
"charLength": "0"
}
def byte_array_generator():
return bytearray(os.urandom(1000))
iterate_over_test_chunk([pyarrow.binary(), pyarrow.binary()],
[column_meta, column_meta],
byte_array_generator)
示例2: _GetExpectedColumnValues
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import binary [as 别名]
def _GetExpectedColumnValues(tfxio):
if tfxio._can_produce_large_types:
list_factory = pa.large_list
bytes_type = pa.large_binary()
else:
list_factory = pa.list_
bytes_type = pa.binary()
return {
path.ColumnPath(["int_feature"]):
pa.array([[1], [2], [3]], type=list_factory(pa.int64())),
path.ColumnPath(["float_feature"]):
pa.array([[1, 2, 3, 4], [2, 3, 4, 5], None],
type=list_factory(pa.float32())),
path.ColumnPath([_SEQUENCE_COLUMN_NAME, "int_feature"]):
pa.array([[[1, 2], [3]], None, [[4]]],
list_factory(list_factory(pa.int64()))),
path.ColumnPath([_SEQUENCE_COLUMN_NAME, "string_feature"]):
pa.array([None, [[b"foo", b"bar"], []], [[b"baz"]]],
list_factory(list_factory(bytes_type)))
}
示例3: testRaggedTensorStructTypeInvalidSteps
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import binary [as 别名]
def testRaggedTensorStructTypeInvalidSteps(self):
tensor_representation = text_format.Parse(
"""
ragged_tensor {
feature_path {
step: "ragged_feature"
step: "wrong_step"
}
}
""", schema_pb2.TensorRepresentation())
record_batch = pa.RecordBatch.from_arrays([
pa.StructArray.from_arrays([
pa.array([[1, 2, 3]], pa.list_(pa.int64())),
pa.array([["a", "b", "c"]], pa.list_(pa.binary()))
], ["inner_feature", "x2"])
], ["ragged_feature"])
with self.assertRaisesRegex(ValueError,
".*Unable to handle tensor output.*"):
tensor_adapter.TensorAdapter(
tensor_adapter.TensorAdapterConfig(record_batch.schema,
{"output": tensor_representation}))
示例4: testRaggedTensorStructTypeNonLeaf
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import binary [as 别名]
def testRaggedTensorStructTypeNonLeaf(self):
tensor_representation = text_format.Parse(
"""
ragged_tensor {
feature_path {
step: "ragged_feature"
}
}
""", schema_pb2.TensorRepresentation())
record_batch = pa.RecordBatch.from_arrays([
pa.StructArray.from_arrays([
pa.array([[1, 2, 3]], pa.list_(pa.int64())),
pa.array([["a", "b", "c"]], pa.list_(pa.binary()))
], ["inner_feature", "x2"])
], ["ragged_feature"])
with self.assertRaisesRegex(ValueError,
".*Unable to handle tensor output.*"):
tensor_adapter.TensorAdapter(
tensor_adapter.TensorAdapterConfig(record_batch.schema,
{"output": tensor_representation}))
示例5: testIsListLike
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import binary [as 别名]
def testIsListLike(self):
for t in (pa.list_(pa.int64()), pa.large_list(pa.int64())):
self.assertTrue(arrow_util.is_list_like(t))
for t in (pa.binary(), pa.int64(), pa.large_string()):
self.assertFalse(arrow_util.is_list_like(t))
示例6: testIsBinaryLike
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import binary [as 别名]
def testIsBinaryLike(self):
for t in (pa.binary(), pa.large_binary(), pa.string(), pa.large_string()):
self.assertTrue(arrow_util.is_binary_like(t))
for t in (pa.list_(pa.binary()), pa.large_list(pa.string())):
self.assertFalse(arrow_util.is_binary_like(t))
示例7: testEnumerateArraysStringWeight
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import binary [as 别名]
def testEnumerateArraysStringWeight(self):
# The arrow type of a string changes between py2 and py3 so we accept either
with self.assertRaisesRegex(
ValueError,
r'Weight column "w" must be of numeric type. Found (string|binary).*'):
for _ in arrow_util.enumerate_arrays(
pa.RecordBatch.from_arrays(
[pa.array([[1], [2, 3]]),
pa.array([["a"], ["b"]])], ["v", "w"]),
weight_column="w",
enumerate_leaves_only=True):
pass
示例8: test_redshift_category
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import binary [as 别名]
def test_redshift_category(bucket, databases_parameters):
path = f"s3://{bucket}/test_redshift_category/"
df = get_df_category().drop(["binary"], axis=1, inplace=False)
engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift")
wr.db.copy_to_redshift(
df=df,
path=path,
con=engine,
schema="public",
table="test_redshift_category",
mode="overwrite",
iam_role=databases_parameters["redshift"]["role"],
)
df2 = wr.db.unload_redshift(
sql="SELECT * FROM public.test_redshift_category",
con=engine,
iam_role=databases_parameters["redshift"]["role"],
path=path,
keep_files=False,
categories=df.columns,
)
ensure_data_types_category(df2)
dfs = wr.db.unload_redshift(
sql="SELECT * FROM public.test_redshift_category",
con=engine,
iam_role=databases_parameters["redshift"]["role"],
path=path,
keep_files=False,
categories=df.columns,
chunked=True,
)
for df2 in dfs:
ensure_data_types_category(df2)
wr.s3.delete_objects(path=path)
示例9: test_arrow_schema_convertion
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import binary [as 别名]
def test_arrow_schema_convertion():
fields = [
pa.field('string', pa.string()),
pa.field('int8', pa.int8()),
pa.field('int16', pa.int16()),
pa.field('int32', pa.int32()),
pa.field('int64', pa.int64()),
pa.field('float', pa.float32()),
pa.field('double', pa.float64()),
pa.field('bool', pa.bool_(), False),
pa.field('fixed_size_binary', pa.binary(10)),
pa.field('variable_size_binary', pa.binary()),
pa.field('decimal', pa.decimal128(3, 4)),
pa.field('timestamp_s', pa.timestamp('s')),
pa.field('timestamp_ns', pa.timestamp('ns')),
pa.field('date_32', pa.date32()),
pa.field('date_64', pa.date64())
]
arrow_schema = pa.schema(fields)
mock_dataset = _mock_parquet_dataset([], arrow_schema)
unischema = Unischema.from_arrow_schema(mock_dataset)
for name in arrow_schema.names:
assert getattr(unischema, name).name == name
assert getattr(unischema, name).codec is None
if name == 'bool':
assert not getattr(unischema, name).nullable
else:
assert getattr(unischema, name).nullable
# Test schema preserve fields order
field_name_list = [f.name for f in fields]
assert list(unischema.fields.keys()) == field_name_list
示例10: to_arrow_type
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import binary [as 别名]
def to_arrow_type(dt):
""" Convert Spark data type to pyarrow type
"""
from distutils.version import LooseVersion
import pyarrow as pa
if type(dt) == BooleanType:
arrow_type = pa.bool_()
elif type(dt) == ByteType:
arrow_type = pa.int8()
elif type(dt) == ShortType:
arrow_type = pa.int16()
elif type(dt) == IntegerType:
arrow_type = pa.int32()
elif type(dt) == LongType:
arrow_type = pa.int64()
elif type(dt) == FloatType:
arrow_type = pa.float32()
elif type(dt) == DoubleType:
arrow_type = pa.float64()
elif type(dt) == DecimalType:
arrow_type = pa.decimal128(dt.precision, dt.scale)
elif type(dt) == StringType:
arrow_type = pa.string()
elif type(dt) == BinaryType:
# TODO: remove version check once minimum pyarrow version is 0.10.0
if LooseVersion(pa.__version__) < LooseVersion("0.10.0"):
raise TypeError("Unsupported type in conversion to Arrow: " + str(dt) +
"\nPlease install pyarrow >= 0.10.0 for BinaryType support.")
arrow_type = pa.binary()
elif type(dt) == DateType:
arrow_type = pa.date32()
elif type(dt) == TimestampType:
# Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read
arrow_type = pa.timestamp('us', tz='UTC')
elif type(dt) == ArrayType:
if type(dt.elementType) == TimestampType:
raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
arrow_type = pa.list_(to_arrow_type(dt.elementType))
else:
raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
return arrow_type
示例11: test_bq_to_arrow_data_type_w_struct
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import binary [as 别名]
def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type):
fields = (
schema.SchemaField("field01", "STRING"),
schema.SchemaField("field02", "BYTES"),
schema.SchemaField("field03", "INTEGER"),
schema.SchemaField("field04", "INT64"),
schema.SchemaField("field05", "FLOAT"),
schema.SchemaField("field06", "FLOAT64"),
schema.SchemaField("field07", "NUMERIC"),
schema.SchemaField("field08", "BOOLEAN"),
schema.SchemaField("field09", "BOOL"),
schema.SchemaField("field10", "TIMESTAMP"),
schema.SchemaField("field11", "DATE"),
schema.SchemaField("field12", "TIME"),
schema.SchemaField("field13", "DATETIME"),
schema.SchemaField("field14", "GEOGRAPHY"),
)
field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields)
actual = module_under_test.bq_to_arrow_data_type(field)
expected = pyarrow.struct(
(
pyarrow.field("field01", pyarrow.string()),
pyarrow.field("field02", pyarrow.binary()),
pyarrow.field("field03", pyarrow.int64()),
pyarrow.field("field04", pyarrow.int64()),
pyarrow.field("field05", pyarrow.float64()),
pyarrow.field("field06", pyarrow.float64()),
pyarrow.field("field07", module_under_test.pyarrow_numeric()),
pyarrow.field("field08", pyarrow.bool_()),
pyarrow.field("field09", pyarrow.bool_()),
pyarrow.field("field10", module_under_test.pyarrow_timestamp()),
pyarrow.field("field11", pyarrow.date32()),
pyarrow.field("field12", module_under_test.pyarrow_time()),
pyarrow.field("field13", module_under_test.pyarrow_datetime()),
pyarrow.field("field14", pyarrow.string()),
)
)
assert pyarrow.types.is_struct(actual)
assert actual.num_children == len(fields)
assert actual.equals(expected)
示例12: test_bq_to_arrow_data_type_w_array_struct
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import binary [as 别名]
def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type):
fields = (
schema.SchemaField("field01", "STRING"),
schema.SchemaField("field02", "BYTES"),
schema.SchemaField("field03", "INTEGER"),
schema.SchemaField("field04", "INT64"),
schema.SchemaField("field05", "FLOAT"),
schema.SchemaField("field06", "FLOAT64"),
schema.SchemaField("field07", "NUMERIC"),
schema.SchemaField("field08", "BOOLEAN"),
schema.SchemaField("field09", "BOOL"),
schema.SchemaField("field10", "TIMESTAMP"),
schema.SchemaField("field11", "DATE"),
schema.SchemaField("field12", "TIME"),
schema.SchemaField("field13", "DATETIME"),
schema.SchemaField("field14", "GEOGRAPHY"),
)
field = schema.SchemaField("ignored_name", bq_type, mode="REPEATED", fields=fields)
actual = module_under_test.bq_to_arrow_data_type(field)
expected_value_type = pyarrow.struct(
(
pyarrow.field("field01", pyarrow.string()),
pyarrow.field("field02", pyarrow.binary()),
pyarrow.field("field03", pyarrow.int64()),
pyarrow.field("field04", pyarrow.int64()),
pyarrow.field("field05", pyarrow.float64()),
pyarrow.field("field06", pyarrow.float64()),
pyarrow.field("field07", module_under_test.pyarrow_numeric()),
pyarrow.field("field08", pyarrow.bool_()),
pyarrow.field("field09", pyarrow.bool_()),
pyarrow.field("field10", module_under_test.pyarrow_timestamp()),
pyarrow.field("field11", pyarrow.date32()),
pyarrow.field("field12", module_under_test.pyarrow_time()),
pyarrow.field("field13", module_under_test.pyarrow_datetime()),
pyarrow.field("field14", pyarrow.string()),
)
)
assert pyarrow.types.is_list(actual)
assert pyarrow.types.is_struct(actual.value_type)
assert actual.value_type.num_children == len(fields)
assert actual.value_type.equals(expected_value_type)
示例13: _get_binary_like_byte_size_test_cases
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import binary [as 别名]
def _get_binary_like_byte_size_test_cases():
result = []
for array_type, sizeof_offsets in [
(pa.binary(), 4),
(pa.string(), 4),
(pa.large_binary(), 8),
(pa.large_string(), 8),
]:
result.append(
dict(
testcase_name=str(array_type),
array=pa.array([
"a", "bb", "ccc", "dddd", "eeeee", "ffffff", "ggggggg",
"hhhhhhhh", "iiiiiiiii"
],
type=array_type),
slice_offset=1,
slice_length=3,
# contents: 45
# offsets: 10 * sizeof_offsets
# null bitmap: 2
expected_size=(45 + sizeof_offsets * 10 +
_all_false_null_bitmap_size(2)),
# contents: 9
# offsets: 4 * sizeof_offsets
# null bitmap: 1
expected_sliced_size=(9 + sizeof_offsets * 4 +
_all_false_null_bitmap_size(1))))
return result
示例14: test_success
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import binary [as 别名]
def test_success(self, row_indices, expected_output):
record_batch = pa.RecordBatch.from_arrays([
pa.array([[1, 2, 3], None, [4], [], [5, 6], [7], [8, 9], [10], []],
type=pa.list_(pa.int32())),
pa.array(
[["a"], ["b", "c"], None, [], None, ["d", "e"], ["f"], None, ["g"]],
type=pa.list_(pa.binary())),
], ["f1", "f2"])
for row_indices_type in (pa.int32(), pa.int64()):
sliced = table_util.RecordBatchTake(
record_batch, pa.array(row_indices, type=row_indices_type))
self.assertTrue(
sliced.equals(expected_output),
"Expected {}, got {}".format(expected_output, sliced))
示例15: _ValidateRecordBatch
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import binary [as 别名]
def _ValidateRecordBatch(
self, tfxio, record_batch, raw_record_column_name=None):
self.assertIsInstance(record_batch, pa.RecordBatch)
self.assertEqual(record_batch.num_rows, 3)
expected_column_values = _GetExpectedColumnValues(tfxio)
for i, field in enumerate(record_batch.schema):
if field.name == raw_record_column_name:
continue
if field.name == _SEQUENCE_COLUMN_NAME:
self.assertTrue(pa.types.is_struct(field.type))
for seq_column, seq_field in zip(
record_batch.column(i).flatten(), list(field.type)):
expected_array = expected_column_values[path.ColumnPath(
[_SEQUENCE_COLUMN_NAME, seq_field.name])]
self.assertTrue(
seq_column.equals(expected_array),
"Sequence column {} did not match ({} vs {})".format(
seq_field.name, seq_column, expected_array))
continue
self.assertTrue(
record_batch.column(i).equals(expected_column_values[path.ColumnPath(
[field.name])]), "Column {} did not match ({} vs {}).".format(
field.name, record_batch.column(i),
expected_column_values[path.ColumnPath([field.name])]))
if raw_record_column_name is not None:
if tfxio._can_produce_large_types:
raw_record_column_type = pa.large_list(pa.large_binary())
else:
raw_record_column_type = pa.list_(pa.binary())
self.assertEqual(record_batch.schema.names[-1], raw_record_column_name)
self.assertTrue(
record_batch.columns[-1].type.equals(raw_record_column_type))
self.assertEqual(record_batch.columns[-1].flatten().to_pylist(),
_SERIALIZED_EXAMPLES)