本文整理汇总了Python中pyarrow.Array方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.Array方法的具体用法?Python pyarrow.Array怎么用?Python pyarrow.Array使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow
的用法示例。
在下文中一共展示了pyarrow.Array方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: update
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import Array [as 别名]
def update(self, feature_array: pa.Array, presence_mask: np.ndarray,
num_values: np.ndarray, num_values_not_none: np.ndarray,
weights: Optional[np.ndarray]) -> None:
"""Updates the stats with a feature array."""
self.num_non_missing += len(feature_array) - feature_array.null_count
self.max_num_values = np.maximum.reduce(
num_values_not_none, initial=self.max_num_values)
self.min_num_values = np.minimum.reduce(num_values_not_none,
initial=self.min_num_values)
self.total_num_values += np.sum(num_values_not_none)
if weights is not None:
if weights.size != num_values.size:
raise ValueError('Weight feature must not be missing.')
self.weighted_total_num_values += np.sum(num_values * weights)
self.weighted_num_non_missing += np.sum(weights[presence_mask])
示例2: __init__
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import Array [as 别名]
def __init__(self, array, dtype=None, copy: Optional[bool] = None):
# Copy is not used at the moment. It's only affect will be when we
# allow array to be a FletcherContinuousArray
if is_array_like(array) or isinstance(array, list):
self.data = pa.array(array, type=dtype)
elif isinstance(array, pa.Array):
# TODO: Assert dtype
self.data = array
elif isinstance(array, pa.ChunkedArray):
# TODO: Assert dtype
if array.num_chunks == 1:
self.data = array.chunk(0)
else:
self.data = pa.concat_arrays(array.iterchunks())
else:
raise ValueError(
"Unsupported type passed for {}: {}".format(
self.__class__.__name__, type(array)
)
)
self._dtype = FletcherContinuousDtype(self.data.type)
示例3: np_ufunc_array_array
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import Array [as 别名]
def np_ufunc_array_array(a: pa.Array, b: pa.Array, op: Callable):
np_arr_a = _extract_data_buffer_as_np_array(a)
np_arr_b = _extract_data_buffer_as_np_array(b)
if a.null_count > 0 and b.null_count > 0:
# TODO: Combine them before extracting
mask_a = extract_isnull_bytemap(a)
mask_b = extract_isnull_bytemap(b)
mask = mask_a | mask_b
elif a.null_count > 0:
mask = extract_isnull_bytemap(a)
elif b.null_count > 0:
mask = extract_isnull_bytemap(b)
else:
mask = None
new_arr = op(np_arr_a, np_arr_b)
# Don't set type as we might have valid casts like int->float in truediv
return pa.array(new_arr, mask=mask)
示例4: _text_cat
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import Array [as 别名]
def _text_cat(a: pa.Array, b: pa.Array) -> pa.Array:
if len(a) != len(b):
raise ValueError("Lengths of arrays don't match")
offsets_a, data_a = _extract_string_buffers(a)
offsets_b, data_b = _extract_string_buffers(b)
if len(a) > 0:
valid = _merge_valid_bitmaps(a, b)
result_offsets = np.empty(len(a) + 1, dtype=np.int32)
result_offsets[0] = 0
total_size = (offsets_a[-1] - offsets_a[0]) + (offsets_b[-1] - offsets_b[0])
result_data = np.empty(total_size, dtype=np.uint8)
_merge_string_data(
len(a),
valid,
offsets_a,
data_a,
offsets_b,
data_b,
result_offsets,
result_data,
)
buffers = [pa.py_buffer(x) for x in [valid, result_offsets, result_data]]
return pa.Array.from_buffers(pa.string(), len(a), buffers)
return a
示例5: or_na
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import Array [as 别名]
def or_na(arr: pa.Array) -> pa.Array:
"""Apply ``array | NA`` with a boolean pyarrow.Array."""
output_length = len(arr) // 8
if len(arr) % 8 != 0:
output_length += 1
if arr.null_count == 0:
return pa.Array.from_buffers(
pa.bool_(),
len(arr),
[arr.buffers()[1], arr.buffers()[1]],
null_count=-1,
offset=arr.offset,
)
else:
output = np.zeros(output_length, dtype=np.uint8)
null_count = _or_na(
len(arr), arr.offset, arr.buffers()[0], arr.buffers()[1], output
)
buf = pa.py_buffer(output)
return pa.Array.from_buffers(pa.bool_(), len(arr), [buf, buf], null_count)
示例6: _2
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import Array [as 别名]
def _2(a: pa.Array, b: Any, ops: Dict[str, Callable]):
"""Apply a NumPy ufunc where at least one of the arguments is an Arrow structure."""
if isinstance(b, pa.ChunkedArray):
if len(a) != len(b):
raise ValueError("Inputs don't have the same length.")
new_chunks = []
offsets = _calculate_chunk_offsets(b)
for chunk, offset in zip(b.iterchunks(), offsets):
new_chunks.append(
dispatch_chunked_binary_map(a[offset : offset + len(chunk)], chunk, ops)
)
return pa.chunked_array(new_chunks)
elif isinstance(b, pa.Array):
if len(a) != len(b):
raise ValueError("Inputs don't have the same length.")
return ops.get("array_array", _not_implemented_path)(a, b)
else:
if np.isscalar(b):
return ops.get("array_scalar", _not_implemented_path)(a, b)
else:
if len(a) != len(b):
raise ValueError("Inputs don't have the same length.")
return ops.get("array_nparray", _not_implemented_path)(a, b)
示例7: _ListArrayToTensor
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import Array [as 别名]
def _ListArrayToTensor(
self, list_array: pa.Array,
produce_eager_tensors: bool) -> Union[np.ndarray, tf.Tensor]:
"""Converts a ListArray to a dense tensor."""
values = list_array.flatten()
batch_size = len(list_array)
expected_num_elements = batch_size * self._unbatched_flat_len
if len(values) != expected_num_elements:
raise ValueError(
"Unable to convert ListArray {} to {}: size mismatch. expected {} "
"elements but got {}".format(
list_array, self.type_spec, expected_num_elements, len(values)))
actual_shape = list(self._shape)
actual_shape[0] = batch_size
if self._convert_to_binary_fn is not None:
values = self._convert_to_binary_fn(values)
values_np = np.asarray(values).reshape(actual_shape)
if produce_eager_tensors:
return tf.convert_to_tensor(values_np)
return values_np
示例8: from_array
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import Array [as 别名]
def from_array(cls, arr):
assert isinstance(arr, pa.Array)
return cls(pa.chunked_array([arr]))
示例9: is_binary_like
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import Array [as 别名]
def is_binary_like(data_type: pa.DataType) -> bool:
"""Returns true if an Arrow type is binary-like.
Qualified types are {Large,}BinaryArray, {Large,}StringArray.
Args:
data_type: a pa.Array.
Returns:
bool.
"""
return (pa.types.is_binary(data_type) or
pa.types.is_large_binary(data_type) or
pa.types.is_unicode(data_type) or
pa.types.is_large_unicode(data_type))
示例10: flatten_nested
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import Array [as 别名]
def flatten_nested(
array: pa.Array, return_parent_indices: bool = False
) -> Tuple[pa.Array, Optional[np.ndarray]]:
"""Flattens all the list arrays nesting an array.
If `array` is not list-like, itself will be returned.
Args:
array: pa.Array to flatten.
return_parent_indices: If True, also returns the parent indices array.
Returns:
A tuple. The first term is the flattened array. The second term is None
if `return_parent_indices` is False; otherwise it's a parent indices array
parallel to the flattened array: if parent_indices[i] = j, then
flattened_array[i] belongs to the j-th element of the input array.
"""
parent_indices = None
while is_list_like(array.type):
if return_parent_indices:
cur_parent_indices = array_util.GetFlattenedArrayParentIndices(
array).to_numpy()
if parent_indices is None:
parent_indices = cur_parent_indices
else:
parent_indices = parent_indices[cur_parent_indices]
array = array.flatten()
# the array is not nested at the first place.
if return_parent_indices and parent_indices is None:
parent_indices = np.arange(len(array))
return array, parent_indices
示例11: add_input
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import Array [as 别名]
def add_input(self, accumulator: _PartialNLStats,
feature_path: types.FeaturePath,
feature_array: pa.Array) -> _PartialNLStats:
"""Return result of folding a batch of inputs into accumulator.
Args:
accumulator: The current accumulator.
feature_path: The path of the feature.
feature_array: An arrow Array representing a batch of feature values
which should be added to the accumulator.
Returns:
The accumulator after updating the statistics for the batch of inputs.
"""
if accumulator.invalidate:
return accumulator
feature_type = stats_util.get_feature_type_from_arrow_type(
feature_path, feature_array.type)
# Ignore null array.
if feature_type is None:
return accumulator
# If we see a different type, invalidate.
if feature_type != statistics_pb2.FeatureNameStatistics.STRING:
accumulator.invalidate = True
return accumulator
def _is_non_utf8(value):
return (isinstance(value, bytes) and
stats_util.maybe_get_utf8(value) is None)
is_non_utf_vec = np.vectorize(_is_non_utf8, otypes=[np.bool])
classify_vec = np.vectorize(self._classifier.classify, otypes=[np.bool])
values = np.asarray(arrow_util.flatten_nested(feature_array)[0]
.slice(0, _CROP_AT_VALUES))
if np.any(is_non_utf_vec(values)):
accumulator.invalidate = True
return accumulator
accumulator.considered += values.size
accumulator.matched += np.sum(classify_vec(values))
return accumulator
示例12: add_input
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import Array [as 别名]
def add_input(self, accumulator: _PartialTimeStats,
feature_path: types.FeaturePath,
feature_array: pa.Array) -> _PartialTimeStats:
"""Returns result of folding a batch of inputs into the current accumulator.
Args:
accumulator: The current accumulator.
feature_path: The path of the feature.
feature_array: An arrow Array representing a batch of feature values
which should be added to the accumulator.
Returns:
The accumulator after updating the statistics for the batch of inputs.
"""
if accumulator.invalidated:
return accumulator
feature_type = stats_util.get_feature_type_from_arrow_type(
feature_path, feature_array.type)
# Ignore null array.
if feature_type is None:
return accumulator
if feature_type == statistics_pb2.FeatureNameStatistics.STRING:
def _maybe_get_utf8(val):
return stats_util.maybe_get_utf8(val) if isinstance(val, bytes) else val
values = np.asarray(arrow_util.flatten_nested(feature_array)[0])
maybe_utf8 = np.vectorize(_maybe_get_utf8, otypes=[np.object])(values)
if not maybe_utf8.all():
accumulator.invalidated = True
return accumulator
accumulator.update(maybe_utf8, feature_type)
elif feature_type == statistics_pb2.FeatureNameStatistics.INT:
values = np.asarray(arrow_util.flatten_nested(feature_array)[0])
accumulator.update(values, feature_type)
else:
accumulator.invalidated = True
return accumulator
示例13: assert_content_equals_array
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import Array [as 别名]
def assert_content_equals_array(result, expected):
"""Assert that the result is an Arrow structure and the content matches an array."""
assert isinstance(result, (pa.Array, pa.ChunkedArray))
if isinstance(result, pa.ChunkedArray):
result = pa.concat_arrays(result.iterchunks())
assert result.equals(expected)
示例14: _get_example
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import Array [as 别名]
def _get_example(arrow_dtype: pa.DataType) -> pa.Array:
if isinstance(arrow_dtype, pa.ListType):
return pa.array(
[None, _get_example(arrow_dtype.value_type).to_pylist()], type=arrow_dtype
)
return _examples[arrow_dtype]
示例15: __arrow_array__
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import Array [as 别名]
def __arrow_array__(self, type=None):
"""Convert myself to a pyarrow Array or ChunkedArray."""
return self.data