本文整理汇总了Python中pyarrow.ChunkedArray方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.ChunkedArray方法的具体用法?Python pyarrow.ChunkedArray怎么用?Python pyarrow.ChunkedArray使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow
的用法示例。
在下文中一共展示了pyarrow.ChunkedArray方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import ChunkedArray [as 别名]
def __init__(self, array, dtype=None, copy=None):
# Copy is not used at the moment. It's only affect will be when we
# allow array to be a FletcherChunkedArray
if is_array_like(array) or isinstance(array, list):
self.data = pa.chunked_array([pa.array(array, type=dtype)])
elif isinstance(array, pa.Array):
# ARROW-7008: pyarrow.chunked_array([array]) fails on array with all-None buffers
if len(array) == 0 and all(b is None for b in array.buffers()):
array = pa.array([], type=array.type)
# TODO: Assert dtype
self.data = pa.chunked_array([array])
elif isinstance(array, pa.ChunkedArray):
# TODO: Assert dtype
self.data = array
else:
raise ValueError(
"Unsupported type passed for {}: {}".format(
self.__class__.__name__, type(array)
)
)
self._dtype = FletcherChunkedDtype(self.data.type)
self.offsets = self._calculate_chunk_offsets()
示例2: _call_x_with
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import ChunkedArray [as 别名]
def _call_x_with(self, impl, needle, na=None):
needle = NumbaString.make(needle) # type: ignore
result = np.zeros(len(self.data), dtype=np.uint8)
if isinstance(self.data, pa.ChunkedArray):
offset = 0
for chunk in self.data.chunks:
str_arr = NumbaStringArray.make(chunk) # type: ignore
impl(str_arr, needle, 2, offset, result)
offset += len(chunk)
else:
str_arr = NumbaStringArray.make(self.data) # type: ignore
impl(str_arr, needle, 2, 0, result)
return pd.Series(
type(self.obj.values)(pa.array(result.astype(bool), mask=(result == 2)))
)
示例3: _2
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import ChunkedArray [as 别名]
def _2(a: pa.Array, b: Any, ops: Dict[str, Callable]):
"""Apply a NumPy ufunc where at least one of the arguments is an Arrow structure."""
if isinstance(b, pa.ChunkedArray):
if len(a) != len(b):
raise ValueError("Inputs don't have the same length.")
new_chunks = []
offsets = _calculate_chunk_offsets(b)
for chunk, offset in zip(b.iterchunks(), offsets):
new_chunks.append(
dispatch_chunked_binary_map(a[offset : offset + len(chunk)], chunk, ops)
)
return pa.chunked_array(new_chunks)
elif isinstance(b, pa.Array):
if len(a) != len(b):
raise ValueError("Inputs don't have the same length.")
return ops.get("array_array", _not_implemented_path)(a, b)
else:
if np.isscalar(b):
return ops.get("array_scalar", _not_implemented_path)(a, b)
else:
if len(a) != len(b):
raise ValueError("Inputs don't have the same length.")
return ops.get("array_nparray", _not_implemented_path)(a, b)
示例4: __init__
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import ChunkedArray [as 别名]
def __init__(self, values):
if not isinstance(values, pa.ChunkedArray):
raise ValueError
assert values.type == pa.bool_()
self._data = values
self._dtype = ArrowBoolDtype()
示例5: check_valid_in_offsets
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import ChunkedArray [as 别名]
def check_valid_in_offsets(
arr: pa.ChunkedArray, in_offsets: List[Tuple[int, int, int]]
) -> None:
if arr.num_chunks == 0:
assert in_offsets == []
return
# We always start at the beginning
assert in_offsets[0][0] == 0
assert in_offsets[0][1] == 0
# Overall, the chunk offsets must have the same length as the array
assert sum(x[2] for x in in_offsets) == len(arr)
示例6: assert_content_equals_array
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import ChunkedArray [as 别名]
def assert_content_equals_array(result, expected):
"""Assert that the result is an Arrow structure and the content matches an array."""
assert isinstance(result, (pa.Array, pa.ChunkedArray))
if isinstance(result, pa.ChunkedArray):
result = pa.concat_arrays(result.iterchunks())
assert result.equals(expected)
示例7: __arrow_array__
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import ChunkedArray [as 别名]
def __arrow_array__(self, type=None):
"""Convert myself to a pyarrow Array or ChunkedArray."""
return self.data
示例8: base
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import ChunkedArray [as 别名]
def base(self) -> Union[pa.Array, pa.ChunkedArray]:
"""Return base object of the underlying data."""
return self.data
示例9: unique
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import ChunkedArray [as 别名]
def unique(self):
"""
Compute the ExtensionArray of unique values.
It relies on the Pyarrow.ChunkedArray.unique and if
it fails, comes back to the naive implementation.
Returns
-------
uniques : ExtensionArray
"""
try:
return type(self)(self.data.unique())
except NotImplementedError:
return super().unique()
示例10: pandas_from_arrow
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import ChunkedArray [as 别名]
def pandas_from_arrow(
arrow_object: Union[pa.RecordBatch, pa.Table, pa.Array, pa.ChunkedArray],
continuous: bool = False,
):
"""
Convert Arrow object instance to their Pandas equivalent by using Fletcher.
The conversion rules are:
* {RecordBatch, Table} -> DataFrame
* {Array, ChunkedArray} -> Series
Parameters
----------
arrow_object : RecordBatch, Table, Array or ChunkedArray
object to be converted
continuous : bool
Use FletcherContinuousArray instead of FletcherChunkedArray
"""
if continuous:
array_type = FletcherContinuousArray
else:
array_type = FletcherChunkedArray
if isinstance(arrow_object, pa.RecordBatch):
data: OrderedDict = OrderedDict()
for ix, arr in enumerate(arrow_object):
col_name = arrow_object.schema.names[ix]
data[col_name] = array_type(arr)
return pd.DataFrame(data)
elif isinstance(arrow_object, pa.Table):
data = OrderedDict()
for name, col in zip(arrow_object.column_names, arrow_object.itercolumns()):
data[name] = array_type(col)
return pd.DataFrame(data)
elif isinstance(arrow_object, (pa.ChunkedArray, pa.Array)):
return pd.Series(array_type(arrow_object))
else:
raise NotImplementedError(
"Objects of type {} are not supported".format(type(arrow_object))
)
示例11: _series_like
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import ChunkedArray [as 别名]
def _series_like(self, array: Union[pa.Array, pa.ChunkedArray]) -> pd.Series:
"""Return an Arrow result as a series with the same base classes as the input."""
return pd.Series(
type(self.obj.values)(array),
dtype=type(self.obj.dtype)(array.type),
index=self.obj.index,
)
示例12: extract_isnull_bytemap
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import ChunkedArray [as 别名]
def extract_isnull_bytemap(array: Union[pa.ChunkedArray, pa.Array]) -> np.ndarray:
"""
Extract the valid bitmaps of a (chunked) array into numpy isnull bytemaps.
Parameters
----------
array
Array from which we extract the validity bits as bytes
Returns
-------
valid_bytemap
"""
if array.null_count == len(array):
return np.ones(len(array), dtype=bool)
if isinstance(array, pa.ChunkedArray):
result = np.zeros(len(array), dtype=bool)
if array.null_count == 0:
return result
offset = 0
for chunk in array.chunks:
if chunk.null_count > 0:
_extract_isnull_bytemap(
chunk.buffers()[0], len(chunk), chunk.offset, offset, result
)
offset += len(chunk)
else:
valid_bitmap = array.buffers()[0]
if valid_bitmap:
# TODO: Can we use np.empty here to improve performance?
result = np.zeros(len(array), dtype=bool)
# TODO(ARROW-2664): We only need to following line to support
# executing the code in disabled-JIT mode.
buf = memoryview(valid_bitmap)
_extract_isnull_bytemap(buf, len(array), array.offset, 0, result)
else:
result = np.full(len(array), False)
return result
示例13: pd_nanop
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import ChunkedArray [as 别名]
def pd_nanop(nanop: Callable, arr: Union[pa.ChunkedArray, pa.Array], skipna: bool):
"""Use pandas.core.nanops to provide a reduction."""
if isinstance(arr, pa.ChunkedArray):
data = pa.concat_arrays(arr.iterchunks())
else:
data = arr
np_arr = _extract_data_buffer_as_np_array(data)
mask = extract_isnull_bytemap(data)
return nanop(np_arr, skipna=skipna, mask=mask)
示例14: _text_cat_chunked
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import ChunkedArray [as 别名]
def _text_cat_chunked(a: Any, b: pa.ChunkedArray) -> pa.ChunkedArray:
raise NotImplementedError(
"_text_cat_chunked is only implemented for pa.Array and pa.ChunkedArray"
)
示例15: _text_cat_chunked_1
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import ChunkedArray [as 别名]
def _text_cat_chunked_1(a: pa.ChunkedArray, b: pa.ChunkedArray) -> pa.ChunkedArray:
in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)
new_chunks: List[pa.Array] = []
for a_offset, b_offset in zip(in_a_offsets, in_b_offsets):
a_slice = a.chunk(a_offset[0])[a_offset[1] : a_offset[1] + a_offset[2]]
b_slice = b.chunk(b_offset[0])[b_offset[1] : b_offset[1] + b_offset[2]]
new_chunks.append(_text_cat(a_slice, b_slice))
return pa.chunked_array(new_chunks)