本文整理汇总了Python中pyarrow.string方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.string方法的具体用法?Python pyarrow.string怎么用?Python pyarrow.string使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow
的用法示例。
在下文中一共展示了pyarrow.string方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_redshift_spectrum_long_string
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_redshift_spectrum_long_string(path, glue_table, glue_database, redshift_external_schema):
df = pd.DataFrame(
{
"id": [1, 2],
"col_str": [
"".join(random.choice(string.ascii_letters) for _ in range(300)),
"".join(random.choice(string.ascii_letters) for _ in range(300)),
],
}
)
paths = wr.s3.to_parquet(
df=df, path=path, database=glue_database, table=glue_table, mode="overwrite", index=False, dataset=True
)["paths"]
wr.s3.wait_objects_exist(paths=paths, use_threads=False)
engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift")
with engine.connect() as con:
cursor = con.execute(f"SELECT * FROM {redshift_external_schema}.{glue_table}")
rows = cursor.fetchall()
assert len(rows) == len(df.index)
for row in rows:
assert len(row) == len(df.columns)
示例2: __init__
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def __init__(self, name, dataType, nullable=True, metadata=None):
"""
>>> (StructField("f1", StringType(), True)
... == StructField("f1", StringType(), True))
True
>>> (StructField("f1", StringType(), True)
... == StructField("f2", StringType(), True))
False
"""
assert isinstance(dataType, DataType),\
"dataType %s should be an instance of %s" % (dataType, DataType)
assert isinstance(name, basestring), "field name %s should be string" % (name)
if not isinstance(name, str):
name = name.encode('utf-8')
self.name = name
self.dataType = dataType
self.nullable = nullable
self.metadata = metadata or {}
示例3: test_dataframe_to_arrow_dict_sequence_schema
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_dataframe_to_arrow_dict_sequence_schema(module_under_test):
dict_schema = [
{"name": "field01", "type": "STRING", "mode": "REQUIRED"},
{"name": "field02", "type": "BOOL", "mode": "NULLABLE"},
]
dataframe = pandas.DataFrame(
{"field01": [u"hello", u"world"], "field02": [True, False]}
)
arrow_table = module_under_test.dataframe_to_arrow(dataframe, dict_schema)
arrow_schema = arrow_table.schema
expected_fields = [
pyarrow.field("field01", "string", nullable=False),
pyarrow.field("field02", "bool", nullable=True),
]
assert list(arrow_schema) == expected_fields
示例4: test_num_bytes_getter
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_num_bytes_getter(self):
dataset = DatasetReference(self.PROJECT, self.DS_ID)
table_ref = dataset.table(self.TABLE_NAME)
table = self._make_one(table_ref)
# Check with no value set.
self.assertIsNone(table.num_bytes)
num_bytes = 1337
# Check with integer value set.
table._properties = {"numBytes": num_bytes}
self.assertEqual(table.num_bytes, num_bytes)
# Check with a string value set.
table._properties = {"numBytes": str(num_bytes)}
self.assertEqual(table.num_bytes, num_bytes)
# Check with invalid int value.
table._properties = {"numBytes": "x"}
with self.assertRaises(ValueError):
getattr(table, "num_bytes")
示例5: test_text_zfill
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_text_zfill(data, fletcher_variant):
if any("\x00" in x for x in data if x):
# pytest.skip("pandas cannot handle \\x00 characters in tests")
# Skip is not working properly with hypothesis
return
ser_pd = pd.Series(data, dtype=str)
max_str_len = ser_pd.map(_optional_len).max()
if pd.isna(max_str_len):
max_str_len = 0
arrow_data = pa.array(data, type=pa.string())
if fletcher_variant == "chunked":
fr_array = fr.FletcherChunkedArray(arrow_data)
else:
fr_array = fr.FletcherContinuousArray(arrow_data)
ser_fr = pd.Series(fr_array)
result_pd = ser_pd.str.zfill(max_str_len + 1)
result_fr = ser_fr.fr_text.zfill(max_str_len + 1)
result_fr = result_fr.astype(object)
# Pandas returns np.nan for NA values in cat, keep this in line
result_fr[result_fr.isna()] = np.nan
tm.assert_series_equal(result_fr, result_pd)
示例6: test_string_builder_simple
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_string_builder_simple(data):
builder = NumbaStringArrayBuilder(2, 6)
for s in data:
if s is None:
builder.finish_null()
continue
for c in s:
builder.put_byte(ord(c))
builder.finish_string()
builder.finish()
expected = pa.array(data, pa.string())
missing, offsets, data = buffers_as_arrays(expected)
np.testing.assert_array_equal(builder.offsets, offsets)
np.testing.assert_array_equal(builder.data, data)
示例7: __eq__
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def __eq__(self, other) -> bool:
"""Check whether 'other' is equal to self.
By default, 'other' is considered equal if
* it's a string matching 'self.name'.
* it's an instance of this type.
Parameters
----------
other : Any
Returns
-------
bool
"""
if isinstance(other, str):
return other == self.name
elif isinstance(other, type(self)):
return self.arrow_dtype == other.arrow_dtype
else:
return False
示例8: _text_cat
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def _text_cat(a: pa.Array, b: pa.Array) -> pa.Array:
if len(a) != len(b):
raise ValueError("Lengths of arrays don't match")
offsets_a, data_a = _extract_string_buffers(a)
offsets_b, data_b = _extract_string_buffers(b)
if len(a) > 0:
valid = _merge_valid_bitmaps(a, b)
result_offsets = np.empty(len(a) + 1, dtype=np.int32)
result_offsets[0] = 0
total_size = (offsets_a[-1] - offsets_a[0]) + (offsets_b[-1] - offsets_b[0])
result_data = np.empty(total_size, dtype=np.uint8)
_merge_string_data(
len(a),
valid,
offsets_a,
data_a,
offsets_b,
data_b,
result_offsets,
result_data,
)
buffers = [pa.py_buffer(x) for x in [valid, result_offsets, result_data]]
return pa.Array.from_buffers(pa.string(), len(a), buffers)
return a
示例9: get_pa_translated_schema
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def get_pa_translated_schema(self):
"""Translates a BigQuery schema to an parquet schema.
Returns: Translated parquet schema in pyarrow.Schema format.
"""
type_conversions = {
'STRING': pa.string(),
'NUMERIC': pa.int64(),
}
# TODO(annarudy@google.com): add support for nested fields
pa_schema_list = [
pa.field(
bq_field.name,
type_conversions[bq_field.field_type],
) for bq_field in self.bq_schema
]
return pa.schema(pa_schema_list)
示例10: test_dataframe_all_null_category_column
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_dataframe_all_null_category_column(self):
assert_arrow_table_equals(
dataframe_to_arrow_table(
pd.DataFrame({"A": [None]}, dtype=str).astype("category"),
[Column("A", ColumnType.TEXT())],
self.path,
),
arrow_table(
{
"A": pyarrow.DictionaryArray.from_arrays(
pyarrow.array([None], type=pyarrow.int8()),
pyarrow.array([], type=pyarrow.string()),
)
}
),
)
示例11: load_from_buffer
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def load_from_buffer(
buf, store: KeyValueStore, format: str = "json"
) -> "DatasetMetadata":
"""
Load a dataset from a (string) buffer.
Parameters
----------
buf:
Input to be parsed.
store:
Object that implements the .get method for file/object loading.
Returns
-------
dataset_metadata:
Parsed metadata.
"""
if format == "json":
metadata = load_json(buf)
elif format == "msgpack":
metadata = msgpack.unpackb(buf)
return DatasetMetadata.load_from_dict(metadata, store)
示例12: test_iterate_over_string_chunk
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_iterate_over_string_chunk():
random.seed(datetime.datetime.now())
column_meta = [
{"logicalType": "TEXT"},
{"logicalType": "TEXT"}
]
field_foo = pyarrow.field("column_foo", pyarrow.string(), True, column_meta[0])
field_bar = pyarrow.field("column_bar", pyarrow.string(), True, column_meta[1])
pyarrow.schema([field_foo, field_bar])
def str_generator():
return str(random.randint(-100, 100))
iterate_over_test_chunk([pyarrow.string(), pyarrow.string()],
column_meta, str_generator)
示例13: get_pyarrow_types
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def get_pyarrow_types():
return {
'bool': PA_BOOL,
'float32': PA_FLOAT32,
'float64': PA_FLOAT64,
'int8': PA_INT8,
'int16': PA_INT16,
'int32': PA_INT32,
'int64': PA_INT64,
'string': PA_STRING,
'timestamp': PA_TIMESTAMP,
'base64': PA_BINARY
}
# pylint: disable=too-many-branches,too-many-statements
示例14: test_argparse_types
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_argparse_types():
_map = {}
csv2parquet.main_with_args(capture_args(_map), ['foo.csv', '--type', '0=string', '0=int8?'])
assert _map['raw_types'] == [('0', pa.string(), False), ('0', pa.int8(), True)]
示例15: testIsBinaryLike
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def testIsBinaryLike(self):
for t in (pa.binary(), pa.large_binary(), pa.string(), pa.large_string()):
self.assertTrue(arrow_util.is_binary_like(t))
for t in (pa.list_(pa.binary()), pa.large_list(pa.string())):
self.assertFalse(arrow_util.is_binary_like(t))