本文整理汇总了Python中pyarrow.field方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.field方法的具体用法?Python pyarrow.field怎么用?Python pyarrow.field使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow
的用法示例。
在下文中一共展示了pyarrow.field方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import field [as 别名]
def __init__(self, name, dataType, nullable=True, metadata=None):
"""
>>> (StructField("f1", StringType(), True)
... == StructField("f1", StringType(), True))
True
>>> (StructField("f1", StringType(), True)
... == StructField("f2", StringType(), True))
False
"""
assert isinstance(dataType, DataType),\
"dataType %s should be an instance of %s" % (dataType, DataType)
assert isinstance(name, basestring), "field name %s should be string" % (name)
if not isinstance(name, str):
name = name.encode('utf-8')
self.name = name
self.dataType = dataType
self.nullable = nullable
self.metadata = metadata or {}
示例2: __getitem__
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import field [as 别名]
def __getitem__(self, key):
"""Access fields by name or slice."""
if isinstance(key, str):
for field in self:
if field.name == key:
return field
raise KeyError('No StructField named {0}'.format(key))
elif isinstance(key, int):
try:
return self.fields[key]
except IndexError:
raise IndexError('StructType index out of range')
elif isinstance(key, slice):
return StructType(self.fields[key])
else:
raise TypeError('StructType keys should be strings, integers or slices')
示例3: test_bq_to_arrow_data_type_w_struct_unknown_subfield
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import field [as 别名]
def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test):
fields = (
schema.SchemaField("field1", "STRING"),
schema.SchemaField("field2", "INTEGER"),
# Don't know what to convert UNKNOWN_TYPE to, let type inference work,
# instead.
schema.SchemaField("field3", "UNKNOWN_TYPE"),
)
field = schema.SchemaField("ignored_name", "RECORD", mode="NULLABLE", fields=fields)
with warnings.catch_warnings(record=True) as warned:
actual = module_under_test.bq_to_arrow_data_type(field)
assert actual is None
assert len(warned) == 1
warning = warned[0]
assert "field3" in str(warning)
示例4: test_dataframe_to_arrow_dict_sequence_schema
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import field [as 别名]
def test_dataframe_to_arrow_dict_sequence_schema(module_under_test):
dict_schema = [
{"name": "field01", "type": "STRING", "mode": "REQUIRED"},
{"name": "field02", "type": "BOOL", "mode": "NULLABLE"},
]
dataframe = pandas.DataFrame(
{"field01": [u"hello", u"world"], "field02": [True, False]}
)
arrow_table = module_under_test.dataframe_to_arrow(dataframe, dict_schema)
arrow_schema = arrow_table.schema
expected_fields = [
pyarrow.field("field01", "string", nullable=False),
pyarrow.field("field02", "bool", nullable=True),
]
assert list(arrow_schema) == expected_fields
示例5: test_range_partitioning
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import field [as 别名]
def test_range_partitioning(self):
from google.cloud.bigquery.table import RangePartitioning
from google.cloud.bigquery.table import PartitionRange
table = self._make_one("proj.dset.tbl")
assert table.range_partitioning is None
table.range_partitioning = RangePartitioning(
field="col1", range_=PartitionRange(start=-512, end=1024, interval=128)
)
assert table.range_partitioning.field == "col1"
assert table.range_partitioning.range_.start == -512
assert table.range_partitioning.range_.end == 1024
assert table.range_partitioning.range_.interval == 128
table.range_partitioning = None
assert table.range_partitioning is None
示例6: test_time_partitioning_getter
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import field [as 别名]
def test_time_partitioning_getter(self):
from google.cloud.bigquery.table import TimePartitioning
from google.cloud.bigquery.table import TimePartitioningType
dataset = DatasetReference(self.PROJECT, self.DS_ID)
table_ref = dataset.table(self.TABLE_NAME)
table = self._make_one(table_ref)
table._properties["timePartitioning"] = {
"type": "DAY",
"field": "col1",
"expirationMs": "123456",
"requirePartitionFilter": False,
}
self.assertIsInstance(table.time_partitioning, TimePartitioning)
self.assertEqual(table.time_partitioning.type_, TimePartitioningType.DAY)
self.assertEqual(table.time_partitioning.field, "col1")
self.assertEqual(table.time_partitioning.expiration_ms, 123456)
with warnings.catch_warnings(record=True) as warned:
self.assertFalse(table.time_partitioning.require_partition_filter)
assert len(warned) == 1
self.assertIs(warned[0].category, PendingDeprecationWarning)
示例7: test_time_partitioning_getter_w_empty
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import field [as 别名]
def test_time_partitioning_getter_w_empty(self):
from google.cloud.bigquery.table import TimePartitioning
dataset = DatasetReference(self.PROJECT, self.DS_ID)
table_ref = dataset.table(self.TABLE_NAME)
table = self._make_one(table_ref)
# Even though there are required properties according to the API
# specification, sometimes time partitioning is populated as an empty
# object. See internal bug 131167013.
table._properties["timePartitioning"] = {}
self.assertIsInstance(table.time_partitioning, TimePartitioning)
self.assertIsNone(table.time_partitioning.type_)
self.assertIsNone(table.time_partitioning.field)
self.assertIsNone(table.time_partitioning.expiration_ms)
with warnings.catch_warnings(record=True) as warned:
self.assertIsNone(table.time_partitioning.require_partition_filter)
for warning in warned:
self.assertIs(warning.category, PendingDeprecationWarning)
示例8: test_from_api_repr_explicit
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import field [as 别名]
def test_from_api_repr_explicit(self):
from google.cloud.bigquery.table import TimePartitioningType
klass = self._get_target_class()
api_repr = {
"type": "DAY",
"field": "name",
"expirationMs": "10000",
"requirePartitionFilter": True,
}
time_partitioning = klass.from_api_repr(api_repr)
self.assertEqual(time_partitioning.type_, TimePartitioningType.DAY)
self.assertEqual(time_partitioning.field, "name")
self.assertEqual(time_partitioning.expiration_ms, 10000)
with warnings.catch_warnings(record=True) as warned:
self.assertTrue(time_partitioning.require_partition_filter)
self.assertIs(warned[0].category, PendingDeprecationWarning)
示例9: test_to_api_repr_explicit
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import field [as 别名]
def test_to_api_repr_explicit(self):
from google.cloud.bigquery.table import TimePartitioningType
time_partitioning = self._make_one(
type_=TimePartitioningType.DAY, field="name", expiration_ms=10000
)
with warnings.catch_warnings(record=True) as warned:
time_partitioning.require_partition_filter = True
self.assertIs(warned[0].category, PendingDeprecationWarning)
expected = {
"type": "DAY",
"field": "name",
"expirationMs": "10000",
"requirePartitionFilter": True,
}
self.assertEqual(time_partitioning.to_api_repr(), expected)
示例10: bq_to_arrow_data_type
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import field [as 别名]
def bq_to_arrow_data_type(field):
"""Return the Arrow data type, corresponding to a given BigQuery column.
Returns:
None: if default Arrow type inspection should be used.
"""
if field.mode is not None and field.mode.upper() == "REPEATED":
inner_type = bq_to_arrow_data_type(
schema.SchemaField(field.name, field.field_type, fields=field.fields)
)
if inner_type:
return pyarrow.list_(inner_type)
return None
field_type_upper = field.field_type.upper() if field.field_type else ""
if field_type_upper in schema._STRUCT_TYPES:
return bq_to_arrow_struct_data_type(field)
data_type_constructor = BQ_TO_ARROW_SCALARS.get(field_type_upper)
if data_type_constructor is None:
return None
return data_type_constructor()
示例11: download_arrow_tabledata_list
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import field [as 别名]
def download_arrow_tabledata_list(pages, bq_schema):
"""Use tabledata.list to construct an iterable of RecordBatches.
Args:
pages (Iterator[:class:`google.api_core.page_iterator.Page`]):
An iterator over the result pages.
bq_schema (Sequence[Union[ \
:class:`~google.cloud.bigquery.schema.SchemaField`, \
Mapping[str, Any] \
]]):
A decription of the fields in result pages.
Yields:
:class:`pyarrow.RecordBatch`
The next page of records as a ``pyarrow`` record batch.
"""
bq_schema = schema._to_schema_fields(bq_schema)
column_names = bq_to_arrow_schema(bq_schema) or [field.name for field in bq_schema]
arrow_types = [bq_to_arrow_data_type(field) for field in bq_schema]
for page in pages:
yield _tabledata_list_page_to_arrow(page, column_names, arrow_types)
示例12: download_dataframe_tabledata_list
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import field [as 别名]
def download_dataframe_tabledata_list(pages, bq_schema, dtypes):
"""Use (slower, but free) tabledata.list to construct a DataFrame.
Args:
pages (Iterator[:class:`google.api_core.page_iterator.Page`]):
An iterator over the result pages.
bq_schema (Sequence[Union[ \
:class:`~google.cloud.bigquery.schema.SchemaField`, \
Mapping[str, Any] \
]]):
A decription of the fields in result pages.
dtypes(Mapping[str, numpy.dtype]):
The types of columns in result data to hint construction of the
resulting DataFrame. Not all column types have to be specified.
Yields:
:class:`pandas.DataFrame`
The next page of records as a ``pandas.DataFrame`` record batch.
"""
bq_schema = schema._to_schema_fields(bq_schema)
column_names = [field.name for field in bq_schema]
for page in pages:
yield _tabledata_list_page_to_dataframe(page, column_names, dtypes)
示例13: get_pa_translated_schema
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import field [as 别名]
def get_pa_translated_schema(self):
"""Translates a BigQuery schema to an parquet schema.
Returns: Translated parquet schema in pyarrow.Schema format.
"""
type_conversions = {
'STRING': pa.string(),
'NUMERIC': pa.int64(),
}
# TODO(annarudy@google.com): add support for nested fields
pa_schema_list = [
pa.field(
bq_field.name,
type_conversions[bq_field.field_type],
) for bq_field in self.bq_schema
]
return pa.schema(pa_schema_list)
示例14: test_validate_schema_non_overlapping_nulls
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import field [as 别名]
def test_validate_schema_non_overlapping_nulls(df_all_types_schema):
"""
Test that two schemas with non-overlapping null columns are valid
"""
first_ix = np.random.randint(len(df_all_types_schema))
second_ix = first_ix
while second_ix == first_ix:
second_ix = np.random.randint(len(df_all_types_schema))
first_null = pa.field(name=df_all_types_schema.names[first_ix], type=pa.null())
first_schema = df_all_types_schema.set(first_ix, first_null)
second_null = pa.field(name=df_all_types_schema.names[second_ix], type=pa.null())
second_schema = df_all_types_schema.set(second_ix, second_null)
for schemas in permutations([first_schema, second_schema]):
reference_schema = validate_compatible(schemas)
# The reference schema should be the original schema
# with the columns reconstructed
assert df_all_types_schema == reference_schema
示例15: test_make_meta_column_normalization_pyarrow_schema
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import field [as 别名]
def test_make_meta_column_normalization_pyarrow_schema():
# GH228
df = pd.DataFrame(
[{"part": 1, "id": 1, "col1": "abc"}, {"part": 2, "id": 2, "col1": np.nan}],
# Kartothek normalizes field order s.t. partition keys are first and the
# rest is alphabetically. This is reverse.
columns=["col1", "id", "part"],
)
schema = make_meta(
pa.Schema.from_pandas(df), origin="gh228", partition_keys=["part"]
)
fields = [
pa.field("part", pa.int64()),
pa.field("col1", pa.string()),
pa.field("id", pa.int64()),
]
expected_schema = pa.schema(fields)
assert schema.internal().equals(expected_schema, check_metadata=False)