本文整理汇总了Python中pyarrow.schema方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.schema方法的具体用法?Python pyarrow.schema怎么用?Python pyarrow.schema使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow
的用法示例。
在下文中一共展示了pyarrow.schema方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _mock_parquet_dataset
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import schema [as 别名]
def _mock_parquet_dataset(partitions, arrow_schema):
"""Creates a pyarrow.ParquetDataset mock capable of returning:
parquet_dataset.pieces[0].get_metadata(parquet_dataset.fs.open).schema.to_arrow_schema() == schema
parquet_dataset.partitions = partitions
:param partitions: expected to be a list of pa.parquet.PartitionSet
:param arrow_schema: an instance of pa.arrow_schema to be assumed by the mock parquet dataset object.
:return:
"""
piece_mock = mock.Mock()
piece_mock.get_metadata().schema.to_arrow_schema.return_value = arrow_schema
dataset_mock = mock.Mock()
type(dataset_mock).pieces = mock.PropertyMock(return_value=[piece_mock])
type(dataset_mock).partitions = partitions
return dataset_mock
示例2: jsonValue
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import schema [as 别名]
def jsonValue(self):
if self.scalaUDT():
assert self.module() != '__main__', 'UDT in __main__ cannot work with ScalaUDT'
schema = {
"type": "udt",
"class": self.scalaUDT(),
"pyClass": "%s.%s" % (self.module(), type(self).__name__),
"sqlType": self.sqlType().jsonValue()
}
else:
ser = CloudPickleSerializer()
b = ser.dumps(type(self))
schema = {
"type": "udt",
"pyClass": "%s.%s" % (self.module(), type(self).__name__),
"serializedClass": base64.b64encode(b).decode('utf8'),
"sqlType": self.sqlType().jsonValue()
}
return schema
示例3: _infer_schema
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import schema [as 别名]
def _infer_schema(row, names=None):
"""Infer the schema from dict/namedtuple/object"""
if isinstance(row, dict):
items = sorted(row.items())
elif isinstance(row, (tuple, list)):
if hasattr(row, "__fields__"): # Row
items = zip(row.__fields__, tuple(row))
elif hasattr(row, "_fields"): # namedtuple
items = zip(row._fields, tuple(row))
else:
if names is None:
names = ['_%d' % i for i in range(1, len(row) + 1)]
elif len(names) < len(row):
names.extend('_%d' % i for i in range(len(names) + 1, len(row) + 1))
items = zip(names, row)
elif hasattr(row, "__dict__"): # object
items = sorted(row.__dict__.items())
else:
raise TypeError("Can not infer schema for type: %s" % type(row))
fields = [StructField(k, _infer_type(v), True) for k, v in items]
return StructType(fields)
示例4: test__row_from_mapping_w_invalid_schema
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import schema [as 别名]
def test__row_from_mapping_w_invalid_schema(self):
from google.cloud.bigquery.schema import SchemaField
from google.cloud.bigquery.table import Table
MAPPING = {
"full_name": "Phred Phlyntstone",
"age": 32,
"colors": ["red", "green"],
"bogus": "WHATEVER",
}
dataset = DatasetReference(self.PROJECT, self.DS_ID)
table_ref = dataset.table(self.TABLE_NAME)
full_name = SchemaField("full_name", "STRING", mode="REQUIRED")
age = SchemaField("age", "INTEGER", mode="REQUIRED")
colors = SchemaField("colors", "DATETIME", mode="REPEATED")
bogus = SchemaField("joined", "STRING", mode="BOGUS")
table = Table(table_ref, schema=[full_name, age, colors, bogus])
with self.assertRaises(ValueError) as exc:
self._call_fut(MAPPING, table.schema)
self.assertIn("Unknown field mode: BOGUS", str(exc.exception))
示例5: test__row_from_mapping_w_schema
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import schema [as 别名]
def test__row_from_mapping_w_schema(self):
from google.cloud.bigquery.schema import SchemaField
from google.cloud.bigquery.table import Table
MAPPING = {
"full_name": "Phred Phlyntstone",
"age": 32,
"colors": ["red", "green"],
"extra": "IGNORED",
}
dataset = DatasetReference(self.PROJECT, self.DS_ID)
table_ref = dataset.table(self.TABLE_NAME)
full_name = SchemaField("full_name", "STRING", mode="REQUIRED")
age = SchemaField("age", "INTEGER", mode="REQUIRED")
colors = SchemaField("colors", "DATETIME", mode="REPEATED")
joined = SchemaField("joined", "STRING", mode="NULLABLE")
table = Table(table_ref, schema=[full_name, age, colors, joined])
self.assertEqual(
self._call_fut(MAPPING, table.schema),
("Phred Phlyntstone", 32, ["red", "green"], None),
)
示例6: test_to_dataframe_iterable_error_if_pandas_is_none
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import schema [as 别名]
def test_to_dataframe_iterable_error_if_pandas_is_none(self):
from google.cloud.bigquery.schema import SchemaField
schema = [
SchemaField("name", "STRING", mode="REQUIRED"),
SchemaField("age", "INTEGER", mode="REQUIRED"),
]
rows = [
{"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
{"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
]
path = "/foo"
api_request = mock.Mock(return_value={"rows": rows})
row_iterator = self._make_one(_mock_client(), api_request, path, schema)
with pytest.raises(ValueError, match="pandas"):
row_iterator.to_dataframe_iterable()
示例7: test_to_dataframe
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import schema [as 别名]
def test_to_dataframe(self):
from google.cloud.bigquery.schema import SchemaField
schema = [
SchemaField("name", "STRING", mode="REQUIRED"),
SchemaField("age", "INTEGER", mode="REQUIRED"),
]
rows = [
{"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
{"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
{"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
{"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
]
path = "/foo"
api_request = mock.Mock(return_value={"rows": rows})
row_iterator = self._make_one(_mock_client(), api_request, path, schema)
df = row_iterator.to_dataframe(create_bqstorage_client=False)
self.assertIsInstance(df, pandas.DataFrame)
self.assertEqual(len(df), 4) # verify the number of rows
self.assertEqual(list(df), ["name", "age"]) # verify the column names
self.assertEqual(df.name.dtype.name, "object")
self.assertEqual(df.age.dtype.name, "int64")
示例8: test_to_dataframe_no_tqdm_no_progress_bar
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import schema [as 别名]
def test_to_dataframe_no_tqdm_no_progress_bar(self):
from google.cloud.bigquery.schema import SchemaField
schema = [
SchemaField("name", "STRING", mode="REQUIRED"),
SchemaField("age", "INTEGER", mode="REQUIRED"),
]
rows = [
{"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
{"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
{"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
{"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
]
path = "/foo"
api_request = mock.Mock(return_value={"rows": rows})
row_iterator = self._make_one(_mock_client(), api_request, path, schema)
with warnings.catch_warnings(record=True) as warned:
df = row_iterator.to_dataframe(create_bqstorage_client=False)
self.assertEqual(len(warned), 0)
self.assertEqual(len(df), 4)
示例9: test_to_dataframe_w_empty_results_wo_pyarrow
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import schema [as 别名]
def test_to_dataframe_w_empty_results_wo_pyarrow(self):
from google.cloud.bigquery.schema import SchemaField
with mock.patch("google.cloud.bigquery.table.pyarrow", None):
schema = [
SchemaField("name", "STRING", mode="REQUIRED"),
SchemaField("age", "INTEGER", mode="REQUIRED"),
]
api_request = mock.Mock(return_value={"rows": []})
row_iterator = self._make_one(_mock_client(), api_request, schema=schema)
df = row_iterator.to_dataframe()
self.assertIsInstance(df, pandas.DataFrame)
self.assertEqual(len(df), 0) # verify the number of rows
self.assertEqual(list(df), ["name", "age"]) # verify the column names
示例10: test_to_dataframe_w_no_results_wo_pyarrow
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import schema [as 别名]
def test_to_dataframe_w_no_results_wo_pyarrow(self):
from google.cloud.bigquery.schema import SchemaField
with mock.patch("google.cloud.bigquery.table.pyarrow", None):
schema = [
SchemaField("name", "STRING", mode="REQUIRED"),
SchemaField("age", "INTEGER", mode="REQUIRED"),
]
api_request = mock.Mock(return_value={"rows": []})
row_iterator = self._make_one(_mock_client(), api_request, schema=schema)
def empty_iterable(dtypes=None):
return []
row_iterator.to_dataframe_iterable = empty_iterable
df = row_iterator.to_dataframe()
self.assertIsInstance(df, pandas.DataFrame)
self.assertEqual(len(df), 0) # verify the number of rows
self.assertEqual(list(df), ["name", "age"]) # verify the column names
示例11: test_to_dataframe_error_if_pandas_is_none
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import schema [as 别名]
def test_to_dataframe_error_if_pandas_is_none(self):
from google.cloud.bigquery.schema import SchemaField
schema = [
SchemaField("name", "STRING", mode="REQUIRED"),
SchemaField("age", "INTEGER", mode="REQUIRED"),
]
rows = [
{"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
{"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
]
path = "/foo"
api_request = mock.Mock(return_value={"rows": rows})
row_iterator = self._make_one(_mock_client(), api_request, path, schema)
with self.assertRaises(ValueError):
row_iterator.to_dataframe()
示例12: test_to_dataframe_w_bqstorage_no_streams
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import schema [as 别名]
def test_to_dataframe_w_bqstorage_no_streams(self):
from google.cloud.bigquery import schema
from google.cloud.bigquery import table as mut
bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient)
session = bigquery_storage_v1.types.ReadSession()
bqstorage_client.create_read_session.return_value = session
row_iterator = mut.RowIterator(
_mock_client(),
api_request=None,
path=None,
schema=[
schema.SchemaField("colA", "INTEGER"),
schema.SchemaField("colC", "FLOAT"),
schema.SchemaField("colB", "STRING"),
],
table=mut.TableReference.from_string("proj.dset.tbl"),
)
got = row_iterator.to_dataframe(bqstorage_client)
column_names = ["colA", "colC", "colB"]
self.assertEqual(list(got), column_names)
self.assertTrue(got.empty)
示例13: test_to_dataframe_w_bqstorage_partition
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import schema [as 别名]
def test_to_dataframe_w_bqstorage_partition(self):
from google.cloud.bigquery import schema
from google.cloud.bigquery import table as mut
bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient)
row_iterator = mut.RowIterator(
_mock_client(),
None, # api_request: ignored
None, # path: ignored
[schema.SchemaField("colA", "IGNORED")],
table=mut.TableReference.from_string("proj.dset.tbl$20181225"),
)
with pytest.raises(ValueError):
row_iterator.to_dataframe(bqstorage_client)
示例14: test_to_dataframe_w_bqstorage_snapshot
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import schema [as 别名]
def test_to_dataframe_w_bqstorage_snapshot(self):
from google.cloud.bigquery import schema
from google.cloud.bigquery import table as mut
bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient)
row_iterator = mut.RowIterator(
_mock_client(),
None, # api_request: ignored
None, # path: ignored
[schema.SchemaField("colA", "IGNORED")],
table=mut.TableReference.from_string("proj.dset.tbl@1234567890000"),
)
with pytest.raises(ValueError):
row_iterator.to_dataframe(bqstorage_client)
示例15: bq_to_arrow_data_type
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import schema [as 别名]
def bq_to_arrow_data_type(field):
"""Return the Arrow data type, corresponding to a given BigQuery column.
Returns:
None: if default Arrow type inspection should be used.
"""
if field.mode is not None and field.mode.upper() == "REPEATED":
inner_type = bq_to_arrow_data_type(
schema.SchemaField(field.name, field.field_type, fields=field.fields)
)
if inner_type:
return pyarrow.list_(inner_type)
return None
field_type_upper = field.field_type.upper() if field.field_type else ""
if field_type_upper in schema._STRUCT_TYPES:
return bq_to_arrow_struct_data_type(field)
data_type_constructor = BQ_TO_ARROW_SCALARS.get(field_type_upper)
if data_type_constructor is None:
return None
return data_type_constructor()