本文整理汇总了Python中pyspark.sql.types._verify_type函数的典型用法代码示例。如果您正苦于以下问题:Python _verify_type函数的具体用法?Python _verify_type怎么用?Python _verify_type使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了_verify_type函数的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _createFromRDD
def _createFromRDD(self, rdd, schema, samplingRatio):
"""
Create an RDD for DataFrame from an existing RDD, returns the RDD and schema.
"""
if schema is None or isinstance(schema, (list, tuple)):
struct = self._inferSchema(rdd, samplingRatio)
converter = _create_converter(struct)
rdd = rdd.map(converter)
if isinstance(schema, (list, tuple)):
for i, name in enumerate(schema):
struct.fields[i].name = name
struct.names[i] = name
schema = struct
elif isinstance(schema, StructType):
# take the first few rows to verify schema
rows = rdd.take(10)
for row in rows:
_verify_type(row, schema)
else:
raise TypeError("schema should be StructType or list or None, but got: %s" % schema)
# convert python objects to sql data
rdd = rdd.map(schema.toInternal)
return rdd, schema
示例2: _createFromLocal
def _createFromLocal(self, data, schema):
"""
Create an RDD for DataFrame from an list or pandas.DataFrame, returns
the RDD and schema.
"""
if has_pandas and isinstance(data, pandas.DataFrame):
if schema is None:
schema = [str(x) for x in data.columns]
data = [r.tolist() for r in data.to_records(index=False)]
# make sure data could consumed multiple times
if not isinstance(data, list):
data = list(data)
if schema is None or isinstance(schema, (list, tuple)):
struct = self._inferSchemaFromList(data)
if isinstance(schema, (list, tuple)):
for i, name in enumerate(schema):
struct.fields[i].name = name
struct.names[i] = name
schema = struct
elif isinstance(schema, StructType):
for row in data:
_verify_type(row, schema)
else:
raise TypeError("schema should be StructType or list or None, but got: %s" % schema)
# convert python objects to sql data
data = [schema.toInternal(row) for row in data]
return self._sc.parallelize(data), schema
示例3: applySchema
def applySchema(self, rdd, schema):
"""
Applies the given schema to the given RDD of L{tuple} or L{list}.
::note:
Deprecated in 1.3, use :func:`createDataFrame` instead
These tuples or lists can contain complex nested structures like
lists, maps or nested rows.
The schema should be a StructType.
It is important that the schema matches the types of the objects
in each row or exceptions could be thrown at runtime.
>>> from pyspark.sql.types import *
>>> rdd2 = sc.parallelize([(1, "row1"), (2, "row2"), (3, "row3")])
>>> schema = StructType([StructField("field1", IntegerType(), False),
... StructField("field2", StringType(), False)])
>>> df = sqlCtx.applySchema(rdd2, schema)
>>> df.collect()
[Row(field1=1, field2=u'row1'),..., Row(field1=3, field2=u'row3')]
"""
if isinstance(rdd, DataFrame):
raise TypeError("Cannot apply schema to DataFrame")
if not isinstance(schema, StructType):
raise TypeError("schema should be StructType, but got %s" % schema)
# take the first few rows to verify schema
rows = rdd.take(10)
# Row() cannot been deserialized by Pyrolite
if rows and isinstance(rows[0], tuple) and rows[0].__class__.__name__ == 'Row':
rdd = rdd.map(tuple)
rows = rdd.take(10)
for row in rows:
_verify_type(row, schema)
# convert python objects to sql data
converter = _python_to_sql_converter(schema)
rdd = rdd.map(converter)
jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
df = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
return DataFrame(df, self)
示例4: test_udt
def test_udt(self):
from pyspark.sql.types import _parse_datatype_json_string, _infer_type, _verify_type
from pyspark.sql.tests import ExamplePointUDT, ExamplePoint
def check_datatype(datatype):
pickled = pickle.loads(pickle.dumps(datatype))
assert datatype == pickled
scala_datatype = self.sqlCtx._ssql_ctx.parseDataType(datatype.json())
python_datatype = _parse_datatype_json_string(scala_datatype.json())
assert datatype == python_datatype
check_datatype(ExamplePointUDT())
structtype_with_udt = StructType([StructField("label", DoubleType(), False),
StructField("point", ExamplePointUDT(), False)])
check_datatype(structtype_with_udt)
p = ExamplePoint(1.0, 2.0)
self.assertEqual(_infer_type(p), ExamplePointUDT())
_verify_type(ExamplePoint(1.0, 2.0), ExamplePointUDT())
self.assertRaises(ValueError, lambda: _verify_type([1.0, 2.0], ExamplePointUDT()))
check_datatype(PythonOnlyUDT())
structtype_with_udt = StructType([StructField("label", DoubleType(), False),
StructField("point", PythonOnlyUDT(), False)])
check_datatype(structtype_with_udt)
p = PythonOnlyPoint(1.0, 2.0)
self.assertEqual(_infer_type(p), PythonOnlyUDT())
_verify_type(PythonOnlyPoint(1.0, 2.0), PythonOnlyUDT())
self.assertRaises(ValueError, lambda: _verify_type([1.0, 2.0], PythonOnlyUDT()))
示例5: createDataFrame
def createDataFrame(self, data, schema=None, samplingRatio=None):
"""
Creates a :class:`DataFrame` from an :class:`RDD` of :class:`tuple`/:class:`list`,
list or :class:`pandas.DataFrame`.
When ``schema`` is a list of column names, the type of each column
will be inferred from ``data``.
When ``schema`` is ``None``, it will try to infer the schema (column names and types)
from ``data``, which should be an RDD of :class:`Row`,
or :class:`namedtuple`, or :class:`dict`.
If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.
:param data: an RDD of :class:`Row`/:class:`tuple`/:class:`list`/:class:`dict`,
:class:`list`, or :class:`pandas.DataFrame`.
:param schema: a :class:`StructType` or list of column names. default None.
:param samplingRatio: the sample ratio of rows used for inferring
>>> l = [('Alice', 1)]
>>> sqlContext.createDataFrame(l).collect()
[Row(_1=u'Alice', _2=1)]
>>> sqlContext.createDataFrame(l, ['name', 'age']).collect()
[Row(name=u'Alice', age=1)]
>>> d = [{'name': 'Alice', 'age': 1}]
>>> sqlContext.createDataFrame(d).collect()
[Row(age=1, name=u'Alice')]
>>> rdd = sc.parallelize(l)
>>> sqlContext.createDataFrame(rdd).collect()
[Row(_1=u'Alice', _2=1)]
>>> df = sqlContext.createDataFrame(rdd, ['name', 'age'])
>>> df.collect()
[Row(name=u'Alice', age=1)]
>>> from pyspark.sql import Row
>>> Person = Row('name', 'age')
>>> person = rdd.map(lambda r: Person(*r))
>>> df2 = sqlContext.createDataFrame(person)
>>> df2.collect()
[Row(name=u'Alice', age=1)]
>>> from pyspark.sql.types import *
>>> schema = StructType([
... StructField("name", StringType(), True),
... StructField("age", IntegerType(), True)])
>>> df3 = sqlContext.createDataFrame(rdd, schema)
>>> df3.collect()
[Row(name=u'Alice', age=1)]
>>> sqlContext.createDataFrame(df.toPandas()).collect() # doctest: +SKIP
[Row(name=u'Alice', age=1)]
>>> sqlContext.createDataFrame(pandas.DataFrame([[1, 2]]).collect()) # doctest: +SKIP
[Row(0=1, 1=2)]
"""
if isinstance(data, DataFrame):
raise TypeError("data is already a DataFrame")
if has_pandas and isinstance(data, pandas.DataFrame):
if schema is None:
schema = [str(x) for x in data.columns]
data = [r.tolist() for r in data.to_records(index=False)]
if not isinstance(data, RDD):
try:
# data could be list, tuple, generator ...
rdd = self._sc.parallelize(data)
except Exception:
raise ValueError("cannot create an RDD from type: %s" % type(data))
else:
rdd = data
if schema is None:
schema = self._inferSchema(rdd, samplingRatio)
converter = _create_converter(schema)
rdd = rdd.map(converter)
if isinstance(schema, (list, tuple)):
first = rdd.first()
if not isinstance(first, (list, tuple)):
raise ValueError("each row in `rdd` should be list or tuple, "
"but got %r" % type(first))
row_cls = Row(*schema)
schema = self._inferSchema(rdd.map(lambda r: row_cls(*r)), samplingRatio)
# take the first few rows to verify schema
rows = rdd.take(10)
# Row() cannot been deserialized by Pyrolite
if rows and isinstance(rows[0], tuple) and rows[0].__class__.__name__ == 'Row':
rdd = rdd.map(tuple)
rows = rdd.take(10)
for row in rows:
_verify_type(row, schema)
# convert python objects to sql data
converter = _python_to_sql_converter(schema)
rdd = rdd.map(converter)
#.........这里部分代码省略.........
示例6: prepare
def prepare(obj):
_verify_type(obj, datatype)
return (obj, )
示例7: applySchema
def applySchema(self, rdd, schema):
"""
Applies the given schema to the given RDD of L{tuple} or L{list}.
::note:
Deprecated in 1.3, use :func:`createDataFrame` instead
These tuples or lists can contain complex nested structures like
lists, maps or nested rows.
The schema should be a StructType.
It is important that the schema matches the types of the objects
in each row or exceptions could be thrown at runtime.
>>> from pyspark.sql.types import *
>>> rdd2 = sc.parallelize([(1, "row1"), (2, "row2"), (3, "row3")])
>>> schema = StructType([StructField("field1", IntegerType(), False),
... StructField("field2", StringType(), False)])
>>> df = sqlCtx.applySchema(rdd2, schema)
>>> sqlCtx.registerRDDAsTable(df, "table1")
>>> df2 = sqlCtx.sql("SELECT * from table1")
>>> df2.collect()
[Row(field1=1, field2=u'row1'),..., Row(field1=3, field2=u'row3')]
>>> from datetime import date, datetime
>>> rdd = sc.parallelize([(127, -128L, -32768, 32767, 2147483647L, 1.0,
... date(2010, 1, 1),
... datetime(2010, 1, 1, 1, 1, 1),
... {"a": 1}, (2,), [1, 2, 3], None)])
>>> schema = StructType([
... StructField("byte1", ByteType(), False),
... StructField("byte2", ByteType(), False),
... StructField("short1", ShortType(), False),
... StructField("short2", ShortType(), False),
... StructField("int", IntegerType(), False),
... StructField("float", FloatType(), False),
... StructField("date", DateType(), False),
... StructField("time", TimestampType(), False),
... StructField("map",
... MapType(StringType(), IntegerType(), False), False),
... StructField("struct",
... StructType([StructField("b", ShortType(), False)]), False),
... StructField("list", ArrayType(ByteType(), False), False),
... StructField("null", DoubleType(), True)])
>>> df = sqlCtx.applySchema(rdd, schema)
>>> results = df.map(
... lambda x: (x.byte1, x.byte2, x.short1, x.short2, x.int, x.float, x.date,
... x.time, x.map["a"], x.struct.b, x.list, x.null))
>>> results.collect()[0] # doctest: +NORMALIZE_WHITESPACE
(127, -128, -32768, 32767, 2147483647, 1.0, datetime.date(2010, 1, 1),
datetime.datetime(2010, 1, 1, 1, 1, 1), 1, 2, [1, 2, 3], None)
>>> df.registerTempTable("table2")
>>> sqlCtx.sql(
... "SELECT byte1 - 1 AS byte1, byte2 + 1 AS byte2, " +
... "short1 + 1 AS short1, short2 - 1 AS short2, int - 1 AS int, " +
... "float + 1.5 as float FROM table2").collect()
[Row(byte1=126, byte2=-127, short1=-32767, short2=32766, int=2147483646, float=2.5)]
>>> from pyspark.sql.types import _parse_schema_abstract, _infer_schema_type
>>> rdd = sc.parallelize([(127, -32768, 1.0,
... datetime(2010, 1, 1, 1, 1, 1),
... {"a": 1}, (2,), [1, 2, 3])])
>>> abstract = "byte short float time map{} struct(b) list[]"
>>> schema = _parse_schema_abstract(abstract)
>>> typedSchema = _infer_schema_type(rdd.first(), schema)
>>> df = sqlCtx.applySchema(rdd, typedSchema)
>>> df.collect()
[Row(byte=127, short=-32768, float=1.0, time=..., list=[1, 2, 3])]
"""
if isinstance(rdd, DataFrame):
raise TypeError("Cannot apply schema to DataFrame")
if not isinstance(schema, StructType):
raise TypeError("schema should be StructType, but got %s" % schema)
# take the first few rows to verify schema
rows = rdd.take(10)
# Row() cannot been deserialized by Pyrolite
if rows and isinstance(rows[0], tuple) and rows[0].__class__.__name__ == 'Row':
rdd = rdd.map(tuple)
rows = rdd.take(10)
for row in rows:
_verify_type(row, schema)
# convert python objects to sql data
converter = _python_to_sql_converter(schema)
rdd = rdd.map(converter)
jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
df = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
return DataFrame(df, self)