本文整理汇总了Python中pyspark.sql.types._merge_type函数的典型用法代码示例。如果您正苦于以下问题:Python _merge_type函数的具体用法?Python _merge_type怎么用?Python _merge_type使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了_merge_type函数的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _inferSchema
def _inferSchema(self, rdd, samplingRatio=None):
"""
Infer schema from an RDD of Row or tuple.
:param rdd: an RDD of Row or tuple
:param samplingRatio: sampling ratio, or no sampling (default)
:return: StructType
"""
first = rdd.first()
if not first:
raise ValueError("The first row in RDD is empty, "
"can not infer schema")
if type(first) is dict:
warnings.warn("Using RDD of dict to inferSchema is deprecated. "
"Use pyspark.sql.Row instead")
if samplingRatio is None:
schema = _infer_schema(first)
if _has_nulltype(schema):
for row in rdd.take(100)[1:]:
schema = _merge_type(schema, _infer_schema(row))
if not _has_nulltype(schema):
break
else:
raise ValueError("Some of types cannot be determined by the "
"first 100 rows, please try again with sampling")
else:
if samplingRatio < 0.99:
rdd = rdd.sample(False, float(samplingRatio))
schema = rdd.map(_infer_schema).reduce(_merge_type)
return schema
示例2: _inferSchemaFromList
def _inferSchemaFromList(self, data):
"""
Infer schema from list of Row or tuple.
:param data: list of Row or tuple
:return: StructType
"""
if not data:
raise ValueError("can not infer schema from empty dataset")
first = data[0]
if type(first) is dict:
warnings.warn("inferring schema from dict is deprecated,"
"please use pyspark.sql.Row instead")
schema = _infer_schema(first)
if _has_nulltype(schema):
for r in data:
schema = _merge_type(schema, _infer_schema(r))
if not _has_nulltype(schema):
break
else:
raise ValueError("Some of types cannot be determined after inferring")
return schema
示例3: test_merge_type
def test_merge_type(self):
self.assertEqual(_merge_type(LongType(), NullType()), LongType())
self.assertEqual(_merge_type(NullType(), LongType()), LongType())
self.assertEqual(_merge_type(LongType(), LongType()), LongType())
self.assertEqual(_merge_type(
ArrayType(LongType()),
ArrayType(LongType())
), ArrayType(LongType()))
with self.assertRaisesRegexp(TypeError, 'element in array'):
_merge_type(ArrayType(LongType()), ArrayType(DoubleType()))
self.assertEqual(_merge_type(
MapType(StringType(), LongType()),
MapType(StringType(), LongType())
), MapType(StringType(), LongType()))
with self.assertRaisesRegexp(TypeError, 'key of map'):
_merge_type(
MapType(StringType(), LongType()),
MapType(DoubleType(), LongType()))
with self.assertRaisesRegexp(TypeError, 'value of map'):
_merge_type(
MapType(StringType(), LongType()),
MapType(StringType(), DoubleType()))
self.assertEqual(_merge_type(
StructType([StructField("f1", LongType()), StructField("f2", StringType())]),
StructType([StructField("f1", LongType()), StructField("f2", StringType())])
), StructType([StructField("f1", LongType()), StructField("f2", StringType())]))
with self.assertRaisesRegexp(TypeError, 'field f1'):
_merge_type(
StructType([StructField("f1", LongType()), StructField("f2", StringType())]),
StructType([StructField("f1", DoubleType()), StructField("f2", StringType())]))
self.assertEqual(_merge_type(
StructType([StructField("f1", StructType([StructField("f2", LongType())]))]),
StructType([StructField("f1", StructType([StructField("f2", LongType())]))])
), StructType([StructField("f1", StructType([StructField("f2", LongType())]))]))
with self.assertRaisesRegexp(TypeError, 'field f2 in field f1'):
_merge_type(
StructType([StructField("f1", StructType([StructField("f2", LongType())]))]),
StructType([StructField("f1", StructType([StructField("f2", StringType())]))]))
self.assertEqual(_merge_type(
StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]),
StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())])
), StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]))
with self.assertRaisesRegexp(TypeError, 'element in array field f1'):
_merge_type(
StructType([
StructField("f1", ArrayType(LongType())),
StructField("f2", StringType())]),
StructType([
StructField("f1", ArrayType(DoubleType())),
StructField("f2", StringType())]))
self.assertEqual(_merge_type(
StructType([
StructField("f1", MapType(StringType(), LongType())),
StructField("f2", StringType())]),
StructType([
StructField("f1", MapType(StringType(), LongType())),
StructField("f2", StringType())])
), StructType([
StructField("f1", MapType(StringType(), LongType())),
StructField("f2", StringType())]))
with self.assertRaisesRegexp(TypeError, 'value of map field f1'):
_merge_type(
StructType([
StructField("f1", MapType(StringType(), LongType())),
StructField("f2", StringType())]),
StructType([
StructField("f1", MapType(StringType(), DoubleType())),
StructField("f2", StringType())]))
self.assertEqual(_merge_type(
StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]),
StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))])
), StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]))
with self.assertRaisesRegexp(TypeError, 'key of map element in array field f1'):
_merge_type(
StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]),
StructType([StructField("f1", ArrayType(MapType(DoubleType(), LongType())))])
)
示例4: inferSchema
def inferSchema(self, rdd, samplingRatio=None):
"""Infer and apply a schema to an RDD of L{Row}.
When samplingRatio is specified, the schema is inferred by looking
at the types of each row in the sampled dataset. Otherwise, the
first 100 rows of the RDD are inspected. Nested collections are
supported, which can include array, dict, list, Row, tuple,
namedtuple, or object.
Each row could be L{pyspark.sql.Row} object or namedtuple or objects.
Using top level dicts is deprecated, as dict is used to represent Maps.
If a single column has multiple distinct inferred types, it may cause
runtime exceptions.
>>> rdd = sc.parallelize(
... [Row(field1=1, field2="row1"),
... Row(field1=2, field2="row2"),
... Row(field1=3, field2="row3")])
>>> df = sqlCtx.inferSchema(rdd)
>>> df.collect()[0]
Row(field1=1, field2=u'row1')
>>> NestedRow = Row("f1", "f2")
>>> nestedRdd1 = sc.parallelize([
... NestedRow(array('i', [1, 2]), {"row1": 1.0}),
... NestedRow(array('i', [2, 3]), {"row2": 2.0})])
>>> df = sqlCtx.inferSchema(nestedRdd1)
>>> df.collect()
[Row(f1=[1, 2], f2={u'row1': 1.0}), ..., f2={u'row2': 2.0})]
>>> nestedRdd2 = sc.parallelize([
... NestedRow([[1, 2], [2, 3]], [1, 2]),
... NestedRow([[2, 3], [3, 4]], [2, 3])])
>>> df = sqlCtx.inferSchema(nestedRdd2)
>>> df.collect()
[Row(f1=[[1, 2], [2, 3]], f2=[1, 2]), ..., f2=[2, 3])]
>>> from collections import namedtuple
>>> CustomRow = namedtuple('CustomRow', 'field1 field2')
>>> rdd = sc.parallelize(
... [CustomRow(field1=1, field2="row1"),
... CustomRow(field1=2, field2="row2"),
... CustomRow(field1=3, field2="row3")])
>>> df = sqlCtx.inferSchema(rdd)
>>> df.collect()[0]
Row(field1=1, field2=u'row1')
"""
if isinstance(rdd, DataFrame):
raise TypeError("Cannot apply schema to DataFrame")
first = rdd.first()
if not first:
raise ValueError("The first row in RDD is empty, "
"can not infer schema")
if type(first) is dict:
warnings.warn("Using RDD of dict to inferSchema is deprecated,"
"please use pyspark.sql.Row instead")
if samplingRatio is None:
schema = _infer_schema(first)
if _has_nulltype(schema):
for row in rdd.take(100)[1:]:
schema = _merge_type(schema, _infer_schema(row))
if not _has_nulltype(schema):
break
else:
warnings.warn("Some of types cannot be determined by the "
"first 100 rows, please try again with sampling")
else:
if samplingRatio > 0.99:
rdd = rdd.sample(False, float(samplingRatio))
schema = rdd.map(_infer_schema).reduce(_merge_type)
converter = _create_converter(schema)
rdd = rdd.map(converter)
return self.applySchema(rdd, schema)