本文整理汇总了Python中pyspark.sql.types.StructType类的典型用法代码示例。如果您正苦于以下问题:Python StructType类的具体用法?Python StructType怎么用?Python StructType使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了StructType类的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _get_schema
def _get_schema(header, schema):
if schema is None or len(schema) == 0:
# Use header to generate schema
if header is None or len(header) == 0:
return None
elif len(header) > 4:
warnings.warn(WARNING_MOVIE_LENS_HEADER)
header = header[:4]
schema = StructType()
try:
schema.add(StructField(header[0], IntegerType())).add(
StructField(header[1], IntegerType())
).add(StructField(header[2], FloatType())).add(
StructField(header[3], LongType())
)
except IndexError:
pass
else:
if header is not None:
warnings.warn(WARNING_HAVE_SCHEMA_AND_HEADER)
if len(schema) > 4:
warnings.warn(WARNING_MOVIE_LENS_HEADER)
schema = schema[:4]
return schema
示例2: get_spark_schema
def get_spark_schema(header=DEFAULT_HEADER):
## create schema
schema = StructType()
## do label + ints
n_ints = 14
for i in range(n_ints):
schema.add(StructField(header[i], IntegerType()))
## do categoricals
for i in range(26):
schema.add(StructField(header[i + n_ints], StringType()))
return schema
示例3: test_rmse
def test_rmse():
# TODO: revised so that it will take user's inputs instead of hardcoded values
movies_schema = None
ratings_schema = None
# load the schemas
with open("movielens_20m_movies_schema.json", "r") as json_schema_file:
movies_schema = StructType.fromJson(json.load(json_schema_file))
with open("movielens_20m_ratings_schema.json", "r") as json_schema_file:
ratings_schema = StructType.fromJson(json.load(json_schema_file))
# create a hdfs directory
os.system("hdfs dfs -mkdir datasets")
# load the json file into the hdfs directory
os.system("hdfs dfs -put movielens_10m_ratings.json.gz datasets/movielens_10m_ratings.json.gz")
# create a DataFrame based on the content of the json file
ratingsDF = scsingleton.sqlCtx.read.json("hdfs://localhost:9000/datasets/movielens_10m_ratings.json.gz", schema=ratings_schema)
# explicitly repartition RDD after loading so that more tasks can run on it in parallel
# by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster
ratingsDF = ratingsDF.repartition(scsingleton.sc.defaultParallelism * 3)
# parse ratings DataFrame into an RDD of [(userId, itemId, rating)]
ratingsRDD = ratingsDF.map(lambda row: (row.user_id, row.movie_id, row.rating))
ratingsRDD.cache()
# split data into train (60%), test (40%)
# TODO: add validation in the future? train (60%), validation (20%), test(20%)?
trainingRDD, testRDD = ratingsRDD.randomSplit([0.6, 0.4])
trainingRDD.cache()
testRDD.cache()
# run training algorithm to build the model
# without validation
with Timer() as t:
model = ALS.train(trainingRDD, rank=3)
print "ALS.train(trainingRDD, rank=3): %s seconds" % t.secs
# make a prediction
with Timer() as t:
testPredRDD = model.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) ).cache()
print "testPredRDD: %s seconds" % t.secs
# calculate RMSE
with Timer() as t:
testRmse = pm.calculate_rmse_using_rdd(testRDD, testPredRDD)
print "testRmse: %s seconds" % t.secs
print "testRmse", testRmse
return
示例4: __init__
def __init__(self, scoreAndLabels):
sc = scoreAndLabels.ctx
sql_ctx = SQLContext.getOrCreate(sc)
numCol = len(scoreAndLabels.first())
schema = StructType([
StructField("score", DoubleType(), nullable=False),
StructField("label", DoubleType(), nullable=False)])
if numCol == 3:
schema.add("weight", DoubleType(), False)
df = sql_ctx.createDataFrame(scoreAndLabels, schema=schema)
java_class = sc._jvm.org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
java_model = java_class(df._jdf)
super(BinaryClassificationMetrics, self).__init__(java_model)
示例5: __init__
def __init__(self, predAndLabelsWithOptWeight):
sc = predAndLabelsWithOptWeight.ctx
sql_ctx = SQLContext.getOrCreate(sc)
numCol = len(predAndLabelsWithOptWeight.first())
schema = StructType([
StructField("prediction", DoubleType(), nullable=False),
StructField("label", DoubleType(), nullable=False)])
if (numCol == 3):
schema.add("weight", DoubleType(), False)
df = sql_ctx.createDataFrame(predAndLabelsWithOptWeight, schema)
java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics
java_model = java_class(df._jdf)
super(MulticlassMetrics, self).__init__(java_model)
示例6: createDataFrame
def createDataFrame(self, data, schema=None, samplingRatio=None):
"""
Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.
When ``schema`` is a list of column names, the type of each column
will be inferred from ``data``.
When ``schema`` is ``None``, it will try to infer the schema (column names and types)
from ``data``, which should be an RDD of :class:`Row`,
or :class:`namedtuple`, or :class:`dict`.
When ``schema`` is :class:`DataType` or datatype string, it must match the real data, or
exception will be thrown at runtime. If the given schema is not StructType, it will be
wrapped into a StructType as its only field, and the field name will be "value", each record
will also be wrapped into a tuple, which can be converted to row later.
If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.
:param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean,
etc.), or :class:`list`, or :class:`pandas.DataFrame`.
:param schema: a :class:`DataType` or a datatype string or a list of column names, default
is None. The data type string format equals to `DataType.simpleString`, except that
top level struct type can omit the `struct<>` and atomic types use `typeName()` as
their format, e.g. use `byte` instead of `tinyint` for ByteType. We can also use `int`
as a short name for IntegerType.
:param samplingRatio: the sample ratio of rows used for inferring
:return: :class:`DataFrame`
.. versionchanged:: 2.0
The schema parameter can be a DataType or a datatype string after 2.0. If it's not a
StructType, it will be wrapped into a StructType and each record will also be wrapped
into a tuple.
>>> l = [('Alice', 1)]
>>> spark.createDataFrame(l).collect()
[Row(_1=u'Alice', _2=1)]
>>> spark.createDataFrame(l, ['name', 'age']).collect()
[Row(name=u'Alice', age=1)]
>>> d = [{'name': 'Alice', 'age': 1}]
>>> spark.createDataFrame(d).collect()
[Row(age=1, name=u'Alice')]
>>> rdd = sc.parallelize(l)
>>> spark.createDataFrame(rdd).collect()
[Row(_1=u'Alice', _2=1)]
>>> df = spark.createDataFrame(rdd, ['name', 'age'])
>>> df.collect()
[Row(name=u'Alice', age=1)]
>>> from pyspark.sql import Row
>>> Person = Row('name', 'age')
>>> person = rdd.map(lambda r: Person(*r))
>>> df2 = spark.createDataFrame(person)
>>> df2.collect()
[Row(name=u'Alice', age=1)]
>>> from pyspark.sql.types import *
>>> schema = StructType([
... StructField("name", StringType(), True),
... StructField("age", IntegerType(), True)])
>>> df3 = spark.createDataFrame(rdd, schema)
>>> df3.collect()
[Row(name=u'Alice', age=1)]
>>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP
[Row(name=u'Alice', age=1)]
>>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP
[Row(0=1, 1=2)]
>>> spark.createDataFrame(rdd, "a: string, b: int").collect()
[Row(a=u'Alice', b=1)]
>>> rdd = rdd.map(lambda row: row[1])
>>> spark.createDataFrame(rdd, "int").collect()
[Row(value=1)]
>>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
...
Py4JJavaError: ...
"""
if isinstance(data, DataFrame):
raise TypeError("data is already a DataFrame")
if isinstance(schema, basestring):
schema = _parse_datatype_string(schema)
try:
import pandas
has_pandas = True
except Exception:
has_pandas = False
if has_pandas and isinstance(data, pandas.DataFrame):
if schema is None:
schema = [str(x) for x in data.columns]
data = [r.tolist() for r in data.to_records(index=False)]
if isinstance(schema, StructType):
def prepare(obj):
_verify_type(obj, schema)
#.........这里部分代码省略.........
示例7: createDataFrame
def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True):
"""
Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.
When ``schema`` is a list of column names, the type of each column
will be inferred from ``data``.
When ``schema`` is ``None``, it will try to infer the schema (column names and types)
from ``data``, which should be an RDD of :class:`Row`,
or :class:`namedtuple`, or :class:`dict`.
When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
the real data, or an exception will be thrown at runtime. If the given schema is not
:class:`pyspark.sql.types.StructType`, it will be wrapped into a
:class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value",
each record will also be wrapped into a tuple, which can be converted to row later.
If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.
:param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean,
etc.), or :class:`list`, or :class:`pandas.DataFrame`.
:param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of
column names, default is ``None``. The data type string format equals to
:class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use
``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use
``int`` as a short name for ``IntegerType``.
:param samplingRatio: the sample ratio of rows used for inferring
:param verifySchema: verify data types of every row against schema.
:return: :class:`DataFrame`
.. versionchanged:: 2.1
Added verifySchema.
.. note:: Usage with spark.sql.execution.arrow.enabled=True is experimental.
>>> l = [('Alice', 1)]
>>> spark.createDataFrame(l).collect()
[Row(_1=u'Alice', _2=1)]
>>> spark.createDataFrame(l, ['name', 'age']).collect()
[Row(name=u'Alice', age=1)]
>>> d = [{'name': 'Alice', 'age': 1}]
>>> spark.createDataFrame(d).collect()
[Row(age=1, name=u'Alice')]
>>> rdd = sc.parallelize(l)
>>> spark.createDataFrame(rdd).collect()
[Row(_1=u'Alice', _2=1)]
>>> df = spark.createDataFrame(rdd, ['name', 'age'])
>>> df.collect()
[Row(name=u'Alice', age=1)]
>>> from pyspark.sql import Row
>>> Person = Row('name', 'age')
>>> person = rdd.map(lambda r: Person(*r))
>>> df2 = spark.createDataFrame(person)
>>> df2.collect()
[Row(name=u'Alice', age=1)]
>>> from pyspark.sql.types import *
>>> schema = StructType([
... StructField("name", StringType(), True),
... StructField("age", IntegerType(), True)])
>>> df3 = spark.createDataFrame(rdd, schema)
>>> df3.collect()
[Row(name=u'Alice', age=1)]
>>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP
[Row(name=u'Alice', age=1)]
>>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP
[Row(0=1, 1=2)]
>>> spark.createDataFrame(rdd, "a: string, b: int").collect()
[Row(a=u'Alice', b=1)]
>>> rdd = rdd.map(lambda row: row[1])
>>> spark.createDataFrame(rdd, "int").collect()
[Row(value=1)]
>>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
...
Py4JJavaError: ...
"""
SparkSession._activeSession = self
self._jvm.SparkSession.setActiveSession(self._jsparkSession)
if isinstance(data, DataFrame):
raise TypeError("data is already a DataFrame")
if isinstance(schema, basestring):
schema = _parse_datatype_string(schema)
elif isinstance(schema, (list, tuple)):
# Must re-encode any unicode strings to be consistent with StructField names
schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema]
try:
import pandas
has_pandas = True
except Exception:
has_pandas = False
#.........这里部分代码省略.........
示例8: _create_from_pandas_with_arrow
def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
"""
Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
data types will be used to coerce the data in Pandas to Arrow conversion.
"""
from distutils.version import LooseVersion
from pyspark.serializers import ArrowStreamPandasSerializer
from pyspark.sql.types import from_arrow_type, to_arrow_type, TimestampType
from pyspark.sql.utils import require_minimum_pandas_version, \
require_minimum_pyarrow_version
require_minimum_pandas_version()
require_minimum_pyarrow_version()
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
import pyarrow as pa
# Create the Spark schema from list of names passed in with Arrow types
if isinstance(schema, (list, tuple)):
if LooseVersion(pa.__version__) < LooseVersion("0.12.0"):
temp_batch = pa.RecordBatch.from_pandas(pdf[0:100], preserve_index=False)
arrow_schema = temp_batch.schema
else:
arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False)
struct = StructType()
for name, field in zip(schema, arrow_schema):
struct.add(name, from_arrow_type(field.type), nullable=field.nullable)
schema = struct
# Determine arrow types to coerce data when creating batches
if isinstance(schema, StructType):
arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
elif isinstance(schema, DataType):
raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
else:
# Any timestamps must be coerced to be compatible with Spark
arrow_types = [to_arrow_type(TimestampType())
if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
for t in pdf.dtypes]
# Slice the DataFrame to be batched
step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up
pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))
# Create list of Arrow (columns, type) for serializer dump_stream
arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]
for pdf_slice in pdf_slices]
jsqlContext = self._wrapped._jsqlContext
safecheck = self._wrapped._conf.arrowSafeTypeConversion()
col_by_name = True # col by name only applies to StructType columns, can't happen here
ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name)
def reader_func(temp_filename):
return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename)
def create_RDD_server():
return self._jvm.ArrowRDDServer(jsqlContext)
# Create Spark DataFrame from Arrow stream file, using one batch per partition
jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func, create_RDD_server)
jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext)
df = DataFrame(jdf, self._wrapped)
df._schema = schema
return df
示例9: get_twitter_schema
def get_twitter_schema(json_file_name):
schema_dict = json.load(open(json_file_name))
schema_struct = StructType.fromJson(schema_dict)
return schema_struct
示例10: generate_schema_dict
COUNTRIES_FILE_PATH = '/opt/SparkDatasets/geography/countries.csv'
CITIES_FILE_PATH = '/opt/SparkDatasets/geography/cities.csv'
CONTINENT_STRUCTURE = \
[ ( 'continent_id' , 'integer' )
, ( 'continent_name', 'string' ) ]
COUNTRY_STRUCTURE = \
[ ( 'country_id' , 'integer' )
, ( 'continent_id', 'integer' )
, ( 'country_name', 'string' ) ]
CITY_STRUCTURE = \
[ ( 'city_id' , 'integer' )
, ( 'country_id', 'integer' )
, ( 'city_name' , 'string' ) ]
CONTINENT_SCHEMA = StructType.fromJson( generate_schema_dict(CONTINENT_STRUCTURE) )
COUNTRY_SCHEMA = StructType.fromJson( generate_schema_dict(COUNTRY_STRUCTURE) )
CITY_SCHEMA = StructType.fromJson( generate_schema_dict(CITY_STRUCTURE) )
spark = SparkSession.builder.getOrCreate()
continents_df = generate_dataframe( spark, CONTINENT_SCHEMA, CONTINENTS_FILE_PATH )
countries_df = generate_dataframe( spark, COUNTRY_SCHEMA , COUNTRIES_FILE_PATH )
cities_df = generate_dataframe( spark, CITY_SCHEMA , CITIES_FILE_PATH )
continents_df.registerTempTable('continents')
countries_df.registerTempTable('countries')
cities_df.registerTempTable('cities')
print continents_df.count()
print countries_df.count()