本文整理汇总了Python中pyspark.sql方法的典型用法代码示例。如果您正苦于以下问题:Python pyspark.sql方法的具体用法?Python pyspark.sql怎么用?Python pyspark.sql使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark
的用法示例。
在下文中一共展示了pyspark.sql方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: summarizeOutput
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def summarizeOutput(self):
s = SQLContext(self.sc)
res = s.read.parquet(self.cclinks.output)
totalLinks = res.count()
uniqueContentQuery = res.drop_duplicates(subset=['provider_domain', 'content_path', 'content_query_string']).count()
uniqueContent = res.drop_duplicates(subset=['provider_domain', 'content_path']).count()
res.registerTempTable('test_deeds')
summary = s.sql('SELECT provider_domain, count(*) AS total, count(distinct content_path) AS unique_content_path, count(distinct content_query_string) AS unique_query_string FROM test_deeds GROUP BY provider_domain ORDER BY total DESC LIMIT 100')
summary.write.mode('overwrite').format('csv').option('header', 'true').save(self.cclinks.output.replace('parquet', 'summary'))
fh = open('{}/total'.format(self.cclinks.output.replace('parquet', 'summary')), 'w')
fh.write('Total records: {}\r\n'.format(totalLinks))
fh.write('Total unique content path: {}\r\n'.format(uniqueContent))
fh.write('Total unique query strings: {}\r\n'.format(uniqueContentQuery))
fh.close()
示例2: as_spark_schema
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def as_spark_schema(self):
"""Returns an object derived from the unischema as spark schema.
Example:
>>> spark.createDataFrame(dataset_rows,
>>> SomeSchema.as_spark_schema())
"""
# Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
# (currently works only with make_batch_reader)
import pyspark.sql.types as sql_types
schema_entries = []
for field in self._fields.values():
spark_type = _field_spark_dtype(field)
schema_entries.append(sql_types.StructField(field.name, spark_type, field.nullable))
return sql_types.StructType(schema_entries)
示例3: test_atexit
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def test_atexit(spark_test_ctx):
lines = """
from petastorm.spark import SparkDatasetConverter, make_spark_converter
from pyspark.sql import SparkSession
import os
spark = SparkSession.builder.getOrCreate()
spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, '{temp_url}')
df = spark.createDataFrame([(1, 2),(4, 5)], ["col1", "col2"])
converter = make_spark_converter(df)
f = open(os.path.join('{tempdir}', 'test_atexit.out'), "w")
f.write(converter.cache_dir_url)
f.close()
""".format(tempdir=spark_test_ctx.tempdir, temp_url=spark_test_ctx.temp_url)
code_str = "; ".join(
line.strip() for line in lines.strip().splitlines())
ret_code = subprocess.call([sys.executable, "-c", code_str])
assert 0 == ret_code
with open(os.path.join(spark_test_ctx.tempdir, 'test_atexit.out')) as f:
cache_dir_url = f.read()
fs = FilesystemResolver(cache_dir_url).filesystem()
assert not fs.exists(urlparse(cache_dir_url).path)
示例4: test_distMetric
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def test_distMetric(self):
df = self.createDF("s1:String; s2:String",
",ads;" +\
"asdfg,asdfg;" +\
"asdfghj,asdfhgj"
)
trunc = lambda c: pyspark.sql.functions.round(c,2)
res = df.select(
df.s1, df.s2,
trunc(nGram2(df.s1, df.s2)).alias("nGram2"),
trunc(nGram3(df.s1, df.s2)).alias("nGram3"),
trunc(diceSorensen(df.s1, df.s2)).alias("diceSorensen"),
trunc(normlevenshtein(df.s1, df.s2)).alias("normlevenshtein"),
trunc(jaroWinkler(df.s1, df.s2)).alias("jaroWinkler")
)
exp = self.createDF("s1: String;s2: String;nGram2: Float;nGram3: Float;diceSorensen: Float;normlevenshtein: Float;jaroWinkler: Float",
",ads,,,,,;" + \
"asdfg,asdfg,1.0,1.0,1.0,1.0,1.0;" + \
"asdfghj,asdfhgj,0.5,0.4,0.5,0.71,0.97")
self.should_be_same(res, exp)
示例5: _cumprod
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def _cumprod(self, skipna, part_cols=()):
from pyspark.sql.functions import pandas_udf
def cumprod(scol):
@pandas_udf(returnType=self.spark.data_type)
def negative_check(s):
assert len(s) == 0 or ((s > 0) | (s.isnull())).all(), (
"values should be bigger than 0: %s" % s
)
return s
return F.sum(F.log(negative_check(scol)))
kser = self._cum(cumprod, skipna, part_cols)
return kser._with_new_scol(F.exp(kser.spark.column)).rename(self.name)
# ----------------------------------------------------------------------
# Accessor Methods
# ----------------------------------------------------------------------
示例6: stats
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def stats(self, columns):
"""Compute the stats for each column provided in columns.
Parameters
----------
columns : list of str, contains all columns to compute stats on.
"""
assert (not isinstance(columns, basestring)), "columns should be a " \
"list of strs, " \
"not a str!"
assert isinstance(columns, list), "columns should be a list!"
from pyspark.sql import functions as F
functions = [F.min, F.max, F.avg, F.count]
aggs = list(
self._flatmap(lambda column: map(lambda f: f(column), functions),
columns))
return PStats(self.from_schema_rdd(self._schema_rdd.agg(*aggs)))
示例7: sum
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def sum(self):
"""Compute the sum for each group."""
if self._can_use_new_school():
self._prep_spark_sql_groupby()
import pyspark.sql.functions as func
return self._use_aggregation(func.sum)
self._prep_pandas_groupby()
myargs = self._myargs
mykwargs = self._mykwargs
def create_combiner(x):
return x.groupby(*myargs, **mykwargs).sum()
def merge_value(x, y):
return pd.concat([x, create_combiner(y)])
def merge_combiner(x, y):
return x + y
rddOfSum = self._sortIfNeeded(self._distributedRDD.combineByKey(
create_combiner,
merge_value,
merge_combiner)).values()
return DataFrame.fromDataFrameRDD(rddOfSum, self.sql_ctx)
示例8: min
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def min(self):
"""Compute the min for each group."""
if self._can_use_new_school():
self._prep_spark_sql_groupby()
import pyspark.sql.functions as func
return self._use_aggregation(func.min)
self._prep_pandas_groupby()
myargs = self._myargs
mykwargs = self._mykwargs
def create_combiner(x):
return x.groupby(*myargs, **mykwargs).min()
def merge_value(x, y):
return x.append(create_combiner(y)).min()
def merge_combiner(x, y):
return x.append(y).min(level=0)
rddOfMin = self._sortIfNeeded(self._distributedRDD.combineByKey(
create_combiner,
merge_value,
merge_combiner)).values()
return DataFrame.fromDataFrameRDD(rddOfMin, self.sql_ctx)
示例9: max
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def max(self):
"""Compute the max for each group."""
if self._can_use_new_school():
self._prep_spark_sql_groupby()
import pyspark.sql.functions as func
return self._use_aggregation(func.max)
self._prep_pandas_groupby()
myargs = self._myargs
mykwargs = self._mykwargs
def create_combiner(x):
return x.groupby(*myargs, **mykwargs).max()
def merge_value(x, y):
return x.append(create_combiner(y)).max()
def merge_combiner(x, y):
return x.append(y).max(level=0)
rddOfMax = self._sortIfNeeded(self._distributedRDD.combineByKey(
create_combiner,
merge_value,
merge_combiner)).values()
return DataFrame.fromDataFrameRDD(rddOfMax, self.sql_ctx)
示例10: last
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def last(self):
"""Pull out the last from each group."""
myargs = self._myargs
mykwargs = self._mykwargs
# If its possible to use Spark SQL grouping do it
if self._can_use_new_school():
self._prep_spark_sql_groupby()
import pyspark.sql.functions as func
return self._use_aggregation(func.last)
def create_combiner(x):
return x.groupby(*myargs, **mykwargs).last()
def merge_value(x, y):
return create_combiner(y)
def merge_combiner(x, y):
return y
rddOfLast = self._sortIfNeeded(self._distributedRDD.combineByKey(
create_combiner,
merge_value,
merge_combiner)).values()
return DataFrame.fromDataFrameRDD(rddOfLast, self.sql_ctx)
示例11: _load_pyfunc
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def _load_pyfunc(path):
"""
Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``.
:param path: Local filesystem path to the MLflow Model with the ``spark`` flavor.
"""
# NOTE: The getOrCreate() call below may change settings of the active session which we do not
# intend to do here. In particular, setting master to local[1] can break distributed clusters.
# To avoid this problem, we explicitly check for an active session. This is not ideal but there
# is no good workaround at the moment.
import pyspark
spark = pyspark.sql.SparkSession._instantiatedSession
if spark is None:
spark = pyspark.sql.SparkSession.builder.config("spark.python.worker.reuse", True) \
.master("local[1]").getOrCreate()
return _PyFuncModelWrapper(spark, _load_model(model_uri=path))
示例12: _numpy_to_spark_mapping
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def _numpy_to_spark_mapping():
"""Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation
of multiple objects in each call."""
# Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot'
# notation to avoid copy/paste/typo mistakes
cache_attr_name = 'cached_numpy_to_pyspark_types_map'
if not hasattr(_numpy_to_spark_mapping, cache_attr_name):
import pyspark.sql.types as T
setattr(_numpy_to_spark_mapping, cache_attr_name,
{
np.int8: T.ByteType(),
np.uint8: T.ShortType(),
np.int16: T.ShortType(),
np.uint16: T.IntegerType(),
np.int32: T.IntegerType(),
np.int64: T.LongType(),
np.float32: T.FloatType(),
np.float64: T.DoubleType(),
np.string_: T.StringType(),
np.str_: T.StringType(),
np.unicode_: T.StringType(),
np.bool_: T.BooleanType(),
})
return getattr(_numpy_to_spark_mapping, cache_attr_name)
# TODO: Changing fields in this class or the UnischemaField will break reading due to the schema being pickled next to
# the dataset on disk
示例13: spark_context
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def spark_context(self, application_name):
"""Create a spark context given the parameters configured in this class.
The caller is responsible for calling ``.close`` on the resulting spark context
Parameters
----------
application_name : string
Returns
-------
sc : SparkContext
"""
# initialize the spark configuration
self._init_spark()
import pyspark
import pyspark.sql
# initialize conf
spark_conf = pyspark.SparkConf()
for k, v in self._spark_conf_helper._conf_dict.items():
spark_conf.set(k, v)
log.info("Starting SparkContext")
return pyspark.SparkContext(appName=application_name, conf=spark_conf)
示例14: spark_session
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def spark_session(self, application_name):
sc = self.spark_context(application_name)
from pyspark.sql import SparkSession
return SparkSession(sc)
示例15: with_sql_context
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def with_sql_context(application_name, conf=None):
"""Context manager for a spark context
Returns
-------
sc : SparkContext
sql_context: SQLContext
Examples
--------
Used within a context manager
>>> with with_sql_context("MyApplication") as (sc, sql_context):
... import pyspark
... # Do stuff
... pass
"""
if conf is None:
conf = default_configuration
assert isinstance(conf, SparkConfiguration)
sc = conf.spark_context(application_name)
import pyspark.sql
try:
yield sc, pyspark.sql.SQLContext(sc)
finally:
sc.stop()