本文整理汇总了Python中pyspark.sql.SparkSession.builder方法的典型用法代码示例。如果您正苦于以下问题:Python SparkSession.builder方法的具体用法?Python SparkSession.builder怎么用?Python SparkSession.builder使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.SparkSession
的用法示例。
在下文中一共展示了SparkSession.builder方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_spark_session
# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def get_spark_session(enable_hive=False, app_name='marvin-engine', configs=[]):
"""Return a Spark Session object"""
# Prepare spark context to be used
import findspark
findspark.init()
from pyspark.sql import SparkSession
# prepare spark sesseion to be returned
spark = SparkSession.builder
spark = spark.appName(app_name)
spark = spark.enableHiveSupport() if enable_hive else spark
# if has configs
for config in configs:
spark = spark.config(config)
return spark.getOrCreate()
示例2: spark
# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def spark(request):
spark = SparkSession.builder \
.master('local[*]') \
.enableHiveSupport() \
.getOrCreate()
# Now populate some tables
for database_name in ['tst_app', 'transaction_a', 'transaction_b']:
spark.sql('DROP DATABASE IF EXISTS {0} CASCADE'.format(database_name)).collect()
spark.sql('CREATE DATABASE {0}'.format(database_name))
populate_transaction_a(spark)
populate_transaction_b(spark)
populate_account_info(spark)
populate_countries(spark)
return spark
示例3: spark
# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def spark():
spark = SparkSession.builder \
.config('spark.sql.warehouse.dir', '/usr/local/airflow/spark_warehouse') \
.config('spark.hadoop.javax.jdo.option.ConnectionURL',
'jdbc:derby:;databaseName=/usr/local/airflow/metastore_db;create=true') \
.enableHiveSupport() \
.getOrCreate()
# Now populate some tables
for database_name in ['dev_app', 'tst_app', 'acc_app', 'prd_app', 'transaction_a', 'transaction_b']:
spark.sql('DROP DATABASE IF EXISTS {0} CASCADE'.format(database_name)).collect()
spark.sql('CREATE DATABASE {0}'.format(database_name)).collect()
populate_transaction_a(spark)
populate_transaction_b(spark)
for environment in ['dev', 'tst', 'acc', 'prd']:
populate_account_info(spark, environment)
populate_countries(spark, environment)
示例4: _test
# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _test():
import doctest
import pyspark.ml.image
globs = pyspark.ml.image.__dict__.copy()
spark = SparkSession.builder\
.master("local[2]")\
.appName("ml.image tests")\
.getOrCreate()
globs['spark'] = spark
(failure_count, test_count) = doctest.testmod(
pyspark.ml.image, globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
spark.stop()
if failure_count:
sys.exit(-1)
示例5: _test
# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _test():
import doctest
import numpy
from pyspark.sql import SparkSession
import pyspark.mllib.evaluation
try:
# Numpy 1.14+ changed it's string format.
numpy.set_printoptions(legacy='1.13')
except TypeError:
pass
globs = pyspark.mllib.evaluation.__dict__.copy()
spark = SparkSession.builder\
.master("local[4]")\
.appName("mllib.evaluation tests")\
.getOrCreate()
globs['sc'] = spark.sparkContext
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
spark.stop()
if failure_count:
sys.exit(-1)
示例6: _test
# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _test():
import doctest
from pyspark.sql import SparkSession
import pyspark.mllib.fpm
globs = pyspark.mllib.fpm.__dict__.copy()
spark = SparkSession.builder\
.master("local[4]")\
.appName("mllib.fpm tests")\
.getOrCreate()
globs['sc'] = spark.sparkContext
import tempfile
temp_path = tempfile.mkdtemp()
globs['temp_path'] = temp_path
try:
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
spark.stop()
finally:
from shutil import rmtree
try:
rmtree(temp_path)
except OSError:
pass
if failure_count:
sys.exit(-1)
示例7: _test
# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _test():
import doctest
import numpy
from pyspark.sql import SparkSession
from pyspark.mllib.linalg import Matrices
import pyspark.mllib.linalg.distributed
try:
# Numpy 1.14+ changed it's string format.
numpy.set_printoptions(legacy='1.13')
except TypeError:
pass
globs = pyspark.mllib.linalg.distributed.__dict__.copy()
spark = SparkSession.builder\
.master("local[2]")\
.appName("mllib.linalg.distributed tests")\
.getOrCreate()
globs['sc'] = spark.sparkContext
globs['Matrices'] = Matrices
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
spark.stop()
if failure_count:
sys.exit(-1)
示例8: _test
# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _test():
import doctest
from pyspark.sql import Row, SparkSession
import pyspark.sql.functions
globs = pyspark.sql.functions.__dict__.copy()
spark = SparkSession.builder\
.master("local[4]")\
.appName("sql.functions tests")\
.getOrCreate()
sc = spark.sparkContext
globs['sc'] = sc
globs['spark'] = spark
globs['df'] = spark.createDataFrame([Row(name='Alice', age=2), Row(name='Bob', age=5)])
(failure_count, test_count) = doctest.testmod(
pyspark.sql.functions, globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
spark.stop()
if failure_count:
sys.exit(-1)
示例9: _test
# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _test():
import doctest
from pyspark.sql import SparkSession
import pyspark.sql.udf
globs = pyspark.sql.udf.__dict__.copy()
spark = SparkSession.builder\
.master("local[4]")\
.appName("sql.udf tests")\
.getOrCreate()
globs['spark'] = spark
(failure_count, test_count) = doctest.testmod(
pyspark.sql.udf, globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
spark.stop()
if failure_count:
sys.exit(-1)
示例10: cast
# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def cast(self, dataType):
""" Convert the column into type ``dataType``.
>>> df.select(df.age.cast("string").alias('ages')).collect()
[Row(ages=u'2'), Row(ages=u'5')]
>>> df.select(df.age.cast(StringType()).alias('ages')).collect()
[Row(ages=u'2'), Row(ages=u'5')]
"""
if isinstance(dataType, basestring):
jc = self._jc.cast(dataType)
elif isinstance(dataType, DataType):
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
jdt = spark._jsparkSession.parseDataType(dataType.json())
jc = self._jc.cast(jdt)
else:
raise TypeError("unexpected type: %s" % type(dataType))
return Column(jc)
示例11: _test
# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _test():
import doctest
from pyspark.sql import SparkSession
import pyspark.sql.column
globs = pyspark.sql.column.__dict__.copy()
spark = SparkSession.builder\
.master("local[4]")\
.appName("sql.column tests")\
.getOrCreate()
sc = spark.sparkContext
globs['spark'] = spark
globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
.toDF(StructType([StructField('age', IntegerType()),
StructField('name', StringType())]))
(failure_count, test_count) = doctest.testmod(
pyspark.sql.column, globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
spark.stop()
if failure_count:
sys.exit(-1)
示例12: _spark_session
# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _spark_session():
"""Internal fixture for SparkSession instance.
Yields SparkSession instance if it is supported by the pyspark
version, otherwise yields None.
Required to correctly initialize `spark_context` fixture after
`spark_session` fixture.
..note::
It is not possible to create SparkSession from the existing
SparkContext.
"""
try:
from pyspark.sql import SparkSession
except ImportError:
yield
else:
session = SparkSession.builder \
.config(conf=SparkConfigBuilder().get()) \
.getOrCreate()
yield session
session.stop()
示例13: _load_pyfunc
# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _load_pyfunc(path):
"""
Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``.
:param path: Local filesystem path to the MLflow Model with the ``spark`` flavor.
"""
# NOTE: The getOrCreate() call below may change settings of the active session which we do not
# intend to do here. In particular, setting master to local[1] can break distributed clusters.
# To avoid this problem, we explicitly check for an active session. This is not ideal but there
# is no good workaround at the moment.
import pyspark
spark = pyspark.sql.SparkSession._instantiatedSession
if spark is None:
spark = pyspark.sql.SparkSession.builder.config("spark.python.worker.reuse", True) \
.master("local[1]").getOrCreate()
return _PyFuncModelWrapper(spark, _load_model(model_uri=path))
示例14: create_testing_spark_session
# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def create_testing_spark_session(cls):
return (SparkSession.builder
.master('local[2]')
.appName('sparkflow')
.getOrCreate())
示例15: spark
# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def spark(request):
"""
Fixture to create the SparkSession.
"""
spark = SparkSession.builder \
.appName(APP_NAME) \
.config('spark.sql.warehouse.dir', '/usr/local/airflow/spark_warehouse') \
.config('spark.hadoop.javax.jdo.option.ConnectionURL',
'jdbc:derby:;databaseName=/usr/local/airflow/metastore_db;create=true') \
.enableHiveSupport() \
.getOrCreate()
request.addfinalizer(spark.stop)
return spark