当前位置: 首页>>代码示例>>Python>>正文


Python SparkSession.builder方法代码示例

本文整理汇总了Python中pyspark.sql.SparkSession.builder方法的典型用法代码示例。如果您正苦于以下问题:Python SparkSession.builder方法的具体用法?Python SparkSession.builder怎么用?Python SparkSession.builder使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.SparkSession的用法示例。


在下文中一共展示了SparkSession.builder方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_spark_session

# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def get_spark_session(enable_hive=False, app_name='marvin-engine', configs=[]):
    """Return a Spark Session object"""

    # Prepare spark context to be used
    import findspark
    findspark.init()
    from pyspark.sql import SparkSession

    # prepare spark sesseion to be returned
    spark = SparkSession.builder

    spark = spark.appName(app_name)
    spark = spark.enableHiveSupport() if enable_hive else spark

    # if has configs
    for config in configs:
        spark = spark.config(config)

    return spark.getOrCreate() 
开发者ID:marvin-ai,项目名称:marvin-python-toolbox,代码行数:21,代码来源:data_source_provider.py

示例2: spark

# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def spark(request):
    spark = SparkSession.builder \
        .master('local[*]') \
        .enableHiveSupport() \
        .getOrCreate()

    # Now populate some tables
    for database_name in ['tst_app', 'transaction_a', 'transaction_b']:
        spark.sql('DROP DATABASE IF EXISTS {0} CASCADE'.format(database_name)).collect()
        spark.sql('CREATE DATABASE {0}'.format(database_name))

    populate_transaction_a(spark)
    populate_transaction_b(spark)
    populate_account_info(spark)
    populate_countries(spark)

    return spark 
开发者ID:danielvdende,项目名称:data-testing-with-airflow,代码行数:19,代码来源:conftest.py

示例3: spark

# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def spark():
    spark = SparkSession.builder \
        .config('spark.sql.warehouse.dir', '/usr/local/airflow/spark_warehouse') \
        .config('spark.hadoop.javax.jdo.option.ConnectionURL',
                'jdbc:derby:;databaseName=/usr/local/airflow/metastore_db;create=true') \
        .enableHiveSupport() \
        .getOrCreate()

    # Now populate some tables
    for database_name in ['dev_app', 'tst_app', 'acc_app', 'prd_app', 'transaction_a', 'transaction_b']:
        spark.sql('DROP DATABASE IF EXISTS {0} CASCADE'.format(database_name)).collect()
        spark.sql('CREATE DATABASE {0}'.format(database_name)).collect()

    populate_transaction_a(spark)
    populate_transaction_b(spark)

    for environment in ['dev', 'tst', 'acc', 'prd']:
        populate_account_info(spark, environment)
        populate_countries(spark, environment) 
开发者ID:danielvdende,项目名称:data-testing-with-airflow,代码行数:21,代码来源:populate_tables.py

示例4: _test

# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _test():
    import doctest
    import pyspark.ml.image
    globs = pyspark.ml.image.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[2]")\
        .appName("ml.image tests")\
        .getOrCreate()
    globs['spark'] = spark

    (failure_count, test_count) = doctest.testmod(
        pyspark.ml.image, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    spark.stop()
    if failure_count:
        sys.exit(-1) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:18,代码来源:image.py

示例5: _test

# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _test():
    import doctest
    import numpy
    from pyspark.sql import SparkSession
    import pyspark.mllib.evaluation
    try:
        # Numpy 1.14+ changed it's string format.
        numpy.set_printoptions(legacy='1.13')
    except TypeError:
        pass
    globs = pyspark.mllib.evaluation.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("mllib.evaluation tests")\
        .getOrCreate()
    globs['sc'] = spark.sparkContext
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    spark.stop()
    if failure_count:
        sys.exit(-1) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:22,代码来源:evaluation.py

示例6: _test

# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _test():
    import doctest
    from pyspark.sql import SparkSession
    import pyspark.mllib.fpm
    globs = pyspark.mllib.fpm.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("mllib.fpm tests")\
        .getOrCreate()
    globs['sc'] = spark.sparkContext
    import tempfile

    temp_path = tempfile.mkdtemp()
    globs['temp_path'] = temp_path
    try:
        (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
        spark.stop()
    finally:
        from shutil import rmtree
        try:
            rmtree(temp_path)
        except OSError:
            pass
    if failure_count:
        sys.exit(-1) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:27,代码来源:fpm.py

示例7: _test

# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _test():
    import doctest
    import numpy
    from pyspark.sql import SparkSession
    from pyspark.mllib.linalg import Matrices
    import pyspark.mllib.linalg.distributed
    try:
        # Numpy 1.14+ changed it's string format.
        numpy.set_printoptions(legacy='1.13')
    except TypeError:
        pass
    globs = pyspark.mllib.linalg.distributed.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[2]")\
        .appName("mllib.linalg.distributed tests")\
        .getOrCreate()
    globs['sc'] = spark.sparkContext
    globs['Matrices'] = Matrices
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    spark.stop()
    if failure_count:
        sys.exit(-1) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:24,代码来源:distributed.py

示例8: _test

# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _test():
    import doctest
    from pyspark.sql import Row, SparkSession
    import pyspark.sql.functions
    globs = pyspark.sql.functions.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("sql.functions tests")\
        .getOrCreate()
    sc = spark.sparkContext
    globs['sc'] = sc
    globs['spark'] = spark
    globs['df'] = spark.createDataFrame([Row(name='Alice', age=2), Row(name='Bob', age=5)])
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.functions, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    spark.stop()
    if failure_count:
        sys.exit(-1) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:21,代码来源:functions.py

示例9: _test

# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _test():
    import doctest
    from pyspark.sql import SparkSession
    import pyspark.sql.udf
    globs = pyspark.sql.udf.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("sql.udf tests")\
        .getOrCreate()
    globs['spark'] = spark
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.udf, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    spark.stop()
    if failure_count:
        sys.exit(-1) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:18,代码来源:udf.py

示例10: cast

# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def cast(self, dataType):
        """ Convert the column into type ``dataType``.

        >>> df.select(df.age.cast("string").alias('ages')).collect()
        [Row(ages=u'2'), Row(ages=u'5')]
        >>> df.select(df.age.cast(StringType()).alias('ages')).collect()
        [Row(ages=u'2'), Row(ages=u'5')]
        """
        if isinstance(dataType, basestring):
            jc = self._jc.cast(dataType)
        elif isinstance(dataType, DataType):
            from pyspark.sql import SparkSession
            spark = SparkSession.builder.getOrCreate()
            jdt = spark._jsparkSession.parseDataType(dataType.json())
            jc = self._jc.cast(jdt)
        else:
            raise TypeError("unexpected type: %s" % type(dataType))
        return Column(jc) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:20,代码来源:column.py

示例11: _test

# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _test():
    import doctest
    from pyspark.sql import SparkSession
    import pyspark.sql.column
    globs = pyspark.sql.column.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("sql.column tests")\
        .getOrCreate()
    sc = spark.sparkContext
    globs['spark'] = spark
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.column, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    spark.stop()
    if failure_count:
        sys.exit(-1) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:23,代码来源:column.py

示例12: _spark_session

# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _spark_session():
    """Internal fixture for SparkSession instance.

    Yields SparkSession instance if it is supported by the pyspark
    version, otherwise yields None.

    Required to correctly initialize `spark_context` fixture after
    `spark_session` fixture.

    ..note::
        It is not possible to create SparkSession from the existing
        SparkContext.
    """

    try:
        from pyspark.sql import SparkSession
    except ImportError:
        yield
    else:
        session = SparkSession.builder \
            .config(conf=SparkConfigBuilder().get()) \
            .getOrCreate()

        yield session
        session.stop() 
开发者ID:malexer,项目名称:pytest-spark,代码行数:27,代码来源:fixtures.py

示例13: _load_pyfunc

# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def _load_pyfunc(path):
    """
    Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``.

    :param path: Local filesystem path to the MLflow Model with the ``spark`` flavor.
    """
    # NOTE: The getOrCreate() call below may change settings of the active session which we do not
    # intend to do here. In particular, setting master to local[1] can break distributed clusters.
    # To avoid this problem, we explicitly check for an active session. This is not ideal but there
    # is no good workaround at the moment.
    import pyspark

    spark = pyspark.sql.SparkSession._instantiatedSession
    if spark is None:
        spark = pyspark.sql.SparkSession.builder.config("spark.python.worker.reuse", True) \
            .master("local[1]").getOrCreate()
    return _PyFuncModelWrapper(spark, _load_model(model_uri=path)) 
开发者ID:mlflow,项目名称:mlflow,代码行数:19,代码来源:spark.py

示例14: create_testing_spark_session

# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def create_testing_spark_session(cls):
        return (SparkSession.builder
                .master('local[2]')
                .appName('sparkflow')
                .getOrCreate()) 
开发者ID:lifeomic,项目名称:sparkflow,代码行数:7,代码来源:dl_runner.py

示例15: spark

# 需要导入模块: from pyspark.sql import SparkSession [as 别名]
# 或者: from pyspark.sql.SparkSession import builder [as 别名]
def spark(request):
    """
    Fixture to create the SparkSession.
    """
    spark = SparkSession.builder \
        .appName(APP_NAME) \
        .config('spark.sql.warehouse.dir', '/usr/local/airflow/spark_warehouse') \
        .config('spark.hadoop.javax.jdo.option.ConnectionURL',
                'jdbc:derby:;databaseName=/usr/local/airflow/metastore_db;create=true') \
        .enableHiveSupport() \
        .getOrCreate()

    request.addfinalizer(spark.stop)

    return spark 
开发者ID:danielvdende,项目名称:data-testing-with-airflow,代码行数:17,代码来源:conftest.py


注:本文中的pyspark.sql.SparkSession.builder方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。