当前位置: 首页>>代码示例>>Python>>正文


Python SQLContext.getOrCreate方法代码示例

本文整理汇总了Python中pyspark.sql.SQLContext.getOrCreate方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.getOrCreate方法的具体用法?Python SQLContext.getOrCreate怎么用?Python SQLContext.getOrCreate使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.SQLContext的用法示例。


在下文中一共展示了SQLContext.getOrCreate方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: do_rollup

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
    def do_rollup(setter_rollup_group_by_list,
                  aggregation_period,
                  setter_rollup_operation,
                  instance_usage_df):

        # get aggregation period
        group_by_period_list = \
            ComponentUtils._get_instance_group_by_period_list(
                aggregation_period)

        # group by columns list
        group_by_columns_list = group_by_period_list + \
            setter_rollup_group_by_list

        # perform rollup operation
        instance_usage_json_rdd = RollupQuantity._rollup_quantity(
            instance_usage_df,
            group_by_columns_list,
            str(setter_rollup_operation))

        sql_context = SQLContext.getOrCreate(instance_usage_df.rdd.context)
        instance_usage_trans_df = InstanceUsageUtils.create_df_from_json_rdd(
            sql_context,
            instance_usage_json_rdd)

        return instance_usage_trans_df
开发者ID:bigluster,项目名称:monasca-transform,代码行数:28,代码来源:rollup_quantity.py

示例2: _java2py

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def _java2py(sc, r, encoding="bytes"):
    if isinstance(r, JavaObject):
        clsName = r.getClass().getSimpleName()
        # convert RDD into JavaRDD
        if clsName != 'JavaRDD' and clsName.endswith("RDD"):
            r = r.toJavaRDD()
            clsName = 'JavaRDD'

        if clsName == 'JavaRDD':
            jrdd = sc._jvm.org.apache.spark.mllib.api.python.SerDe.javaToPython(r)
            return RDD(jrdd, sc)

        if clsName == 'Dataset':
            return DataFrame(r, SQLContext.getOrCreate(sc))

        if clsName in _picklable_classes:
            r = sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(r)
        elif isinstance(r, (JavaArray, JavaList)):
            try:
                r = sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(r)
            except Py4JJavaError:
                pass  # not pickable

    if isinstance(r, (bytearray, bytes)):
        r = PickleSerializer().loads(bytes(r), encoding=encoding)
    return r
开发者ID:11wzy001,项目名称:spark,代码行数:28,代码来源:common.py

示例3: __init__

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
 def __init__(self, predictionAndLabels):
     sc = predictionAndLabels.ctx
     sql_ctx = SQLContext.getOrCreate(sc)
     df = sql_ctx.createDataFrame(predictionAndLabels,
                                  schema=sql_ctx._inferSchema(predictionAndLabels))
     java_model = callMLlibFunc("newRankingMetrics", df._jdf)
     super(RankingMetrics, self).__init__(java_model)
开发者ID:advancedxy,项目名称:spark,代码行数:9,代码来源:evaluation.py

示例4: process_metrics

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
    def process_metrics(transform_context, record_store_df):
        """start processing (aggregating) metrics
        """
        #
        # look in record_store_df for list of metrics to be processed
        #
        metric_ids_df = record_store_df.select("metric_id").distinct()
        metric_ids_to_process = [row.metric_id
                                 for row in metric_ids_df.collect()]

        data_driven_specs_repo = DataDrivenSpecsRepoFactory.\
            get_data_driven_specs_repo()
        sqlc = SQLContext.getOrCreate(record_store_df.rdd.context)
        transform_specs_df = data_driven_specs_repo.get_data_driven_specs(
            sql_context=sqlc,
            data_driven_spec_type=DataDrivenSpecsRepo.transform_specs_type)

        for metric_id in metric_ids_to_process:
            transform_spec_df = transform_specs_df.select(
                ["aggregation_params_map", "metric_id"]
            ).where(transform_specs_df.metric_id == metric_id)
            source_record_store_df = record_store_df.select("*").where(
                record_store_df.metric_id == metric_id)

            # set transform_spec_df in TransformContext
            transform_context = \
                TransformContextUtils.get_context(
                    transform_context_info=transform_context,
                    transform_spec_df_info=transform_spec_df)

            MonMetricsKafkaProcessor.process_metric(
                transform_context, source_record_store_df)
开发者ID:bigluster,项目名称:monasca-transform,代码行数:34,代码来源:mon_metrics_kafka.py

示例5: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def main():
    """Run the belief propagation algorithm for an example problem."""
    # setup context
    conf = SparkConf().setAppName("BeliefPropagation example")
    sc = SparkContext.getOrCreate(conf)
    sql = SQLContext.getOrCreate(sc)

    with SuppressSparkLogs(sc):

        # create graphical model g of size 3 x 3
        g = graphframes.examples.Graphs(sql).gridIsingModel(3)
        print("Original Ising model:")
        g.vertices.show()
        g.edges.show()

        # run BP for 5 iterations
        numIter = 5
        results = BeliefPropagation.runBPwithGraphFrames(g, numIter)

        # display beliefs
        beliefs = results.vertices.select('id', 'belief')
        print("Done with BP. Final beliefs after {} iterations:".format(numIter))
        beliefs.show()

    sc.stop()
开发者ID:mengxr,项目名称:graphframes,代码行数:27,代码来源:belief_propagation.py

示例6: setter_by_operation

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
    def setter_by_operation(transform_context, instance_usage_df,
                            setter_rollup_operation):

        transform_spec_df = transform_context.transform_spec_df_info

        # get fields we want to group by for a rollup
        agg_params = transform_spec_df.select(
            "aggregation_params_map.setter_rollup_group_by_list"). \
            collect()[0].asDict()
        setter_rollup_group_by_list = agg_params["setter_rollup_group_by_list"]

        # get aggregation period
        agg_params = transform_spec_df.select(
            "aggregation_params_map.aggregation_period").collect()[0].asDict()
        aggregation_period = agg_params["aggregation_period"]
        group_by_period_list = \
            ComponentUtils._get_instance_group_by_period_list(
                aggregation_period)

        # group by columns list
        group_by_columns_list = \
            group_by_period_list + setter_rollup_group_by_list

        # perform rollup operation
        instance_usage_json_rdd = RollupQuantity._rollup_quantity(
            instance_usage_df,
            group_by_columns_list,
            str(setter_rollup_operation))

        sql_context = SQLContext.getOrCreate(instance_usage_df.rdd.context)
        instance_usage_trans_df = InstanceUsageUtils.create_df_from_json_rdd(
            sql_context,
            instance_usage_json_rdd)

        return instance_usage_trans_df
开发者ID:bigluster,项目名称:monasca-transform,代码行数:37,代码来源:rollup_quantity.py

示例7: _test

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def _test():
    import doctest
    import os
    import tempfile
    from pyspark.sql import Row, SparkSession, SQLContext
    import pyspark.sql.streaming

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.streaming.__dict__.copy()
    try:
        spark = SparkSession.builder.getOrCreate()
    except py4j.protocol.Py4JError:
        spark = SparkSession(sc)

    globs['tempfile'] = tempfile
    globs['os'] = os
    globs['spark'] = spark
    globs['sqlContext'] = SQLContext.getOrCreate(spark.sparkContext)
    globs['sdf'] = \
        spark.readStream.format('text').load('python/test_support/sql/streaming')
    globs['sdf_schema'] = StructType([StructField("data", StringType(), False)])
    globs['df'] = \
        globs['spark'].readStream.format('text').load('python/test_support/sql/streaming')

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.streaming, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    globs['spark'].stop()
    if failure_count:
        exit(-1)
开发者ID:anuragkapur,项目名称:apache-spark-ak-skynet,代码行数:33,代码来源:streaming.py

示例8: _rdd_to_df

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
 def _rdd_to_df(rdd, schema):
     """convert rdd to dataframe using schema."""
     spark_context = rdd.context
     sql_context = SQLContext.getOrCreate(spark_context)
     if schema is None:
         df = sql_context.createDataFrame(rdd)
     else:
         df = sql_context.createDataFrame(rdd, schema)
     return df
开发者ID:openstack,项目名称:monasca-transform,代码行数:11,代码来源:transform_utils.py

示例9: process_metrics

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
    def process_metrics(transform_context, record_store_df):
        """start processing (aggregating) metrics"""
        #
        # look in record_store_df for list of metrics to be processed
        #
        metric_ids_df = record_store_df.select("metric_id").distinct()
        metric_ids_to_process = [row.metric_id
                                 for row in metric_ids_df.collect()]

        data_driven_specs_repo = DataDrivenSpecsRepoFactory.\
            get_data_driven_specs_repo()
        sqlc = SQLContext.getOrCreate(record_store_df.rdd.context)
        transform_specs_df = data_driven_specs_repo.get_data_driven_specs(
            sql_context=sqlc,
            data_driven_spec_type=DataDrivenSpecsRepo.transform_specs_type)

        for metric_id in metric_ids_to_process:
            transform_spec_df = transform_specs_df.select(
                ["aggregation_params_map", "metric_id"]
            ).where(transform_specs_df.metric_id == metric_id)
            source_record_store_df = record_store_df.select("*").where(
                record_store_df.metric_id == metric_id)

            # set transform_spec_df in TransformContext
            transform_context = \
                TransformContextUtils.get_context(
                    transform_context_info=transform_context,
                    transform_spec_df_info=transform_spec_df)

            try:
                agg_inst_usage_df = (
                    MonMetricsKafkaProcessor.process_metric(
                        transform_context, source_record_store_df))

                # if running in debug mode, write out the aggregated metric
                # name just processed (along with the count of how many of
                # these were aggregated) to the application log.
                if log.isEnabledFor(logging.DEBUG):
                    agg_inst_usage_collection = agg_inst_usage_df.collect()
                    collection_len = len(agg_inst_usage_collection)
                    if collection_len > 0:
                        agg_inst_usage_dict = (
                            agg_inst_usage_collection[0].asDict())
                        log.debug("Submitted pre-hourly aggregated metric: "
                                  "%s (%s)",
                                  agg_inst_usage_dict[
                                      "aggregated_metric_name"],
                                  str(collection_len))
            except FetchQuantityException:
                raise
            except FetchQuantityUtilException:
                raise
            except Exception as e:
                MonMetricsKafkaProcessor.log_debug(
                    "Exception raised in metric processing for metric: " +
                    str(metric_id) + ".  Error: " + str(e))
开发者ID:openstack,项目名称:monasca-transform,代码行数:58,代码来源:mon_metrics_kafka.py

示例10: do_rate_calculation

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
    def do_rate_calculation(instance_usage_df):
        instance_usage_json_rdd = PreHourlyCalculateRate._calculate_rate(
            instance_usage_df)

        sql_context = SQLContext.getOrCreate(instance_usage_df.rdd.context)
        instance_usage_trans_df = InstanceUsageUtils.create_df_from_json_rdd(
            sql_context,
            instance_usage_json_rdd)

        return instance_usage_trans_df
开发者ID:openstack,项目名称:monasca-transform,代码行数:12,代码来源:pre_hourly_calculate_rate.py

示例11: __init__

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
 def __init__(self, predictionAndLabels):
     sc = predictionAndLabels.ctx
     sql_ctx = SQLContext.getOrCreate(sc)
     numCol = len(predictionAndLabels.first())
     schema = StructType([
         StructField("prediction", DoubleType(), nullable=False),
         StructField("label", DoubleType(), nullable=False)])
     if numCol == 3:
         schema.add("weight", DoubleType(), False)
     df = sql_ctx.createDataFrame(predictionAndLabels, schema)
     java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics
     java_model = java_class(df._jdf)
     super(MulticlassMetrics, self).__init__(java_model)
开发者ID:apache,项目名称:spark,代码行数:15,代码来源:evaluation.py

示例12: pre_hourly_to_instance_usage_df

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
    def pre_hourly_to_instance_usage_df(pre_hourly_rdd):
        """convert raw pre hourly data into instance usage dataframe."""
        #
        # extract second column containing instance usage data
        #
        instance_usage_rdd = pre_hourly_rdd.map(
            lambda iud: iud[1])

        #
        # convert usage data rdd to instance usage df
        #
        sqlc = SQLContext.getOrCreate(pre_hourly_rdd.context)
        instance_usage_df = \
            InstanceUsageUtils.create_df_from_json_rdd(
                sqlc,
                instance_usage_rdd)

        return instance_usage_df
开发者ID:michaelzhd,项目名称:monasca-transform,代码行数:20,代码来源:pre_hourly_processor.py

示例13: cast

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
    def cast(self, dataType):
        """ Convert the column into type ``dataType``.

        >>> df.select(df.age.cast("string").alias('ages')).collect()
        [Row(ages=u'2'), Row(ages=u'5')]
        >>> df.select(df.age.cast(StringType()).alias('ages')).collect()
        [Row(ages=u'2'), Row(ages=u'5')]
        """
        if isinstance(dataType, basestring):
            jc = self._jc.cast(dataType)
        elif isinstance(dataType, DataType):
            from pyspark.sql import SQLContext
            sc = SparkContext.getOrCreate()
            ctx = SQLContext.getOrCreate(sc)
            jdt = ctx._ssql_ctx.parseDataType(dataType.json())
            jc = self._jc.cast(jdt)
        else:
            raise TypeError("unexpected type: %s" % type(dataType))
        return Column(jc)
开发者ID:15652101501,项目名称:spark,代码行数:21,代码来源:column.py

示例14: setter

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
    def setter(transform_context, instance_usage_df):
        """set the aggregated metric name field for elements in instance usage rdd"""

        transform_spec_df = transform_context.transform_spec_df_info

        agg_params = transform_spec_df.select(
            "aggregation_params_map.aggregation_period").collect()[0].asDict()

        instance_usage_df_agg_params = instance_usage_df.rdd.map(
            lambda x: InstanceUsageDataAggParams(x, agg_params))

        instance_usage_json_rdd = instance_usage_df_agg_params.map(
            SetAggregatedPeriod._set_aggregated_period)

        sql_context = SQLContext.getOrCreate(instance_usage_df.rdd.context)

        instance_usage_trans_df = InstanceUsageUtils.create_df_from_json_rdd(
            sql_context,
            instance_usage_json_rdd)
        return instance_usage_trans_df
开发者ID:openstack,项目名称:monasca-transform,代码行数:22,代码来源:set_aggregated_period.py

示例15: test_transform_builder

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
    def test_transform_builder(self,
                               usage_manager,
                               setter_manager,
                               insert_manager):

        usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr()
        setter_manager.return_value = \
            MockComponentManager.get_setter_cmpt_mgr()
        insert_manager.return_value = \
            MockComponentManager.get_insert_cmpt_mgr()

        record_store_json_path = DataProvider.record_store_path

        metric_proc_json_path = DataProvider.transform_spec_path

        sql_context = SQLContext.getOrCreate(self.spark_context)
        record_store_df = \
            RecordStoreUtils.create_df_from_json(sql_context,
                                                 record_store_json_path)

        transform_spec_df = TransformSpecsUtils.create_df_from_json(
            sql_context, metric_proc_json_path)

        transform_context = TransformContextUtils.get_context(
            transform_spec_df_info=transform_spec_df,
            batch_time_info=self.get_dummy_batch_time())

        # invoke the generic transformation builder
        instance_usage_df = GenericTransformBuilder.do_transform(
            transform_context, record_store_df)

        result_list = [(row.usage_date, row.usage_hour,
                        row.tenant_id, row.host, row.quantity,
                        row.aggregated_metric_name)
                       for row in instance_usage_df.rdd.collect()]

        expected_result = [('2016-02-08', '18', 'all',
                            'all', 12946.0,
                            'mem.total_mb_agg')]

        self.assertItemsEqual(result_list, expected_result)
开发者ID:openstack,项目名称:monasca-transform,代码行数:43,代码来源:test_transform_builder.py


注:本文中的pyspark.sql.SQLContext.getOrCreate方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。