本文整理汇总了Python中pyspark.sql.SQLContext.getOrCreate方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.getOrCreate方法的具体用法?Python SQLContext.getOrCreate怎么用?Python SQLContext.getOrCreate使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.SQLContext
的用法示例。
在下文中一共展示了SQLContext.getOrCreate方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: do_rollup
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def do_rollup(setter_rollup_group_by_list,
aggregation_period,
setter_rollup_operation,
instance_usage_df):
# get aggregation period
group_by_period_list = \
ComponentUtils._get_instance_group_by_period_list(
aggregation_period)
# group by columns list
group_by_columns_list = group_by_period_list + \
setter_rollup_group_by_list
# perform rollup operation
instance_usage_json_rdd = RollupQuantity._rollup_quantity(
instance_usage_df,
group_by_columns_list,
str(setter_rollup_operation))
sql_context = SQLContext.getOrCreate(instance_usage_df.rdd.context)
instance_usage_trans_df = InstanceUsageUtils.create_df_from_json_rdd(
sql_context,
instance_usage_json_rdd)
return instance_usage_trans_df
示例2: _java2py
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def _java2py(sc, r, encoding="bytes"):
if isinstance(r, JavaObject):
clsName = r.getClass().getSimpleName()
# convert RDD into JavaRDD
if clsName != 'JavaRDD' and clsName.endswith("RDD"):
r = r.toJavaRDD()
clsName = 'JavaRDD'
if clsName == 'JavaRDD':
jrdd = sc._jvm.org.apache.spark.mllib.api.python.SerDe.javaToPython(r)
return RDD(jrdd, sc)
if clsName == 'Dataset':
return DataFrame(r, SQLContext.getOrCreate(sc))
if clsName in _picklable_classes:
r = sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(r)
elif isinstance(r, (JavaArray, JavaList)):
try:
r = sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(r)
except Py4JJavaError:
pass # not pickable
if isinstance(r, (bytearray, bytes)):
r = PickleSerializer().loads(bytes(r), encoding=encoding)
return r
示例3: __init__
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def __init__(self, predictionAndLabels):
sc = predictionAndLabels.ctx
sql_ctx = SQLContext.getOrCreate(sc)
df = sql_ctx.createDataFrame(predictionAndLabels,
schema=sql_ctx._inferSchema(predictionAndLabels))
java_model = callMLlibFunc("newRankingMetrics", df._jdf)
super(RankingMetrics, self).__init__(java_model)
示例4: process_metrics
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def process_metrics(transform_context, record_store_df):
"""start processing (aggregating) metrics
"""
#
# look in record_store_df for list of metrics to be processed
#
metric_ids_df = record_store_df.select("metric_id").distinct()
metric_ids_to_process = [row.metric_id
for row in metric_ids_df.collect()]
data_driven_specs_repo = DataDrivenSpecsRepoFactory.\
get_data_driven_specs_repo()
sqlc = SQLContext.getOrCreate(record_store_df.rdd.context)
transform_specs_df = data_driven_specs_repo.get_data_driven_specs(
sql_context=sqlc,
data_driven_spec_type=DataDrivenSpecsRepo.transform_specs_type)
for metric_id in metric_ids_to_process:
transform_spec_df = transform_specs_df.select(
["aggregation_params_map", "metric_id"]
).where(transform_specs_df.metric_id == metric_id)
source_record_store_df = record_store_df.select("*").where(
record_store_df.metric_id == metric_id)
# set transform_spec_df in TransformContext
transform_context = \
TransformContextUtils.get_context(
transform_context_info=transform_context,
transform_spec_df_info=transform_spec_df)
MonMetricsKafkaProcessor.process_metric(
transform_context, source_record_store_df)
示例5: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def main():
"""Run the belief propagation algorithm for an example problem."""
# setup context
conf = SparkConf().setAppName("BeliefPropagation example")
sc = SparkContext.getOrCreate(conf)
sql = SQLContext.getOrCreate(sc)
with SuppressSparkLogs(sc):
# create graphical model g of size 3 x 3
g = graphframes.examples.Graphs(sql).gridIsingModel(3)
print("Original Ising model:")
g.vertices.show()
g.edges.show()
# run BP for 5 iterations
numIter = 5
results = BeliefPropagation.runBPwithGraphFrames(g, numIter)
# display beliefs
beliefs = results.vertices.select('id', 'belief')
print("Done with BP. Final beliefs after {} iterations:".format(numIter))
beliefs.show()
sc.stop()
示例6: setter_by_operation
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def setter_by_operation(transform_context, instance_usage_df,
setter_rollup_operation):
transform_spec_df = transform_context.transform_spec_df_info
# get fields we want to group by for a rollup
agg_params = transform_spec_df.select(
"aggregation_params_map.setter_rollup_group_by_list"). \
collect()[0].asDict()
setter_rollup_group_by_list = agg_params["setter_rollup_group_by_list"]
# get aggregation period
agg_params = transform_spec_df.select(
"aggregation_params_map.aggregation_period").collect()[0].asDict()
aggregation_period = agg_params["aggregation_period"]
group_by_period_list = \
ComponentUtils._get_instance_group_by_period_list(
aggregation_period)
# group by columns list
group_by_columns_list = \
group_by_period_list + setter_rollup_group_by_list
# perform rollup operation
instance_usage_json_rdd = RollupQuantity._rollup_quantity(
instance_usage_df,
group_by_columns_list,
str(setter_rollup_operation))
sql_context = SQLContext.getOrCreate(instance_usage_df.rdd.context)
instance_usage_trans_df = InstanceUsageUtils.create_df_from_json_rdd(
sql_context,
instance_usage_json_rdd)
return instance_usage_trans_df
示例7: _test
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def _test():
import doctest
import os
import tempfile
from pyspark.sql import Row, SparkSession, SQLContext
import pyspark.sql.streaming
os.chdir(os.environ["SPARK_HOME"])
globs = pyspark.sql.streaming.__dict__.copy()
try:
spark = SparkSession.builder.getOrCreate()
except py4j.protocol.Py4JError:
spark = SparkSession(sc)
globs['tempfile'] = tempfile
globs['os'] = os
globs['spark'] = spark
globs['sqlContext'] = SQLContext.getOrCreate(spark.sparkContext)
globs['sdf'] = \
spark.readStream.format('text').load('python/test_support/sql/streaming')
globs['sdf_schema'] = StructType([StructField("data", StringType(), False)])
globs['df'] = \
globs['spark'].readStream.format('text').load('python/test_support/sql/streaming')
(failure_count, test_count) = doctest.testmod(
pyspark.sql.streaming, globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
globs['spark'].stop()
if failure_count:
exit(-1)
示例8: _rdd_to_df
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def _rdd_to_df(rdd, schema):
"""convert rdd to dataframe using schema."""
spark_context = rdd.context
sql_context = SQLContext.getOrCreate(spark_context)
if schema is None:
df = sql_context.createDataFrame(rdd)
else:
df = sql_context.createDataFrame(rdd, schema)
return df
示例9: process_metrics
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def process_metrics(transform_context, record_store_df):
"""start processing (aggregating) metrics"""
#
# look in record_store_df for list of metrics to be processed
#
metric_ids_df = record_store_df.select("metric_id").distinct()
metric_ids_to_process = [row.metric_id
for row in metric_ids_df.collect()]
data_driven_specs_repo = DataDrivenSpecsRepoFactory.\
get_data_driven_specs_repo()
sqlc = SQLContext.getOrCreate(record_store_df.rdd.context)
transform_specs_df = data_driven_specs_repo.get_data_driven_specs(
sql_context=sqlc,
data_driven_spec_type=DataDrivenSpecsRepo.transform_specs_type)
for metric_id in metric_ids_to_process:
transform_spec_df = transform_specs_df.select(
["aggregation_params_map", "metric_id"]
).where(transform_specs_df.metric_id == metric_id)
source_record_store_df = record_store_df.select("*").where(
record_store_df.metric_id == metric_id)
# set transform_spec_df in TransformContext
transform_context = \
TransformContextUtils.get_context(
transform_context_info=transform_context,
transform_spec_df_info=transform_spec_df)
try:
agg_inst_usage_df = (
MonMetricsKafkaProcessor.process_metric(
transform_context, source_record_store_df))
# if running in debug mode, write out the aggregated metric
# name just processed (along with the count of how many of
# these were aggregated) to the application log.
if log.isEnabledFor(logging.DEBUG):
agg_inst_usage_collection = agg_inst_usage_df.collect()
collection_len = len(agg_inst_usage_collection)
if collection_len > 0:
agg_inst_usage_dict = (
agg_inst_usage_collection[0].asDict())
log.debug("Submitted pre-hourly aggregated metric: "
"%s (%s)",
agg_inst_usage_dict[
"aggregated_metric_name"],
str(collection_len))
except FetchQuantityException:
raise
except FetchQuantityUtilException:
raise
except Exception as e:
MonMetricsKafkaProcessor.log_debug(
"Exception raised in metric processing for metric: " +
str(metric_id) + ". Error: " + str(e))
示例10: do_rate_calculation
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def do_rate_calculation(instance_usage_df):
instance_usage_json_rdd = PreHourlyCalculateRate._calculate_rate(
instance_usage_df)
sql_context = SQLContext.getOrCreate(instance_usage_df.rdd.context)
instance_usage_trans_df = InstanceUsageUtils.create_df_from_json_rdd(
sql_context,
instance_usage_json_rdd)
return instance_usage_trans_df
示例11: __init__
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def __init__(self, predictionAndLabels):
sc = predictionAndLabels.ctx
sql_ctx = SQLContext.getOrCreate(sc)
numCol = len(predictionAndLabels.first())
schema = StructType([
StructField("prediction", DoubleType(), nullable=False),
StructField("label", DoubleType(), nullable=False)])
if numCol == 3:
schema.add("weight", DoubleType(), False)
df = sql_ctx.createDataFrame(predictionAndLabels, schema)
java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics
java_model = java_class(df._jdf)
super(MulticlassMetrics, self).__init__(java_model)
示例12: pre_hourly_to_instance_usage_df
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def pre_hourly_to_instance_usage_df(pre_hourly_rdd):
"""convert raw pre hourly data into instance usage dataframe."""
#
# extract second column containing instance usage data
#
instance_usage_rdd = pre_hourly_rdd.map(
lambda iud: iud[1])
#
# convert usage data rdd to instance usage df
#
sqlc = SQLContext.getOrCreate(pre_hourly_rdd.context)
instance_usage_df = \
InstanceUsageUtils.create_df_from_json_rdd(
sqlc,
instance_usage_rdd)
return instance_usage_df
示例13: cast
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def cast(self, dataType):
""" Convert the column into type ``dataType``.
>>> df.select(df.age.cast("string").alias('ages')).collect()
[Row(ages=u'2'), Row(ages=u'5')]
>>> df.select(df.age.cast(StringType()).alias('ages')).collect()
[Row(ages=u'2'), Row(ages=u'5')]
"""
if isinstance(dataType, basestring):
jc = self._jc.cast(dataType)
elif isinstance(dataType, DataType):
from pyspark.sql import SQLContext
sc = SparkContext.getOrCreate()
ctx = SQLContext.getOrCreate(sc)
jdt = ctx._ssql_ctx.parseDataType(dataType.json())
jc = self._jc.cast(jdt)
else:
raise TypeError("unexpected type: %s" % type(dataType))
return Column(jc)
示例14: setter
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def setter(transform_context, instance_usage_df):
"""set the aggregated metric name field for elements in instance usage rdd"""
transform_spec_df = transform_context.transform_spec_df_info
agg_params = transform_spec_df.select(
"aggregation_params_map.aggregation_period").collect()[0].asDict()
instance_usage_df_agg_params = instance_usage_df.rdd.map(
lambda x: InstanceUsageDataAggParams(x, agg_params))
instance_usage_json_rdd = instance_usage_df_agg_params.map(
SetAggregatedPeriod._set_aggregated_period)
sql_context = SQLContext.getOrCreate(instance_usage_df.rdd.context)
instance_usage_trans_df = InstanceUsageUtils.create_df_from_json_rdd(
sql_context,
instance_usage_json_rdd)
return instance_usage_trans_df
示例15: test_transform_builder
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import getOrCreate [as 别名]
def test_transform_builder(self,
usage_manager,
setter_manager,
insert_manager):
usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr()
setter_manager.return_value = \
MockComponentManager.get_setter_cmpt_mgr()
insert_manager.return_value = \
MockComponentManager.get_insert_cmpt_mgr()
record_store_json_path = DataProvider.record_store_path
metric_proc_json_path = DataProvider.transform_spec_path
sql_context = SQLContext.getOrCreate(self.spark_context)
record_store_df = \
RecordStoreUtils.create_df_from_json(sql_context,
record_store_json_path)
transform_spec_df = TransformSpecsUtils.create_df_from_json(
sql_context, metric_proc_json_path)
transform_context = TransformContextUtils.get_context(
transform_spec_df_info=transform_spec_df,
batch_time_info=self.get_dummy_batch_time())
# invoke the generic transformation builder
instance_usage_df = GenericTransformBuilder.do_transform(
transform_context, record_store_df)
result_list = [(row.usage_date, row.usage_hour,
row.tenant_id, row.host, row.quantity,
row.aggregated_metric_name)
for row in instance_usage_df.rdd.collect()]
expected_result = [('2016-02-08', '18', 'all',
'all', 12946.0,
'mem.total_mb_agg')]
self.assertItemsEqual(result_list, expected_result)