本文整理汇总了Python中pyspark.sql.SQLContext方法的典型用法代码示例。如果您正苦于以下问题:Python sql.SQLContext方法的具体用法?Python sql.SQLContext怎么用?Python sql.SQLContext使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql
的用法示例。
在下文中一共展示了sql.SQLContext方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: summarizeOutput
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def summarizeOutput(self):
s = SQLContext(self.sc)
res = s.read.parquet(self.cclinks.output)
totalLinks = res.count()
uniqueContentQuery = res.drop_duplicates(subset=['provider_domain', 'content_path', 'content_query_string']).count()
uniqueContent = res.drop_duplicates(subset=['provider_domain', 'content_path']).count()
res.registerTempTable('test_deeds')
summary = s.sql('SELECT provider_domain, count(*) AS total, count(distinct content_path) AS unique_content_path, count(distinct content_query_string) AS unique_query_string FROM test_deeds GROUP BY provider_domain ORDER BY total DESC LIMIT 100')
summary.write.mode('overwrite').format('csv').option('header', 'true').save(self.cclinks.output.replace('parquet', 'summary'))
fh = open('{}/total'.format(self.cclinks.output.replace('parquet', 'summary')), 'w')
fh.write('Total records: {}\r\n'.format(totalLinks))
fh.write('Total unique content path: {}\r\n'.format(uniqueContent))
fh.write('Total unique query strings: {}\r\n'.format(uniqueContentQuery))
fh.close()
示例2: main
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def main(data_path, output_path):
# Read data
logging.info(f"Reading data from {data_path}")
sc = SparkContext()
sql = SQLContext(sc)
data = sql.read.parquet(data_path)
# Build label matrix
logging.info("Applying LFs")
lfs = [article_mentions_person, body_contains_fortune, person_in_db]
applier = SparkLFApplier(lfs)
L = applier.apply(data.rdd)
# Train label model
logging.info("Training label model")
label_model = LabelModel(cardinality=2)
label_model.fit(L)
# Generate training labels
logging.info("Generating probabilistic labels")
y_prob = label_model.predict_proba(L)[:, 1]
y_prob_sql_array = F.array([F.lit(y) for y in y_prob])
data_labeled = data.withColumn("y_prob", y_prob_sql_array)
data_labeled.write.mode("overwrite").parquet(output_path)
logging.info(f"Labels saved to {output_path}")
示例3: test_lf_applier_spark_preprocessor_memoized
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def test_lf_applier_spark_preprocessor_memoized(self) -> None:
sc = SparkContext.getOrCreate()
sql = SQLContext(sc)
@preprocessor(memoize=True)
def square_memoize(x: DataPoint) -> DataPoint:
return Row(num=x.num, num_squared=x.num ** 2)
@labeling_function(pre=[square_memoize])
def fp_memoized(x: DataPoint) -> int:
return 0 if x.num_squared > 42 else -1
df = pd.DataFrame(dict(num=DATA))
rdd = sql.createDataFrame(df).rdd
applier = SparkLFApplier([f, fp_memoized])
L = applier.apply(rdd)
np.testing.assert_equal(L, L_PREPROCESS_EXPECTED)
示例4: init_spark_session
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def init_spark_session(app_name):
""" Initializes a Spark Session with the given application name.
Args:
app_name (str): Name of the Spark application. This will also occur in the Spark UI.
"""
global session, context, sql_context
try:
session = SparkSession \
.builder \
.appName(app_name) \
.config("spark.hadoop.dfs.client.use.datanode.hostname", "true") \
.config("spark.hadoop.dfs.datanode.use.datanode.hostname", "true") \
.config("spark.driver.maxResultSize", "4g") \
.getOrCreate()
context = session.sparkContext
context.setLogLevel("ERROR")
sql_context = SQLContext(context)
except Py4JJavaError as err:
raise SparkSessionNotInitializedException(app_name, err.java_exception)
示例5: run
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def run(self):
self.args = self.parse_arguments()
conf = SparkConf()
if self.args.spark_profiler:
conf = conf.set("spark.python.profile", "true")
sc = SparkContext(
appName=self.name,
conf=conf)
sqlc = SQLContext(sparkContext=sc)
self.init_accumulators(sc)
self.run_job(sc, sqlc)
if self.args.spark_profiler:
sc.show_profiles()
sc.stop()
示例6: setup_env
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def setup_env(cls):
cls.sc = SparkContext('local[*]', cls.__name__)
cls.sql = SQLContext(cls.sc)
cls.session = SparkSession.builder.getOrCreate()
示例7: get_sqlcontext_instance
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def get_sqlcontext_instance(spark_context):
"""
:type spark_context: pyspark.SparkContext
:param spark_context: The currently active Spark Context
:return: Returns the SQLContext
:rtype: sql.SQLContext
"""
if 'sqlContextSingletonInstance' not in globals():
globals()['sqlContextSingletonInstance'] = sql.SQLContext(
spark_context)
return globals()['sqlContextSingletonInstance']
示例8: sql_context
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def sql_context(request, spark_context):
return SQLContext(spark_context)
示例9: setUp
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def setUp(self):
self.sc = SparkContext()
self.sqlContext = SQLContext(self.sc)
示例10: to_data_frame
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def to_data_frame(sc, features, labels, categorical=False):
"""Convert numpy arrays of features and labels into Spark DataFrame
"""
lp_rdd = to_labeled_point(sc, features, labels, categorical)
sql_context = SQLContext(sc)
df = sql_context.createDataFrame(lp_rdd)
return df
示例11: sql_context
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def sql_context(request):
""" fixture for creating a Spark SQLContext
Args:
request: pytest.FixtureRequest object
"""
conf = (SparkConf().setMaster("local[2]").setAppName(
"pytest-pyspark-local-testing"))
sc = SparkContext(conf=conf)
sql_context = SQLContext(sc)
request.addfinalizer(lambda: sc.stop())
quiet_py4j()
return sql_context
示例12: test_lf_applier_spark
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def test_lf_applier_spark(self) -> None:
sc = SparkContext.getOrCreate()
sql = SQLContext(sc)
df = pd.DataFrame(dict(num=DATA))
rdd = sql.createDataFrame(df).rdd
applier = SparkLFApplier([f, g])
L = applier.apply(rdd)
np.testing.assert_equal(L, L_EXPECTED)
示例13: test_lf_applier_spark_fault
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def test_lf_applier_spark_fault(self) -> None:
sc = SparkContext.getOrCreate()
sql = SQLContext(sc)
df = pd.DataFrame(dict(num=DATA))
rdd = sql.createDataFrame(df).rdd
applier = SparkLFApplier([f, f_bad])
with self.assertRaises(Exception):
applier.apply(rdd)
L = applier.apply(rdd, fault_tolerant=True)
np.testing.assert_equal(L, L_EXPECTED_BAD)
示例14: test_lf_applier_spark_preprocessor
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def test_lf_applier_spark_preprocessor(self) -> None:
sc = SparkContext.getOrCreate()
sql = SQLContext(sc)
df = pd.DataFrame(dict(num=DATA))
rdd = sql.createDataFrame(df).rdd
applier = SparkLFApplier([f, fp])
L = applier.apply(rdd)
np.testing.assert_equal(L, L_PREPROCESS_EXPECTED)
示例15: main
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def main():
# Read training data as a DataFrame
sqlCt = SQLContext(sc)
trainDF = sqlCt.read.parquet(training_input)
testDF = sqlCt.read.parquet(testing_input)
tokenizer = Tokenizer(inputCol="text", outputCol="words")
evaluator = BinaryClassificationEvaluator()
# no parameter tuning
hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
model_notuning = pipeline_notuning.fit(trainDF)
prediction_notuning = model_notuning.transform(testDF)
notuning_output = evaluator.evaluate(prediction_notuning)
# for cross validation
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=20)
paramGrid = ParamGridBuilder()\
.addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
.addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
.build()
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
cvModel = cv.fit(trainDF)
# Make predictions on test documents. cvModel uses the best model found.
best_prediction = cvModel.transform(testDF)
best_output = evaluator.evaluate(best_prediction)
s = str(notuning_output) + '\n' + str(best_output)
output_data = sc.parallelize([s])
output_data.saveAsTextFile(output)