当前位置: 首页>>代码示例>>Python>>正文


Python sql.SQLContext方法代码示例

本文整理汇总了Python中pyspark.sql.SQLContext方法的典型用法代码示例。如果您正苦于以下问题:Python sql.SQLContext方法的具体用法?Python sql.SQLContext怎么用?Python sql.SQLContext使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql的用法示例。


在下文中一共展示了sql.SQLContext方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: summarizeOutput

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def summarizeOutput(self):
        s   = SQLContext(self.sc)
        res = s.read.parquet(self.cclinks.output)

        totalLinks  = res.count()
        uniqueContentQuery = res.drop_duplicates(subset=['provider_domain', 'content_path', 'content_query_string']).count()
        uniqueContent = res.drop_duplicates(subset=['provider_domain', 'content_path']).count()


        res.registerTempTable('test_deeds')
        summary = s.sql('SELECT provider_domain, count(*) AS total, count(distinct content_path) AS unique_content_path, count(distinct content_query_string) AS unique_query_string FROM test_deeds GROUP BY provider_domain ORDER BY total DESC LIMIT 100')
        summary.write.mode('overwrite').format('csv').option('header', 'true').save(self.cclinks.output.replace('parquet', 'summary'))

        fh = open('{}/total'.format(self.cclinks.output.replace('parquet', 'summary')), 'w')
        fh.write('Total records: {}\r\n'.format(totalLinks))
        fh.write('Total unique content path: {}\r\n'.format(uniqueContent))
        fh.write('Total unique query strings: {}\r\n'.format(uniqueContentQuery))
        fh.close() 
开发者ID:creativecommons,项目名称:cccatalog,代码行数:20,代码来源:test_ExtractCCLinks.py

示例2: main

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    sc = SparkContext()
    sql = SQLContext(sc)
    data = sql.read.parquet(data_path)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = SparkLFApplier(lfs)
    L = applier.apply(data.rdd)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    y_prob_sql_array = F.array([F.lit(y) for y in y_prob])
    data_labeled = data.withColumn("y_prob", y_prob_sql_array)
    data_labeled.write.mode("overwrite").parquet(output_path)
    logging.info(f"Labels saved to {output_path}") 
开发者ID:snorkel-team,项目名称:snorkel-tutorials,代码行数:27,代码来源:drybell_spark.py

示例3: test_lf_applier_spark_preprocessor_memoized

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def test_lf_applier_spark_preprocessor_memoized(self) -> None:
        sc = SparkContext.getOrCreate()
        sql = SQLContext(sc)

        @preprocessor(memoize=True)
        def square_memoize(x: DataPoint) -> DataPoint:
            return Row(num=x.num, num_squared=x.num ** 2)

        @labeling_function(pre=[square_memoize])
        def fp_memoized(x: DataPoint) -> int:
            return 0 if x.num_squared > 42 else -1

        df = pd.DataFrame(dict(num=DATA))
        rdd = sql.createDataFrame(df).rdd
        applier = SparkLFApplier([f, fp_memoized])
        L = applier.apply(rdd)
        np.testing.assert_equal(L, L_PREPROCESS_EXPECTED) 
开发者ID:snorkel-team,项目名称:snorkel,代码行数:19,代码来源:test_spark.py

示例4: init_spark_session

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def init_spark_session(app_name):
    """ Initializes a Spark Session with the given application name.

        Args:
            app_name (str): Name of the Spark application. This will also occur in the Spark UI.
    """
    global session, context, sql_context
    try:
        session = SparkSession \
                .builder \
                .appName(app_name) \
                .config("spark.hadoop.dfs.client.use.datanode.hostname", "true") \
                .config("spark.hadoop.dfs.datanode.use.datanode.hostname", "true") \
                .config("spark.driver.maxResultSize", "4g") \
                .getOrCreate()
        context = session.sparkContext
        context.setLogLevel("ERROR")
        sql_context = SQLContext(context)
    except Py4JJavaError as err:
        raise SparkSessionNotInitializedException(app_name, err.java_exception) 
开发者ID:metabrainz,项目名称:listenbrainz-server,代码行数:22,代码来源:__init__.py

示例5: run

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def run(self):
        self.args = self.parse_arguments()

        conf = SparkConf()

        if self.args.spark_profiler:
            conf = conf.set("spark.python.profile", "true")

        sc = SparkContext(
            appName=self.name,
            conf=conf)
        sqlc = SQLContext(sparkContext=sc)

        self.init_accumulators(sc)

        self.run_job(sc, sqlc)

        if self.args.spark_profiler:
            sc.show_profiles()

        sc.stop() 
开发者ID:commoncrawl,项目名称:cc-pyspark,代码行数:23,代码来源:sparkcc.py

示例6: setup_env

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def setup_env(cls):
        cls.sc = SparkContext('local[*]', cls.__name__)
        cls.sql = SQLContext(cls.sc)
        cls.session = SparkSession.builder.getOrCreate() 
开发者ID:databricks,项目名称:spark-deep-learning,代码行数:6,代码来源:tests.py

示例7: get_sqlcontext_instance

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def get_sqlcontext_instance(spark_context):
    """
    :type spark_context: pyspark.SparkContext
    :param spark_context: The currently active Spark Context
    :return: Returns the SQLContext
    :rtype: sql.SQLContext
    """
    if 'sqlContextSingletonInstance' not in globals():
        globals()['sqlContextSingletonInstance'] = sql.SQLContext(
            spark_context)
    return globals()['sqlContextSingletonInstance'] 
开发者ID:openstack,项目名称:monasca-analytics,代码行数:13,代码来源:streaming_context.py

示例8: sql_context

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def sql_context(request, spark_context):
    return SQLContext(spark_context) 
开发者ID:creativecommons,项目名称:cccatalog,代码行数:4,代码来源:test_deeds.py

示例9: setUp

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def setUp(self):
		self.sc = SparkContext()
		self.sqlContext = SQLContext(self.sc) 
开发者ID:jpmml,项目名称:pyspark2pmml,代码行数:5,代码来源:__init__.py

示例10: to_data_frame

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def to_data_frame(sc, features, labels, categorical=False):
    """Convert numpy arrays of features and labels into Spark DataFrame
    """
    lp_rdd = to_labeled_point(sc, features, labels, categorical)
    sql_context = SQLContext(sc)
    df = sql_context.createDataFrame(lp_rdd)
    return df 
开发者ID:maxpumperla,项目名称:elephas,代码行数:9,代码来源:adapter.py

示例11: sql_context

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def sql_context(request):
    """ fixture for creating a Spark SQLContext
    Args:
        request: pytest.FixtureRequest object
    """
    conf = (SparkConf().setMaster("local[2]").setAppName(
        "pytest-pyspark-local-testing"))
    sc = SparkContext(conf=conf)
    sql_context = SQLContext(sc)
    request.addfinalizer(lambda: sc.stop())

    quiet_py4j()
    return sql_context 
开发者ID:maxpumperla,项目名称:elephas,代码行数:15,代码来源:conftest.py

示例12: test_lf_applier_spark

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def test_lf_applier_spark(self) -> None:
        sc = SparkContext.getOrCreate()
        sql = SQLContext(sc)
        df = pd.DataFrame(dict(num=DATA))
        rdd = sql.createDataFrame(df).rdd
        applier = SparkLFApplier([f, g])
        L = applier.apply(rdd)
        np.testing.assert_equal(L, L_EXPECTED) 
开发者ID:snorkel-team,项目名称:snorkel,代码行数:10,代码来源:test_spark.py

示例13: test_lf_applier_spark_fault

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def test_lf_applier_spark_fault(self) -> None:
        sc = SparkContext.getOrCreate()
        sql = SQLContext(sc)
        df = pd.DataFrame(dict(num=DATA))
        rdd = sql.createDataFrame(df).rdd
        applier = SparkLFApplier([f, f_bad])
        with self.assertRaises(Exception):
            applier.apply(rdd)
        L = applier.apply(rdd, fault_tolerant=True)
        np.testing.assert_equal(L, L_EXPECTED_BAD) 
开发者ID:snorkel-team,项目名称:snorkel,代码行数:12,代码来源:test_spark.py

示例14: test_lf_applier_spark_preprocessor

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def test_lf_applier_spark_preprocessor(self) -> None:
        sc = SparkContext.getOrCreate()
        sql = SQLContext(sc)
        df = pd.DataFrame(dict(num=DATA))
        rdd = sql.createDataFrame(df).rdd
        applier = SparkLFApplier([f, fp])
        L = applier.apply(rdd)
        np.testing.assert_equal(L, L_PREPROCESS_EXPECTED) 
开发者ID:snorkel-team,项目名称:snorkel,代码行数:10,代码来源:test_spark.py

示例15: main

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import SQLContext [as 别名]
def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output) 
开发者ID:hanhanwu,项目名称:Hanhan-Spark-Python,代码行数:40,代码来源:spark_ml_pipline.py


注:本文中的pyspark.sql.SQLContext方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。