当前位置: 首页>>代码示例>>Python>>正文


Python pyspark.sql方法代码示例

本文整理汇总了Python中pyspark.sql方法的典型用法代码示例。如果您正苦于以下问题:Python pyspark.sql方法的具体用法?Python pyspark.sql怎么用?Python pyspark.sql使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark的用法示例。


在下文中一共展示了pyspark.sql方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: summarizeOutput

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def summarizeOutput(self):
        s   = SQLContext(self.sc)
        res = s.read.parquet(self.cclinks.output)

        totalLinks  = res.count()
        uniqueContentQuery = res.drop_duplicates(subset=['provider_domain', 'content_path', 'content_query_string']).count()
        uniqueContent = res.drop_duplicates(subset=['provider_domain', 'content_path']).count()


        res.registerTempTable('test_deeds')
        summary = s.sql('SELECT provider_domain, count(*) AS total, count(distinct content_path) AS unique_content_path, count(distinct content_query_string) AS unique_query_string FROM test_deeds GROUP BY provider_domain ORDER BY total DESC LIMIT 100')
        summary.write.mode('overwrite').format('csv').option('header', 'true').save(self.cclinks.output.replace('parquet', 'summary'))

        fh = open('{}/total'.format(self.cclinks.output.replace('parquet', 'summary')), 'w')
        fh.write('Total records: {}\r\n'.format(totalLinks))
        fh.write('Total unique content path: {}\r\n'.format(uniqueContent))
        fh.write('Total unique query strings: {}\r\n'.format(uniqueContentQuery))
        fh.close() 
开发者ID:creativecommons,项目名称:cccatalog,代码行数:20,代码来源:test_ExtractCCLinks.py

示例2: as_spark_schema

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def as_spark_schema(self):
        """Returns an object derived from the unischema as spark schema.

        Example:

        >>> spark.createDataFrame(dataset_rows,
        >>>                       SomeSchema.as_spark_schema())
        """
        # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
        # (currently works only with make_batch_reader)
        import pyspark.sql.types as sql_types

        schema_entries = []
        for field in self._fields.values():
            spark_type = _field_spark_dtype(field)
            schema_entries.append(sql_types.StructField(field.name, spark_type, field.nullable))

        return sql_types.StructType(schema_entries) 
开发者ID:uber,项目名称:petastorm,代码行数:20,代码来源:unischema.py

示例3: test_atexit

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def test_atexit(spark_test_ctx):
    lines = """
    from petastorm.spark import SparkDatasetConverter, make_spark_converter
    from pyspark.sql import SparkSession
    import os
    spark = SparkSession.builder.getOrCreate()
    spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, '{temp_url}')
    df = spark.createDataFrame([(1, 2),(4, 5)], ["col1", "col2"])
    converter = make_spark_converter(df)
    f = open(os.path.join('{tempdir}', 'test_atexit.out'), "w")
    f.write(converter.cache_dir_url)
    f.close()
    """.format(tempdir=spark_test_ctx.tempdir, temp_url=spark_test_ctx.temp_url)
    code_str = "; ".join(
        line.strip() for line in lines.strip().splitlines())
    ret_code = subprocess.call([sys.executable, "-c", code_str])
    assert 0 == ret_code
    with open(os.path.join(spark_test_ctx.tempdir, 'test_atexit.out')) as f:
        cache_dir_url = f.read()

    fs = FilesystemResolver(cache_dir_url).filesystem()
    assert not fs.exists(urlparse(cache_dir_url).path) 
开发者ID:uber,项目名称:petastorm,代码行数:24,代码来源:test_spark_dataset_converter.py

示例4: test_distMetric

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def test_distMetric(self):
        df = self.createDF("s1:String; s2:String",
            ",ads;" +\
            "asdfg,asdfg;" +\
            "asdfghj,asdfhgj"
        )

        trunc = lambda c: pyspark.sql.functions.round(c,2)
        res = df.select(
            df.s1, df.s2,
            trunc(nGram2(df.s1, df.s2)).alias("nGram2"),
            trunc(nGram3(df.s1, df.s2)).alias("nGram3"),
            trunc(diceSorensen(df.s1, df.s2)).alias("diceSorensen"),
            trunc(normlevenshtein(df.s1, df.s2)).alias("normlevenshtein"),
            trunc(jaroWinkler(df.s1, df.s2)).alias("jaroWinkler")
        )

        exp = self.createDF("s1: String;s2: String;nGram2: Float;nGram3: Float;diceSorensen: Float;normlevenshtein: Float;jaroWinkler: Float",
            ",ads,,,,,;" + \
            "asdfg,asdfg,1.0,1.0,1.0,1.0,1.0;" + \
            "asdfghj,asdfhgj,0.5,0.4,0.5,0.71,0.97")

        self.should_be_same(res, exp) 
开发者ID:TresAmigosSD,项目名称:SMV,代码行数:25,代码来源:testSmvfuncs.py

示例5: _cumprod

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def _cumprod(self, skipna, part_cols=()):
        from pyspark.sql.functions import pandas_udf

        def cumprod(scol):
            @pandas_udf(returnType=self.spark.data_type)
            def negative_check(s):
                assert len(s) == 0 or ((s > 0) | (s.isnull())).all(), (
                    "values should be bigger than 0: %s" % s
                )
                return s

            return F.sum(F.log(negative_check(scol)))

        kser = self._cum(cumprod, skipna, part_cols)
        return kser._with_new_scol(F.exp(kser.spark.column)).rename(self.name)

    # ----------------------------------------------------------------------
    # Accessor Methods
    # ---------------------------------------------------------------------- 
开发者ID:databricks,项目名称:koalas,代码行数:21,代码来源:series.py

示例6: stats

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def stats(self, columns):
        """Compute the stats for each column provided in columns.
        Parameters
        ----------
        columns : list of str, contains all columns to compute stats on.
        """
        assert (not isinstance(columns, basestring)), "columns should be a " \
                                                      "list of strs,  " \
                                                      "not a str!"
        assert isinstance(columns, list), "columns should be a list!"

        from pyspark.sql import functions as F
        functions = [F.min, F.max, F.avg, F.count]
        aggs = list(
            self._flatmap(lambda column: map(lambda f: f(column), functions),
                          columns))
        return PStats(self.from_schema_rdd(self._schema_rdd.agg(*aggs))) 
开发者ID:sparklingpandas,项目名称:sparklingpandas,代码行数:19,代码来源:dataframe.py

示例7: sum

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def sum(self):
        """Compute the sum for each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.sum)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).sum()

        def merge_value(x, y):
            return pd.concat([x, create_combiner(y)])

        def merge_combiner(x, y):
            return x + y

        rddOfSum = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfSum, self.sql_ctx) 
开发者ID:sparklingpandas,项目名称:sparklingpandas,代码行数:26,代码来源:groupby.py

示例8: min

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def min(self):
        """Compute the min for each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.min)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).min()

        def merge_value(x, y):
            return x.append(create_combiner(y)).min()

        def merge_combiner(x, y):
            return x.append(y).min(level=0)

        rddOfMin = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfMin, self.sql_ctx) 
开发者ID:sparklingpandas,项目名称:sparklingpandas,代码行数:26,代码来源:groupby.py

示例9: max

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def max(self):
        """Compute the max for each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.max)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).max()

        def merge_value(x, y):
            return x.append(create_combiner(y)).max()

        def merge_combiner(x, y):
            return x.append(y).max(level=0)

        rddOfMax = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfMax, self.sql_ctx) 
开发者ID:sparklingpandas,项目名称:sparklingpandas,代码行数:26,代码来源:groupby.py

示例10: last

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def last(self):
        """Pull out the last from each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs
        # If its possible to use Spark SQL grouping do it
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.last)

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).last()

        def merge_value(x, y):
            return create_combiner(y)

        def merge_combiner(x, y):
            return y

        rddOfLast = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfLast, self.sql_ctx) 
开发者ID:sparklingpandas,项目名称:sparklingpandas,代码行数:26,代码来源:groupby.py

示例11: _load_pyfunc

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def _load_pyfunc(path):
    """
    Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``.

    :param path: Local filesystem path to the MLflow Model with the ``spark`` flavor.
    """
    # NOTE: The getOrCreate() call below may change settings of the active session which we do not
    # intend to do here. In particular, setting master to local[1] can break distributed clusters.
    # To avoid this problem, we explicitly check for an active session. This is not ideal but there
    # is no good workaround at the moment.
    import pyspark

    spark = pyspark.sql.SparkSession._instantiatedSession
    if spark is None:
        spark = pyspark.sql.SparkSession.builder.config("spark.python.worker.reuse", True) \
            .master("local[1]").getOrCreate()
    return _PyFuncModelWrapper(spark, _load_model(model_uri=path)) 
开发者ID:mlflow,项目名称:mlflow,代码行数:19,代码来源:spark.py

示例12: _numpy_to_spark_mapping

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def _numpy_to_spark_mapping():
    """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation
    of multiple objects in each call."""

    # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot'
    # notation to avoid copy/paste/typo mistakes
    cache_attr_name = 'cached_numpy_to_pyspark_types_map'
    if not hasattr(_numpy_to_spark_mapping, cache_attr_name):
        import pyspark.sql.types as T

        setattr(_numpy_to_spark_mapping, cache_attr_name,
                {
                    np.int8: T.ByteType(),
                    np.uint8: T.ShortType(),
                    np.int16: T.ShortType(),
                    np.uint16: T.IntegerType(),
                    np.int32: T.IntegerType(),
                    np.int64: T.LongType(),
                    np.float32: T.FloatType(),
                    np.float64: T.DoubleType(),
                    np.string_: T.StringType(),
                    np.str_: T.StringType(),
                    np.unicode_: T.StringType(),
                    np.bool_: T.BooleanType(),
                })

    return getattr(_numpy_to_spark_mapping, cache_attr_name)


# TODO: Changing fields in this class or the UnischemaField will break reading due to the schema being pickled next to
# the dataset on disk 
开发者ID:uber,项目名称:petastorm,代码行数:33,代码来源:unischema.py

示例13: spark_context

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def spark_context(self, application_name):
        """Create a spark context given the parameters configured in this class.

        The caller is responsible for calling ``.close`` on the resulting spark context

        Parameters
        ----------
        application_name : string

        Returns
        -------
        sc : SparkContext
        """

        # initialize the spark configuration
        self._init_spark()
        import pyspark
        import pyspark.sql

        # initialize conf
        spark_conf = pyspark.SparkConf()
        for k, v in self._spark_conf_helper._conf_dict.items():
            spark_conf.set(k, v)

        log.info("Starting SparkContext")
        return pyspark.SparkContext(appName=application_name, conf=spark_conf) 
开发者ID:Valassis-Digital-Media,项目名称:spylon,代码行数:28,代码来源:launcher.py

示例14: spark_session

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def spark_session(self, application_name):
        sc = self.spark_context(application_name)
        from pyspark.sql import SparkSession
        return SparkSession(sc) 
开发者ID:Valassis-Digital-Media,项目名称:spylon,代码行数:6,代码来源:launcher.py

示例15: with_sql_context

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import sql [as 别名]
def with_sql_context(application_name, conf=None):
    """Context manager for a spark context

    Returns
    -------
    sc : SparkContext
    sql_context: SQLContext

    Examples
    --------
    Used within a context manager
    >>> with with_sql_context("MyApplication") as (sc, sql_context):
    ...     import pyspark
    ...     # Do stuff
    ...     pass

    """
    if conf is None:
        conf = default_configuration
    assert isinstance(conf, SparkConfiguration)

    sc = conf.spark_context(application_name)
    import pyspark.sql
    try:
        yield sc, pyspark.sql.SQLContext(sc)
    finally:
        sc.stop() 
开发者ID:Valassis-Digital-Media,项目名称:spylon,代码行数:29,代码来源:launcher.py


注:本文中的pyspark.sql方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。