Python functions.first方法代码示例

本文整理汇总了Python中pyspark.sql.functions.first方法的典型用法代码示例。如果您正苦于以下问题：Python functions.first方法的具体用法？Python functions.first怎么用？Python functions.first使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions的用法示例。

在下文中一共展示了functions.first方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _summary

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def _summary(self, name=None):
        """
        Return a summarized representation.

        Parameters
        ----------
        name : str
            name to use in the summary representation

        Returns
        -------
        String with a summarized representation of the index
        """
        head, tail, total_count = self._internal.spark_frame.select(
            F.first(self.spark.column), F.last(self.spark.column), F.count(F.expr("*"))
        ).first()

        if total_count > 0:
            index_summary = ", %s to %s" % (pprint_thing(head), pprint_thing(tail))
        else:
            index_summary = ""

        if name is None:
            name = type(self).__name__
        return "%s: %s entries%s" % (name, total_count, index_summary)

开发者ID:databricks，项目名称:koalas，代码行数:27，代码来源:indexes.py

示例2: has_duplicates

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def has_duplicates(self) -> bool:
        """
        If index has duplicates, return True, otherwise False.

        Examples
        --------
        >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=list('aac'))
        >>> kdf.index.has_duplicates
        True

        >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('abc'), list('def')])
        >>> kdf.index.has_duplicates
        False

        >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('aac'), list('eef')])
        >>> kdf.index.has_duplicates
        True
        """
        sdf = self._internal.spark_frame.select(self.spark.column)
        scol = scol_for(sdf, sdf.columns[0])

        return sdf.select(F.count(scol) != F.countDistinct(scol)).first()[0]

开发者ID:databricks，项目名称:koalas，代码行数:24，代码来源:indexes.py

示例3: item

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def item(self):
        """
        Return the first element of the underlying data as a Python scalar.

        Returns
        -------
        scalar
            The first element of Series.

        Raises
        ------
        ValueError
            If the data is not length-1.

        Examples
        --------
        >>> kser = ks.Series([10])
        >>> kser.item()
        10
        """
        return self.head(2).to_pandas().item()

开发者ID:databricks，项目名称:koalas，代码行数:23，代码来源:series.py

示例4: repr

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def __repr__(self):
        max_display_count = get_option("display.max_rows")
        if max_display_count is None:
            return self._to_internal_pandas().to_string(name=self.name, dtype=self.dtype)

        pser = self._kdf._get_or_create_repr_pandas_cache(max_display_count)[self.name]
        pser_length = len(pser)
        pser = pser.iloc[:max_display_count]
        if pser_length > max_display_count:
            repr_string = pser.to_string(length=True)
            rest, prev_footer = repr_string.rsplit("\n", 1)
            match = REPR_PATTERN.search(prev_footer)
            if match is not None:
                length = match.group("length")
                name = str(self.dtype.name)
                footer = "\nName: {name}, dtype: {dtype}\nShowing only the first {length}".format(
                    length=length, name=self.name, dtype=pprint_thing(name)
                )
                return rest + footer
        return pser.to_string(name=self.name, dtype=self.dtype)

开发者ID:databricks，项目名称:koalas，代码行数:22，代码来源:series.py

示例5: _join_results

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def _join_results(self, scaffolds_df):

        def _read_rows(row):
            idx, _, dec = row.split("\t")
            return ps.Row(id=idx, decoration_smi=dec)

        sampled_df = SPARK.createDataFrame(SC.textFile(self._tmp_path(
            "sampled_decorations"), self.num_partitions).map(_read_rows))

        if self.decorator_type == "single":
            processed_df = self._join_results_single(scaffolds_df, sampled_df)
        elif self.decorator_type == "multi":
            processed_df = self._join_results_multi(scaffolds_df, sampled_df)
        else:
            raise ValueError("decorator_type has an invalid value '{}'".format(self.decorator_type))

        return processed_df\
            .where("smiles IS NOT NULL")\
            .groupBy("smiles")\
            .agg(
                psf.first("scaffold").alias("scaffold"),
                psf.first("decorations").alias("decorations"),
                psf.count("smiles").alias("count"))

开发者ID:undeadpixel，项目名称:reinvent-scaffold-decorator，代码行数:25，代码来源:sample_scaffolds.py

示例6: compile_arbitrary

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def compile_arbitrary(t, expr, scope, context=None, **kwargs):
    how = expr.op().how

    if how == 'first':
        fn = functools.partial(F.first, ignorenulls=True)
    elif how == 'last':
        fn = functools.partial(F.last, ignorenulls=True)
    else:
        raise NotImplementedError("Does not support 'how': {}".format(how))

    return compile_aggregator(t, expr, scope, fn, context)

开发者ID:ibis-project，项目名称:ibis，代码行数:13，代码来源:compiler.py

示例7: compile_first_value

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def compile_first_value(t, expr, scope, *, window, **kwargs):
    op = expr.op()
    src_column = t.translate(op.arg, scope)
    return F.first(src_column).over(window)

开发者ID:ibis-project，项目名称:ibis，代码行数:6，代码来源:compiler.py

示例8: argmax

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def argmax(self):
        """
        Return a maximum argument indexer.

        Parameters
        ----------
        skipna : bool, default True

        Returns
        -------
        maximum argument indexer

        Examples
        --------
        >>> kidx = ks.Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3])
        >>> kidx
        Int64Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3], dtype='int64')

        >>> kidx.argmax()
        4
        """
        sdf = self._internal.spark_frame.select(self.spark.column)
        sequence_col = verify_temp_column_name(sdf, "__distributed_sequence_column__")
        sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name=sequence_col)
        # spark_frame here looks like below
        # +-----------------+---------------+
        # |__index_level_0__|__index_value__|
        # +-----------------+---------------+
        # |                0|             10|
        # |                4|            100|
        # |                2|              8|
        # |                3|              7|
        # |                6|              4|
        # |                5|              5|
        # |                7|              3|
        # |                8|            100|
        # |                1|              9|
        # +-----------------+---------------+

        return sdf.orderBy(self.spark.column.desc(), F.col(sequence_col).asc()).first()[0]

开发者ID:databricks，项目名称:koalas，代码行数:42，代码来源:indexes.py

示例9: argmin

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def argmin(self):
        """
        Return a minimum argument indexer.

        Parameters
        ----------
        skipna : bool, default True

        Returns
        -------
        minimum argument indexer

        Examples
        --------
        >>> kidx = ks.Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3])
        >>> kidx
        Int64Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3], dtype='int64')

        >>> kidx.argmin()
        7
        """
        sdf = self._internal.spark_frame.select(self.spark.column)
        sequence_col = verify_temp_column_name(sdf, "__distributed_sequence_column__")
        sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name=sequence_col)

        return sdf.orderBy(self.spark.column.asc(), F.col(sequence_col).asc()).first()[0]

开发者ID:databricks，项目名称:koalas，代码行数:28，代码来源:indexes.py

示例10: repr

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def __repr__(self):
        max_display_count = get_option("display.max_rows")
        if max_display_count is None:
            return repr(self.to_pandas())

        pindex = self._kdf._get_or_create_repr_pandas_cache(max_display_count).index

        pindex_length = len(pindex)
        repr_string = repr(pindex[:max_display_count])

        if pindex_length > max_display_count:
            footer = "\nShowing only the first {}".format(max_display_count)
            return repr_string + footer
        return repr_string

开发者ID:databricks，项目名称:koalas，代码行数:16，代码来源:indexes.py

示例11: head

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def head(self, n: int = 5) -> "Series":
        """
        Return the first n rows.

        This function returns the first n rows for the object based on position.
        It is useful for quickly testing if your object has the right type of data in it.

        Parameters
        ----------
        n : Integer, default =  5

        Returns
        -------
        The first n rows of the caller object.

        Examples
        --------
        >>> df = ks.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion']})
        >>> df.animal.head(2)  # doctest: +NORMALIZE_WHITESPACE
        0     alligator
        1     bee
        Name: animal, dtype: object
        """
        return first_series(self.to_dataframe().head(n))

    # TODO: Categorical type isn't supported (due to PySpark's limitation) and
    # some doctests related with timestamps were not added.

开发者ID:databricks，项目名称:koalas，代码行数:29，代码来源:series.py

示例12: first_series

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def first_series(df):
    """
    Takes a DataFrame and returns the first column of the DataFrame as a Series
    """
    assert isinstance(df, (DataFrame, pd.DataFrame)), type(df)
    if isinstance(df, DataFrame):
        return df._kser_for(df._internal.column_labels[0])
    else:
        return df[df.columns[0]]

开发者ID:databricks，项目名称:koalas，代码行数:11，代码来源:series.py

示例13: first

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def first(self):
        """
        Pull out the first from each group. Note: this is different than
        Spark's first.
        """
        # If its possible to use Spark SQL grouping do it
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.first)
        myargs = self._myargs
        mykwargs = self._mykwargs
        self._prep_pandas_groupby()

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).first()

        def merge_value(x, y):
            return create_combiner(x)

        def merge_combiner(x, y):
            return x

        rddOfFirst = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfFirst, self.sql_ctx)

开发者ID:sparklingpandas，项目名称:sparklingpandas，代码行数:30，代码来源:groupby.py

示例14: agg_first

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def agg_first(field_name):
    return F.first(field_name, ignorenulls=True).alias(field_name)

开发者ID:mozilla，项目名称:python_mozetl，代码行数:4，代码来源:fields.py

示例15: run

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def run(self):
        def _enumerate(row, max_cuts=self.max_cuts, enumerator=self.enumerator):
            fields = row.split("\t")
            smiles = fields[0]
            mol = uc.to_mol(smiles)
            out_rows = []
            if mol:
                for cuts in range(1, max_cuts + 1):
                    for sliced_mol in enumerator.enumerate(mol, cuts=cuts):
                        # normalize scaffold and decorations
                        scaff_smi, dec_smis = sliced_mol.to_smiles()
                        dec_smis = [smi for num, smi in sorted(dec_smis.items())]
                        out_rows.append(ps.Row(
                            scaffold=scaff_smi,
                            decorations=dec_smis,
                            smiles=uc.to_smiles(mol),
                            cuts=cuts
                        ))
            return out_rows
        enumeration_df = SPARK.createDataFrame(
            SC.textFile(self.input_path)
            .repartition(self.partitions)
            .flatMap(_enumerate))\
            .groupBy("scaffold", "decorations")\
            .agg(psf.first("cuts").alias("cuts"), psf.first("smiles").alias("smiles"))\
            .persist()

        self._log("info", "Obtained %d sliced molecules", enumeration_df.count())

        if self.output_path:
            enumeration_df.write.parquet(self.output_path)
        return enumeration_df

开发者ID:undeadpixel，项目名称:reinvent-scaffold-decorator，代码行数:34，代码来源:slice_db.py

注：本文中的pyspark.sql.functions.first方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。