本文整理汇总了Python中pyspark.sql.functions.first方法的典型用法代码示例。如果您正苦于以下问题:Python functions.first方法的具体用法?Python functions.first怎么用?Python functions.first使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.first方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _summary
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def _summary(self, name=None):
"""
Return a summarized representation.
Parameters
----------
name : str
name to use in the summary representation
Returns
-------
String with a summarized representation of the index
"""
head, tail, total_count = self._internal.spark_frame.select(
F.first(self.spark.column), F.last(self.spark.column), F.count(F.expr("*"))
).first()
if total_count > 0:
index_summary = ", %s to %s" % (pprint_thing(head), pprint_thing(tail))
else:
index_summary = ""
if name is None:
name = type(self).__name__
return "%s: %s entries%s" % (name, total_count, index_summary)
示例2: has_duplicates
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def has_duplicates(self) -> bool:
"""
If index has duplicates, return True, otherwise False.
Examples
--------
>>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=list('aac'))
>>> kdf.index.has_duplicates
True
>>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('abc'), list('def')])
>>> kdf.index.has_duplicates
False
>>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('aac'), list('eef')])
>>> kdf.index.has_duplicates
True
"""
sdf = self._internal.spark_frame.select(self.spark.column)
scol = scol_for(sdf, sdf.columns[0])
return sdf.select(F.count(scol) != F.countDistinct(scol)).first()[0]
示例3: item
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def item(self):
"""
Return the first element of the underlying data as a Python scalar.
Returns
-------
scalar
The first element of Series.
Raises
------
ValueError
If the data is not length-1.
Examples
--------
>>> kser = ks.Series([10])
>>> kser.item()
10
"""
return self.head(2).to_pandas().item()
示例4: __repr__
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def __repr__(self):
max_display_count = get_option("display.max_rows")
if max_display_count is None:
return self._to_internal_pandas().to_string(name=self.name, dtype=self.dtype)
pser = self._kdf._get_or_create_repr_pandas_cache(max_display_count)[self.name]
pser_length = len(pser)
pser = pser.iloc[:max_display_count]
if pser_length > max_display_count:
repr_string = pser.to_string(length=True)
rest, prev_footer = repr_string.rsplit("\n", 1)
match = REPR_PATTERN.search(prev_footer)
if match is not None:
length = match.group("length")
name = str(self.dtype.name)
footer = "\nName: {name}, dtype: {dtype}\nShowing only the first {length}".format(
length=length, name=self.name, dtype=pprint_thing(name)
)
return rest + footer
return pser.to_string(name=self.name, dtype=self.dtype)
示例5: _join_results
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def _join_results(self, scaffolds_df):
def _read_rows(row):
idx, _, dec = row.split("\t")
return ps.Row(id=idx, decoration_smi=dec)
sampled_df = SPARK.createDataFrame(SC.textFile(self._tmp_path(
"sampled_decorations"), self.num_partitions).map(_read_rows))
if self.decorator_type == "single":
processed_df = self._join_results_single(scaffolds_df, sampled_df)
elif self.decorator_type == "multi":
processed_df = self._join_results_multi(scaffolds_df, sampled_df)
else:
raise ValueError("decorator_type has an invalid value '{}'".format(self.decorator_type))
return processed_df\
.where("smiles IS NOT NULL")\
.groupBy("smiles")\
.agg(
psf.first("scaffold").alias("scaffold"),
psf.first("decorations").alias("decorations"),
psf.count("smiles").alias("count"))
示例6: compile_arbitrary
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def compile_arbitrary(t, expr, scope, context=None, **kwargs):
how = expr.op().how
if how == 'first':
fn = functools.partial(F.first, ignorenulls=True)
elif how == 'last':
fn = functools.partial(F.last, ignorenulls=True)
else:
raise NotImplementedError("Does not support 'how': {}".format(how))
return compile_aggregator(t, expr, scope, fn, context)
示例7: compile_first_value
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def compile_first_value(t, expr, scope, *, window, **kwargs):
op = expr.op()
src_column = t.translate(op.arg, scope)
return F.first(src_column).over(window)
示例8: argmax
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def argmax(self):
"""
Return a maximum argument indexer.
Parameters
----------
skipna : bool, default True
Returns
-------
maximum argument indexer
Examples
--------
>>> kidx = ks.Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3])
>>> kidx
Int64Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3], dtype='int64')
>>> kidx.argmax()
4
"""
sdf = self._internal.spark_frame.select(self.spark.column)
sequence_col = verify_temp_column_name(sdf, "__distributed_sequence_column__")
sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name=sequence_col)
# spark_frame here looks like below
# +-----------------+---------------+
# |__index_level_0__|__index_value__|
# +-----------------+---------------+
# | 0| 10|
# | 4| 100|
# | 2| 8|
# | 3| 7|
# | 6| 4|
# | 5| 5|
# | 7| 3|
# | 8| 100|
# | 1| 9|
# +-----------------+---------------+
return sdf.orderBy(self.spark.column.desc(), F.col(sequence_col).asc()).first()[0]
示例9: argmin
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def argmin(self):
"""
Return a minimum argument indexer.
Parameters
----------
skipna : bool, default True
Returns
-------
minimum argument indexer
Examples
--------
>>> kidx = ks.Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3])
>>> kidx
Int64Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3], dtype='int64')
>>> kidx.argmin()
7
"""
sdf = self._internal.spark_frame.select(self.spark.column)
sequence_col = verify_temp_column_name(sdf, "__distributed_sequence_column__")
sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name=sequence_col)
return sdf.orderBy(self.spark.column.asc(), F.col(sequence_col).asc()).first()[0]
示例10: __repr__
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def __repr__(self):
max_display_count = get_option("display.max_rows")
if max_display_count is None:
return repr(self.to_pandas())
pindex = self._kdf._get_or_create_repr_pandas_cache(max_display_count).index
pindex_length = len(pindex)
repr_string = repr(pindex[:max_display_count])
if pindex_length > max_display_count:
footer = "\nShowing only the first {}".format(max_display_count)
return repr_string + footer
return repr_string
示例11: head
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def head(self, n: int = 5) -> "Series":
"""
Return the first n rows.
This function returns the first n rows for the object based on position.
It is useful for quickly testing if your object has the right type of data in it.
Parameters
----------
n : Integer, default = 5
Returns
-------
The first n rows of the caller object.
Examples
--------
>>> df = ks.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion']})
>>> df.animal.head(2) # doctest: +NORMALIZE_WHITESPACE
0 alligator
1 bee
Name: animal, dtype: object
"""
return first_series(self.to_dataframe().head(n))
# TODO: Categorical type isn't supported (due to PySpark's limitation) and
# some doctests related with timestamps were not added.
示例12: first_series
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def first_series(df):
"""
Takes a DataFrame and returns the first column of the DataFrame as a Series
"""
assert isinstance(df, (DataFrame, pd.DataFrame)), type(df)
if isinstance(df, DataFrame):
return df._kser_for(df._internal.column_labels[0])
else:
return df[df.columns[0]]
示例13: first
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def first(self):
"""
Pull out the first from each group. Note: this is different than
Spark's first.
"""
# If its possible to use Spark SQL grouping do it
if self._can_use_new_school():
self._prep_spark_sql_groupby()
import pyspark.sql.functions as func
return self._use_aggregation(func.first)
myargs = self._myargs
mykwargs = self._mykwargs
self._prep_pandas_groupby()
def create_combiner(x):
return x.groupby(*myargs, **mykwargs).first()
def merge_value(x, y):
return create_combiner(x)
def merge_combiner(x, y):
return x
rddOfFirst = self._sortIfNeeded(self._distributedRDD.combineByKey(
create_combiner,
merge_value,
merge_combiner)).values()
return DataFrame.fromDataFrameRDD(rddOfFirst, self.sql_ctx)
示例14: agg_first
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def agg_first(field_name):
return F.first(field_name, ignorenulls=True).alias(field_name)
示例15: run
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import first [as 别名]
def run(self):
def _enumerate(row, max_cuts=self.max_cuts, enumerator=self.enumerator):
fields = row.split("\t")
smiles = fields[0]
mol = uc.to_mol(smiles)
out_rows = []
if mol:
for cuts in range(1, max_cuts + 1):
for sliced_mol in enumerator.enumerate(mol, cuts=cuts):
# normalize scaffold and decorations
scaff_smi, dec_smis = sliced_mol.to_smiles()
dec_smis = [smi for num, smi in sorted(dec_smis.items())]
out_rows.append(ps.Row(
scaffold=scaff_smi,
decorations=dec_smis,
smiles=uc.to_smiles(mol),
cuts=cuts
))
return out_rows
enumeration_df = SPARK.createDataFrame(
SC.textFile(self.input_path)
.repartition(self.partitions)
.flatMap(_enumerate))\
.groupBy("scaffold", "decorations")\
.agg(psf.first("cuts").alias("cuts"), psf.first("smiles").alias("smiles"))\
.persist()
self._log("info", "Obtained %d sliced molecules", enumeration_df.count())
if self.output_path:
enumeration_df.write.parquet(self.output_path)
return enumeration_df