本文整理汇总了Python中pyspark.sql.functions.lit方法的典型用法代码示例。如果您正苦于以下问题:Python functions.lit方法的具体用法?Python functions.lit怎么用?Python functions.lit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.lit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: write_partition
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import lit [as 别名]
def write_partition(
df: DataFrame, output_table: str, output_path: str,
partition_spec: Mapping[str, str], mode: str = 'overwrite'
) -> None:
"""Write dataframe to disk as parquet and add to hive metastore"""
for k, v in partition_spec.items():
df = df.withColumn(k, F.lit(v))
expect_schema = df.sql_ctx.read.table(output_table).schema
errors = _verify_schema_equality(expect_schema, df.schema)
if errors:
raise Exception('Output table has incompatible schema: {}'.format(
', '.join(errors)))
df.write.mode(mode).parquet(output_path)
df.sql_ctx.sparkSession.sql(_add_partition_ql(
output_table, output_path, partition_spec)).collect()
# Generic helpers for composing transformations
示例2: explode_features
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import lit [as 别名]
def explode_features(df, features=None):
"""Convert feature vector into individual columns
Parameters
----------
df : pyspark.sql.DataFrame
features : list of str or None
Returns
-------
pyspark.sql.DataFrame
"""
if features is None:
features = df.schema['features'].metadata['features']
def extract_feature(features, idx):
return float(features[idx])
extract_feature_udf = F.udf(extract_feature, pyspark.sql.types.FloatType())
cols = [extract_feature_udf('features', F.lit(idx)).alias(name) for idx, name in enumerate(features)]
return df.select('*', *cols)
示例3: test_split
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import lit [as 别名]
def test_split(spark):
df = (
spark
.range(1, 100 * 100)
# convert into 100 "queries" with 100 values each. We need a
# sufficiently large number of queries, or the split wont have
# enough data for partitions to even out.
.select(F.lit('foowiki').alias('wikiid'),
(F.col('id')/100).cast('int').alias('norm_query_id')))
with_folds = mjolnir.training.tuning.split(df, (0.8, 0.2)).collect()
fold_0 = [row for row in with_folds if row.fold == 0]
fold_1 = [row for row in with_folds if row.fold == 1]
# Check the folds are pretty close to requested
total_len = float(len(with_folds))
assert 0.8 == pytest.approx(len(fold_0) / total_len, abs=0.015)
assert 0.2 == pytest.approx(len(fold_1) / total_len, abs=0.015)
# Check each norm query is only found on one side of the split
queries_in_0 = set([row.norm_query_id for row in fold_0])
queries_in_1 = set([row.norm_query_id for row in fold_1])
assert len(queries_in_0.intersection(queries_in_1)) == 0
示例4: compile_literal
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import lit [as 别名]
def compile_literal(t, expr, scope, raw=False, **kwargs):
""" If raw is True, don't wrap the result with F.lit()
"""
value = expr.op().value
if raw:
return value
if isinstance(value, collections.abc.Set):
# Don't wrap set with F.lit
if isinstance(value, frozenset):
# Spark doens't like frozenset
return set(value)
else:
return value
elif isinstance(value, list):
return F.array(*[F.lit(v) for v in value])
else:
return F.lit(expr.op().value)
示例5: main
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import lit [as 别名]
def main(data_path, output_path):
# Read data
logging.info(f"Reading data from {data_path}")
sc = SparkContext()
sql = SQLContext(sc)
data = sql.read.parquet(data_path)
# Build label matrix
logging.info("Applying LFs")
lfs = [article_mentions_person, body_contains_fortune, person_in_db]
applier = SparkLFApplier(lfs)
L = applier.apply(data.rdd)
# Train label model
logging.info("Training label model")
label_model = LabelModel(cardinality=2)
label_model.fit(L)
# Generate training labels
logging.info("Generating probabilistic labels")
y_prob = label_model.predict_proba(L)[:, 1]
y_prob_sql_array = F.array([F.lit(y) for y in y_prob])
data_labeled = data.withColumn("y_prob", y_prob_sql_array)
data_labeled.write.mode("overwrite").parquet(output_path)
logging.info(f"Labels saved to {output_path}")
示例6: _select_rows_by_iterable
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import lit [as 别名]
def _select_rows_by_iterable(
self, rows_sel: Iterable
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
rows_sel = list(rows_sel)
if len(rows_sel) == 0:
return F.lit(False), None, None
elif len(self._internal.index_spark_column_names) == 1:
index_column = self._kdf_or_kser.index.to_series()
index_data_type = index_column.spark.data_type
if len(rows_sel) == 1:
return (
index_column.spark.column == F.lit(rows_sel[0]).cast(index_data_type),
None,
None,
)
else:
return (
index_column.spark.column.isin(
[F.lit(r).cast(index_data_type) for r in rows_sel]
),
None,
None,
)
else:
raise LocIndexer._NotImplemented("Cannot select with MultiIndex with Spark.")
示例7: _is_monotonic_decreasing
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import lit [as 别名]
def _is_monotonic_decreasing(self):
scol = self.spark.column
window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)
prev = F.lag(scol, 1).over(window)
cond = F.lit(True)
for field in self.spark.data_type[::-1]:
left = scol.getField(field.name)
right = prev.getField(field.name)
compare = MultiIndex._comparator_for_monotonic_decreasing(field.dataType)
cond = F.when(left.eqNullSafe(right), cond).otherwise(
compare(left, right, spark.Column.__lt__)
)
cond = prev.isNull() | cond
internal = InternalFrame(
spark_frame=self._internal.spark_frame.select(
self._internal.index_spark_columns + [cond]
),
index_map=self._internal.index_map,
)
return first_series(DataFrame(internal))
示例8: getTopEntities
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import lit [as 别名]
def getTopEntities(self, e, targetType = '', maxCount = 20, minScore = 0.0):
df1 = self.df
row1 = df1.where(df1.EntityId == e).first()
self.raiseErrorIfNotFound(row1, e)
if targetType == '':
df2 = df1.where(df1.EntityId != e)
else :
df2 = df1.where((df1.EntityId != e) & (df1.EntityType == targetType))
df3 = df2.select(df2.EntityId, df2.EntityType, udfCosineSimilarity(F.lit(row1.Data), df2.Data).alias('Score'))
return df3.where(df3.Score >= minScore).orderBy(df3.Score.desc()).limit(maxCount)
# COMMAND ----------
# MAGIC %md **PaperSimilarity** class to compute paper recommendations
# COMMAND ----------
# Parameters:
# resource: resource stream path
# container: container name in Azure Storage (AS) account
# account: Azure Storage (AS) account
# sas: complete 'Blob service SAS URL' of the shared access signature (sas) for the container
# key: access key for the container, if sas is specified, key is ignored
#
# Note:
# resource does not have header
# you need to provide value for either sas or key
#
示例9: _getTopEntitiesByEmbedding
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import lit [as 别名]
def _getTopEntitiesByEmbedding(self, e, maxCount, minScore):
df1 = self.df
paperdf = self.mag.getDataframe('Papers')
row1 = df1.where(df1.EntityId == e).first()
df2 = df1.where(df1.EntityId != e)
df3 = df2.select(df2.EntityId, udfCosineSimilarityN(F.lit(row1.Data), df2.Data).alias('Score'))
return df3.join(paperdf, df3.EntityId == paperdf.PaperId, 'inner').select(paperdf.PaperId, paperdf.PaperTitle, df3.Score).where((~F.isnan(df3.Score)) & (df3.Score >= minScore)).orderBy(df3.Score.desc()).limit(maxCount)
示例10: read_partition
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import lit [as 别名]
def read_partition(
spark: SparkSession,
table: str,
partition_spec: Mapping[str, str],
schema: Optional[T.StructType] = None,
direct_parquet_read: bool = False
) -> DataFrame:
"""Read a single partition from a hive table.
Verifies the partition specification describes a complete partition,
that the partition exists, and optionally that the table is compatible
with an expected schema. The partition could still be empty.
"""
# We don't need to do anything with the result, our goal is to
# trigger AnalysisException when the arguments are invalid.
spark.sql(_describe_partition_ql(table, partition_spec)).collect()
partition_cond = F.lit(True)
for k, v in partition_spec.items():
partition_cond &= F.col(k) == v
df = spark.read.table(table).where(partition_cond)
# The df we have now has types defined by the hive table, but this downgrades
# non-standard types like VectorUDT() to it's sql equivalent. Use the first
# df to find the files, then read them directly.
if direct_parquet_read:
input_files = list(df._jdf.inputFiles()) # type: ignore
input_dirs = set(os.path.dirname(path) for path in input_files)
if len(input_dirs) != 1:
raise Exception('Expected single directory containing partition data: [{}]'.format(
'],['.join(input_files)))
df = spark.read.parquet(list(input_dirs)[0])
if schema is not None:
# TODO: This only allows extra top level columns, anything
# nested must be exactly the same. Fine for now.
_verify_schema_compatability(schema, df.schema)
df = df.select(*(field.name for field in schema))
# Drop partitioning columns. These are not part of the mjolnir transformations, and
# are only an implementation detail of putting them on disk and tracking history.
return df.drop(*partition_spec.keys())
示例11: at_least_n_distinct
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import lit [as 别名]
def at_least_n_distinct(col, limit):
"""Count distinct that works with windows
The standard distinct count in spark sql can't be applied in
a window. This implementation allows that to work
"""
sc = SparkContext._active_spark_context
j_cols = _to_seq(sc, [_to_java_column(col), _to_java_column(F.lit(limit))])
jc = sc._jvm.org.wikimedia.search.mjolnir.AtLeastNDistinct().apply(j_cols)
return Column(jc)
示例12: compile_sign
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import lit [as 别名]
def compile_sign(t, expr, scope, **kwargs):
op = expr.op()
src_column = t.translate(op.arg, scope)
return F.when(src_column == 0, F.lit(0.0)).otherwise(
F.when(src_column > 0, F.lit(1.0)).otherwise(-1.0)
)
示例13: compile_string_find
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import lit [as 别名]
def compile_string_find(t, expr, scope, **kwargs):
op = expr.op()
@F.udf('long')
def str_find(s, substr, start, end):
return s.find(substr, start, end)
src_column = t.translate(op.arg, scope)
substr_column = t.translate(op.substr, scope)
start_column = t.translate(op.start, scope) if op.start else F.lit(None)
end_column = t.translate(op.end, scope) if op.end else F.lit(None)
return str_find(src_column, substr_column, start_column, end_column)
示例14: compile_null_literal
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import lit [as 别名]
def compile_null_literal(t, expr, scope):
return F.lit(None)
示例15: compile_null_if
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import lit [as 别名]
def compile_null_if(t, expr, scope, **kwargs):
op = expr.op()
col = t.translate(op.arg, scope)
nullif_col = t.translate(op.null_if_expr, scope)
return F.when(col == nullif_col, F.lit(None)).otherwise(col)