本文整理匯總了Java中org.apache.spark.sql.Dataset.withColumn方法的典型用法代碼示例。如果您正苦於以下問題:Java Dataset.withColumn方法的具體用法?Java Dataset.withColumn怎麽用?Java Dataset.withColumn使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.spark.sql.Dataset
的用法示例。
在下文中一共展示了Dataset.withColumn方法的4個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: buildProperties
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public void buildProperties() {
// return rows of format <predicate, is_complex>
// is_complex can be 1 or 0
// 1 for multivalued predicate, 0 for single predicate
// select the properties that are complex
Dataset<Row> multivaluedProperties = spark.sql(String.format(
"SELECT DISTINCT(%1$s) AS %1$s FROM "
+ "(SELECT %2$s, %1$s, COUNT(*) AS rc FROM %3$s GROUP BY %2$s, %1$s HAVING rc > 1) AS grouped",
column_name_predicate, column_name_subject, name_tripletable));
// select all the properties
Dataset<Row> allProperties = spark.sql(String.format("SELECT DISTINCT(%1$s) AS %1$s FROM %2$s",
column_name_predicate, name_tripletable));
// select the properties that are not complex
Dataset<Row> singledValueProperties = allProperties.except(multivaluedProperties);
// combine them
Dataset<Row> combinedProperties = singledValueProperties
.selectExpr(column_name_predicate, "0 AS is_complex")
.union(multivaluedProperties.selectExpr(column_name_predicate, "1 AS is_complex"));
// remove '<' and '>', convert the characters
Dataset<Row> cleanedProperties = combinedProperties.withColumn("p",
functions.regexp_replace(functions.translate(combinedProperties.col("p"), "<>", ""),
"[[^\\w]+]", "_"));
// write the result
cleanedProperties.write().mode(SaveMode.Overwrite).saveAsTable("properties");
logger.info("Created properties table with name: " + tablename_properties);
}
示例2: start
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start() {
// @formatter:off
SparkSession spark = SparkSession
.builder()
.appName("Time to Ship")
.master("local")
.getOrCreate();
// @formatter:on
// Specific Informix dialect
JdbcDialect dialect = new InformixJdbcDialect();
JdbcDialects.registerDialect(dialect);
// Configuration info for the database
Config config = ConfigManager.getConfig(K.INFORMIX);
// List of all tables we want to work with
List<String> tables = new ArrayList<>();
tables.add("orders");
Map<String, Dataset<Row>> datalake = new HashMap<>();
for (String table : tables) {
System.out.print("Loading table [" + table + "] ... ");
// @formatter:off
Dataset<Row> df = spark.read()
.format("jdbc")
.option("url", config.getJdbcUrl())
.option("dbtable", table)
.option("user", config.getUser())
.option("password", config.getPassword())
.option("driver", config.getDriver())
.load();
// @formatter:on
datalake.put(table, df);
System.out.println("done.");
}
System.out.println("We have loaded " + datalake.size()
+ " table(s) in our data lake.");
Dataset<Row> ordersDf = datalake.get("orders");
// @formatter:off
ordersDf = ordersDf.withColumn(
"time_to_ship",
datediff(ordersDf.col("ship_date"), ordersDf.col("order_date")));
// @formatter:on
ordersDf.printSchema();
ordersDf.show(10);
System.out.println("We have " + ordersDf.count() + " orders");
Dataset<Row> ordersDf2 = ordersDf.filter("time_to_ship IS NOT NULL");
ordersDf2.printSchema();
ordersDf2.show(5);
System.out.println("We have " + ordersDf2.count() + " delivered orders");
}
示例3: start
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start() {
Dataset<Row> householdDf = getHouseholdDataframe();
Dataset<Row> populationDf = getPopulationDataframe();
Dataset<Row> indexDf = joinHouseholdPopulation(householdDf, populationDf);
Dataset<Row> salesDf = getSalesData();
Dataset<Row> salesIndexDf = salesDf
.join(indexDf, salesDf.col("zipcode").equalTo(indexDf.col("zipcode")), "left")
.drop(indexDf.col("zipcode"));
salesIndexDf = salesIndexDf.withColumn("revenue_by_inh", salesIndexDf.col("revenue")
.divide(salesIndexDf.col("pop")));
salesIndexDf = salesIndexDf.orderBy(col("revenue_by_inh").desc());
Row bestRow = salesIndexDf.first();
double bestRevenuePerInhabitant = ((BigDecimal) bestRow.getAs("revenue_by_inh"))
.doubleValue();
int populationOfBestRevenuePerInhabitant = bestRow.getAs("pop");
double incomeOfBestRevenuePerInhabitant = bestRow.getAs("income_per_inh");
salesIndexDf = salesIndexDf.withColumn(
"best_revenue_per_inh",
salesIndexDf.col("pop").divide(salesIndexDf.col("pop"))
.multiply(bestRevenuePerInhabitant));
salesIndexDf = salesIndexDf.withColumn(
"pop_of_best",
lit(populationOfBestRevenuePerInhabitant));
salesIndexDf = salesIndexDf.withColumn(
"income_of_best",
lit(incomeOfBestRevenuePerInhabitant));
salesIndexDf = salesIndexDf.withColumn(
"idx_revenue",
salesIndexDf.col("best_revenue_per_inh")
.divide(salesIndexDf.col("revenue_by_inh")));
salesIndexDf = salesIndexDf.withColumn(
"idx_pop",
salesIndexDf.col("pop").divide(salesIndexDf.col("pop_of_best")));
salesIndexDf = salesIndexDf.withColumn(
"idx_income",
salesIndexDf.col("income_per_inh").divide(salesIndexDf.col("income_of_best")));
salesIndexDf = salesIndexDf.withColumn(
"index",
salesIndexDf.col("idx_revenue").multiply(salesIndexDf.col("idx_pop")
.multiply(salesIndexDf.col("idx_income"))));
salesIndexDf = salesIndexDf.withColumn(
"potential_revenue",
salesIndexDf.col("revenue").multiply(salesIndexDf.col("index")));
salesIndexDf = salesIndexDf
.drop("idx_income")
.drop("idx_pop")
.drop("idx_revenue")
.drop("income_of_best")
.drop("total_income")
.drop("revenue_by_inh")
.drop("pop_of_best")
.drop("best_revenue_per_inh")
.orderBy(salesIndexDf.col("potential_revenue").desc());
salesIndexDf.show();
}
示例4: transform
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
@Override
public Dataset<Row> transform(Dataset<?> dataset){
StructType schema = dataset.schema();
StructType structSchema = getStructSchema(schema);
Column structColumn = dataset.apply(DatasetUtil.escapeColumnName(getStructCol()));
Dataset<Row> result = dataset.toDF();
StructField[] fields = structSchema.fields();
for(StructField field : fields){
String name = field.name();
Column fieldColumn = structColumn.getField(DatasetUtil.escapeColumnName(name));
result = result.withColumn(DatasetUtil.escapeColumnName(name), fieldColumn);
}
return result;
}