當前位置: 首頁>>代碼示例>>Java>>正文


Java Dataset.withColumn方法代碼示例

本文整理匯總了Java中org.apache.spark.sql.Dataset.withColumn方法的典型用法代碼示例。如果您正苦於以下問題:Java Dataset.withColumn方法的具體用法?Java Dataset.withColumn怎麽用?Java Dataset.withColumn使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在org.apache.spark.sql.Dataset的用法示例。


在下文中一共展示了Dataset.withColumn方法的4個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。

示例1: buildProperties

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public void buildProperties() {
	// return rows of format <predicate, is_complex>
	// is_complex can be 1 or 0
	// 1 for multivalued predicate, 0 for single predicate

	// select the properties that are complex
	Dataset<Row> multivaluedProperties = spark.sql(String.format(
			"SELECT DISTINCT(%1$s) AS %1$s FROM "
			+ "(SELECT %2$s, %1$s, COUNT(*) AS rc FROM %3$s GROUP BY %2$s, %1$s HAVING rc > 1) AS grouped",
			column_name_predicate, column_name_subject, name_tripletable));

	// select all the properties
	Dataset<Row> allProperties = spark.sql(String.format("SELECT DISTINCT(%1$s) AS %1$s FROM %2$s",
			column_name_predicate, name_tripletable));

	// select the properties that are not complex
	Dataset<Row> singledValueProperties = allProperties.except(multivaluedProperties);

	// combine them
	Dataset<Row> combinedProperties = singledValueProperties
			.selectExpr(column_name_predicate, "0 AS is_complex")
			.union(multivaluedProperties.selectExpr(column_name_predicate, "1 AS is_complex"));
	
	// remove '<' and '>', convert the characters
	Dataset<Row> cleanedProperties = combinedProperties.withColumn("p", 
			functions.regexp_replace(functions.translate(combinedProperties.col("p"), "<>", ""), 
			"[[^\\w]+]", "_"));
	
	// write the result
	cleanedProperties.write().mode(SaveMode.Overwrite).saveAsTable("properties");
	logger.info("Created properties table with name: " + tablename_properties);
}
 
開發者ID:tf-dbis-uni-freiburg,項目名稱:PRoST,代碼行數:33,代碼來源:PropertyTableLoader.java

示例2: start

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start() {
  // @formatter:off
   SparkSession spark = SparkSession
      .builder()
      .appName("Time to Ship")
      .master("local")
      .getOrCreate();
  // @formatter:on

  // Specific Informix dialect
  JdbcDialect dialect = new InformixJdbcDialect();
  JdbcDialects.registerDialect(dialect);

  // Configuration info for the database
  Config config = ConfigManager.getConfig(K.INFORMIX);

  // List of all tables we want to work with
  List<String> tables = new ArrayList<>();
  tables.add("orders");

  Map<String, Dataset<Row>> datalake = new HashMap<>();
  for (String table : tables) {
    System.out.print("Loading table [" + table + "] ... ");
    // @formatter:off
    Dataset<Row> df = spark.read()
        .format("jdbc")
        .option("url", config.getJdbcUrl())
        .option("dbtable", table)
        .option("user", config.getUser())
        .option("password", config.getPassword())
        .option("driver", config.getDriver())
        .load();
    // @formatter:on

    datalake.put(table, df);
    System.out.println("done.");
  }

  System.out.println("We have loaded " + datalake.size()
      + " table(s) in our data lake.");

  Dataset<Row> ordersDf = datalake.get("orders");
  // @formatter:off
  ordersDf = ordersDf.withColumn(
      "time_to_ship", 
      datediff(ordersDf.col("ship_date"), ordersDf.col("order_date")));
  // @formatter:on

  ordersDf.printSchema();
  ordersDf.show(10);
  System.out.println("We have " + ordersDf.count() + " orders");

  Dataset<Row> ordersDf2 = ordersDf.filter("time_to_ship IS NOT NULL");
  ordersDf2.printSchema();
  ordersDf2.show(5);
  System.out.println("We have " + ordersDf2.count() + " delivered orders");

}
 
開發者ID:jgperrin,項目名稱:net.jgp.labs.informix2spark,代碼行數:59,代碼來源:TimeToShipApp.java

示例3: start

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start() {
  Dataset<Row> householdDf = getHouseholdDataframe();
  Dataset<Row> populationDf = getPopulationDataframe();
  Dataset<Row> indexDf = joinHouseholdPopulation(householdDf, populationDf);
  Dataset<Row> salesDf = getSalesData();

  Dataset<Row> salesIndexDf = salesDf
      .join(indexDf, salesDf.col("zipcode").equalTo(indexDf.col("zipcode")), "left")
      .drop(indexDf.col("zipcode"));
  salesIndexDf = salesIndexDf.withColumn("revenue_by_inh", salesIndexDf.col("revenue")
      .divide(salesIndexDf.col("pop")));
  salesIndexDf = salesIndexDf.orderBy(col("revenue_by_inh").desc());
  Row bestRow = salesIndexDf.first();
  double bestRevenuePerInhabitant = ((BigDecimal) bestRow.getAs("revenue_by_inh"))
      .doubleValue();
  int populationOfBestRevenuePerInhabitant = bestRow.getAs("pop");
  double incomeOfBestRevenuePerInhabitant = bestRow.getAs("income_per_inh");
  salesIndexDf = salesIndexDf.withColumn(
      "best_revenue_per_inh",
      salesIndexDf.col("pop").divide(salesIndexDf.col("pop"))
          .multiply(bestRevenuePerInhabitant));
  salesIndexDf = salesIndexDf.withColumn(
      "pop_of_best",
      lit(populationOfBestRevenuePerInhabitant));
  salesIndexDf = salesIndexDf.withColumn(
      "income_of_best",
      lit(incomeOfBestRevenuePerInhabitant));
  salesIndexDf = salesIndexDf.withColumn(
      "idx_revenue",
      salesIndexDf.col("best_revenue_per_inh")
          .divide(salesIndexDf.col("revenue_by_inh")));
  salesIndexDf = salesIndexDf.withColumn(
      "idx_pop",
      salesIndexDf.col("pop").divide(salesIndexDf.col("pop_of_best")));
  salesIndexDf = salesIndexDf.withColumn(
      "idx_income",
      salesIndexDf.col("income_per_inh").divide(salesIndexDf.col("income_of_best")));
  salesIndexDf = salesIndexDf.withColumn(
      "index",
      salesIndexDf.col("idx_revenue").multiply(salesIndexDf.col("idx_pop")
          .multiply(salesIndexDf.col("idx_income"))));
  salesIndexDf = salesIndexDf.withColumn(
      "potential_revenue",
      salesIndexDf.col("revenue").multiply(salesIndexDf.col("index")));
  salesIndexDf = salesIndexDf
      .drop("idx_income")
      .drop("idx_pop")
      .drop("idx_revenue")
      .drop("income_of_best")
      .drop("total_income")
      .drop("revenue_by_inh")
      .drop("pop_of_best")
      .drop("best_revenue_per_inh")
      .orderBy(salesIndexDf.col("potential_revenue").desc());
  salesIndexDf.show();
}
 
開發者ID:jgperrin,項目名稱:net.jgp.labs.informix2spark,代碼行數:57,代碼來源:SalesTargetApp.java

示例4: transform

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
@Override
public Dataset<Row> transform(Dataset<?> dataset){
	StructType schema = dataset.schema();

	StructType structSchema = getStructSchema(schema);

	Column structColumn = dataset.apply(DatasetUtil.escapeColumnName(getStructCol()));

	Dataset<Row> result = dataset.toDF();

	StructField[] fields = structSchema.fields();
	for(StructField field : fields){
		String name = field.name();

		Column fieldColumn = structColumn.getField(DatasetUtil.escapeColumnName(name));

		result = result.withColumn(DatasetUtil.escapeColumnName(name), fieldColumn);
	}

	return result;
}
 
開發者ID:jeremyore,項目名稱:spark-pmml-import,代碼行數:22,代碼來源:ColumnExploder.java


注:本文中的org.apache.spark.sql.Dataset.withColumn方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。