本文整理匯總了Java中org.apache.spark.sql.Dataset.select方法的典型用法代碼示例。如果您正苦於以下問題:Java Dataset.select方法的具體用法?Java Dataset.select怎麽用?Java Dataset.select使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.spark.sql.Dataset
的用法示例。
在下文中一共展示了Dataset.select方法的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: computeJoins
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public Dataset<Row> computeJoins(SQLContext sqlContext){
// compute all the joins
Dataset<Row> results = node.computeJoinWithChildren(sqlContext);
// select only the requested result
Column [] selectedColumns = new Column[node.projection.size()];
for (int i = 0; i < selectedColumns.length; i++) {
selectedColumns[i]= new Column(node.projection.get(i));
}
// if there is a filter set, apply it
results = filter == null ? results.select(selectedColumns) : results.filter(filter).select(selectedColumns);
// if results are distinct
if(selectDistinct) results = results.distinct();
return results;
}
示例2: getHouseholdDataframe
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private Dataset<Row> getHouseholdDataframe() {
String filename = "data/14zpallagi*.csv";
Dataset<Row> df = spark.read().format("csv")
.option("inferSchema", "true")
.option("header", "true").load(filename);
df = df.select(
df.col("zipcode"),
df.col("agi_stub"),
df.col("N1"),
df.col("A02650"),
df.col("N1").multiply(df.col("A02650")));
df = df.withColumnRenamed(df.columns()[df.columns().length - 1], "income");
df = df.groupBy("zipcode").sum("income");
df = df.withColumnRenamed(df.columns()[df.columns().length - 1], "total_income");
return df;
}
示例3: load_ntriples
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public void load_ntriples() {
String ds = hdfs_input_directory;
Dataset<Row> triple_table_file = spark.read().text(ds);
String triple_regex = build_triple_regex();
Dataset<Row> triple_table = triple_table_file.select(
functions.regexp_extract(functions.col("value"), triple_regex, 1).alias(this.column_name_subject),
functions.regexp_extract(functions.col("value"), triple_regex, 2).alias(this.column_name_predicate),
functions.regexp_extract(functions.col("value"), triple_regex, 3).alias(this.column_name_object));
triple_table.createOrReplaceTempView(name_tripletable);
logger.info("Created tripletable");
}
示例4: writeAncestorsToTable
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
/**
* Writes ancestor records to a table. This class ensures the columns and partitions are mapped
* properly, and is a workaround similar to the problem described <a
* href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
*
* @param ancestors a dataset of ancestor records
* @param tableName the table to write them to
*/
private static void writeAncestorsToTable(Dataset<Ancestor> ancestors, String tableName) {
Dataset<Row> orderedColumnDataset = ancestors.select("descendantSystem",
"descendantValue",
"ancestorSystem",
"ancestorValue",
"uri",
"version");
orderedColumnDataset.write()
.mode(SaveMode.ErrorIfExists)
.insertInto(tableName);
}
示例5: writeValuesToTable
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
/**
* Writes value records to a table. This class ensures the columns and partitions are mapped
* properly, and is a workaround similar to the problem described <a
* href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
*
* @param values a dataset of value records
* @param tableName the table to write them to
*/
private static void writeValuesToTable(Dataset<Value> values, String tableName) {
// Note the last two columns here must be the partitioned-by columns in order and in lower case
// for Spark to properly match them to the partitions
Dataset<Row> orderColumnDataset = values.select("system",
"version",
"value",
"valueseturi",
"valuesetversion");
orderColumnDataset.write()
.mode(SaveMode.ErrorIfExists)
.insertInto(tableName);
}
示例6: writeMappingsToTable
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
/**
* Writes mapping records to a table. This class ensures the columns and partitions are mapped
* properly, and is a workaround similar to the problem described <a
* href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
*
* @param mappings a dataset of mapping records
* @param tableName the table to write them to
*/
private static void writeMappingsToTable(Dataset<Mapping> mappings,
String tableName) {
// Note the last two columns here must be the partitioned-by columns
// in order and in lower case for Spark to properly match
// them to the partitions.
Dataset<Row> orderedColumnDataset =
mappings.select("sourceValueSet",
"targetValueSet",
"sourceSystem",
"sourceValue",
"targetSystem",
"targetValue",
"equivalence",
"conceptmapuri",
"conceptmapversion");
orderedColumnDataset
.write()
.insertInto(tableName);
}
示例7: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder().master("local").config("spark.sql.warehouse.dir", "file:///C:/Users/sumit.kumar/Downloads/bin/warehouse")
.appName("JavaEstimatorTransformerParamExample")
.getOrCreate();
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
// $example on$
// Prepare training data.
List<Row> dataTraining = Arrays.asList(
RowFactory.create(1.0, Vectors.dense(0.0, 1.1, 0.1)),
RowFactory.create(0.0, Vectors.dense(2.0, 1.0, -1.0)),
RowFactory.create(0.0, Vectors.dense(2.0, 1.3, 1.0)),
RowFactory.create(1.0, Vectors.dense(0.0, 1.2, -0.5))
);
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("features", new VectorUDT(), false, Metadata.empty())
});
Dataset<Row> training = spark.createDataFrame(dataTraining, schema);
// Create a LogisticRegression instance. This instance is an Estimator.
LogisticRegression lr = new LogisticRegression();
// Print out the parameters, documentation, and any default values.
System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");
// We may set parameters using setter methods.
lr.setMaxIter(10).setRegParam(0.01);
// Learn a LogisticRegression model. This uses the parameters stored in lr.
LogisticRegressionModel model1 = lr.fit(training);
// Since model1 is a Model (i.e., a Transformer produced by an Estimator),
// we can view the parameters it used during fit().
// This prints the parameter (name: value) pairs, where names are unique IDs for this
// LogisticRegression instance.
System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());
// We may alternatively specify parameters using a ParamMap.
ParamMap paramMap = new ParamMap()
.put(lr.maxIter().w(20)) // Specify 1 Param.
.put(lr.maxIter(), 30) // This overwrites the original maxIter.
.put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params.
// One can also combine ParamMaps.
ParamMap paramMap2 = new ParamMap()
.put(lr.probabilityCol().w("myProbability")); // Change output column name
ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
// Now learn a new model using the paramMapCombined parameters.
// paramMapCombined overrides all parameters set earlier via lr.set* methods.
LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());
// Prepare test documents.
List<Row> dataTest = Arrays.asList(
RowFactory.create(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
RowFactory.create(0.0, Vectors.dense(3.0, 2.0, -0.1)),
RowFactory.create(1.0, Vectors.dense(0.0, 2.2, -1.5))
);
Dataset<Row> test = spark.createDataFrame(dataTest, schema);
// Make predictions on test documents using the Transformer.transform() method.
// LogisticRegression.transform will only use the 'features' column.
// Note that model2.transform() outputs a 'myProbability' column instead of the usual
// 'probability' column since we renamed the lr.probabilityCol parameter previously.
Dataset<Row> results = model2.transform(test);
Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction");
for (Row r: rows.collectAsList()) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
+ ", prediction=" + r.get(3));
}
// $example off$
spark.stop();
}
開發者ID:PacktPublishing,項目名稱:Apache-Spark-2x-for-Java-Developers,代碼行數:76,代碼來源:JavaEstimatorTransformerParamExample.java
示例8: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
System.setProperty("hadoop.home.dir", "E:\\hadoop");
//Build a Spark Session
SparkSession sparkSession = SparkSession
.builder()
.master("local")
.config("spark.sql.warehouse.dir","file:///E:/hadoop/warehouse")
.appName("EdgeBuilder")
.getOrCreate();
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
// Read the CSV data
Dataset<Row> emp_ds = sparkSession.read()
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load("src/main/resources/employee.txt");
UDF2 calcDays=new CalcDaysUDF();
//Registering the UDFs in Spark Session created above
sparkSession.udf().register("calcDays", calcDays, DataTypes.LongType);
emp_ds.createOrReplaceTempView("emp_ds");
emp_ds.printSchema();
emp_ds.show();
sparkSession.sql("select calcDays(hiredate,'dd-MM-yyyy') from emp_ds").show();
//Instantiate UDAF
AverageUDAF calcAvg= new AverageUDAF();
//Register UDAF to SparkSession
sparkSession.udf().register("calAvg", calcAvg);
//Use UDAF
sparkSession.sql("select deptno,calAvg(salary) from emp_ds group by deptno ").show();
//
TypeSafeUDAF typeSafeUDAF=new TypeSafeUDAF();
Dataset<Employee> emf = emp_ds.as(Encoders.bean(Employee.class));
emf.printSchema();
emf.show();
TypedColumn<Employee, Double> averageSalary = typeSafeUDAF.toColumn().name("averageTypeSafe");
Dataset<Double> result = emf.select(averageSalary);
result.show();
}