Java Dataset.select方法代碼示例

本文整理匯總了Java中org.apache.spark.sql.Dataset.select方法的典型用法代碼示例。如果您正苦於以下問題：Java Dataset.select方法的具體用法？Java Dataset.select怎麽用？Java Dataset.select使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.spark.sql.Dataset的用法示例。

在下文中一共展示了Dataset.select方法的8個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Java代碼示例。

示例1: computeJoins

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public Dataset<Row> computeJoins(SQLContext sqlContext){
	// compute all the joins
	Dataset<Row> results = node.computeJoinWithChildren(sqlContext);
	// select only the requested result
	Column [] selectedColumns = new Column[node.projection.size()];
	for (int i = 0; i < selectedColumns.length; i++) {
		selectedColumns[i]= new Column(node.projection.get(i));
	}

	// if there is a filter set, apply it
	results =  filter == null ? results.select(selectedColumns) : results.filter(filter).select(selectedColumns);
	
	// if results are distinct
	if(selectDistinct) results = results.distinct();
	
	return results;
	
}

開發者ID:tf-dbis-uni-freiburg，項目名稱:PRoST，代碼行數:19，代碼來源:JoinTree.java

示例2: getHouseholdDataframe

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private Dataset<Row> getHouseholdDataframe() {
  String filename = "data/14zpallagi*.csv";
  Dataset<Row> df = spark.read().format("csv")
      .option("inferSchema", "true")
      .option("header", "true").load(filename);

  df = df.select(
      df.col("zipcode"),
      df.col("agi_stub"),
      df.col("N1"),
      df.col("A02650"),
      df.col("N1").multiply(df.col("A02650")));
  df = df.withColumnRenamed(df.columns()[df.columns().length - 1], "income");
  df = df.groupBy("zipcode").sum("income");
  df = df.withColumnRenamed(df.columns()[df.columns().length - 1], "total_income");

  return df;
}

開發者ID:jgperrin，項目名稱:net.jgp.labs.informix2spark，代碼行數:19，代碼來源:SalesTargetApp.java

示例3: load_ntriples

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public void load_ntriples() {
	String ds = hdfs_input_directory;
	Dataset<Row> triple_table_file = spark.read().text(ds);

	
	String triple_regex = build_triple_regex();

	Dataset<Row> triple_table = triple_table_file.select(
			functions.regexp_extract(functions.col("value"), triple_regex, 1).alias(this.column_name_subject),
			functions.regexp_extract(functions.col("value"), triple_regex, 2).alias(this.column_name_predicate),
			functions.regexp_extract(functions.col("value"), triple_regex, 3).alias(this.column_name_object));
	
	triple_table.createOrReplaceTempView(name_tripletable);
	logger.info("Created tripletable");
}

開發者ID:tf-dbis-uni-freiburg，項目名稱:PRoST，代碼行數:16，代碼來源:TripleTableLoader.java

示例4: writeAncestorsToTable

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
/**
 * Writes ancestor records to a table. This class ensures the columns and partitions are mapped
 * properly, and is a workaround similar to the problem described <a
 * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
 *
 * @param ancestors a dataset of ancestor records
 * @param tableName the table to write them to
 */
private static void writeAncestorsToTable(Dataset<Ancestor> ancestors, String tableName) {

  Dataset<Row> orderedColumnDataset = ancestors.select("descendantSystem",
      "descendantValue",
      "ancestorSystem",
      "ancestorValue",
      "uri",
      "version");

  orderedColumnDataset.write()
      .mode(SaveMode.ErrorIfExists)
      .insertInto(tableName);
}

開發者ID:cerner，項目名稱:bunsen，代碼行數:22，代碼來源:Hierarchies.java

示例5: writeValuesToTable

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
/**
 * Writes value records to a table. This class ensures the columns and partitions are mapped
 * properly, and is a workaround similar to the problem described <a
 * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
 *
 * @param values a dataset of value records
 * @param tableName the table to write them to
 */
private static void writeValuesToTable(Dataset<Value> values, String tableName) {

  // Note the last two columns here must be the partitioned-by columns in order and in lower case
  // for Spark to properly match them to the partitions
  Dataset<Row> orderColumnDataset = values.select("system",
      "version",
      "value",
      "valueseturi",
      "valuesetversion");

  orderColumnDataset.write()
      .mode(SaveMode.ErrorIfExists)
      .insertInto(tableName);
}

開發者ID:cerner，項目名稱:bunsen，代碼行數:23，代碼來源:ValueSets.java

示例6: writeMappingsToTable

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
/**
 * Writes mapping records to a table. This class ensures the columns and partitions are mapped
 * properly, and is a workaround similar to the problem described <a
 * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
 *
 * @param mappings a dataset of mapping records
 * @param tableName the table to write them to
 */
private static void writeMappingsToTable(Dataset<Mapping> mappings,
    String tableName) {

  // Note the last two columns here must be the partitioned-by columns
  // in order and in lower case for Spark to properly match
  // them to the partitions.
  Dataset<Row> orderedColumnDataset =
      mappings.select("sourceValueSet",
          "targetValueSet",
          "sourceSystem",
          "sourceValue",
          "targetSystem",
          "targetValue",
          "equivalence",
          "conceptmapuri",
          "conceptmapversion");

  orderedColumnDataset
      .write()
      .insertInto(tableName);
}

開發者ID:cerner，項目名稱:bunsen，代碼行數:30，代碼來源:ConceptMaps.java

示例7: main

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
   SparkSession spark = SparkSession
     .builder().master("local").config("spark.sql.warehouse.dir", "file:///C:/Users/sumit.kumar/Downloads/bin/warehouse")
     .appName("JavaEstimatorTransformerParamExample")
     .getOrCreate();
   Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
   // $example on$
   // Prepare training data.
   List<Row> dataTraining = Arrays.asList(
       RowFactory.create(1.0, Vectors.dense(0.0, 1.1, 0.1)),
       RowFactory.create(0.0, Vectors.dense(2.0, 1.0, -1.0)),
       RowFactory.create(0.0, Vectors.dense(2.0, 1.3, 1.0)),
       RowFactory.create(1.0, Vectors.dense(0.0, 1.2, -0.5))
   );
   StructType schema = new StructType(new StructField[]{
       new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
       new StructField("features", new VectorUDT(), false, Metadata.empty())
   });
   Dataset<Row> training = spark.createDataFrame(dataTraining, schema);

   // Create a LogisticRegression instance. This instance is an Estimator.
   LogisticRegression lr = new LogisticRegression();
   // Print out the parameters, documentation, and any default values.
   System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");

   // We may set parameters using setter methods.
   lr.setMaxIter(10).setRegParam(0.01);

   // Learn a LogisticRegression model. This uses the parameters stored in lr.
   LogisticRegressionModel model1 = lr.fit(training);
   // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
   // we can view the parameters it used during fit().
   // This prints the parameter (name: value) pairs, where names are unique IDs for this
   // LogisticRegression instance.
   System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());

   // We may alternatively specify parameters using a ParamMap.
   ParamMap paramMap = new ParamMap()
     .put(lr.maxIter().w(20))  // Specify 1 Param.
     .put(lr.maxIter(), 30)  // This overwrites the original maxIter.
     .put(lr.regParam().w(0.1), lr.threshold().w(0.55));  // Specify multiple Params.

   // One can also combine ParamMaps.
   ParamMap paramMap2 = new ParamMap()
     .put(lr.probabilityCol().w("myProbability"));  // Change output column name
   ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);

   // Now learn a new model using the paramMapCombined parameters.
   // paramMapCombined overrides all parameters set earlier via lr.set* methods.
   LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
   System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());

   // Prepare test documents.
   List<Row> dataTest = Arrays.asList(
       RowFactory.create(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
       RowFactory.create(0.0, Vectors.dense(3.0, 2.0, -0.1)),
       RowFactory.create(1.0, Vectors.dense(0.0, 2.2, -1.5))
   );
   Dataset<Row> test = spark.createDataFrame(dataTest, schema);

   // Make predictions on test documents using the Transformer.transform() method.
   // LogisticRegression.transform will only use the 'features' column.
   // Note that model2.transform() outputs a 'myProbability' column instead of the usual
   // 'probability' column since we renamed the lr.probabilityCol parameter previously.
   Dataset<Row> results = model2.transform(test);
   Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction");
   for (Row r: rows.collectAsList()) {
     System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
       + ", prediction=" + r.get(3));
   }
   // $example off$

   spark.stop();
 }

開發者ID:PacktPublishing，項目名稱:Apache-Spark-2x-for-Java-Developers，代碼行數:76，代碼來源:JavaEstimatorTransformerParamExample.java

示例8: main

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
	 System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
	 //Build a Spark Session	
      SparkSession sparkSession = SparkSession
      .builder()
      .master("local")
	  .config("spark.sql.warehouse.dir","file:///E:/hadoop/warehouse")
      .appName("EdgeBuilder")
      .getOrCreate();
      Logger rootLogger = LogManager.getRootLogger();
	  rootLogger.setLevel(Level.WARN); 
	// Read the CSV data
		 Dataset<Row> emp_ds = sparkSession.read()
				 .format("com.databricks.spark.csv")
   		         .option("header", "true")
   		         .option("inferSchema", "true")
   		         .load("src/main/resources/employee.txt");    
    		
	    UDF2 calcDays=new CalcDaysUDF();
	  //Registering the UDFs in Spark Session created above      
	    sparkSession.udf().register("calcDays", calcDays, DataTypes.LongType);
	    
	    emp_ds.createOrReplaceTempView("emp_ds");
	    
	    emp_ds.printSchema();
	    emp_ds.show();
	    
	    sparkSession.sql("select calcDays(hiredate,'dd-MM-yyyy') from emp_ds").show();   
	    //Instantiate UDAF
	    AverageUDAF calcAvg= new AverageUDAF();
	    //Register UDAF to SparkSession
	    sparkSession.udf().register("calAvg", calcAvg);
	    //Use UDAF
	    sparkSession.sql("select deptno,calAvg(salary) from emp_ds group by deptno ").show(); 
	   
	    //
	    TypeSafeUDAF typeSafeUDAF=new TypeSafeUDAF();
	    
	    Dataset<Employee> emf = emp_ds.as(Encoders.bean(Employee.class));
	    emf.printSchema();
	    emf.show();
	    
	    TypedColumn<Employee, Double> averageSalary = typeSafeUDAF.toColumn().name("averageTypeSafe");
	    Dataset<Double> result = emf.select(averageSalary);
	   result.show();
	    

}

開發者ID:PacktPublishing，項目名稱:Apache-Spark-2x-for-Java-Developers，代碼行數:51，代碼來源:UDFExample.java

注：本文中的org.apache.spark.sql.Dataset.select方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。