當前位置: 首頁>>代碼示例>>Java>>正文


Java Dataset.show方法代碼示例

本文整理匯總了Java中org.apache.spark.sql.Dataset.show方法的典型用法代碼示例。如果您正苦於以下問題:Java Dataset.show方法的具體用法?Java Dataset.show怎麽用?Java Dataset.show使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在org.apache.spark.sql.Dataset的用法示例。


在下文中一共展示了Dataset.show方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。

示例1: main

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
	SparkSession spark = SparkSession.builder()
			.master("local[8]")
			.appName("PCAExpt")
			.getOrCreate();

	// Load and parse data
	String filePath = "/home/kchoppella/book/Chapter09/data/covtypeNorm.csv";

	// Loads data.
	Dataset<Row> inDataset = spark.read()
			.format("com.databricks.spark.csv")
			.option("header", "true")
			.option("inferSchema", true)
			.load(filePath);
	ArrayList<String> inputColsList = new ArrayList<String>(Arrays.asList(inDataset.columns()));
	
	//Make single features column for feature vectors 
	inputColsList.remove("class");
	String[] inputCols = inputColsList.parallelStream().toArray(String[]::new);
	
	//Prepare dataset for training with all features in "features" column
	VectorAssembler assembler = new VectorAssembler().setInputCols(inputCols).setOutputCol("features");
	Dataset<Row> dataset = assembler.transform(inDataset);

	PCAModel pca = new PCA()
			.setK(16)
			.setInputCol("features")
			.setOutputCol("pcaFeatures")
			.fit(dataset);

	Dataset<Row> result = pca.transform(dataset).select("pcaFeatures");
	System.out.println("Explained variance:");
	System.out.println(pca.explainedVariance());
	result.show(false);
	// $example off$
	spark.stop();
}
 
開發者ID:PacktPublishing,項目名稱:Machine-Learning-End-to-Endguide-for-Java-developers,代碼行數:39,代碼來源:PCAExpt.java

示例2: start

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start(int zip) {
  SparkSession spark = SparkSession
      .builder()
      .appName("Number of households in your ZIP Code™")
      .master("local")
      .getOrCreate();

  String filename = "data/14zpallagi-part*.csv";
  Dataset<Row> df = spark
      .read()
      .format("csv")
      .option("inferSchema", "true")
      .option("header", "true")
      .load(filename);
  df.printSchema();
  df.sample(true, 0.01, 4589).show(2);
  System.out.println("Dataframe has " + df.count() + " rows and " + df.columns().length
      + " columns.");

  Dataset<Row> df2 = df.filter(df.col("zipcode").equalTo(zip));
  String[] colsToDrop = { "STATEFIPS", "mars1", "MARS2", "MARS4", "PREP", "N2",
      "NUMDEP", "TOTAL_VITA", "VITA", "TCE", "A00100", "N02650", "N00200", "A00200",
      "N00300", "A00300", "N00600", "A00600", "N00650", "A00650", "N00700", "A00700",
      "N00900", "A00900", "N01000", "A01000", "N01400", "A01400", "N01700", "A01700",
      "SCHF", "N02300", "A02300", "N02500", "A02500", "N26270", "A26270", "N02900",
      "A02900", "N03220", "A03220", "N03300", "A03300", "N03270", "A03270", "N03150",
      "A03150", "N03210", "A03210", "N03230", "A03230", "N03240", "A03240", "N04470",
      "A04470", "A00101", "N18425", "A18425", "N18450", "A18450", "N18500", "A18500",
      "N18300", "A18300", "N19300", "A19300", "N19700", "A19700", "N04800", "A04800",
      "N05800", "A05800", "N09600", "A09600", "N05780", "A05780", "N07100", "A07100",
      "N07300", "A07300", "N07180", "A07180", "N07230", "A07230", "N07240", "A07240",
      "N07220", "A07220", "N07260", "A07260", "N09400", "A09400", "N85770", "A85770",
      "N85775", "A85775", "N09750", "A09750", "N10600", "A10600", "N59660", "A59660",
      "N59720", "A59720", "N11070", "A11070", "N10960", "A10960", "N11560", "A11560",
      "N06500", "A06500", "N10300", "A10300", "N85530", "A85530", "N85300", "A85300",
      "N11901", "A11901", "N11902", "A11902" };
  for (String colName : colsToDrop) {
    df2 = df2.drop(colName);
  }
  df2.printSchema();
  df2.show();
  System.out.println("Dataframe has " + df2.count() + " rows and " + df2
      .columns().length + " columns.");

  Dataset<Row> df3 = df2.filter(df2.col("agi_stub").$greater(3));
  df3 = df3.groupBy("zipcode").sum("N1").withColumnRenamed("sum(N1)", "households");
  df3.show();
}
 
開發者ID:jgperrin,項目名稱:net.jgp.labs.informix2spark,代碼行數:49,代碼來源:HouseholdsAboveMedianRevenuePerZipApp.java

示例3: main

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main (String[] args) throws IOException {
    SparkSession spark = SparkSession.builder().master("local").appName("DataProcess").getOrCreate();
    String filename = "prices-split-adjusted.csv";
    String symbol = "GOOG";
    // load data from csv file
    Dataset<Row> data = spark.read().format("csv").option("header", true)
            .load(new ClassPathResource(filename).getFile().getAbsolutePath())
            //.filter(functions.col("symbol").equalTo(symbol))
            //.drop("date").drop("symbol")
            .withColumn("openPrice", functions.col("open").cast("double")).drop("open")
            .withColumn("closePrice", functions.col("close").cast("double")).drop("close")
            .withColumn("lowPrice", functions.col("low").cast("double")).drop("low")
            .withColumn("highPrice", functions.col("high").cast("double")).drop("high")
            .withColumn("volumeTmp", functions.col("volume").cast("double")).drop("volume")
            .toDF("date", "symbol", "open", "close", "low", "high", "volume");

    data.show();

    Dataset<Row> symbols = data.select("date", "symbol").groupBy("symbol").agg(functions.count("date").as("count"));
    System.out.println("Number of Symbols: " + symbols.count());
    symbols.show();

    VectorAssembler assembler = new VectorAssembler()
            .setInputCols(new String[] {"open", "low", "high", "volume", "close"})
            .setOutputCol("features");

    data = assembler.transform(data).drop("open", "low", "high", "volume", "close");

    data = new MinMaxScaler().setMin(0).setMax(1)
            .setInputCol("features").setOutputCol("normalizedFeatures")
            .fit(data).transform(data)
            .drop("features").toDF("features");
}
 
開發者ID:IsaacChanghau,項目名稱:StockPrediction,代碼行數:34,代碼來源:DataPreview.java

示例4: start

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start() {

    // @formatter:off
    SparkSession spark = SparkSession
        .builder()
        .appName("Stores Customer")
        .master("local")
        .getOrCreate();
    // @formatter:on

    // @formatter:off
    Dataset<Row> df = spark
      .read()
      .format("jdbc")
      .option(
          "url", 
          "jdbc:informix-sqli://[::1]:33378/stores_demo:IFXHOST=lo_informix1210;DELIMIDENT=Y")
      .option("dbtable", "customer")
      .option("user", "informix")
      .option("password", "in4mix")
      .load();
    // @formatter:on

    df.printSchema();
    System.out.println("Number of rows in customer: " + df
        .count());
    df.show(5);
  }
 
開發者ID:jgperrin,項目名稱:net.jgp.labs.informix2spark,代碼行數:29,代碼來源:BasicCustomerLoader.java

示例5: main

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
	 System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
		
      SparkSession sparkSession = SparkSession
      .builder()
      .master("local")
	  .config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse")
      .appName("JavaALSExample")
      .getOrCreate();
      Logger rootLogger = LogManager.getRootLogger();
		rootLogger.setLevel(Level.WARN); 

	
	HashMap<String, String> params = new HashMap<String, String>();
	params.put("rowTag", "food");
	params.put("failFast", "true");
	 Dataset<Row> docDF = sparkSession.read()
			                   .format("com.databricks.spark.xml")
			                   .options(params)
			                   .load("C:/Users/sumit.kumar/git/learning/src/main/resources/breakfast_menu.xml");
	 
	 docDF.printSchema();		 
	 docDF.show();
	 
	 docDF.write().format("com.databricks.spark.xml")
	    .option("rootTag", "food")
	    .option("rowTag", "food")
	    .save("C:/Users/sumit.kumar/git/learning/src/main/resources/newMenu.xml");

}
 
開發者ID:PacktPublishing,項目名稱:Apache-Spark-2x-for-Java-Developers,代碼行數:31,代碼來源:XMLFileOperations.java

示例6: start

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private boolean start() {
    SparkSession spark = SparkSession.builder()
            .appName("EXIF to Dataset")
            .master("local[*]").getOrCreate();
    
    String importDirectory = "/Users/jgp/Pictures";
    
    Dataset<Row> df = spark.read()
            .format("exif")
            .option("recursive", "true")
            .option("limit", "100000")
            .option("extensions", "jpg,jpeg")
            .load(importDirectory);
    
    // We can start analytics
    df = df
            .filter(df.col("GeoX").isNotNull())
            .filter(df.col("GeoZ").notEqual("NaN"))
            .orderBy(df.col("GeoZ").desc());
    df.collect();
    df.cache();
    System.out.println("I have imported " + df.count() + " photos.");
    df.printSchema();
    df.show(5);
    
    return true;
}
 
開發者ID:jgperrin,項目名稱:net.jgp.labs.spark.datasources,代碼行數:28,代碼來源:PhotoMetadataIngestionApp.java

示例7: liststatnumber

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public List<String> liststatnumber() {
    Dataset<Row> result = InitSpark.spark.sql("SELECT * " +
            "FROM tagjson tj " +
            "   limit 1");
    result.show();
    return result.toJSON().collectAsList();
}
 
開發者ID:alikemalocalan,項目名稱:Spark-API,代碼行數:8,代碼來源:Recommendation.java

示例8: main

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
	SparkSession spark = SparkSession.builder()
			.master("local[8]")
			.appName("KMeansWithPCAExpt")
			.getOrCreate();

	// Load and parse data
	String filePath = "/home/kchoppella/book/Chapter09/data/covtypeNorm.csv";

	// Loads data.
	Dataset<Row> inDataset = spark.read()
			.format("com.databricks.spark.csv")
			.option("header", "true")
			.option("inferSchema", true)
			.load(filePath);
	ArrayList<String> inputColsList = new ArrayList<String>(Arrays.asList(inDataset.columns()));
	
	//Make single features column for feature vectors 
	inputColsList.remove("class");
	String[] inputCols = inputColsList.parallelStream().toArray(String[]::new);
	
	//Prepare dataset for training with all features in "features" column
	VectorAssembler assembler = new VectorAssembler().setInputCols(inputCols).setOutputCol("features");
	Dataset<Row> dataset = assembler.transform(inDataset);

	PCAModel pca = new PCA()
			.setK(16)
			.setInputCol("features")
			.setOutputCol("pcaFeatures")
			.fit(dataset);

	Dataset<Row> result = pca.transform(dataset).select("pcaFeatures");
	System.out.println("Explained variance:");
	System.out.println(pca.explainedVariance());
	result.show(false);
	
	KMeans kmeans = new KMeans().setK(27).setSeed(1L);
	KMeansModel model = kmeans.fit(dataset);

	// Evaluate clustering by computing Within Set Sum of Squared Errors.
	double WSSSE = model.computeCost(dataset);
	System.out.println("Within Set Sum of Squared Errors = " + WSSSE);

	// $example off$
	spark.stop();
}
 
開發者ID:PacktPublishing,項目名稱:Machine-Learning-End-to-Endguide-for-Java-developers,代碼行數:47,代碼來源:KMeansWithPCAExpt.java

示例9: start

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start() {
  // @formatter:off
   SparkSession spark = SparkSession
      .builder()
      .appName("Time to Ship")
      .master("local")
      .getOrCreate();
  // @formatter:on

  // Specific Informix dialect
  JdbcDialect dialect = new InformixJdbcDialect();
  JdbcDialects.registerDialect(dialect);

  // Configuration info for the database
  Config config = ConfigManager.getConfig(K.INFORMIX);

  // List of all tables we want to work with
  List<String> tables = new ArrayList<>();
  tables.add("orders");

  Map<String, Dataset<Row>> datalake = new HashMap<>();
  for (String table : tables) {
    System.out.print("Loading table [" + table + "] ... ");
    // @formatter:off
    Dataset<Row> df = spark.read()
        .format("jdbc")
        .option("url", config.getJdbcUrl())
        .option("dbtable", table)
        .option("user", config.getUser())
        .option("password", config.getPassword())
        .option("driver", config.getDriver())
        .load();
    // @formatter:on

    datalake.put(table, df);
    System.out.println("done.");
  }

  System.out.println("We have loaded " + datalake.size()
      + " table(s) in our data lake.");

  Dataset<Row> ordersDf = datalake.get("orders");
  // @formatter:off
  ordersDf = ordersDf.withColumn(
      "time_to_ship", 
      datediff(ordersDf.col("ship_date"), ordersDf.col("order_date")));
  // @formatter:on

  ordersDf.printSchema();
  ordersDf.show(10);
  System.out.println("We have " + ordersDf.count() + " orders");

  Dataset<Row> ordersDf2 = ordersDf.filter("time_to_ship IS NOT NULL");
  ordersDf2.printSchema();
  ordersDf2.show(5);
  System.out.println("We have " + ordersDf2.count() + " delivered orders");

}
 
開發者ID:jgperrin,項目名稱:net.jgp.labs.informix2spark,代碼行數:59,代碼來源:TimeToShipApp.java

示例10: start

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start() {
  Dataset<Row> householdDf = getHouseholdDataframe();
  Dataset<Row> populationDf = getPopulationDataframe();
  Dataset<Row> indexDf = joinHouseholdPopulation(householdDf, populationDf);
  Dataset<Row> salesDf = getSalesData();

  Dataset<Row> salesIndexDf = salesDf
      .join(indexDf, salesDf.col("zipcode").equalTo(indexDf.col("zipcode")), "left")
      .drop(indexDf.col("zipcode"));
  salesIndexDf = salesIndexDf.withColumn("revenue_by_inh", salesIndexDf.col("revenue")
      .divide(salesIndexDf.col("pop")));
  salesIndexDf = salesIndexDf.orderBy(col("revenue_by_inh").desc());
  Row bestRow = salesIndexDf.first();
  double bestRevenuePerInhabitant = ((BigDecimal) bestRow.getAs("revenue_by_inh"))
      .doubleValue();
  int populationOfBestRevenuePerInhabitant = bestRow.getAs("pop");
  double incomeOfBestRevenuePerInhabitant = bestRow.getAs("income_per_inh");
  salesIndexDf = salesIndexDf.withColumn(
      "best_revenue_per_inh",
      salesIndexDf.col("pop").divide(salesIndexDf.col("pop"))
          .multiply(bestRevenuePerInhabitant));
  salesIndexDf = salesIndexDf.withColumn(
      "pop_of_best",
      lit(populationOfBestRevenuePerInhabitant));
  salesIndexDf = salesIndexDf.withColumn(
      "income_of_best",
      lit(incomeOfBestRevenuePerInhabitant));
  salesIndexDf = salesIndexDf.withColumn(
      "idx_revenue",
      salesIndexDf.col("best_revenue_per_inh")
          .divide(salesIndexDf.col("revenue_by_inh")));
  salesIndexDf = salesIndexDf.withColumn(
      "idx_pop",
      salesIndexDf.col("pop").divide(salesIndexDf.col("pop_of_best")));
  salesIndexDf = salesIndexDf.withColumn(
      "idx_income",
      salesIndexDf.col("income_per_inh").divide(salesIndexDf.col("income_of_best")));
  salesIndexDf = salesIndexDf.withColumn(
      "index",
      salesIndexDf.col("idx_revenue").multiply(salesIndexDf.col("idx_pop")
          .multiply(salesIndexDf.col("idx_income"))));
  salesIndexDf = salesIndexDf.withColumn(
      "potential_revenue",
      salesIndexDf.col("revenue").multiply(salesIndexDf.col("index")));
  salesIndexDf = salesIndexDf
      .drop("idx_income")
      .drop("idx_pop")
      .drop("idx_revenue")
      .drop("income_of_best")
      .drop("total_income")
      .drop("revenue_by_inh")
      .drop("pop_of_best")
      .drop("best_revenue_per_inh")
      .orderBy(salesIndexDf.col("potential_revenue").desc());
  salesIndexDf.show();
}
 
開發者ID:jgperrin,項目名稱:net.jgp.labs.informix2spark,代碼行數:57,代碼來源:SalesTargetApp.java

示例11: getSalesData

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private Dataset<Row> getSalesData() {
  // Let's connect to the database
  Config config = ConfigManager.getConfig(K.INFORMIX);
  Connection connection = config.getConnection();
  if (connection == null) {
    return null;
  }

  // List of all tables we want to work with
  List<String> tables = new ArrayList<>();
  tables.add("customer");
  tables.add("orders");
  tables.add("items");
  tables.add("stock");

  // Specific Informix dialect
  JdbcDialect dialect = new InformixJdbcDialect();
  JdbcDialects.registerDialect(dialect);

  Map<String, Dataset<Row>> datalake = new HashMap<>();
  for (String table : tables) {
    System.out.print("Loading table [" + table + "] ... ");
    Dataset<Row> df = spark.read()
        .format("jdbc")
        .option("url", config.getJdbcUrl())
        .option("dbtable", table)
        .option("user", config.getUser())
        .option("password", config.getPassword())
        .option("driver", config.getDriver())
        .load();

    datalake.put(table, df);
    System.out.println("done");
  }

  System.out.println("We have loaded " + datalake.size()
      + " table in our data lake");

  Dataset<Row> ordersDf = datalake.get("orders");
  Dataset<Row> customerDf = datalake.get("customer");
  Dataset<Row> itemsDf = datalake.get("items");
  Dataset<Row> stockDf = datalake.get("stock");

  Seq<String> stockColumns =
      new scala.collection.immutable.Set.Set2<>(
          "stock_num", "manu_code").toSeq();

  Dataset<Row> allDf = customerDf
      .join(
          ordersDf,
          customerDf.col("customer_num").equalTo(ordersDf.col("customer_num")),
          "full_outer")
      .join(itemsDf, ordersDf.col("order_num").equalTo(itemsDf.col("order_num")),
          "full_outer")
      .join(stockDf, stockColumns, "full_outer")
      .drop(ordersDf.col("customer_num"))
      .drop(itemsDf.col("order_num"))
      .drop(stockDf.col("stock_num"))
      .drop(stockDf.col("manu_code"));

  // Sales analysis
  Dataset<Row> salesDf = allDf.filter(
      "zipcode IS NOT NULL").groupBy("zipcode").sum(
          "total_price");
  salesDf = salesDf.withColumn("revenue", salesDf.col(
      "sum(total_price)")).drop("sum(total_price)")
      .filter("revenue IS NOT NULL");
  salesDf.show(5);
  return salesDf;
}
 
開發者ID:jgperrin,項目名稱:net.jgp.labs.informix2spark,代碼行數:71,代碼來源:SalesTargetApp.java

示例12: main

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
/**
 * Main method..
 *
 * @param args the arguments
 */
public static void main(final String[] args) {
  final String tableName = "SparkExampleDF";

  /** get the locator host/port from arguments, if specified.. **/
  final String locatorHost = args.length > 0 ? args[0] : "localhost";
  final int locatorPort = args.length > 1 ? Integer.valueOf(args[1]) : 10334;

  /** create SparkContext **/
  SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SparkExampleDF");
  JavaSparkContext jsc = new JavaSparkContext(conf);
  SQLContext sqlContext = new SQLContext(jsc);

  /** create data-frame from existing data.. **/
  Dataset df = sqlContext.createDataFrame(jsc.parallelize(SAMPLE_DATA), Employee.class);

  /** print schema of the data-frame **/
  df.printSchema();

  df.show();

  Map<String, String> options = new HashMap<>(3);
  options.put("ampool.locator.host", locatorHost);
  options.put("ampool.locator.port", String.valueOf(locatorPort));

  /** overwrite existing table, if specified.. **/
  SaveMode saveMode = Boolean.getBoolean("overwrite") ? SaveMode.Overwrite : SaveMode.ErrorIfExists;

  /** save the dataFrame to Ampool as `tableName' **/
  df.write().format("io.ampool").options(options).mode(saveMode).save(tableName);

  System.out.println("########## DATA FROM AMPOOL ############");

  /** load the data-frame from Ampool `tableName' **/
  Dataset df1 = sqlContext.read().format("io.ampool").options(options).load(tableName);

  /** show the contents of loaded data-frame **/
  df1.show();

  /** show the total number of rows in data-frame **/
  System.out.println("# NumberOfRowsInDataFrame= " + df1.count());

  /** data-frame with filter **/
  df1.filter("id > 2").show();

  /** data-frame with selected columns **/
  df1.select("name", "id", "department").show();

  df1.registerTempTable("temp_table");

  sqlContext.sql("select * from temp_table order by id").show();
}
 
開發者ID:ampool,項目名稱:monarch,代碼行數:57,代碼來源:SparkExampleDF.java

示例13: main

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new HiveContext(sc.sc());

  Options options = new Options();
  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option baminOpt = new Option( "in", true, "" );

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  //Read BAM/SAM from HDFS
  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
  JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));

  Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
  samDF.registerTempTable(tablename);
  if(query!=null) {

    //Save as parquet file
    Dataset df2 = sqlContext.sql(query);
    df2.show(100,false);

    if(bwaOutDir!=null)
      df2.write().parquet(bwaOutDir);

  }else{
    if(bwaOutDir!=null)
      samDF.write().parquet(bwaOutDir);
  }

  sc.stop();

}
 
開發者ID:NGSeq,項目名稱:ViraPipe,代碼行數:56,代碼來源:SQLQueryBAM.java

示例14: main

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
	System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
	Logger rootLogger = LogManager.getRootLogger();
	rootLogger.setLevel(Level.WARN); 
	      SparkSession sparkSession = SparkSession
	      .builder()
	      .master("local")
		  .config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse")
	      .appName("JavaALSExample")
	      .getOrCreate();
	      
	   RDD<String> textFile = sparkSession.sparkContext().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json",2); 
	   
	   JavaRDD<PersonDetails> mapParser = textFile.toJavaRDD().map(v1 -> new ObjectMapper().readValue(v1, PersonDetails.class));
	   
	   mapParser.foreach(t -> System.out.println(t)); 
	  
	   Dataset<Row> anotherPeople = sparkSession.read().json(textFile);
	   
	   anotherPeople.printSchema();
	   anotherPeople.show();
	      
	      
	      Dataset<Row> json_rec = sparkSession.read().json("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json");
	      json_rec.printSchema();
	      
	      json_rec.show();
	      
	      StructType schema = new StructType( new StructField[] {
	    	            DataTypes.createStructField("cid", DataTypes.IntegerType, true),
	    	            DataTypes.createStructField("county", DataTypes.StringType, true),
	    	            DataTypes.createStructField("firstName", DataTypes.StringType, true),
	    	            DataTypes.createStructField("sex", DataTypes.StringType, true),
	    	            DataTypes.createStructField("year", DataTypes.StringType, true),
	    	            DataTypes.createStructField("dateOfBirth", DataTypes.TimestampType, true) });
	      
	    /*  StructType pep = new StructType(new StructField[] {
					new StructField("Count", DataTypes.StringType, true, Metadata.empty()),
					new StructField("County", DataTypes.StringType, true, Metadata.empty()),
					new StructField("First Name", DataTypes.StringType, true, Metadata.empty()),
					new StructField("Sex", DataTypes.StringType, true, Metadata.empty()),
					new StructField("Year", DataTypes.StringType, true, Metadata.empty()),
				    new StructField("timestamp", DataTypes.TimestampType, true, Metadata.empty()) });*/
	      
	     Dataset<Row> person_mod = sparkSession.read().schema(schema).json(textFile);
	     
	     person_mod.printSchema();
	     person_mod.show();
	     
	     person_mod.write().format("json").mode("overwrite").save("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_out.json");

}
 
開發者ID:PacktPublishing,項目名稱:Apache-Spark-2x-for-Java-Developers,代碼行數:53,代碼來源:JsonFileOperations.java

示例15: main

import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
	  System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
		
      SparkSession sparkSession = SparkSession
      .builder()
      .master("local")
	  .config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse")
      .appName("JavaALSExample")
      .getOrCreate();
      Logger rootLogger = LogManager.getRootLogger();
		rootLogger.setLevel(Level.WARN); 

    JavaRDD<Movie> moviesRDD = sparkSession
      .read().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/movies.csv")
      .javaRDD().filter( str-> !(null==str))
      .filter(str-> !(str.length()==0))
      .filter(str-> !str.contains("movieId"))	      
      .map(str -> Movie.parseRating(str));
    
    moviesRDD.foreach(m -> System.out.println(m));
    
       Dataset<Row> csv_read = sparkSession.read().format("com.databricks.spark.csv")
	        		      .option("header", "true")
	        		      .option("inferSchema", "true")
	        		      .load("C:/Users/sumit.kumar/git/learning/src/main/resources/movies.csv");
	       
	       csv_read.printSchema();
	       
	       csv_read.show();
	       
	       
	       StructType customSchema = new StructType(new StructField[] {
	    		    new StructField("movieId", DataTypes.LongType, true, Metadata.empty()),
	    		    new StructField("title", DataTypes.StringType, true, Metadata.empty()),
	    		    new StructField("genres", DataTypes.StringType, true, Metadata.empty())
	    		});
  
	       Dataset<Row> csv_custom_read = sparkSession.read().format("com.databricks.spark.csv")
        		      .option("header", "true")
        		      .schema(customSchema)
        		      .load("C:/Users/sumit.kumar/git/learning/src/main/resources/movies.csv");
       
	       csv_custom_read.printSchema();
       
	       csv_custom_read.show(); 
	       
	       
	       csv_custom_read.write()
	       .format("com.databricks.spark.csv")
	       .option("header", "true")
	       .option("codec", "org.apache.hadoop.io.compress.GzipCodec")
	       .save("C:/Users/sumit.kumar/git/learning/src/main/resources/newMovies.csv");
	       
}
 
開發者ID:PacktPublishing,項目名稱:Apache-Spark-2x-for-Java-Developers,代碼行數:55,代碼來源:CSVFileOperations.java


注:本文中的org.apache.spark.sql.Dataset.show方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。