本文整理匯總了Java中org.apache.spark.sql.Dataset.show方法的典型用法代碼示例。如果您正苦於以下問題:Java Dataset.show方法的具體用法?Java Dataset.show怎麽用?Java Dataset.show使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.spark.sql.Dataset
的用法示例。
在下文中一共展示了Dataset.show方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
SparkSession spark = SparkSession.builder()
.master("local[8]")
.appName("PCAExpt")
.getOrCreate();
// Load and parse data
String filePath = "/home/kchoppella/book/Chapter09/data/covtypeNorm.csv";
// Loads data.
Dataset<Row> inDataset = spark.read()
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", true)
.load(filePath);
ArrayList<String> inputColsList = new ArrayList<String>(Arrays.asList(inDataset.columns()));
//Make single features column for feature vectors
inputColsList.remove("class");
String[] inputCols = inputColsList.parallelStream().toArray(String[]::new);
//Prepare dataset for training with all features in "features" column
VectorAssembler assembler = new VectorAssembler().setInputCols(inputCols).setOutputCol("features");
Dataset<Row> dataset = assembler.transform(inDataset);
PCAModel pca = new PCA()
.setK(16)
.setInputCol("features")
.setOutputCol("pcaFeatures")
.fit(dataset);
Dataset<Row> result = pca.transform(dataset).select("pcaFeatures");
System.out.println("Explained variance:");
System.out.println(pca.explainedVariance());
result.show(false);
// $example off$
spark.stop();
}
開發者ID:PacktPublishing,項目名稱:Machine-Learning-End-to-Endguide-for-Java-developers,代碼行數:39,代碼來源:PCAExpt.java
示例2: start
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start(int zip) {
SparkSession spark = SparkSession
.builder()
.appName("Number of households in your ZIP Code™")
.master("local")
.getOrCreate();
String filename = "data/14zpallagi-part*.csv";
Dataset<Row> df = spark
.read()
.format("csv")
.option("inferSchema", "true")
.option("header", "true")
.load(filename);
df.printSchema();
df.sample(true, 0.01, 4589).show(2);
System.out.println("Dataframe has " + df.count() + " rows and " + df.columns().length
+ " columns.");
Dataset<Row> df2 = df.filter(df.col("zipcode").equalTo(zip));
String[] colsToDrop = { "STATEFIPS", "mars1", "MARS2", "MARS4", "PREP", "N2",
"NUMDEP", "TOTAL_VITA", "VITA", "TCE", "A00100", "N02650", "N00200", "A00200",
"N00300", "A00300", "N00600", "A00600", "N00650", "A00650", "N00700", "A00700",
"N00900", "A00900", "N01000", "A01000", "N01400", "A01400", "N01700", "A01700",
"SCHF", "N02300", "A02300", "N02500", "A02500", "N26270", "A26270", "N02900",
"A02900", "N03220", "A03220", "N03300", "A03300", "N03270", "A03270", "N03150",
"A03150", "N03210", "A03210", "N03230", "A03230", "N03240", "A03240", "N04470",
"A04470", "A00101", "N18425", "A18425", "N18450", "A18450", "N18500", "A18500",
"N18300", "A18300", "N19300", "A19300", "N19700", "A19700", "N04800", "A04800",
"N05800", "A05800", "N09600", "A09600", "N05780", "A05780", "N07100", "A07100",
"N07300", "A07300", "N07180", "A07180", "N07230", "A07230", "N07240", "A07240",
"N07220", "A07220", "N07260", "A07260", "N09400", "A09400", "N85770", "A85770",
"N85775", "A85775", "N09750", "A09750", "N10600", "A10600", "N59660", "A59660",
"N59720", "A59720", "N11070", "A11070", "N10960", "A10960", "N11560", "A11560",
"N06500", "A06500", "N10300", "A10300", "N85530", "A85530", "N85300", "A85300",
"N11901", "A11901", "N11902", "A11902" };
for (String colName : colsToDrop) {
df2 = df2.drop(colName);
}
df2.printSchema();
df2.show();
System.out.println("Dataframe has " + df2.count() + " rows and " + df2
.columns().length + " columns.");
Dataset<Row> df3 = df2.filter(df2.col("agi_stub").$greater(3));
df3 = df3.groupBy("zipcode").sum("N1").withColumnRenamed("sum(N1)", "households");
df3.show();
}
開發者ID:jgperrin,項目名稱:net.jgp.labs.informix2spark,代碼行數:49,代碼來源:HouseholdsAboveMedianRevenuePerZipApp.java
示例3: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main (String[] args) throws IOException {
SparkSession spark = SparkSession.builder().master("local").appName("DataProcess").getOrCreate();
String filename = "prices-split-adjusted.csv";
String symbol = "GOOG";
// load data from csv file
Dataset<Row> data = spark.read().format("csv").option("header", true)
.load(new ClassPathResource(filename).getFile().getAbsolutePath())
//.filter(functions.col("symbol").equalTo(symbol))
//.drop("date").drop("symbol")
.withColumn("openPrice", functions.col("open").cast("double")).drop("open")
.withColumn("closePrice", functions.col("close").cast("double")).drop("close")
.withColumn("lowPrice", functions.col("low").cast("double")).drop("low")
.withColumn("highPrice", functions.col("high").cast("double")).drop("high")
.withColumn("volumeTmp", functions.col("volume").cast("double")).drop("volume")
.toDF("date", "symbol", "open", "close", "low", "high", "volume");
data.show();
Dataset<Row> symbols = data.select("date", "symbol").groupBy("symbol").agg(functions.count("date").as("count"));
System.out.println("Number of Symbols: " + symbols.count());
symbols.show();
VectorAssembler assembler = new VectorAssembler()
.setInputCols(new String[] {"open", "low", "high", "volume", "close"})
.setOutputCol("features");
data = assembler.transform(data).drop("open", "low", "high", "volume", "close");
data = new MinMaxScaler().setMin(0).setMax(1)
.setInputCol("features").setOutputCol("normalizedFeatures")
.fit(data).transform(data)
.drop("features").toDF("features");
}
示例4: start
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start() {
// @formatter:off
SparkSession spark = SparkSession
.builder()
.appName("Stores Customer")
.master("local")
.getOrCreate();
// @formatter:on
// @formatter:off
Dataset<Row> df = spark
.read()
.format("jdbc")
.option(
"url",
"jdbc:informix-sqli://[::1]:33378/stores_demo:IFXHOST=lo_informix1210;DELIMIDENT=Y")
.option("dbtable", "customer")
.option("user", "informix")
.option("password", "in4mix")
.load();
// @formatter:on
df.printSchema();
System.out.println("Number of rows in customer: " + df
.count());
df.show(5);
}
示例5: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
SparkSession sparkSession = SparkSession
.builder()
.master("local")
.config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse")
.appName("JavaALSExample")
.getOrCreate();
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
HashMap<String, String> params = new HashMap<String, String>();
params.put("rowTag", "food");
params.put("failFast", "true");
Dataset<Row> docDF = sparkSession.read()
.format("com.databricks.spark.xml")
.options(params)
.load("C:/Users/sumit.kumar/git/learning/src/main/resources/breakfast_menu.xml");
docDF.printSchema();
docDF.show();
docDF.write().format("com.databricks.spark.xml")
.option("rootTag", "food")
.option("rowTag", "food")
.save("C:/Users/sumit.kumar/git/learning/src/main/resources/newMenu.xml");
}
示例6: start
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private boolean start() {
SparkSession spark = SparkSession.builder()
.appName("EXIF to Dataset")
.master("local[*]").getOrCreate();
String importDirectory = "/Users/jgp/Pictures";
Dataset<Row> df = spark.read()
.format("exif")
.option("recursive", "true")
.option("limit", "100000")
.option("extensions", "jpg,jpeg")
.load(importDirectory);
// We can start analytics
df = df
.filter(df.col("GeoX").isNotNull())
.filter(df.col("GeoZ").notEqual("NaN"))
.orderBy(df.col("GeoZ").desc());
df.collect();
df.cache();
System.out.println("I have imported " + df.count() + " photos.");
df.printSchema();
df.show(5);
return true;
}
示例7: liststatnumber
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public List<String> liststatnumber() {
Dataset<Row> result = InitSpark.spark.sql("SELECT * " +
"FROM tagjson tj " +
" limit 1");
result.show();
return result.toJSON().collectAsList();
}
示例8: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
SparkSession spark = SparkSession.builder()
.master("local[8]")
.appName("KMeansWithPCAExpt")
.getOrCreate();
// Load and parse data
String filePath = "/home/kchoppella/book/Chapter09/data/covtypeNorm.csv";
// Loads data.
Dataset<Row> inDataset = spark.read()
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", true)
.load(filePath);
ArrayList<String> inputColsList = new ArrayList<String>(Arrays.asList(inDataset.columns()));
//Make single features column for feature vectors
inputColsList.remove("class");
String[] inputCols = inputColsList.parallelStream().toArray(String[]::new);
//Prepare dataset for training with all features in "features" column
VectorAssembler assembler = new VectorAssembler().setInputCols(inputCols).setOutputCol("features");
Dataset<Row> dataset = assembler.transform(inDataset);
PCAModel pca = new PCA()
.setK(16)
.setInputCol("features")
.setOutputCol("pcaFeatures")
.fit(dataset);
Dataset<Row> result = pca.transform(dataset).select("pcaFeatures");
System.out.println("Explained variance:");
System.out.println(pca.explainedVariance());
result.show(false);
KMeans kmeans = new KMeans().setK(27).setSeed(1L);
KMeansModel model = kmeans.fit(dataset);
// Evaluate clustering by computing Within Set Sum of Squared Errors.
double WSSSE = model.computeCost(dataset);
System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
// $example off$
spark.stop();
}
開發者ID:PacktPublishing,項目名稱:Machine-Learning-End-to-Endguide-for-Java-developers,代碼行數:47,代碼來源:KMeansWithPCAExpt.java
示例9: start
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start() {
// @formatter:off
SparkSession spark = SparkSession
.builder()
.appName("Time to Ship")
.master("local")
.getOrCreate();
// @formatter:on
// Specific Informix dialect
JdbcDialect dialect = new InformixJdbcDialect();
JdbcDialects.registerDialect(dialect);
// Configuration info for the database
Config config = ConfigManager.getConfig(K.INFORMIX);
// List of all tables we want to work with
List<String> tables = new ArrayList<>();
tables.add("orders");
Map<String, Dataset<Row>> datalake = new HashMap<>();
for (String table : tables) {
System.out.print("Loading table [" + table + "] ... ");
// @formatter:off
Dataset<Row> df = spark.read()
.format("jdbc")
.option("url", config.getJdbcUrl())
.option("dbtable", table)
.option("user", config.getUser())
.option("password", config.getPassword())
.option("driver", config.getDriver())
.load();
// @formatter:on
datalake.put(table, df);
System.out.println("done.");
}
System.out.println("We have loaded " + datalake.size()
+ " table(s) in our data lake.");
Dataset<Row> ordersDf = datalake.get("orders");
// @formatter:off
ordersDf = ordersDf.withColumn(
"time_to_ship",
datediff(ordersDf.col("ship_date"), ordersDf.col("order_date")));
// @formatter:on
ordersDf.printSchema();
ordersDf.show(10);
System.out.println("We have " + ordersDf.count() + " orders");
Dataset<Row> ordersDf2 = ordersDf.filter("time_to_ship IS NOT NULL");
ordersDf2.printSchema();
ordersDf2.show(5);
System.out.println("We have " + ordersDf2.count() + " delivered orders");
}
示例10: start
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start() {
Dataset<Row> householdDf = getHouseholdDataframe();
Dataset<Row> populationDf = getPopulationDataframe();
Dataset<Row> indexDf = joinHouseholdPopulation(householdDf, populationDf);
Dataset<Row> salesDf = getSalesData();
Dataset<Row> salesIndexDf = salesDf
.join(indexDf, salesDf.col("zipcode").equalTo(indexDf.col("zipcode")), "left")
.drop(indexDf.col("zipcode"));
salesIndexDf = salesIndexDf.withColumn("revenue_by_inh", salesIndexDf.col("revenue")
.divide(salesIndexDf.col("pop")));
salesIndexDf = salesIndexDf.orderBy(col("revenue_by_inh").desc());
Row bestRow = salesIndexDf.first();
double bestRevenuePerInhabitant = ((BigDecimal) bestRow.getAs("revenue_by_inh"))
.doubleValue();
int populationOfBestRevenuePerInhabitant = bestRow.getAs("pop");
double incomeOfBestRevenuePerInhabitant = bestRow.getAs("income_per_inh");
salesIndexDf = salesIndexDf.withColumn(
"best_revenue_per_inh",
salesIndexDf.col("pop").divide(salesIndexDf.col("pop"))
.multiply(bestRevenuePerInhabitant));
salesIndexDf = salesIndexDf.withColumn(
"pop_of_best",
lit(populationOfBestRevenuePerInhabitant));
salesIndexDf = salesIndexDf.withColumn(
"income_of_best",
lit(incomeOfBestRevenuePerInhabitant));
salesIndexDf = salesIndexDf.withColumn(
"idx_revenue",
salesIndexDf.col("best_revenue_per_inh")
.divide(salesIndexDf.col("revenue_by_inh")));
salesIndexDf = salesIndexDf.withColumn(
"idx_pop",
salesIndexDf.col("pop").divide(salesIndexDf.col("pop_of_best")));
salesIndexDf = salesIndexDf.withColumn(
"idx_income",
salesIndexDf.col("income_per_inh").divide(salesIndexDf.col("income_of_best")));
salesIndexDf = salesIndexDf.withColumn(
"index",
salesIndexDf.col("idx_revenue").multiply(salesIndexDf.col("idx_pop")
.multiply(salesIndexDf.col("idx_income"))));
salesIndexDf = salesIndexDf.withColumn(
"potential_revenue",
salesIndexDf.col("revenue").multiply(salesIndexDf.col("index")));
salesIndexDf = salesIndexDf
.drop("idx_income")
.drop("idx_pop")
.drop("idx_revenue")
.drop("income_of_best")
.drop("total_income")
.drop("revenue_by_inh")
.drop("pop_of_best")
.drop("best_revenue_per_inh")
.orderBy(salesIndexDf.col("potential_revenue").desc());
salesIndexDf.show();
}
示例11: getSalesData
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private Dataset<Row> getSalesData() {
// Let's connect to the database
Config config = ConfigManager.getConfig(K.INFORMIX);
Connection connection = config.getConnection();
if (connection == null) {
return null;
}
// List of all tables we want to work with
List<String> tables = new ArrayList<>();
tables.add("customer");
tables.add("orders");
tables.add("items");
tables.add("stock");
// Specific Informix dialect
JdbcDialect dialect = new InformixJdbcDialect();
JdbcDialects.registerDialect(dialect);
Map<String, Dataset<Row>> datalake = new HashMap<>();
for (String table : tables) {
System.out.print("Loading table [" + table + "] ... ");
Dataset<Row> df = spark.read()
.format("jdbc")
.option("url", config.getJdbcUrl())
.option("dbtable", table)
.option("user", config.getUser())
.option("password", config.getPassword())
.option("driver", config.getDriver())
.load();
datalake.put(table, df);
System.out.println("done");
}
System.out.println("We have loaded " + datalake.size()
+ " table in our data lake");
Dataset<Row> ordersDf = datalake.get("orders");
Dataset<Row> customerDf = datalake.get("customer");
Dataset<Row> itemsDf = datalake.get("items");
Dataset<Row> stockDf = datalake.get("stock");
Seq<String> stockColumns =
new scala.collection.immutable.Set.Set2<>(
"stock_num", "manu_code").toSeq();
Dataset<Row> allDf = customerDf
.join(
ordersDf,
customerDf.col("customer_num").equalTo(ordersDf.col("customer_num")),
"full_outer")
.join(itemsDf, ordersDf.col("order_num").equalTo(itemsDf.col("order_num")),
"full_outer")
.join(stockDf, stockColumns, "full_outer")
.drop(ordersDf.col("customer_num"))
.drop(itemsDf.col("order_num"))
.drop(stockDf.col("stock_num"))
.drop(stockDf.col("manu_code"));
// Sales analysis
Dataset<Row> salesDf = allDf.filter(
"zipcode IS NOT NULL").groupBy("zipcode").sum(
"total_price");
salesDf = salesDf.withColumn("revenue", salesDf.col(
"sum(total_price)")).drop("sum(total_price)")
.filter("revenue IS NOT NULL");
salesDf.show(5);
return salesDf;
}
示例12: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
/**
* Main method..
*
* @param args the arguments
*/
public static void main(final String[] args) {
final String tableName = "SparkExampleDF";
/** get the locator host/port from arguments, if specified.. **/
final String locatorHost = args.length > 0 ? args[0] : "localhost";
final int locatorPort = args.length > 1 ? Integer.valueOf(args[1]) : 10334;
/** create SparkContext **/
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SparkExampleDF");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(jsc);
/** create data-frame from existing data.. **/
Dataset df = sqlContext.createDataFrame(jsc.parallelize(SAMPLE_DATA), Employee.class);
/** print schema of the data-frame **/
df.printSchema();
df.show();
Map<String, String> options = new HashMap<>(3);
options.put("ampool.locator.host", locatorHost);
options.put("ampool.locator.port", String.valueOf(locatorPort));
/** overwrite existing table, if specified.. **/
SaveMode saveMode = Boolean.getBoolean("overwrite") ? SaveMode.Overwrite : SaveMode.ErrorIfExists;
/** save the dataFrame to Ampool as `tableName' **/
df.write().format("io.ampool").options(options).mode(saveMode).save(tableName);
System.out.println("########## DATA FROM AMPOOL ############");
/** load the data-frame from Ampool `tableName' **/
Dataset df1 = sqlContext.read().format("io.ampool").options(options).load(tableName);
/** show the contents of loaded data-frame **/
df1.show();
/** show the total number of rows in data-frame **/
System.out.println("# NumberOfRowsInDataFrame= " + df1.count());
/** data-frame with filter **/
df1.filter("id > 2").show();
/** data-frame with selected columns **/
df1.select("name", "id", "department").show();
df1.registerTempTable("temp_table");
sqlContext.sql("select * from temp_table order by id").show();
}
示例13: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new HiveContext(sc.sc());
Options options = new Options();
Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
Option queryOpt = new Option( "query", true, "SQL query string." );
Option baminOpt = new Option( "in", true, "" );
options.addOption( opOpt );
options.addOption( queryOpt );
options.addOption( baminOpt );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);
//Read BAM/SAM from HDFS
JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
//Map to SAMRecord RDD
JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));
Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
samDF.registerTempTable(tablename);
if(query!=null) {
//Save as parquet file
Dataset df2 = sqlContext.sql(query);
df2.show(100,false);
if(bwaOutDir!=null)
df2.write().parquet(bwaOutDir);
}else{
if(bwaOutDir!=null)
samDF.write().parquet(bwaOutDir);
}
sc.stop();
}
示例14: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
SparkSession sparkSession = SparkSession
.builder()
.master("local")
.config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse")
.appName("JavaALSExample")
.getOrCreate();
RDD<String> textFile = sparkSession.sparkContext().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json",2);
JavaRDD<PersonDetails> mapParser = textFile.toJavaRDD().map(v1 -> new ObjectMapper().readValue(v1, PersonDetails.class));
mapParser.foreach(t -> System.out.println(t));
Dataset<Row> anotherPeople = sparkSession.read().json(textFile);
anotherPeople.printSchema();
anotherPeople.show();
Dataset<Row> json_rec = sparkSession.read().json("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json");
json_rec.printSchema();
json_rec.show();
StructType schema = new StructType( new StructField[] {
DataTypes.createStructField("cid", DataTypes.IntegerType, true),
DataTypes.createStructField("county", DataTypes.StringType, true),
DataTypes.createStructField("firstName", DataTypes.StringType, true),
DataTypes.createStructField("sex", DataTypes.StringType, true),
DataTypes.createStructField("year", DataTypes.StringType, true),
DataTypes.createStructField("dateOfBirth", DataTypes.TimestampType, true) });
/* StructType pep = new StructType(new StructField[] {
new StructField("Count", DataTypes.StringType, true, Metadata.empty()),
new StructField("County", DataTypes.StringType, true, Metadata.empty()),
new StructField("First Name", DataTypes.StringType, true, Metadata.empty()),
new StructField("Sex", DataTypes.StringType, true, Metadata.empty()),
new StructField("Year", DataTypes.StringType, true, Metadata.empty()),
new StructField("timestamp", DataTypes.TimestampType, true, Metadata.empty()) });*/
Dataset<Row> person_mod = sparkSession.read().schema(schema).json(textFile);
person_mod.printSchema();
person_mod.show();
person_mod.write().format("json").mode("overwrite").save("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_out.json");
}
示例15: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
SparkSession sparkSession = SparkSession
.builder()
.master("local")
.config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse")
.appName("JavaALSExample")
.getOrCreate();
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
JavaRDD<Movie> moviesRDD = sparkSession
.read().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/movies.csv")
.javaRDD().filter( str-> !(null==str))
.filter(str-> !(str.length()==0))
.filter(str-> !str.contains("movieId"))
.map(str -> Movie.parseRating(str));
moviesRDD.foreach(m -> System.out.println(m));
Dataset<Row> csv_read = sparkSession.read().format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load("C:/Users/sumit.kumar/git/learning/src/main/resources/movies.csv");
csv_read.printSchema();
csv_read.show();
StructType customSchema = new StructType(new StructField[] {
new StructField("movieId", DataTypes.LongType, true, Metadata.empty()),
new StructField("title", DataTypes.StringType, true, Metadata.empty()),
new StructField("genres", DataTypes.StringType, true, Metadata.empty())
});
Dataset<Row> csv_custom_read = sparkSession.read().format("com.databricks.spark.csv")
.option("header", "true")
.schema(customSchema)
.load("C:/Users/sumit.kumar/git/learning/src/main/resources/movies.csv");
csv_custom_read.printSchema();
csv_custom_read.show();
csv_custom_read.write()
.format("com.databricks.spark.csv")
.option("header", "true")
.option("codec", "org.apache.hadoop.io.compress.GzipCodec")
.save("C:/Users/sumit.kumar/git/learning/src/main/resources/newMovies.csv");
}