本文整理匯總了Java中org.apache.spark.sql.Dataset.printSchema方法的典型用法代碼示例。如果您正苦於以下問題:Java Dataset.printSchema方法的具體用法?Java Dataset.printSchema怎麽用?Java Dataset.printSchema使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.spark.sql.Dataset
的用法示例。
在下文中一共展示了Dataset.printSchema方法的13個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: start
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start(int zip) {
SparkSession spark = SparkSession
.builder()
.appName("Number of households in your ZIP Code™")
.master("local")
.getOrCreate();
String filename = "data/14zpallagi-part*.csv";
Dataset<Row> df = spark
.read()
.format("csv")
.option("inferSchema", "true")
.option("header", "true")
.load(filename);
df.printSchema();
df.sample(true, 0.01, 4589).show(2);
System.out.println("Dataframe has " + df.count() + " rows and " + df.columns().length
+ " columns.");
Dataset<Row> df2 = df.filter(df.col("zipcode").equalTo(zip));
String[] colsToDrop = { "STATEFIPS", "mars1", "MARS2", "MARS4", "PREP", "N2",
"NUMDEP", "TOTAL_VITA", "VITA", "TCE", "A00100", "N02650", "N00200", "A00200",
"N00300", "A00300", "N00600", "A00600", "N00650", "A00650", "N00700", "A00700",
"N00900", "A00900", "N01000", "A01000", "N01400", "A01400", "N01700", "A01700",
"SCHF", "N02300", "A02300", "N02500", "A02500", "N26270", "A26270", "N02900",
"A02900", "N03220", "A03220", "N03300", "A03300", "N03270", "A03270", "N03150",
"A03150", "N03210", "A03210", "N03230", "A03230", "N03240", "A03240", "N04470",
"A04470", "A00101", "N18425", "A18425", "N18450", "A18450", "N18500", "A18500",
"N18300", "A18300", "N19300", "A19300", "N19700", "A19700", "N04800", "A04800",
"N05800", "A05800", "N09600", "A09600", "N05780", "A05780", "N07100", "A07100",
"N07300", "A07300", "N07180", "A07180", "N07230", "A07230", "N07240", "A07240",
"N07220", "A07220", "N07260", "A07260", "N09400", "A09400", "N85770", "A85770",
"N85775", "A85775", "N09750", "A09750", "N10600", "A10600", "N59660", "A59660",
"N59720", "A59720", "N11070", "A11070", "N10960", "A10960", "N11560", "A11560",
"N06500", "A06500", "N10300", "A10300", "N85530", "A85530", "N85300", "A85300",
"N11901", "A11901", "N11902", "A11902" };
for (String colName : colsToDrop) {
df2 = df2.drop(colName);
}
df2.printSchema();
df2.show();
System.out.println("Dataframe has " + df2.count() + " rows and " + df2
.columns().length + " columns.");
Dataset<Row> df3 = df2.filter(df2.col("agi_stub").$greater(3));
df3 = df3.groupBy("zipcode").sum("N1").withColumnRenamed("sum(N1)", "households");
df3.show();
}
開發者ID:jgperrin,項目名稱:net.jgp.labs.informix2spark,代碼行數:49,代碼來源:HouseholdsAboveMedianRevenuePerZipApp.java
示例2: start
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start() {
// @formatter:off
SparkSession spark = SparkSession
.builder()
.appName("Stores Customer")
.master("local")
.getOrCreate();
// @formatter:on
Config config = ConfigManager.getConfig(K.INFORMIX);
// @formatter:off
Dataset<Row> df = spark
.read()
.format("jdbc")
.option("url", config.getJdbcUrl())
.option("dbtable", config.getTable())
.option("user", config.getUser())
.option("password", config.getPassword())
.option("driver", config.getDriver())
.load();
// @formatter:on
df.cache();
df.printSchema();
System.out.println("Number of rows in " + config
.getTable() + ": " + df.count());
df.show();
}
示例3: start
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start() {
// @formatter:off
SparkSession spark = SparkSession
.builder()
.appName("Stores Customer")
.master("local")
.getOrCreate();
// @formatter:on
// @formatter:off
Dataset<Row> df = spark
.read()
.format("jdbc")
.option(
"url",
"jdbc:informix-sqli://[::1]:33378/stores_demo:IFXHOST=lo_informix1210;DELIMIDENT=Y")
.option("dbtable", "customer")
.option("user", "informix")
.option("password", "in4mix")
.load();
// @formatter:on
df.printSchema();
System.out.println("Number of rows in customer: " + df
.count());
df.show(5);
}
示例4: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
SparkSession sparkSession = SparkSession
.builder()
.master("local")
.config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse")
.appName("JavaALSExample")
.getOrCreate();
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
HashMap<String, String> params = new HashMap<String, String>();
params.put("rowTag", "food");
params.put("failFast", "true");
Dataset<Row> docDF = sparkSession.read()
.format("com.databricks.spark.xml")
.options(params)
.load("C:/Users/sumit.kumar/git/learning/src/main/resources/breakfast_menu.xml");
docDF.printSchema();
docDF.show();
docDF.write().format("com.databricks.spark.xml")
.option("rootTag", "food")
.option("rowTag", "food")
.save("C:/Users/sumit.kumar/git/learning/src/main/resources/newMenu.xml");
}
示例5: start
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private boolean start() {
SparkSession spark = SparkSession.builder()
.appName("EXIF to Dataset")
.master("local[*]").getOrCreate();
String importDirectory = "/Users/jgp/Pictures";
Dataset<Row> df = spark.read()
.format("exif")
.option("recursive", "true")
.option("limit", "100000")
.option("extensions", "jpg,jpeg")
.load(importDirectory);
// We can start analytics
df = df
.filter(df.col("GeoX").isNotNull())
.filter(df.col("GeoZ").notEqual("NaN"))
.orderBy(df.col("GeoZ").desc());
df.collect();
df.cache();
System.out.println("I have imported " + df.count() + " photos.");
df.printSchema();
df.show(5);
return true;
}
示例6: start
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start() {
// @formatter:off
SparkSession spark = SparkSession
.builder()
.appName("Time to Ship")
.master("local")
.getOrCreate();
// @formatter:on
// Specific Informix dialect
JdbcDialect dialect = new InformixJdbcDialect();
JdbcDialects.registerDialect(dialect);
// Configuration info for the database
Config config = ConfigManager.getConfig(K.INFORMIX);
// List of all tables we want to work with
List<String> tables = new ArrayList<>();
tables.add("orders");
Map<String, Dataset<Row>> datalake = new HashMap<>();
for (String table : tables) {
System.out.print("Loading table [" + table + "] ... ");
// @formatter:off
Dataset<Row> df = spark.read()
.format("jdbc")
.option("url", config.getJdbcUrl())
.option("dbtable", table)
.option("user", config.getUser())
.option("password", config.getPassword())
.option("driver", config.getDriver())
.load();
// @formatter:on
datalake.put(table, df);
System.out.println("done.");
}
System.out.println("We have loaded " + datalake.size()
+ " table(s) in our data lake.");
Dataset<Row> ordersDf = datalake.get("orders");
// @formatter:off
ordersDf = ordersDf.withColumn(
"time_to_ship",
datediff(ordersDf.col("ship_date"), ordersDf.col("order_date")));
// @formatter:on
ordersDf.printSchema();
ordersDf.show(10);
System.out.println("We have " + ordersDf.count() + " orders");
Dataset<Row> ordersDf2 = ordersDf.filter("time_to_ship IS NOT NULL");
ordersDf2.printSchema();
ordersDf2.show(5);
System.out.println("We have " + ordersDf2.count() + " delivered orders");
}
示例7: start
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
private void start() {
SparkSession spark;
// @formatter:off
spark = SparkSession
.builder()
.appName("Sales per week")
.master("local")
.getOrCreate();
// @formatter:on
// List of all tables we want to work with
List<String> tables = new ArrayList<>();
tables.add("orders");
tables.add("items");
// Specific Informix dialect
JdbcDialect dialect = new InformixJdbcDialect();
JdbcDialects.registerDialect(dialect);
// Let's connect to the database
Config config = ConfigManager.getConfig(K.INFORMIX);
// Let's build our datalake
Map<String, Dataset<Row>> datalake = new HashMap<>();
for (String table : tables) {
System.out.print("Loading table [" + table + "] ... ");
// @formatter:off
Dataset<Row> df = spark.read()
.format("jdbc")
.option("url", config.getJdbcUrl())
.option("dbtable", table)
.option("user", config.getUser())
.option("password", config.getPassword())
.option("driver", config.getDriver())
.load();
// @formatter:on
datalake.put(table, df);
System.out.println("done");
}
System.out.println("We have loaded " + datalake.size()
+ " table(s) in our data lake");
// Let's look at the content
Dataset<Row> ordersDf = datalake.get("orders");
Dataset<Row> itemsDf = datalake.get("items");
Dataset<Row> allDf = ordersDf
.join(
itemsDf,
ordersDf.col("order_num").equalTo(itemsDf.col("order_num")),
"full_outer")
.drop(ordersDf.col("customer_num"))
.drop(itemsDf.col("order_num"))
.withColumn("order_week", lit(weekofyear(ordersDf.col("order_date"))));
allDf = allDf
.groupBy(allDf.col("order_week"))
.sum("total_price")
.orderBy(allDf.col("order_week"));
allDf.cache();
allDf.printSchema();
allDf.show(50);
}
示例8: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
SparkSession sparkSession = SparkSession
.builder()
.master("local")
.config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse")
.appName("JavaALSExample")
.getOrCreate();
RDD<String> textFile = sparkSession.sparkContext().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json",2);
JavaRDD<PersonDetails> mapParser = textFile.toJavaRDD().map(v1 -> new ObjectMapper().readValue(v1, PersonDetails.class));
mapParser.foreach(t -> System.out.println(t));
Dataset<Row> anotherPeople = sparkSession.read().json(textFile);
anotherPeople.printSchema();
anotherPeople.show();
Dataset<Row> json_rec = sparkSession.read().json("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json");
json_rec.printSchema();
json_rec.show();
StructType schema = new StructType( new StructField[] {
DataTypes.createStructField("cid", DataTypes.IntegerType, true),
DataTypes.createStructField("county", DataTypes.StringType, true),
DataTypes.createStructField("firstName", DataTypes.StringType, true),
DataTypes.createStructField("sex", DataTypes.StringType, true),
DataTypes.createStructField("year", DataTypes.StringType, true),
DataTypes.createStructField("dateOfBirth", DataTypes.TimestampType, true) });
/* StructType pep = new StructType(new StructField[] {
new StructField("Count", DataTypes.StringType, true, Metadata.empty()),
new StructField("County", DataTypes.StringType, true, Metadata.empty()),
new StructField("First Name", DataTypes.StringType, true, Metadata.empty()),
new StructField("Sex", DataTypes.StringType, true, Metadata.empty()),
new StructField("Year", DataTypes.StringType, true, Metadata.empty()),
new StructField("timestamp", DataTypes.TimestampType, true, Metadata.empty()) });*/
Dataset<Row> person_mod = sparkSession.read().schema(schema).json(textFile);
person_mod.printSchema();
person_mod.show();
person_mod.write().format("json").mode("overwrite").save("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_out.json");
}
示例9: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
SparkSession sparkSession = SparkSession
.builder()
.master("local")
.config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse")
.appName("JavaALSExample")
.getOrCreate();
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
JavaRDD<Movie> moviesRDD = sparkSession
.read().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/movies.csv")
.javaRDD().filter( str-> !(null==str))
.filter(str-> !(str.length()==0))
.filter(str-> !str.contains("movieId"))
.map(str -> Movie.parseRating(str));
moviesRDD.foreach(m -> System.out.println(m));
Dataset<Row> csv_read = sparkSession.read().format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load("C:/Users/sumit.kumar/git/learning/src/main/resources/movies.csv");
csv_read.printSchema();
csv_read.show();
StructType customSchema = new StructType(new StructField[] {
new StructField("movieId", DataTypes.LongType, true, Metadata.empty()),
new StructField("title", DataTypes.StringType, true, Metadata.empty()),
new StructField("genres", DataTypes.StringType, true, Metadata.empty())
});
Dataset<Row> csv_custom_read = sparkSession.read().format("com.databricks.spark.csv")
.option("header", "true")
.schema(customSchema)
.load("C:/Users/sumit.kumar/git/learning/src/main/resources/movies.csv");
csv_custom_read.printSchema();
csv_custom_read.show();
csv_custom_read.write()
.format("com.databricks.spark.csv")
.option("header", "true")
.option("codec", "org.apache.hadoop.io.compress.GzipCodec")
.save("C:/Users/sumit.kumar/git/learning/src/main/resources/newMovies.csv");
}
示例10: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) {
//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
System.setProperty("hadoop.home.dir", "E:\\hadoop");
//Build a Spark Session
SparkSession sparkSession = SparkSession
.builder()
.master("local")
.config("spark.sql.warehouse.dir","file:///E:/hadoop/warehouse")
.appName("EdgeBuilder")
.getOrCreate();
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
// Read the CSV data
Dataset<Row> emp_ds = sparkSession.read()
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load("src/main/resources/employee.txt");
UDF2 calcDays=new CalcDaysUDF();
//Registering the UDFs in Spark Session created above
sparkSession.udf().register("calcDays", calcDays, DataTypes.LongType);
emp_ds.createOrReplaceTempView("emp_ds");
emp_ds.printSchema();
emp_ds.show();
sparkSession.sql("select calcDays(hiredate,'dd-MM-yyyy') from emp_ds").show();
//Instantiate UDAF
AverageUDAF calcAvg= new AverageUDAF();
//Register UDAF to SparkSession
sparkSession.udf().register("calAvg", calcAvg);
//Use UDAF
sparkSession.sql("select deptno,calAvg(salary) from emp_ds group by deptno ").show();
//
TypeSafeUDAF typeSafeUDAF=new TypeSafeUDAF();
Dataset<Employee> emf = emp_ds.as(Encoders.bean(Employee.class));
emf.printSchema();
emf.show();
TypedColumn<Employee, Double> averageSalary = typeSafeUDAF.toColumn().name("averageTypeSafe");
Dataset<Double> result = emf.select(averageSalary);
result.show();
}
示例11: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
/**
* Main method..
*
* @param args the arguments
*/
public static void main(final String[] args) {
final String tableName = "SparkExampleDFUsingCSV";
/** get the locator host/port from arguments, if specified.. **/
final String locatorHost = args.length > 0 ? args[0] : "localhost";
final int locatorPort = args.length > 1 ? Integer.valueOf(args[1]) : 10334;
/** create SparkContext **/
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SparkExampleDFUsingCSV");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(jsc);
StructType customSchema = new StructType(new StructField[] {
new StructField("year", DataTypes.IntegerType, true, Metadata.empty()),
new StructField("make", DataTypes.StringType, true, Metadata.empty()),
new StructField("model", DataTypes.StringType, true, Metadata.empty()),
new StructField("comment", DataTypes.StringType, true, Metadata.empty()),
new StructField("blank", DataTypes.StringType, true, Metadata.empty())
});
Dataset df = sqlContext.read()
.format("com.databricks.spark.csv")
.schema(customSchema)
.option("header", "true")
.load("cars.csv");
/** print schema of the data-frame **/
df.printSchema();
df.show();
Map<String, String> options = new HashMap<>(3);
options.put("ampool.locator.host", locatorHost);
options.put("ampool.locator.port", String.valueOf(locatorPort));
/** overwrite existing table, if specified.. **/
SaveMode saveMode = Boolean.getBoolean("overwrite") ? SaveMode.Overwrite : SaveMode.ErrorIfExists;
/** save the dataFrame to Ampool as `tableName' **/
df.write().format("io.ampool").options(options).mode(saveMode).save(tableName);
System.out.println("########## DATA FROM AMPOOL ############");
/** load the data-frame from Ampool `tableName' **/
Dataset df1 = sqlContext.read().format("io.ampool").options(options).load(tableName);
/** show the contents of loaded data-frame **/
df1.show();
/** show the total number of rows in data-frame **/
System.out.println("# NumberOfRowsInDataFrame= " + df1.count());
/** data-frame with filter **/
df1.filter("year > 1997").show();
/** data-frame with selected columns **/
df1.select("year", "make", "model", "comment").show();
df1.registerTempTable("temp_table");
sqlContext.sql("select * from temp_table order by year").show();
}
示例12: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
/**
* Main method..
*
* @param args the arguments
*/
public static void main(final String[] args) {
final String tableName = "SparkExampleDF";
/** get the locator host/port from arguments, if specified.. **/
final String locatorHost = args.length > 0 ? args[0] : "localhost";
final int locatorPort = args.length > 1 ? Integer.valueOf(args[1]) : 10334;
/** create SparkContext **/
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SparkExampleDF");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(jsc);
/** create data-frame from existing data.. **/
Dataset df = sqlContext.createDataFrame(jsc.parallelize(SAMPLE_DATA), Employee.class);
/** print schema of the data-frame **/
df.printSchema();
df.show();
Map<String, String> options = new HashMap<>(3);
options.put("ampool.locator.host", locatorHost);
options.put("ampool.locator.port", String.valueOf(locatorPort));
/** overwrite existing table, if specified.. **/
SaveMode saveMode = Boolean.getBoolean("overwrite") ? SaveMode.Overwrite : SaveMode.ErrorIfExists;
/** save the dataFrame to Ampool as `tableName' **/
df.write().format("io.ampool").options(options).mode(saveMode).save(tableName);
System.out.println("########## DATA FROM AMPOOL ############");
/** load the data-frame from Ampool `tableName' **/
Dataset df1 = sqlContext.read().format("io.ampool").options(options).load(tableName);
/** show the contents of loaded data-frame **/
df1.show();
/** show the total number of rows in data-frame **/
System.out.println("# NumberOfRowsInDataFrame= " + df1.count());
/** data-frame with filter **/
df1.filter("id > 2").show();
/** data-frame with selected columns **/
df1.select("name", "id", "department").show();
df1.registerTempTable("temp_table");
sqlContext.sql("select * from temp_table order by id").show();
}
示例13: main
import org.apache.spark.sql.Dataset; //導入方法依賴的package包/類
public static void main(String[] args) throws StreamingQueryException {
//set log4j programmatically
LogManager.getLogger("org.apache.spark").setLevel(Level.WARN);
LogManager.getLogger("akka").setLevel(Level.ERROR);
//configure Spark
SparkConf conf = new SparkConf()
.setAppName("kafka-structured")
.setMaster("local[*]");
//initialize spark session
SparkSession sparkSession = SparkSession
.builder()
.config(conf)
.getOrCreate();
//reduce task number
sparkSession.sqlContext().setConf("spark.sql.shuffle.partitions", "3");
//data stream from kafka
Dataset<Row> ds1 = sparkSession
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "mytopic")
.option("startingOffsets", "earliest")
.load();
//start the streaming query
sparkSession.udf().register("deserialize", (byte[] data) -> {
GenericRecord record = recordInjection.invert(data).get();
return RowFactory.create(record.get("str1").toString(), record.get("str2").toString(), record.get("int1"));
}, DataTypes.createStructType(type.fields()));
ds1.printSchema();
Dataset<Row> ds2 = ds1
.select("value").as(Encoders.BINARY())
.selectExpr("deserialize(value) as rows")
.select("rows.*");
ds2.printSchema();
StreamingQuery query1 = ds2
.groupBy("str1")
.count()
.writeStream()
.queryName("Test query")
.outputMode("complete")
.format("console")
.start();
query1.awaitTermination();
}