本文整理汇总了Java中org.apache.spark.sql.DataFrame.show方法的典型用法代码示例。如果您正苦于以下问题:Java DataFrame.show方法的具体用法?Java DataFrame.show怎么用?Java DataFrame.show使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.sql.DataFrame
的用法示例。
在下文中一共展示了DataFrame.show方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: createNGramDataFrame
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
* Creates a n-gram data frame from text lines.
* @param lines
* @return a n-gram data frame.
*/
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
private static final long serialVersionUID = -4332903997027358601L;
@Override
public Row call(String line) throws Exception {
return RowFactory.create(Arrays.asList(line.split("\\s+")));
}
});
StructType schema = new StructType(new StructField[] {
new StructField("words",
DataTypes.createArrayType(DataTypes.StringType), false,
Metadata.empty()) });
DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
// build a bigram language model
NGram transformer = new NGram().setInputCol("words")
.setOutputCol("ngrams").setN(2);
DataFrame ngramDF = transformer.transform(wordDF);
ngramDF.show(10, false);
return ngramDF;
}
示例2: main
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf()
.setAppName("ReadFromMapRDB-DF-Java")
.setMaster("local[1]");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
SQLContext sqlContext = new SQLContext(jsc);
Configuration config = null;
try {
config = HBaseConfiguration.create();
config.set(TableInputFormat.INPUT_TABLE, "/apps/tests/users_profiles");
} catch (Exception ce) {
ce.printStackTrace();
}
JavaPairRDD hBaseRDD =
jsc.newAPIHadoopRDD(config, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
// convert HBase result into Java RDD Pair key/User
JavaPairRDD rowPairRDD = hBaseRDD.mapToPair(
new PairFunction<Tuple2, String, User>() {
@Override
public Tuple2 call(
Tuple2 entry) throws Exception {
Result r = (Result) entry._2;
String rowKey = Bytes.toString(r.getRow());
User user = new User();
user.setRowkey( rowKey );
user.setFirstName(Bytes.toString(r.getValue(Bytes.toBytes("default"), Bytes.toBytes("first_name"))));
user.setLastName(Bytes.toString(r.getValue(Bytes.toBytes("default"), Bytes.toBytes("last_name"))));
return new Tuple2(rowKey, user);
}
});
System.out.println("************ RDD *************");
System.out.println(rowPairRDD.count());
System.out.println(rowPairRDD.keys().collect());
System.out.println(rowPairRDD.values().collect());
System.out.println("************ DF *************");
DataFrame df = sqlContext.createDataFrame(rowPairRDD.values(), User.class);
System.out.println(df.count());
System.out.println(df.schema());
df.show();
System.out.println("************ DF with SQL *************");
df.registerTempTable("USER_TABLE");
DataFrame dfSql = sqlContext.sql("SELECT * FROM USER_TABLE WHERE firstName = 'Ally' ");
System.out.println(dfSql.count());
System.out.println(dfSql.schema());
dfSql.show();
jsc.close();
}
示例3: main
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
* Main method..
*
* @param args the arguments
*/
public static void main(final String[] args) {
final String tableName = "SparkExampleDFUsingCSV";
/** get the locator host/port from arguments, if specified.. **/
final String locatorHost = args.length > 0 ? args[0] : "localhost";
final int locatorPort = args.length > 1 ? Integer.valueOf(args[1]) : 10334;
/** create SparkContext **/
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SparkExampleDFUsingCSV");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(jsc);
StructType customSchema = new StructType(new StructField[] {
new StructField("year", DataTypes.IntegerType, true, Metadata.empty()),
new StructField("make", DataTypes.StringType, true, Metadata.empty()),
new StructField("model", DataTypes.StringType, true, Metadata.empty()),
new StructField("comment", DataTypes.StringType, true, Metadata.empty()),
new StructField("blank", DataTypes.StringType, true, Metadata.empty())
});
DataFrame df = sqlContext.read()
.format("com.databricks.spark.csv")
.schema(customSchema)
.option("header", "true")
.load("cars.csv");
/** print schema of the data-frame **/
df.printSchema();
df.show();
Map<String, String> options = new HashMap<>(3);
options.put("ampool.locator.host", locatorHost);
options.put("ampool.locator.port", String.valueOf(locatorPort));
/** overwrite existing table, if specified.. **/
SaveMode saveMode = Boolean.getBoolean("overwrite") ? SaveMode.Overwrite : SaveMode.ErrorIfExists;
/** save the dataFrame to Ampool as `tableName' **/
df.write().format("io.ampool").options(options).mode(saveMode).save(tableName);
System.out.println("########## DATA FROM AMPOOL ############");
/** load the data-frame from Ampool `tableName' **/
DataFrame df1 = sqlContext.read().format("io.ampool").options(options).load(tableName);
/** show the contents of loaded data-frame **/
df1.show();
/** show the total number of rows in data-frame **/
System.out.println("# NumberOfRowsInDataFrame= " + df1.count());
/** data-frame with filter **/
df1.filter("year > 1997").show();
/** data-frame with selected columns **/
df1.select("year", "make", "model", "comment").show();
df1.registerTempTable("temp_table");
sqlContext.sql("select * from temp_table order by year").show();
}
示例4: main
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public static void main(final String[] args) {
final String tableName = "SparkExampleML";
/** get the locator host/port from arguments, if specified.. **/
final String locatorHost = args.length > 0 ? args[0] : "localhost";
final int locatorPort = args.length > 1 ? Integer.valueOf(args[1]) : 10334;
int numClusters = Integer.getInteger("numClusters", 2);
int numIterations = Integer.getInteger("numIterations", 20);
/** create SparkContext **/
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SparkExampleDF");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(jsc);
/** create data-frame from sample ML data **/
DataFrame df = sqlContext.createDataFrame(jsc.parallelize(SAMPLE_ML_DATA), LabeledPoint.class);
df.show();
Map<String, String> options = new HashMap<>(2);
options.put("ampool.locator.host", locatorHost);
options.put("ampool.locator.port", String.valueOf(locatorPort));
/** overwrite existing table, if specified.. **/
SaveMode saveMode = Boolean.getBoolean("overwrite") ? SaveMode.Overwrite : SaveMode.ErrorIfExists;
/** save the dataFrame to Ampool as `tableName' **/
df.write().format("io.ampool").options(options).mode(saveMode).save(tableName);
/** load the data-frame from Ampool `tableName' **/
DataFrame df1 = sqlContext.read().format("io.ampool").options(options).load(tableName);
System.out.println("########## DATA FROM AMPOOL ############");
df1.show();
/** execute KMeans fit on the data loaded from Ampool **/
KMeans kMeans = new KMeans().setK(numClusters).setMaxIter(numIterations)
.setFeaturesCol("features").setPredictionCol("prediction");
KMeansModel model = kMeans.fit(df1);
Vector[] cost = model.clusterCenters();
System.out.println("# Sum of Squared Errors = " + Arrays.toString(cost));
}
示例5: main
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
* Main method..
*
* @param args the arguments
*/
public static void main(final String[] args) {
final String tableName = "SparkExampleDF";
/** get the locator host/port from arguments, if specified.. **/
final String locatorHost = args.length > 0 ? args[0] : "localhost";
final int locatorPort = args.length > 1 ? Integer.valueOf(args[1]) : 10334;
/** create SparkContext **/
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SparkExampleDF");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(jsc);
/** create data-frame from existing data.. **/
DataFrame df = sqlContext.createDataFrame(jsc.parallelize(SAMPLE_DATA), Employee.class);
/** print schema of the data-frame **/
df.printSchema();
df.show();
Map<String, String> options = new HashMap<>(3);
options.put("ampool.locator.host", locatorHost);
options.put("ampool.locator.port", String.valueOf(locatorPort));
/** overwrite existing table, if specified.. **/
SaveMode saveMode = Boolean.getBoolean("overwrite") ? SaveMode.Overwrite : SaveMode.ErrorIfExists;
/** save the dataFrame to Ampool as `tableName' **/
df.write().format("io.ampool").options(options).mode(saveMode).save(tableName);
System.out.println("########## DATA FROM AMPOOL ############");
/** load the data-frame from Ampool `tableName' **/
DataFrame df1 = sqlContext.read().format("io.ampool").options(options).load(tableName);
/** show the contents of loaded data-frame **/
df1.show();
/** show the total number of rows in data-frame **/
System.out.println("# NumberOfRowsInDataFrame= " + df1.count());
/** data-frame with filter **/
df1.filter("id > 2").show();
/** data-frame with selected columns **/
df1.select("name", "id", "department").show();
df1.registerTempTable("temp_table");
sqlContext.sql("select * from temp_table order by id").show();
}
示例6: train
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
* Trains a whitespace classifier model and save the resulting pipeline model
* to an external file.
* @param sentences a list of tokenized sentences.
* @param pipelineModelFileName
* @param numFeatures
*/
public void train(List<String> sentences, String pipelineModelFileName, int numFeatures) {
List<WhitespaceContext> contexts = new ArrayList<WhitespaceContext>(sentences.size());
int id = 0;
for (String sentence : sentences) {
sentence = sentence.trim();
for (int j = 0; j < sentence.length(); j++) {
char c = sentence.charAt(j);
if (c == ' ' || c == '_') {
WhitespaceContext context = new WhitespaceContext();
context.setId(id++);
context.setContext(extractContext(sentence, j));
context.setLabel(c == ' ' ? 0d : 1d);
contexts.add(context);
}
}
}
JavaRDD<WhitespaceContext> jrdd = jsc.parallelize(contexts);
DataFrame df = sqlContext.createDataFrame(jrdd, WhitespaceContext.class);
df.show(false);
System.out.println("N = " + df.count());
df.groupBy("label").count().show();
org.apache.spark.ml.feature.Tokenizer tokenizer = new Tokenizer()
.setInputCol("context").setOutputCol("words");
HashingTF hashingTF = new HashingTF().setNumFeatures(numFeatures)
.setInputCol(tokenizer.getOutputCol()).setOutputCol("features");
LogisticRegression lr = new LogisticRegression().setMaxIter(100)
.setRegParam(0.01);
Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] {
tokenizer, hashingTF, lr });
model = pipeline.fit(df);
try {
model.write().overwrite().save(pipelineModelFileName);
} catch (IOException e) {
e.printStackTrace();
}
DataFrame predictions = model.transform(df);
predictions.show();
MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision");
double accuracy = evaluator.evaluate(predictions);
System.out.println("training accuracy = " + accuracy);
LogisticRegressionModel lrModel = (LogisticRegressionModel) model.stages()[2];
LogisticRegressionTrainingSummary trainingSummary = lrModel.summary();
double[] objectiveHistory = trainingSummary.objectiveHistory();
System.out.println("#(iterations) = " + objectiveHistory.length);
for (double lossPerIteration : objectiveHistory) {
System.out.println(lossPerIteration);
}
}