当前位置: 首页>>代码示例>>Java>>正文


Java DataFrame.show方法代码示例

本文整理汇总了Java中org.apache.spark.sql.DataFrame.show方法的典型用法代码示例。如果您正苦于以下问题:Java DataFrame.show方法的具体用法?Java DataFrame.show怎么用?Java DataFrame.show使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.spark.sql.DataFrame的用法示例。


在下文中一共展示了DataFrame.show方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: createNGramDataFrame

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
 * Creates a n-gram data frame from text lines.
 * @param lines
 * @return a n-gram data frame.
 */
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
	JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
		private static final long serialVersionUID = -4332903997027358601L;
		
		@Override
		public Row call(String line) throws Exception {
			return RowFactory.create(Arrays.asList(line.split("\\s+")));
		}
	});
	StructType schema = new StructType(new StructField[] {
			new StructField("words",
					DataTypes.createArrayType(DataTypes.StringType), false,
					Metadata.empty()) });
	DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
	// build a bigram language model
	NGram transformer = new NGram().setInputCol("words")
			.setOutputCol("ngrams").setN(2);
	DataFrame ngramDF = transformer.transform(wordDF);
	ngramDF.show(10, false);
	return ngramDF;
}
 
开发者ID:phuonglh,项目名称:vn.vitk,代码行数:27,代码来源:NGramBuilder.java

示例2: main

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public static void main(String[] args) {

    SparkConf sparkConf = new SparkConf()
            .setAppName("ReadFromMapRDB-DF-Java")
            .setMaster("local[1]");
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    SQLContext sqlContext = new SQLContext(jsc);

    Configuration config = null;
    try {
      config = HBaseConfiguration.create();
      config.set(TableInputFormat.INPUT_TABLE, "/apps/tests/users_profiles");
    } catch (Exception ce) {
      ce.printStackTrace();
    }

    JavaPairRDD hBaseRDD =
            jsc.newAPIHadoopRDD(config, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);

    // convert HBase result into Java RDD Pair key/User
    JavaPairRDD rowPairRDD = hBaseRDD.mapToPair(

            new PairFunction<Tuple2, String, User>() {
              @Override
              public Tuple2 call(
                      Tuple2 entry) throws Exception {

                Result r = (Result) entry._2;
                String rowKey = Bytes.toString(r.getRow());

                User user = new User();
                user.setRowkey( rowKey );
                user.setFirstName(Bytes.toString(r.getValue(Bytes.toBytes("default"), Bytes.toBytes("first_name"))));
                user.setLastName(Bytes.toString(r.getValue(Bytes.toBytes("default"), Bytes.toBytes("last_name"))));

                return new Tuple2(rowKey, user);
              }
            });

    System.out.println("************ RDD *************");
    System.out.println(rowPairRDD.count());
    System.out.println(rowPairRDD.keys().collect());
    System.out.println(rowPairRDD.values().collect());

    System.out.println("************ DF *************");
    DataFrame df = sqlContext.createDataFrame(rowPairRDD.values(), User.class);

    System.out.println(df.count());
    System.out.println(df.schema());
    df.show();

    System.out.println("************ DF with SQL *************");
    df.registerTempTable("USER_TABLE");
    DataFrame dfSql = sqlContext.sql("SELECT *  FROM USER_TABLE  WHERE firstName = 'Ally' ");
    System.out.println(dfSql.count());
    System.out.println(dfSql.schema());
    dfSql.show();


    jsc.close();

  }
 
开发者ID:tgrall,项目名称:hbase-maprdb-spark,代码行数:63,代码来源:ReadFromHbaseDF.java

示例3: main

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
 * Main method..
 *
 * @param args the arguments
 */
public static void main(final String[] args) {
  final String tableName = "SparkExampleDFUsingCSV";

  /** get the locator host/port from arguments, if specified.. **/
  final String locatorHost = args.length > 0 ? args[0] : "localhost";
  final int locatorPort = args.length > 1 ? Integer.valueOf(args[1]) : 10334;

  /** create SparkContext **/
  SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SparkExampleDFUsingCSV");
  JavaSparkContext jsc = new JavaSparkContext(conf);
  SQLContext sqlContext = new SQLContext(jsc);

  StructType customSchema = new StructType(new StructField[] {
          new StructField("year", DataTypes.IntegerType, true, Metadata.empty()),
          new StructField("make", DataTypes.StringType, true, Metadata.empty()),
          new StructField("model", DataTypes.StringType, true, Metadata.empty()),
          new StructField("comment", DataTypes.StringType, true, Metadata.empty()),
          new StructField("blank", DataTypes.StringType, true, Metadata.empty())
  });

  DataFrame df = sqlContext.read()
          .format("com.databricks.spark.csv")
          .schema(customSchema)
          .option("header", "true")
          .load("cars.csv");


  /** print schema of the data-frame **/
  df.printSchema();

  df.show();

  Map<String, String> options = new HashMap<>(3);
  options.put("ampool.locator.host", locatorHost);
  options.put("ampool.locator.port", String.valueOf(locatorPort));

  /** overwrite existing table, if specified.. **/
  SaveMode saveMode = Boolean.getBoolean("overwrite") ? SaveMode.Overwrite : SaveMode.ErrorIfExists;

  /** save the dataFrame to Ampool as `tableName' **/
  df.write().format("io.ampool").options(options).mode(saveMode).save(tableName);

  System.out.println("########## DATA FROM AMPOOL ############");

  /** load the data-frame from Ampool `tableName' **/
  DataFrame df1 = sqlContext.read().format("io.ampool").options(options).load(tableName);

  /** show the contents of loaded data-frame **/
  df1.show();

  /** show the total number of rows in data-frame **/
  System.out.println("# NumberOfRowsInDataFrame= " + df1.count());

  /** data-frame with filter **/
  df1.filter("year > 1997").show();

  /** data-frame with selected columns **/
  df1.select("year", "make", "model", "comment").show();

  df1.registerTempTable("temp_table");

  sqlContext.sql("select * from temp_table order by year").show();
}
 
开发者ID:ampool,项目名称:monarch,代码行数:69,代码来源:SparkExampleDFUsingCSV.java

示例4: main

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public static void main(final String[] args) {
  final String tableName = "SparkExampleML";

  /** get the locator host/port from arguments, if specified.. **/
  final String locatorHost = args.length > 0 ? args[0] : "localhost";
  final int locatorPort = args.length > 1 ? Integer.valueOf(args[1]) : 10334;

  int numClusters = Integer.getInteger("numClusters", 2);
  int numIterations = Integer.getInteger("numIterations", 20);

  /** create SparkContext **/
  SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SparkExampleDF");
  JavaSparkContext jsc = new JavaSparkContext(conf);
  SQLContext sqlContext = new SQLContext(jsc);

  /** create data-frame from sample ML data **/
  DataFrame df = sqlContext.createDataFrame(jsc.parallelize(SAMPLE_ML_DATA), LabeledPoint.class);
  df.show();

  Map<String, String> options = new HashMap<>(2);
  options.put("ampool.locator.host", locatorHost);
  options.put("ampool.locator.port", String.valueOf(locatorPort));

  /** overwrite existing table, if specified.. **/
  SaveMode saveMode = Boolean.getBoolean("overwrite") ? SaveMode.Overwrite : SaveMode.ErrorIfExists;

  /** save the dataFrame to Ampool as `tableName' **/
  df.write().format("io.ampool").options(options).mode(saveMode).save(tableName);

  /** load the data-frame from Ampool `tableName' **/
  DataFrame df1 = sqlContext.read().format("io.ampool").options(options).load(tableName);

  System.out.println("########## DATA FROM AMPOOL ############");
  df1.show();

  /** execute KMeans fit on the data loaded from Ampool **/
  KMeans kMeans = new KMeans().setK(numClusters).setMaxIter(numIterations)
    .setFeaturesCol("features").setPredictionCol("prediction");
  KMeansModel model = kMeans.fit(df1);

  Vector[] cost = model.clusterCenters();
  System.out.println("# Sum of Squared Errors = " + Arrays.toString(cost));
}
 
开发者ID:ampool,项目名称:monarch,代码行数:44,代码来源:SparkExampleML.java

示例5: main

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
 * Main method..
 *
 * @param args the arguments
 */
public static void main(final String[] args) {
  final String tableName = "SparkExampleDF";

  /** get the locator host/port from arguments, if specified.. **/
  final String locatorHost = args.length > 0 ? args[0] : "localhost";
  final int locatorPort = args.length > 1 ? Integer.valueOf(args[1]) : 10334;

  /** create SparkContext **/
  SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SparkExampleDF");
  JavaSparkContext jsc = new JavaSparkContext(conf);
  SQLContext sqlContext = new SQLContext(jsc);

  /** create data-frame from existing data.. **/
  DataFrame df = sqlContext.createDataFrame(jsc.parallelize(SAMPLE_DATA), Employee.class);

  /** print schema of the data-frame **/
  df.printSchema();

  df.show();

  Map<String, String> options = new HashMap<>(3);
  options.put("ampool.locator.host", locatorHost);
  options.put("ampool.locator.port", String.valueOf(locatorPort));

  /** overwrite existing table, if specified.. **/
  SaveMode saveMode = Boolean.getBoolean("overwrite") ? SaveMode.Overwrite : SaveMode.ErrorIfExists;

  /** save the dataFrame to Ampool as `tableName' **/
  df.write().format("io.ampool").options(options).mode(saveMode).save(tableName);

  System.out.println("########## DATA FROM AMPOOL ############");

  /** load the data-frame from Ampool `tableName' **/
  DataFrame df1 = sqlContext.read().format("io.ampool").options(options).load(tableName);

  /** show the contents of loaded data-frame **/
  df1.show();

  /** show the total number of rows in data-frame **/
  System.out.println("# NumberOfRowsInDataFrame= " + df1.count());

  /** data-frame with filter **/
  df1.filter("id > 2").show();

  /** data-frame with selected columns **/
  df1.select("name", "id", "department").show();

  df1.registerTempTable("temp_table");

  sqlContext.sql("select * from temp_table order by id").show();
}
 
开发者ID:ampool,项目名称:monarch,代码行数:57,代码来源:SparkExampleDF.java

示例6: train

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
 * Trains a whitespace classifier model and save the resulting pipeline model
 * to an external file. 
 * @param sentences a list of tokenized sentences.
 * @param pipelineModelFileName
 * @param numFeatures
 */
public void train(List<String> sentences, String pipelineModelFileName, int numFeatures) {
	List<WhitespaceContext> contexts = new ArrayList<WhitespaceContext>(sentences.size());
	int id = 0;
	for (String sentence : sentences) {
		sentence = sentence.trim();
		for (int j = 0; j < sentence.length(); j++) {
			char c = sentence.charAt(j);
			if (c == ' ' || c == '_') {
				WhitespaceContext context = new WhitespaceContext();
				context.setId(id++);
				context.setContext(extractContext(sentence, j));
				context.setLabel(c == ' ' ? 0d : 1d);
				contexts.add(context);
			}
		}
	}
	JavaRDD<WhitespaceContext> jrdd = jsc.parallelize(contexts);
	DataFrame df = sqlContext.createDataFrame(jrdd, WhitespaceContext.class);
	df.show(false);
	System.out.println("N = " + df.count());
	df.groupBy("label").count().show();
	
	org.apache.spark.ml.feature.Tokenizer tokenizer = new Tokenizer()
			.setInputCol("context").setOutputCol("words");
	HashingTF hashingTF = new HashingTF().setNumFeatures(numFeatures)
			.setInputCol(tokenizer.getOutputCol()).setOutputCol("features");
	LogisticRegression lr = new LogisticRegression().setMaxIter(100)
			.setRegParam(0.01);
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] {
			tokenizer, hashingTF, lr });
	model = pipeline.fit(df);
	
	try {
		model.write().overwrite().save(pipelineModelFileName);
	} catch (IOException e) {
		e.printStackTrace();
	}
	
	DataFrame predictions = model.transform(df);
	predictions.show();
	MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision");
	double accuracy = evaluator.evaluate(predictions);
	System.out.println("training accuracy = " + accuracy);
	
	LogisticRegressionModel lrModel = (LogisticRegressionModel) model.stages()[2];
	LogisticRegressionTrainingSummary trainingSummary = lrModel.summary();
	double[] objectiveHistory = trainingSummary.objectiveHistory();
	System.out.println("#(iterations) = " + objectiveHistory.length);
	for (double lossPerIteration : objectiveHistory) {
	  System.out.println(lossPerIteration);
	}
	
}
 
开发者ID:phuonglh,项目名称:vn.vitk,代码行数:61,代码来源:WhitespaceClassifier.java


注:本文中的org.apache.spark.sql.DataFrame.show方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。