当前位置: 首页>>代码示例>>Java>>正文


Java DataFrame类代码示例

本文整理汇总了Java中org.apache.spark.sql.DataFrame的典型用法代码示例。如果您正苦于以下问题:Java DataFrame类的具体用法?Java DataFrame怎么用?Java DataFrame使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


DataFrame类属于org.apache.spark.sql包,在下文中一共展示了DataFrame类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: createNGramDataFrame

import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
/**
 * Creates a n-gram data frame from text lines.
 * @param lines
 * @return a n-gram data frame.
 */
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
	JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
		private static final long serialVersionUID = -4332903997027358601L;
		
		@Override
		public Row call(String line) throws Exception {
			return RowFactory.create(Arrays.asList(line.split("\\s+")));
		}
	});
	StructType schema = new StructType(new StructField[] {
			new StructField("words",
					DataTypes.createArrayType(DataTypes.StringType), false,
					Metadata.empty()) });
	DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
	// build a bigram language model
	NGram transformer = new NGram().setInputCol("words")
			.setOutputCol("ngrams").setN(2);
	DataFrame ngramDF = transformer.transform(wordDF);
	ngramDF.show(10, false);
	return ngramDF;
}
 
开发者ID:phuonglh,项目名称:vn.vitk,代码行数:27,代码来源:NGramBuilder.java

示例2: createRegistry

import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
@Override
protected JndiRegistry createRegistry() throws Exception {
    JndiRegistry registry = super.createRegistry();

    registry.bind("testFileRdd", sparkContext.textFile("src/test/resources/testrdd.txt"));

    if (shouldRunHive) {
        registry.bind("hiveContext", hiveContext);
        DataFrame jsonCars = hiveContext.read().json("src/test/resources/cars.json");
        jsonCars.registerTempTable("cars");
        registry.bind("jsonCars", jsonCars);
    }

    registry.bind("countLinesTransformation", new org.apache.camel.component.spark.RddCallback() {
        @Override
        public Object onRdd(JavaRDDLike rdd, Object... payloads) {
            return rdd.count();
        }
    });
    return registry;
}
 
开发者ID:HydAu,项目名称:Camel,代码行数:22,代码来源:SparkProducerTest.java

示例3: constructListWithColumnNames

import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
public static List<List<Double>> constructListWithColumnNames(DataFrame dataframe,
    String[] columnNames) {

  List<Double> l;
  Row[] rows;

  List<List<Double>> list = new ArrayList<>();
  for (String name : columnNames) {
    l = new ArrayList<>();
    rows = dataframe.select(name).collect();
    for (Row r : rows) {
      l.add(Double.valueOf(r.get(0).toString()));
    }
    list.add(l);
  }
  return list;

}
 
开发者ID:zoho-labs,项目名称:Explainer,代码行数:19,代码来源:ExplainerUtils.java

示例4: dataframeToList

import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
public static List<List<Double>> dataframeToList(DataFrame dataframe) {

    List<Double> column;
    Row[] rows;

    List<List<Double>> listOfColumns = new ArrayList<>();
    for (String s : dataframe.columns()) {
      column = new ArrayList<>();
      rows = dataframe.select(s).collect();
      for (Row r : rows) {
        column.add(Double.valueOf(r.get(0).toString()));
      }
      listOfColumns.add(column);
    }
    return listOfColumns;

  }
 
开发者ID:zoho-labs,项目名称:Explainer,代码行数:18,代码来源:ExplainerUtils.java

示例5: writeEntityMetadata

import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
/**
 * Write metadata describing entity tables
 *
 * @param entitySchema the entity schema
 */
public void writeEntityMetadata(EntitySchema entitySchema) {

    // create the schema
    List<StructField> fields = new ArrayList<>();
    fields.add(DataTypes.createStructField(ENTITIES_NAME, DataTypes.StringType, false));
    fields.add(DataTypes.createStructField(ENTITIES_URI, DataTypes.StringType, false));
    fields.add(DataTypes.createStructField(ENTITIES_LABEL, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(ENTITIES_NUM_ROWS, DataTypes.LongType, false));
    StructType schema = DataTypes.createStructType(fields);

    List<Tuple2<String, String>> indexes = new ArrayList<>();
    indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_URI));

    List<Tuple2<String, String>> primaryKeys = new ArrayList<>();
    indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_NAME));

    final Map<String, String> uriLabels = rdfSchema.getUriLabels();
    // create table rows
    List<Row> rows = entitySchema.getTables().stream()
            .map(table -> {
                Object[] valueArray = new Object[]{
                        table.getName(),
                        table.getTypeURI(),
                        uriLabels.get(table.getTypeURI()),
                        table.getNumRows()
                };
                return RowFactory.create(valueArray);
            }).collect(Collectors.toList());

    // create and write the META_Entities dataframe
    DataFrame df = sql.createDataFrame(rows, schema);
    persistor.writeDataFrame(ENTITIES_TABLE_NAME, df);
    persistor.createPrimaryKeys(primaryKeys);
    persistor.createIndexes(indexes);
    df.unpersist();
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:42,代码来源:MetadataWriter.java

示例6: getDataFrame

import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
private DataFrame getDataFrame() {

        StructType schema = createStructType(new StructField[]{
                createStructField("id", IntegerType, false),
                createStructField("a", StringType, false),
                createStructField("b", DoubleType, false),
                createStructField("c", DoubleType, false),
                createStructField("d", BooleanType, false),

        });
        List<Row> trainingData = Arrays.asList(
                cr(1, null, null, null, null),
                cr(2, "test", 1.2, null, null),
                cr(3, null, 1.1, null, false),
                cr(4, "faffa", NaN, 45.0, true)
        );

        DataFrame df = sqlContext.createDataFrame(trainingData, schema);
        return df;
    }
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:21,代码来源:FillNAValuesTransformerBridgeTest.java

示例7: main

import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
	Flags.setFromCommandLineArgs(THE_OPTIONS, args);

	// 初始化Spark Conf.
	SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark");
	JavaSparkContext sc = new JavaSparkContext(conf);
	JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval());
	SQLContext sqlContext = new SQLContext(sc);

	// 初始化参数
	HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(",")));
	HashMap<String, String> kafkaParams = new HashMap<String, String>();
	kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker());

	// 从Kafka Stream获取数据
	JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
			StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);

	JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
		private static final long serialVersionUID = 5266880065425088203L;

		public String call(Tuple2<String, String> tuple2) {
			return tuple2._2();
		}
	});

	JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> {
		List<ApacheAccessLog> list = new ArrayList<>();
		try {
			// 映射每一行
			list.add(ApacheAccessLog.parseFromLogLine(line));
			return list;
		} catch (RuntimeException e) {
			return list;
		}
	}).cache();

	accessLogsDStream.foreachRDD(rdd -> {

		// rdd to DataFrame
		DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class);
		// 写入Parquet文件
		df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());

		return null;
	});

	// 启动Streaming服务器
	jssc.start(); // 启动计算
	jssc.awaitTermination(); // 等待终止
}
 
开发者ID:sectong,项目名称:SparkToParquet,代码行数:52,代码来源:AppMain.java

示例8: tpch14

import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
public void tpch14() {
    int year_14 = 1993;
    int monthOffset_14 = rand.nextInt(60);
    SimpleDate d14_1 = new SimpleDate(year_14 + monthOffset_14 / 12, monthOffset_14 % 12 + 1, 1);
    monthOffset_14 += 1;
    SimpleDate d14_2 = new SimpleDate(year_14 + monthOffset_14 / 12, monthOffset_14 % 12 + 1, 1);

    String lineitemPredicate =  "l_shipdate >= \"" + d14_1 + "\" and l_shipdate < \"" + d14_2 + "\"";

    long start = System.currentTimeMillis();

    System.out.println("SELECT * "
            + "FROM lineitem JOIN part ON  l_partkey = p_partkey "
            + "WHERE " + lineitemPredicate);

    DataFrame df = sqlContext.sql("SELECT * "
            + "FROM lineitem JOIN part ON  l_partkey = p_partkey "
            + "WHERE " + lineitemPredicate);

    long result = df.count();  // 76860
    System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start)  + "; Result: " + result);
}
 
开发者ID:mitdbg,项目名称:AdaptDB,代码行数:23,代码来源:TPCHSparkJoinWorkload.java

示例9: getModelInfo

import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
@Override
public LogisticRegressionModelInfo getModelInfo(final LogisticRegressionModel sparkLRModel, DataFrame df) {
    final LogisticRegressionModelInfo logisticRegressionModelInfo = new LogisticRegressionModelInfo();
    logisticRegressionModelInfo.setWeights(sparkLRModel.weights().toArray());
    logisticRegressionModelInfo.setIntercept(sparkLRModel.intercept());
    logisticRegressionModelInfo.setNumClasses(sparkLRModel.numClasses());
    logisticRegressionModelInfo.setNumFeatures(sparkLRModel.numFeatures());
    logisticRegressionModelInfo.setThreshold((double) sparkLRModel.getThreshold().get());

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add("features");
    logisticRegressionModelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add("prediction");
    outputKeys.add("probability");
    logisticRegressionModelInfo.setOutputKeys(outputKeys);

    return logisticRegressionModelInfo;
}
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:21,代码来源:LogisticRegressionModelInfoAdapter.java

示例10: writeDataFrame

import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
/**
 * Write a {@link DataFrame} to the specified output
 *
 * @param name name of output table
 * @param df   dataframe containing the data
 */
@Override
public void writeDataFrame(String name, DataFrame df) {
    Map<String, String> props = config.getProperties(name);
    log.info("Writing to ElasticSearch: {}", props);
    JavaEsSparkSQL.saveToEs(df, props);
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:13,代码来源:ElasticSearchPersistor.java

示例11: writeDataFrame

import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
/**
 * Write a {@link DataFrame} to the specified output
 *
 * @param name name of output table
 * @param df   dataframe containing the data
 */
@Override
public void writeDataFrame(String name, DataFrame df) {
    String outputFolder = config.getOutputFolder();
    String outputPath = Paths.get(outputFolder, name).toString();
    log.info("Writing CSV files to folder {}", outputPath);
    df.write().mode(saveMode)
            .format("com.databricks.spark.csv")
            .option("header", "true")
            .save(outputPath);
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:17,代码来源:CSVPersistor.java

示例12: testRandomSplit

import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
void testRandomSplit(String inputFileName, int numFeatures, String modelFileName) {
	CMMParams params = new CMMParams()
		.setMaxIter(600)
		.setRegParam(1E-6)
		.setMarkovOrder(2)
		.setNumFeatures(numFeatures);
	
	JavaRDD<String> lines = jsc.textFile(inputFileName);
	DataFrame dataset = createDataFrame(lines.collect());
	DataFrame[] splits = dataset.randomSplit(new double[]{0.9, 0.1}); 
	DataFrame trainingData = splits[0];
	System.out.println("Number of training sequences = " + trainingData.count());
	DataFrame testData = splits[1];
	System.out.println("Number of test sequences = " + testData.count());
	// train and save a model on the training data
	cmmModel = train(trainingData, modelFileName, params);
	// test the model on the test data
	System.out.println("Test accuracy:");
	evaluate(testData); 
	// test the model on the training data
	System.out.println("Training accuracy:");
	evaluate(trainingData);
}
 
开发者ID:phuonglh,项目名称:vn.vitk,代码行数:24,代码来源:Tagger.java

示例13: getModelInfo

import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
@Override
public MinMaxScalerModelInfo getModelInfo(final MinMaxScalerModel from, final DataFrame df) {
    final MinMaxScalerModelInfo modelInfo = new MinMaxScalerModelInfo();
    modelInfo.setOriginalMax(from.originalMax().toArray());
    modelInfo.setOriginalMin(from.originalMin().toArray());
    modelInfo.setMax(from.getMax());
    modelInfo.setMin(from.getMin());

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(from.getInputCol());
    modelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    modelInfo.setOutputKeys(outputKeys);

    return modelInfo;
}
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:19,代码来源:MinMaxScalerModelInfoAdapter.java

示例14: getModelInfo

import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
@Override
public CountVectorizerModelInfo getModelInfo(final CountVectorizerModel from, final DataFrame df) {
    final CountVectorizerModelInfo modelInfo = new CountVectorizerModelInfo();
    modelInfo.setMinTF(from.getMinTF());
    modelInfo.setVocabulary(from.vocabulary());

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(from.getInputCol());
    modelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    modelInfo.setOutputKeys(outputKeys);

    return modelInfo;
}
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:17,代码来源:CountVectorizerModelInfoAdapter.java

示例15: getModelInfo

import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
@Override
public IfZeroVectorModelInfo getModelInfo(final IfZeroVector from, DataFrame df) {
    IfZeroVectorModelInfo modelInfo = new IfZeroVectorModelInfo();

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(from.getInputCol());
    modelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    modelInfo.setOutputKeys(outputKeys);

    modelInfo.setThenSetValue(from.getThenSetValue());
    modelInfo.setElseSetCol(from.getElseSetCol());

    return modelInfo;
}
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:18,代码来源:IfZeroVectorModelInfoAdapter.java


注:本文中的org.apache.spark.sql.DataFrame类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。