本文整理汇总了Java中org.apache.spark.sql.DataFrame类的典型用法代码示例。如果您正苦于以下问题:Java DataFrame类的具体用法?Java DataFrame怎么用?Java DataFrame使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
DataFrame类属于org.apache.spark.sql包,在下文中一共展示了DataFrame类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: createNGramDataFrame
import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
/**
* Creates a n-gram data frame from text lines.
* @param lines
* @return a n-gram data frame.
*/
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
private static final long serialVersionUID = -4332903997027358601L;
@Override
public Row call(String line) throws Exception {
return RowFactory.create(Arrays.asList(line.split("\\s+")));
}
});
StructType schema = new StructType(new StructField[] {
new StructField("words",
DataTypes.createArrayType(DataTypes.StringType), false,
Metadata.empty()) });
DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
// build a bigram language model
NGram transformer = new NGram().setInputCol("words")
.setOutputCol("ngrams").setN(2);
DataFrame ngramDF = transformer.transform(wordDF);
ngramDF.show(10, false);
return ngramDF;
}
示例2: createRegistry
import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
@Override
protected JndiRegistry createRegistry() throws Exception {
JndiRegistry registry = super.createRegistry();
registry.bind("testFileRdd", sparkContext.textFile("src/test/resources/testrdd.txt"));
if (shouldRunHive) {
registry.bind("hiveContext", hiveContext);
DataFrame jsonCars = hiveContext.read().json("src/test/resources/cars.json");
jsonCars.registerTempTable("cars");
registry.bind("jsonCars", jsonCars);
}
registry.bind("countLinesTransformation", new org.apache.camel.component.spark.RddCallback() {
@Override
public Object onRdd(JavaRDDLike rdd, Object... payloads) {
return rdd.count();
}
});
return registry;
}
示例3: constructListWithColumnNames
import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
public static List<List<Double>> constructListWithColumnNames(DataFrame dataframe,
String[] columnNames) {
List<Double> l;
Row[] rows;
List<List<Double>> list = new ArrayList<>();
for (String name : columnNames) {
l = new ArrayList<>();
rows = dataframe.select(name).collect();
for (Row r : rows) {
l.add(Double.valueOf(r.get(0).toString()));
}
list.add(l);
}
return list;
}
示例4: dataframeToList
import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
public static List<List<Double>> dataframeToList(DataFrame dataframe) {
List<Double> column;
Row[] rows;
List<List<Double>> listOfColumns = new ArrayList<>();
for (String s : dataframe.columns()) {
column = new ArrayList<>();
rows = dataframe.select(s).collect();
for (Row r : rows) {
column.add(Double.valueOf(r.get(0).toString()));
}
listOfColumns.add(column);
}
return listOfColumns;
}
示例5: writeEntityMetadata
import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
/**
* Write metadata describing entity tables
*
* @param entitySchema the entity schema
*/
public void writeEntityMetadata(EntitySchema entitySchema) {
// create the schema
List<StructField> fields = new ArrayList<>();
fields.add(DataTypes.createStructField(ENTITIES_NAME, DataTypes.StringType, false));
fields.add(DataTypes.createStructField(ENTITIES_URI, DataTypes.StringType, false));
fields.add(DataTypes.createStructField(ENTITIES_LABEL, DataTypes.StringType, true));
fields.add(DataTypes.createStructField(ENTITIES_NUM_ROWS, DataTypes.LongType, false));
StructType schema = DataTypes.createStructType(fields);
List<Tuple2<String, String>> indexes = new ArrayList<>();
indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_URI));
List<Tuple2<String, String>> primaryKeys = new ArrayList<>();
indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_NAME));
final Map<String, String> uriLabels = rdfSchema.getUriLabels();
// create table rows
List<Row> rows = entitySchema.getTables().stream()
.map(table -> {
Object[] valueArray = new Object[]{
table.getName(),
table.getTypeURI(),
uriLabels.get(table.getTypeURI()),
table.getNumRows()
};
return RowFactory.create(valueArray);
}).collect(Collectors.toList());
// create and write the META_Entities dataframe
DataFrame df = sql.createDataFrame(rows, schema);
persistor.writeDataFrame(ENTITIES_TABLE_NAME, df);
persistor.createPrimaryKeys(primaryKeys);
persistor.createIndexes(indexes);
df.unpersist();
}
示例6: getDataFrame
import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
private DataFrame getDataFrame() {
StructType schema = createStructType(new StructField[]{
createStructField("id", IntegerType, false),
createStructField("a", StringType, false),
createStructField("b", DoubleType, false),
createStructField("c", DoubleType, false),
createStructField("d", BooleanType, false),
});
List<Row> trainingData = Arrays.asList(
cr(1, null, null, null, null),
cr(2, "test", 1.2, null, null),
cr(3, null, 1.1, null, false),
cr(4, "faffa", NaN, 45.0, true)
);
DataFrame df = sqlContext.createDataFrame(trainingData, schema);
return df;
}
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:21,代码来源:FillNAValuesTransformerBridgeTest.java
示例7: main
import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
Flags.setFromCommandLineArgs(THE_OPTIONS, args);
// 初始化Spark Conf.
SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval());
SQLContext sqlContext = new SQLContext(sc);
// 初始化参数
HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(",")));
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker());
// 从Kafka Stream获取数据
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);
JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
private static final long serialVersionUID = 5266880065425088203L;
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> {
List<ApacheAccessLog> list = new ArrayList<>();
try {
// 映射每一行
list.add(ApacheAccessLog.parseFromLogLine(line));
return list;
} catch (RuntimeException e) {
return list;
}
}).cache();
accessLogsDStream.foreachRDD(rdd -> {
// rdd to DataFrame
DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class);
// 写入Parquet文件
df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());
return null;
});
// 启动Streaming服务器
jssc.start(); // 启动计算
jssc.awaitTermination(); // 等待终止
}
示例8: tpch14
import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
public void tpch14() {
int year_14 = 1993;
int monthOffset_14 = rand.nextInt(60);
SimpleDate d14_1 = new SimpleDate(year_14 + monthOffset_14 / 12, monthOffset_14 % 12 + 1, 1);
monthOffset_14 += 1;
SimpleDate d14_2 = new SimpleDate(year_14 + monthOffset_14 / 12, monthOffset_14 % 12 + 1, 1);
String lineitemPredicate = "l_shipdate >= \"" + d14_1 + "\" and l_shipdate < \"" + d14_2 + "\"";
long start = System.currentTimeMillis();
System.out.println("SELECT * "
+ "FROM lineitem JOIN part ON l_partkey = p_partkey "
+ "WHERE " + lineitemPredicate);
DataFrame df = sqlContext.sql("SELECT * "
+ "FROM lineitem JOIN part ON l_partkey = p_partkey "
+ "WHERE " + lineitemPredicate);
long result = df.count(); // 76860
System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start) + "; Result: " + result);
}
示例9: getModelInfo
import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
@Override
public LogisticRegressionModelInfo getModelInfo(final LogisticRegressionModel sparkLRModel, DataFrame df) {
final LogisticRegressionModelInfo logisticRegressionModelInfo = new LogisticRegressionModelInfo();
logisticRegressionModelInfo.setWeights(sparkLRModel.weights().toArray());
logisticRegressionModelInfo.setIntercept(sparkLRModel.intercept());
logisticRegressionModelInfo.setNumClasses(sparkLRModel.numClasses());
logisticRegressionModelInfo.setNumFeatures(sparkLRModel.numFeatures());
logisticRegressionModelInfo.setThreshold((double) sparkLRModel.getThreshold().get());
Set<String> inputKeys = new LinkedHashSet<String>();
inputKeys.add("features");
logisticRegressionModelInfo.setInputKeys(inputKeys);
Set<String> outputKeys = new LinkedHashSet<String>();
outputKeys.add("prediction");
outputKeys.add("probability");
logisticRegressionModelInfo.setOutputKeys(outputKeys);
return logisticRegressionModelInfo;
}
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:21,代码来源:LogisticRegressionModelInfoAdapter.java
示例10: writeDataFrame
import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
/**
* Write a {@link DataFrame} to the specified output
*
* @param name name of output table
* @param df dataframe containing the data
*/
@Override
public void writeDataFrame(String name, DataFrame df) {
Map<String, String> props = config.getProperties(name);
log.info("Writing to ElasticSearch: {}", props);
JavaEsSparkSQL.saveToEs(df, props);
}
示例11: writeDataFrame
import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
/**
* Write a {@link DataFrame} to the specified output
*
* @param name name of output table
* @param df dataframe containing the data
*/
@Override
public void writeDataFrame(String name, DataFrame df) {
String outputFolder = config.getOutputFolder();
String outputPath = Paths.get(outputFolder, name).toString();
log.info("Writing CSV files to folder {}", outputPath);
df.write().mode(saveMode)
.format("com.databricks.spark.csv")
.option("header", "true")
.save(outputPath);
}
示例12: testRandomSplit
import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
void testRandomSplit(String inputFileName, int numFeatures, String modelFileName) {
CMMParams params = new CMMParams()
.setMaxIter(600)
.setRegParam(1E-6)
.setMarkovOrder(2)
.setNumFeatures(numFeatures);
JavaRDD<String> lines = jsc.textFile(inputFileName);
DataFrame dataset = createDataFrame(lines.collect());
DataFrame[] splits = dataset.randomSplit(new double[]{0.9, 0.1});
DataFrame trainingData = splits[0];
System.out.println("Number of training sequences = " + trainingData.count());
DataFrame testData = splits[1];
System.out.println("Number of test sequences = " + testData.count());
// train and save a model on the training data
cmmModel = train(trainingData, modelFileName, params);
// test the model on the test data
System.out.println("Test accuracy:");
evaluate(testData);
// test the model on the training data
System.out.println("Training accuracy:");
evaluate(trainingData);
}
示例13: getModelInfo
import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
@Override
public MinMaxScalerModelInfo getModelInfo(final MinMaxScalerModel from, final DataFrame df) {
final MinMaxScalerModelInfo modelInfo = new MinMaxScalerModelInfo();
modelInfo.setOriginalMax(from.originalMax().toArray());
modelInfo.setOriginalMin(from.originalMin().toArray());
modelInfo.setMax(from.getMax());
modelInfo.setMin(from.getMin());
Set<String> inputKeys = new LinkedHashSet<String>();
inputKeys.add(from.getInputCol());
modelInfo.setInputKeys(inputKeys);
Set<String> outputKeys = new LinkedHashSet<String>();
outputKeys.add(from.getOutputCol());
modelInfo.setOutputKeys(outputKeys);
return modelInfo;
}
示例14: getModelInfo
import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
@Override
public CountVectorizerModelInfo getModelInfo(final CountVectorizerModel from, final DataFrame df) {
final CountVectorizerModelInfo modelInfo = new CountVectorizerModelInfo();
modelInfo.setMinTF(from.getMinTF());
modelInfo.setVocabulary(from.vocabulary());
Set<String> inputKeys = new LinkedHashSet<String>();
inputKeys.add(from.getInputCol());
modelInfo.setInputKeys(inputKeys);
Set<String> outputKeys = new LinkedHashSet<String>();
outputKeys.add(from.getOutputCol());
modelInfo.setOutputKeys(outputKeys);
return modelInfo;
}
示例15: getModelInfo
import org.apache.spark.sql.DataFrame; //导入依赖的package包/类
@Override
public IfZeroVectorModelInfo getModelInfo(final IfZeroVector from, DataFrame df) {
IfZeroVectorModelInfo modelInfo = new IfZeroVectorModelInfo();
Set<String> inputKeys = new LinkedHashSet<String>();
inputKeys.add(from.getInputCol());
modelInfo.setInputKeys(inputKeys);
Set<String> outputKeys = new LinkedHashSet<String>();
outputKeys.add(from.getOutputCol());
modelInfo.setOutputKeys(outputKeys);
modelInfo.setThenSetValue(from.getThenSetValue());
modelInfo.setElseSetCol(from.getElseSetCol());
return modelInfo;
}