本文整理汇总了Java中org.apache.spark.sql.SaveMode类的典型用法代码示例。如果您正苦于以下问题:Java SaveMode类的具体用法?Java SaveMode怎么用?Java SaveMode使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
SaveMode类属于org.apache.spark.sql包,在下文中一共展示了SaveMode类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.spark.sql.SaveMode; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
Flags.setFromCommandLineArgs(THE_OPTIONS, args);
// 初始化Spark Conf.
SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval());
SQLContext sqlContext = new SQLContext(sc);
// 初始化参数
HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(",")));
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker());
// 从Kafka Stream获取数据
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);
JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
private static final long serialVersionUID = 5266880065425088203L;
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> {
List<ApacheAccessLog> list = new ArrayList<>();
try {
// 映射每一行
list.add(ApacheAccessLog.parseFromLogLine(line));
return list;
} catch (RuntimeException e) {
return list;
}
}).cache();
accessLogsDStream.foreachRDD(rdd -> {
// rdd to DataFrame
DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class);
// 写入Parquet文件
df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());
return null;
});
// 启动Streaming服务器
jssc.start(); // 启动计算
jssc.awaitTermination(); // 等待终止
}
示例2: buildProperties
import org.apache.spark.sql.SaveMode; //导入依赖的package包/类
public void buildProperties() {
// return rows of format <predicate, is_complex>
// is_complex can be 1 or 0
// 1 for multivalued predicate, 0 for single predicate
// select the properties that are complex
Dataset<Row> multivaluedProperties = spark.sql(String.format(
"SELECT DISTINCT(%1$s) AS %1$s FROM "
+ "(SELECT %2$s, %1$s, COUNT(*) AS rc FROM %3$s GROUP BY %2$s, %1$s HAVING rc > 1) AS grouped",
column_name_predicate, column_name_subject, name_tripletable));
// select all the properties
Dataset<Row> allProperties = spark.sql(String.format("SELECT DISTINCT(%1$s) AS %1$s FROM %2$s",
column_name_predicate, name_tripletable));
// select the properties that are not complex
Dataset<Row> singledValueProperties = allProperties.except(multivaluedProperties);
// combine them
Dataset<Row> combinedProperties = singledValueProperties
.selectExpr(column_name_predicate, "0 AS is_complex")
.union(multivaluedProperties.selectExpr(column_name_predicate, "1 AS is_complex"));
// remove '<' and '>', convert the characters
Dataset<Row> cleanedProperties = combinedProperties.withColumn("p",
functions.regexp_replace(functions.translate(combinedProperties.col("p"), "<>", ""),
"[[^\\w]+]", "_"));
// write the result
cleanedProperties.write().mode(SaveMode.Overwrite).saveAsTable("properties");
logger.info("Created properties table with name: " + tablename_properties);
}
示例3: execute
import org.apache.spark.sql.SaveMode; //导入依赖的package包/类
@Override
public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) {
String filePath = actionStatement.getParamValues().get(0).getValue().toString();
String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString();
String dfTableName = actionStatement.getParamValues().get(2).getValue().toString();
SaveMode saveMode = SaveMode.valueOf(saveModeStr);
String sql = String.format("select * from %s", dfTableName);
logger.info(String.format("Running sql [%s] to get data and then save it", sql));
Dataset<Row> df = sparkSession.sql(sql);
logger.info(String.format("Saving to csv %s, saveMode: %s", filePath, saveMode));
df.coalesce(1).write().mode(saveMode).option("header", "false").csv(filePath);
logger.info(String.format("Saved to csv %s, saveMode: %s", filePath, saveMode));
return null;
}
示例4: execute
import org.apache.spark.sql.SaveMode; //导入依赖的package包/类
@Override
public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) {
String filePath = actionStatement.getParamValues().get(0).getValue().toString();
String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString();
String dfTableName = actionStatement.getParamValues().get(2).getValue().toString();
SaveMode saveMode = SaveMode.valueOf(saveModeStr);
String sql = String.format("select * from %s", dfTableName);
logger.info(String.format("Running sql [%s] to get data and then save it", sql));
Dataset<Row> df = sparkSession.sql(sql);
logger.info(String.format("Saving to json %s, saveMode: %s", filePath, saveMode));
df.coalesce(1).write().mode(saveMode).json(filePath);
logger.info(String.format("Saved to json %s, saveMode: %s", filePath, saveMode));
return null;
}
示例5: execute
import org.apache.spark.sql.SaveMode; //导入依赖的package包/类
@Override
public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) {
String filePath = actionStatement.getParamValues().get(0).getValue().toString();
String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString();
String dfTableName = actionStatement.getParamValues().get(2).getValue().toString();
SaveMode saveMode = SaveMode.valueOf(saveModeStr);
String sql = String.format("select * from %s", dfTableName);
logger.info(String.format("Running sql [%s] to get data and then save it", sql));
Dataset<Row> df = sparkSession.sql(sql);
logger.info(String.format("Saving to parquet %s, saveMode: %s", filePath, saveMode));
df.coalesce(1).write().mode(saveMode).parquet(filePath);
logger.info(String.format("Saved to parquet %s, saveMode: %s", filePath, saveMode));
return null;
}
示例6: writeParquetFile
import org.apache.spark.sql.SaveMode; //导入依赖的package包/类
@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<ExecRow> writeParquetFile(int[] baseColumnMap, int[] partitionBy, String location, String compression,
OperationContext context) {
try {
Dataset<Row> insertDF = SpliceSpark.getSession().createDataFrame(
rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context))).map(new LocatedRowToRowFunction()),
context.getOperation().getExecRowDefinition().schema());
List<Column> cols = new ArrayList();
for (int i = 0; i < baseColumnMap.length; i++) {
cols.add(new Column(ValueRow.getNamedColumn(baseColumnMap[i])));
}
List<String> partitionByCols = new ArrayList();
for (int i = 0; i < partitionBy.length; i++) {
partitionByCols.add(ValueRow.getNamedColumn(partitionBy[i]));
}
insertDF.write().option(SPARK_COMPRESSION_OPTION,compression).partitionBy(partitionByCols.toArray(new String[partitionByCols.size()]))
.mode(SaveMode.Append).parquet(location);
ValueRow valueRow=new ValueRow(1);
valueRow.setColumn(1,new SQLLongint(context.getRecordsWritten()));
return new SparkDataSet<>(SpliceSpark.getContext().parallelize(Collections.singletonList(valueRow), 1));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
示例7: writeORCFile
import org.apache.spark.sql.SaveMode; //导入依赖的package包/类
@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<ExecRow> writeORCFile(int[] baseColumnMap, int[] partitionBy, String location, String compression,
OperationContext context) {
try {
Dataset<Row> insertDF = SpliceSpark.getSession().createDataFrame(
rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context))).map(new LocatedRowToRowFunction()),
context.getOperation().getExecRowDefinition().schema());
List<Column> cols = new ArrayList();
for (int i = 0; i < baseColumnMap.length; i++) {
cols.add(new Column(ValueRow.getNamedColumn(baseColumnMap[i])));
}
String[] partitionByCols = new String[partitionBy.length];
for (int i = 0; i < partitionBy.length; i++) {
partitionByCols[i] = ValueRow.getNamedColumn(partitionBy[i]);
}
insertDF.write().option(SPARK_COMPRESSION_OPTION,compression)
.partitionBy(partitionByCols)
.mode(SaveMode.Append).orc(location);
ValueRow valueRow=new ValueRow(1);
valueRow.setColumn(1,new SQLLongint(context.getRecordsWritten()));
return new SparkDataSet<>(SpliceSpark.getContext().parallelize(Collections.singletonList(valueRow), 1));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
示例8: writeTextFile
import org.apache.spark.sql.SaveMode; //导入依赖的package包/类
@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<ExecRow> writeTextFile(SpliceOperation op, String location, String characterDelimiter, String columnDelimiter,
int[] baseColumnMap,
OperationContext context) {
try {
Dataset<Row> insertDF = SpliceSpark.getSession().createDataFrame(
rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context))).map(new LocatedRowToRowFunction()),
context.getOperation().getExecRowDefinition().schema());
List<Column> cols = new ArrayList();
for (int i = 0; i < baseColumnMap.length; i++) {
cols.add(new Column(ValueRow.getNamedColumn(baseColumnMap[i])));
}
// spark-2.2.0: commons-lang3-3.3.2 does not support 'XXX' timezone, specify 'ZZ' instead
insertDF.write().option("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSZZ")
.mode(SaveMode.Append).csv(location);
ValueRow valueRow=new ValueRow(1);
valueRow.setColumn(1,new SQLLongint(context.getRecordsWritten()));
return new SparkDataSet<>(SpliceSpark.getContext().parallelize(Collections.singletonList(valueRow), 1));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
示例9: repartitionByNumberOfFile
import org.apache.spark.sql.SaveMode; //导入依赖的package包/类
public void repartitionByNumberOfFile(String inputPath, String outputPath, int numberOfParquet, boolean overwrite)
throws IOException {
// Leggo il path di input
Path input = new Path(inputPath);
// Leggo il path di output
Path output = new Path(outputPath);
// Controllo se la cartella di input è vuota
if (HDFSUtils.containsFiles(fileSystem, input, ".parquet")) {
throw new FileNotFoundException("The folder " + inputPath + " does not contain any .parquet file");
}
// Se la cartella di output esiste, non è vuota e non si è specificato
// la sovrascrittura
if (overwrite == false && fileSystem.exists(output)
&& !HDFSUtils.containsFiles(fileSystem, output, ".parquet")) {
throw new IOException("The output folder " + outputPath + " already contains parquet files");
}
DataFrame dataInput = sqlContext.read().parquet(input.toString());
dataInput.repartition(numberOfParquet).write().mode(overwrite ? SaveMode.Overwrite : SaveMode.ErrorIfExists)
.parquet(output.toString());
}
示例10: load
import org.apache.spark.sql.SaveMode; //导入依赖的package包/类
@Override
public void load() {
logger.info("Beginning the creation of VP tables.");
if (this.properties_names == null){
logger.error("Properties not calculated yet. Extracting them");
this.properties_names = extractProperties();
}
Vector<TableStats> tables_stats = new Vector<TableStats>();
ThreadPoolExecutor loaders_pool = (ThreadPoolExecutor) Executors.newFixedThreadPool(max_parallelism);
for(int i = 0; i < this.properties_names.length; i++){
String property = this.properties_names[i];
Dataset<Row> table_VP = spark.sql("SELECT s AS s, o AS o FROM tripletable WHERE p='" + property + "'");
String table_name_VP = "vp_" + this.getValidHiveName(property);
loaders_pool.submit(new Thread(() -> {
// save the table
table_VP.write().mode(SaveMode.Overwrite).saveAsTable(table_name_VP);
// calculate stats
if(computeStatistics)
tables_stats.add(calculate_stats_table(table_VP, this.getValidHiveName(property)));
logger.info("Created VP table for the property: " + property);
}));
}
// save the stats in a file with the same name as the output database
if(computeStatistics)
save_stats(this.database_name, tables_stats);
logger.info("Vertical Partitioning completed. Loaded " + String.valueOf(this.properties_names.length) + " tables.");
}
示例11: buildComplexPropertyTable
import org.apache.spark.sql.SaveMode; //导入依赖的package包/类
/**
* Create the final property table, allProperties contains the list of all
* possible properties isComplexProperty contains (in the same order used by
* allProperties) the boolean value that indicates if that property is
* complex (called also multi valued) or simple.
*/
public void buildComplexPropertyTable(String[] allProperties, Boolean[] isComplexProperty) {
// create a new aggregation environment
PropertiesAggregateFunction aggregator = new PropertiesAggregateFunction(allProperties, columns_separator);
String predicateObjectColumn = "po";
String groupColumn = "group";
// get the compressed table
Dataset<Row> compressedTriples = spark.sql(String.format("SELECT %s, CONCAT(%s, '%s', %s) AS po FROM %s",
column_name_subject, column_name_predicate, columns_separator, column_name_object, name_tripletable));
// group by the subject and get all the data
Dataset<Row> grouped = compressedTriples.groupBy(column_name_subject)
.agg(aggregator.apply(compressedTriples.col(predicateObjectColumn)).alias(groupColumn));
// build the query to extract the property from the array
String[] selectProperties = new String[allProperties.length + 1];
selectProperties[0] = column_name_subject;
for (int i = 0; i < allProperties.length; i++) {
// if property is a full URI, remove the < at the beginning end > at the end
String rawProperty = allProperties[i].startsWith("<") && allProperties[i].endsWith(">") ?
allProperties[i].substring(1, allProperties[i].length() - 1) : allProperties[i];
// if is not a complex type, extract the value
String newProperty = isComplexProperty[i]
? " " + groupColumn + "[" + String.valueOf(i) + "] AS " + getValidHiveName(rawProperty)
: " " + groupColumn + "[" + String.valueOf(i) + "][0] AS " + getValidHiveName(rawProperty);
selectProperties[i + 1] = newProperty;
}
Dataset<Row> propertyTable = grouped.selectExpr(selectProperties);
// write the final one, partitioned by subject
propertyTable.write().mode(SaveMode.Overwrite).format(table_format).saveAsTable(output_tablename);
logger.info("Created property table with name: " + output_tablename);
}
示例12: writeAncestorsToTable
import org.apache.spark.sql.SaveMode; //导入依赖的package包/类
/**
* Writes ancestor records to a table. This class ensures the columns and partitions are mapped
* properly, and is a workaround similar to the problem described <a
* href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
*
* @param ancestors a dataset of ancestor records
* @param tableName the table to write them to
*/
private static void writeAncestorsToTable(Dataset<Ancestor> ancestors, String tableName) {
Dataset<Row> orderedColumnDataset = ancestors.select("descendantSystem",
"descendantValue",
"ancestorSystem",
"ancestorValue",
"uri",
"version");
orderedColumnDataset.write()
.mode(SaveMode.ErrorIfExists)
.insertInto(tableName);
}
示例13: writeValuesToTable
import org.apache.spark.sql.SaveMode; //导入依赖的package包/类
/**
* Writes value records to a table. This class ensures the columns and partitions are mapped
* properly, and is a workaround similar to the problem described <a
* href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
*
* @param values a dataset of value records
* @param tableName the table to write them to
*/
private static void writeValuesToTable(Dataset<Value> values, String tableName) {
// Note the last two columns here must be the partitioned-by columns in order and in lower case
// for Spark to properly match them to the partitions
Dataset<Row> orderColumnDataset = values.select("system",
"version",
"value",
"valueseturi",
"valuesetversion");
orderColumnDataset.write()
.mode(SaveMode.ErrorIfExists)
.insertInto(tableName);
}
示例14: savePropertiesIntoTable
import org.apache.spark.sql.SaveMode; //导入依赖的package包/类
/**
* Create "properties" table. See:
* {@link ComplexPropertyTableLoader#tablename_properties}.
*/
public void savePropertiesIntoTable() {
// return rows of format <predicate, is_complex>
// is_complex can be 1 or 0
// 1 for multivalued predicate, 0 for single predicate
// select the properties that are complex
DataFrame multivaluedProperties = this.hiveContext.sql(String.format(
"SELECT DISTINCT(%1$s) AS %1$s FROM (SELECT %2$s, %1$s, COUNT(*) AS rc FROM %3$s GROUP BY %2$s, %1$s HAVING rc > 1) AS grouped",
column_name_predicate, column_name_subject, tablename_triple_table));
// select all the properties
DataFrame allProperties = this.hiveContext.sql(String.format("SELECT DISTINCT(%1$s) AS %1$s FROM %2$s",
column_name_predicate, tablename_triple_table));
// select the properties that are not complex
DataFrame singledValueProperties = allProperties.except(multivaluedProperties);
// combine them
DataFrame combinedProperties = singledValueProperties
.selectExpr(column_name_predicate, "0 AS " + column_name_is_complex)
.unionAll(multivaluedProperties.selectExpr(column_name_predicate, "1 AS " + column_name_is_complex));
// remove '<' and '>', convert the characters
DataFrame cleanedProperties = combinedProperties.withColumn("p", functions.regexp_replace(functions.translate(combinedProperties.col("p"), "<>", ""),
"[[^\\w]+]", "_"));
// write the result
cleanedProperties.write().mode(SaveMode.Overwrite).saveAsTable(tablename_properties);
}
示例15: buildComplexPropertyTable
import org.apache.spark.sql.SaveMode; //导入依赖的package包/类
/**
* Create the final property table, allProperties contains the list of all
* possible properties isComplexProperty contains (in the same order used by
* allProperties) the boolean value that indicates if that property is
* complex (called also multi valued) or simple.
*/
public void buildComplexPropertyTable(String[] allProperties, Boolean[] isComplexProperty) {
// create a new aggregation environment
PropertiesAggregateFunction aggregator = new PropertiesAggregateFunction(allProperties, columns_separator);
String predicateObjectColumn = "po";
String groupColumn = "group";
// get the compressed table
DataFrame compressedTriples = this.hiveContext.sql(String.format("SELECT %s, CONCAT(%s, '%s', %s) AS po FROM %s",
column_name_subject, column_name_predicate, columns_separator, column_name_object, tablename_triple_table));
// group by the subject and get all the data
DataFrame grouped = compressedTriples.groupBy(column_name_subject)
.agg(aggregator.apply(compressedTriples.col(predicateObjectColumn)).alias(groupColumn));
// build the query to extract the property from the array
String[] selectProperties = new String[allProperties.length + 1];
selectProperties[0] = column_name_subject;
for (int i = 0; i < allProperties.length; i++) {
// if property is a full URI, remove the < at the beginning end > at the end
String rawProperty = allProperties[i].startsWith("<") && allProperties[i].endsWith(">") ?
allProperties[i].substring(1, allProperties[i].length() - 1) : allProperties[i];
// if is not a complex type, extract the value
String newProperty = isComplexProperty[i]
? " " + groupColumn + "[" + String.valueOf(i) + "] AS " + getValidColumnName(rawProperty)
: " " + groupColumn + "[" + String.valueOf(i) + "][0] AS " + getValidColumnName(rawProperty);
selectProperties[i + 1] = newProperty;
}
DataFrame propertyTable = grouped.selectExpr(selectProperties);
// write the final one
propertyTable.write().mode(SaveMode.Overwrite).format(table_format_parquet)
.saveAsTable(tablename_complex_property_table);
}