本文整理汇总了Java中org.apache.spark.sql.Column类的典型用法代码示例。如果您正苦于以下问题:Java Column类的具体用法?Java Column怎么用?Java Column使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
Column类属于org.apache.spark.sql包,在下文中一共展示了Column类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: computeJoins
import org.apache.spark.sql.Column; //导入依赖的package包/类
public Dataset<Row> computeJoins(SQLContext sqlContext){
// compute all the joins
Dataset<Row> results = node.computeJoinWithChildren(sqlContext);
// select only the requested result
Column [] selectedColumns = new Column[node.projection.size()];
for (int i = 0; i < selectedColumns.length; i++) {
selectedColumns[i]= new Column(node.projection.get(i));
}
// if there is a filter set, apply it
results = filter == null ? results.select(selectedColumns) : results.filter(filter).select(selectedColumns);
// if results are distinct
if(selectDistinct) results = results.distinct();
return results;
}
示例2: normalize
import org.apache.spark.sql.Column; //导入依赖的package包/类
/**
* Scale based on min,max
*
* @param dataFrame the dataframe to scale
* @param min the minimum value
* @param max the maximum value
* @return the normalized dataframe per column
*/
public static DataRowsFacade normalize(DataRowsFacade dataFrame, double min, double max, List<String> skipColumns) {
List<String> columnsList = DataFrames.toList(dataFrame.get().columns());
columnsList.removeAll(skipColumns);
String[] columnNames = DataFrames.toArray(columnsList);
//first row is min second row is max, each column in a row is for a particular column
List<Row> minMax = minMaxColumns(dataFrame, columnNames);
for (int i = 0; i < columnNames.length; i++) {
String columnName = columnNames[i];
double dMin = ((Number) minMax.get(0).get(i)).doubleValue();
double dMax = ((Number) minMax.get(1).get(i)).doubleValue();
double maxSubMin = (dMax - dMin);
if (maxSubMin == 0)
maxSubMin = 1;
Column newCol = dataFrame.get().col(columnName).minus(dMin).divide(maxSubMin).multiply(max - min).plus(min);
dataFrame = dataRows(dataFrame.get().withColumn(columnName, newCol));
}
return dataFrame;
}
示例3: writeParquetFile
import org.apache.spark.sql.Column; //导入依赖的package包/类
@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<ExecRow> writeParquetFile(int[] baseColumnMap, int[] partitionBy, String location, String compression,
OperationContext context) {
try {
Dataset<Row> insertDF = SpliceSpark.getSession().createDataFrame(
rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context))).map(new LocatedRowToRowFunction()),
context.getOperation().getExecRowDefinition().schema());
List<Column> cols = new ArrayList();
for (int i = 0; i < baseColumnMap.length; i++) {
cols.add(new Column(ValueRow.getNamedColumn(baseColumnMap[i])));
}
List<String> partitionByCols = new ArrayList();
for (int i = 0; i < partitionBy.length; i++) {
partitionByCols.add(ValueRow.getNamedColumn(partitionBy[i]));
}
insertDF.write().option(SPARK_COMPRESSION_OPTION,compression).partitionBy(partitionByCols.toArray(new String[partitionByCols.size()]))
.mode(SaveMode.Append).parquet(location);
ValueRow valueRow=new ValueRow(1);
valueRow.setColumn(1,new SQLLongint(context.getRecordsWritten()));
return new SparkDataSet<>(SpliceSpark.getContext().parallelize(Collections.singletonList(valueRow), 1));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
示例4: writeORCFile
import org.apache.spark.sql.Column; //导入依赖的package包/类
@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<ExecRow> writeORCFile(int[] baseColumnMap, int[] partitionBy, String location, String compression,
OperationContext context) {
try {
Dataset<Row> insertDF = SpliceSpark.getSession().createDataFrame(
rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context))).map(new LocatedRowToRowFunction()),
context.getOperation().getExecRowDefinition().schema());
List<Column> cols = new ArrayList();
for (int i = 0; i < baseColumnMap.length; i++) {
cols.add(new Column(ValueRow.getNamedColumn(baseColumnMap[i])));
}
String[] partitionByCols = new String[partitionBy.length];
for (int i = 0; i < partitionBy.length; i++) {
partitionByCols[i] = ValueRow.getNamedColumn(partitionBy[i]);
}
insertDF.write().option(SPARK_COMPRESSION_OPTION,compression)
.partitionBy(partitionByCols)
.mode(SaveMode.Append).orc(location);
ValueRow valueRow=new ValueRow(1);
valueRow.setColumn(1,new SQLLongint(context.getRecordsWritten()));
return new SparkDataSet<>(SpliceSpark.getContext().parallelize(Collections.singletonList(valueRow), 1));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
示例5: writeTextFile
import org.apache.spark.sql.Column; //导入依赖的package包/类
@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<ExecRow> writeTextFile(SpliceOperation op, String location, String characterDelimiter, String columnDelimiter,
int[] baseColumnMap,
OperationContext context) {
try {
Dataset<Row> insertDF = SpliceSpark.getSession().createDataFrame(
rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context))).map(new LocatedRowToRowFunction()),
context.getOperation().getExecRowDefinition().schema());
List<Column> cols = new ArrayList();
for (int i = 0; i < baseColumnMap.length; i++) {
cols.add(new Column(ValueRow.getNamedColumn(baseColumnMap[i])));
}
// spark-2.2.0: commons-lang3-3.3.2 does not support 'XXX' timezone, specify 'ZZ' instead
insertDF.write().option("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSZZ")
.mode(SaveMode.Append).csv(location);
ValueRow valueRow=new ValueRow(1);
valueRow.setColumn(1,new SQLLongint(context.getRecordsWritten()));
return new SparkDataSet<>(SpliceSpark.getContext().parallelize(Collections.singletonList(valueRow), 1));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
示例6: toColumn
import org.apache.spark.sql.Column; //导入依赖的package包/类
/**
* Convert a list of string names
* to columns
* @param columns the columns to convert
* @return the resulting column list
*/
public static List<Column> toColumn(List<String> columns) {
List<Column> ret = new ArrayList<>();
for (String s : columns)
ret.add(col(s));
return ret;
}
示例7: toColumns
import org.apache.spark.sql.Column; //导入依赖的package包/类
/**
* Convert an array of strings
* to column names
* @param columns the columns to convert
* @return the converted columns
*/
public static Column[] toColumns(String... columns) {
Column[] ret = new Column[columns.length];
for (int i = 0; i < columns.length; i++)
ret[i] = col(columns[i]);
return ret;
}
示例8: toColumnArray
import org.apache.spark.sql.Column; //导入依赖的package包/类
public static Column[] toColumnArray(List<String> columnList) {
Column[] columnArray = new Column[columnList.size()];
for (int i = 0; i < columnList.size(); i++) {
columnArray[i] = new Column(columnList.get(i));
}
return columnArray;
}
示例9: windows
import org.apache.spark.sql.Column; //导入依赖的package包/类
/**
* Spark implementation of Window function,
* We convert the derby specification using SparkWindow helper
* Most of the specifications is identical to Spark except the one position index
* and some specific functions. Look at SparkWindow for more
* @param windowContext
* @param context
* @param pushScope
* @param scopeDetail
* @return
*/
public DataSet<V> windows(WindowContext windowContext, OperationContext context, boolean pushScope, String scopeDetail) {
pushScopeIfNeeded(context, pushScope, scopeDetail);
try {
Dataset<Row> dataset = toSparkRow(this,context);
for(WindowAggregator aggregator : windowContext.getWindowFunctions()) {
// we need to remove to convert resultColumnId from a 1 position index to a 0position index
DataType resultDataType = dataset.schema().fields()[aggregator.getResultColumnId()-1].dataType();
// We define the window specification and we get a back a spark.
// Simply provide all the information and spark window will build it for you
Column col = SparkWindow.partitionBy(aggregator.getPartitions())
.function(aggregator.getType())
.inputs(aggregator.getInputColumnIds())
.orderBy(aggregator.getOrderings())
.frameBoundary(aggregator.getFrameDefinition())
.specificArgs(aggregator.getFunctionSpecificArgs())
.resultColumn(aggregator.getResultColumnId())
.resultDataType(resultDataType)
.toColumn();
// Now we replace the result column by the spark specification.
// the result column is already define by derby. We need to replace it
dataset = dataset.withColumn(ValueRow.getNamedColumn(aggregator.getResultColumnId()-1),col);
}
//Convert back to Splice Row
return toSpliceLocatedRow(dataset, context);
} catch (Exception se){
throw new RuntimeException(se);
}finally {
if (pushScope) context.popScope();
}
}
示例10: writeAvroFile
import org.apache.spark.sql.Column; //导入依赖的package包/类
@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<ExecRow> writeAvroFile(int[] baseColumnMap, int[] partitionBy, String location, String compression,
OperationContext context) {
try {
StructType schema = AvroUtils.supportAvroDateType(context.getOperation().getExecRowDefinition().schema(),"a");
Dataset<Row> insertDF = SpliceSpark.getSession().createDataFrame(
rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context))).map(new LocatedRowToRowAvroFunction()),
schema);
List<Column> cols = new ArrayList();
for (int i = 0; i < baseColumnMap.length; i++) {
cols.add(new Column(ValueRow.getNamedColumn(baseColumnMap[i])));
}
List<String> partitionByCols = new ArrayList();
for (int i = 0; i < partitionBy.length; i++) {
partitionByCols.add(ValueRow.getNamedColumn(partitionBy[i]));
}
insertDF.write().option(SPARK_COMPRESSION_OPTION,compression).partitionBy(partitionByCols.toArray(new String[partitionByCols.size()]))
.mode(SaveMode.Append).format("com.databricks.spark.avro").save(location);
ValueRow valueRow=new ValueRow(1);
valueRow.setColumn(1,new SQLLongint(context.getRecordsWritten()));
return new SparkDataSet<>(SpliceSpark.getContext().parallelize(Collections.singletonList(valueRow), 1));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
示例11: convertSortColumns
import org.apache.spark.sql.Column; //导入依赖的package包/类
/**
* Convert Sort Columns, convert to 0-based index
* @param sortColumns
* @return
*/
public static scala.collection.mutable.Buffer<Column> convertSortColumns(ColumnOrdering[] sortColumns){
return Arrays
.stream(sortColumns)
.map(column -> column.getIsAscending() ?
(column.getIsNullsOrderedLow() ? asc_nulls_first(ValueRow.getNamedColumn(column.getColumnId()-1)) :
asc_nulls_last(ValueRow.getNamedColumn(column.getColumnId()-1))) :
(column.getIsNullsOrderedLow() ? desc_nulls_last(ValueRow.getNamedColumn(column.getColumnId()-1)) :
desc_nulls_first(ValueRow.getNamedColumn(column.getColumnId()-1))))
.collect(Collectors.collectingAndThen(Collectors.toList(), JavaConversions::asScalaBuffer));
}
示例12: convertPartitions
import org.apache.spark.sql.Column; //导入依赖的package包/类
/**
* Convert partition to Spark dataset columns
* Ignoring partition
* @param sortColumns
* @return
*/
public static scala.collection.mutable.Buffer<Column> convertPartitions(ColumnOrdering[] sortColumns){
return Arrays
.stream(sortColumns)
.map(column -> col(ValueRow.getNamedColumn(column.getColumnId()-1)))
.collect(Collectors.collectingAndThen(Collectors.toList(), JavaConversions::asScalaBuffer));
}
示例13: transform
import org.apache.spark.sql.Column; //导入依赖的package包/类
@Override
public Dataset<Row> transform(Dataset<?> dataset){
StructType schema = dataset.schema();
StructType structSchema = getStructSchema(schema);
Column structColumn = dataset.apply(DatasetUtil.escapeColumnName(getStructCol()));
Dataset<Row> result = dataset.toDF();
StructField[] fields = structSchema.fields();
for(StructField field : fields){
String name = field.name();
Column fieldColumn = structColumn.getField(DatasetUtil.escapeColumnName(name));
result = result.withColumn(DatasetUtil.escapeColumnName(name), fieldColumn);
}
return result;
}
示例14: getSongbyTrackID
import org.apache.spark.sql.Column; //导入依赖的package包/类
public List<String> getSongbyTrackID(String trackID) {
Dataset<Row> result = InitSpark.spark.sql("SELECT * FROM song limit 100").filter(new Column("trackID").equalTo(trackID));
return result.toJSON().collectAsList();
}
示例15: execute
import org.apache.spark.sql.Column; //导入依赖的package包/类
@SuppressWarnings("deprecation")
private void execute() {
SparkConf conf = new SparkConf();
conf.setAppName("cassandra-spark-poc");
conf.setMaster("local[*]");
SparkContext sparkContext = new SparkContext(conf);
System.out.println(sparkContext);
SparkSession sparkSession = SparkSession.builder().appName("cassandra-spark-poc").master("local[*]")
.getOrCreate();
SQLContext sqlContext = new SQLContext(sparkSession);
Map<String, String> options = new HashMap<String, String>();
options.put("keyspace", "wootag");
options.put("table", "video_view");
Dataset<Row> dataset = sqlContext.read().format("org.apache.spark.sql.cassandra").options(options).load()
.cache();
dataset.registerTempTable("temptable");
String query = "select video_id, view_duration_in_second, count(*) from temptable group by 1, 2";
List<Row> collectAsList = sqlContext.sql(query).collectAsList();
for (Row row : collectAsList) {
System.out.println(row.get(0) + "," + row.get(1) + "," + row.get(2));
}
// sqlContext.sql(query).show(1000);
long startTime = 1485907200000L;
long endTime = 1487226374000L;
for (long i = startTime; i <= endTime; i = i + TimeUnit.DAYS.toMillis(1)) {
dataset.filter(new Column("event_start_timestamp").geq(i))
.filter(new Column("event_start_timestamp").leq(i + TimeUnit.DAYS.toMillis(1)))
.groupBy(new Column("view_duration_in_second"), new Column("video_id")).count()
.orderBy("view_duration_in_second").show(1000);
sleepDelay();
}
}