本文整理汇总了Java中org.apache.spark.sql.SparkSession类的典型用法代码示例。如果您正苦于以下问题:Java SparkSession类的具体用法?Java SparkSession怎么用?Java SparkSession使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
SparkSession类属于org.apache.spark.sql包,在下文中一共展示了SparkSession类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: generateData_week_timepoints_by_10_minutes
import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
private static Dataset<Row> generateData_week_timepoints_by_10_minutes(SparkSession spark) {
StructField[] structFields = new StructField[1];
org.apache.spark.sql.types.DataType dataType = DataTypes.IntegerType;
String column = "timepoint";
StructField structField = new StructField(column, dataType, true, Metadata.empty());
structFields[0] = structField;
StructType structType = new StructType(structFields);
List<Row> rows = new ArrayList<>();
int weekTotalMinutes = 7 * 24 * 60;
int timepointIntervalMinutes = 10;
for (int i = 0; i < weekTotalMinutes / timepointIntervalMinutes; i++) {
Object[] objects = new Object[structFields.length];
objects[0] = i;
Row row = RowFactory.create(objects);
rows.add(row);
}
Dataset<Row> df = spark.createDataFrame(rows, structType);
return df;
}
示例2: readMultiaxialHierarchyFile
import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
/**
* Reads the LOINC mutliaxial hierarchy file and converts it to a {@link HierarchicalElement}
* dataset.
*
* @param spark the Spark session
* @param loincHierarchyPath path to the multiaxial hierarchy CSV
* @return a dataset of {@link HierarchicalElement} representing the hierarchical relationship.
*/
public static Dataset<HierarchicalElement> readMultiaxialHierarchyFile(SparkSession spark,
String loincHierarchyPath) {
return spark.read()
.option("header", true)
.csv(loincHierarchyPath)
.select(col("IMMEDIATE_PARENT"), col("CODE"))
.where(col("IMMEDIATE_PARENT").isNotNull()
.and(col("IMMEDIATE_PARENT").notEqual(lit(""))))
.where(col("CODE").isNotNull()
.and(col("CODE").notEqual(lit(""))))
.map((MapFunction<Row, HierarchicalElement>) row -> {
HierarchicalElement element = new HierarchicalElement();
element.setAncestorSystem(LOINC_CODE_SYSTEM_URI);
element.setAncestorValue(row.getString(0));
element.setDescendantSystem(LOINC_CODE_SYSTEM_URI);
element.setDescendantValue(row.getString(1));
return element;
}, Hierarchies.getHierarchicalElementEncoder());
}
示例3: getPayloadFromCsv
import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
private static Dataset<Row> getPayloadFromCsv( final SparkSession sparkSession, JdbcIntegrationConfig config ) {
// String csvPath = Resources.getResource( "dispatch_type.csv" ).getPath();
java.sql.Date d = new java.sql.Date( DateTime.now().minusDays( 90 ).toDate().getTime() );
String query = "(select * from dbo.Dispatch_Type where timercvd >= '" + d.toString() +"') Dispatch_Type";
Dataset<Row> payload = sparkSession
.read()
.format( "jdbc" )
.option( "url", config.getUrl() )
.option( "dbtable", query )
.option( "password", config.getDbPassword() )
.option( "user", config.getDbUser() )
.load();
return payload;
}
示例4: main
import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
public static void main(String[] args) {
// SparkConf conf = new SparkConf().setMaster("local").setAppName("BroadCasting");
// JavaSparkContext jsc = new JavaSparkContext(conf);
//
// Broadcast<String> broadcastVar = jsc.broadcast("Hello Spark");
//
SparkSession sparkSession = SparkSession.builder().master("local").appName("My App")
.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();
Broadcast<String> broadcastVar= sparkSession.sparkContext().broadcast("Hello Spark", scala.reflect.ClassTag$.MODULE$.apply(String.class));
System.out.println(broadcastVar.getValue());
broadcastVar.unpersist();
// broadcastVar.unpersist(true);
broadcastVar.destroy();
}
示例5: generateData_numbers_1k
import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
private static Dataset<Row> generateData_numbers_1k(SparkSession spark) {
StructField[] structFields = new StructField[1];
org.apache.spark.sql.types.DataType dataType = DataTypes.IntegerType;
String column = "number";
StructField structField = new StructField(column, dataType, true, Metadata.empty());
structFields[0] = structField;
StructType structType = new StructType(structFields);
List<Row> rows = new ArrayList<>();
for (int i = 0; i <= 1000; i++) {
Object[] objects = new Object[structFields.length];
objects[0] = i;
Row row = RowFactory.create(objects);
rows.add(row);
}
Dataset<Row> df = spark.createDataFrame(rows, structType);
return df;
}
示例6: execute
import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
@Override
public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) {
String filePath = actionStatement.getParamValues().get(0).getValue().toString();
String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString();
String dfTableName = actionStatement.getParamValues().get(2).getValue().toString();
SaveMode saveMode = SaveMode.valueOf(saveModeStr);
String sql = String.format("select * from %s", dfTableName);
logger.info(String.format("Running sql [%s] to get data and then save it", sql));
Dataset<Row> df = sparkSession.sql(sql);
logger.info(String.format("Saving to parquet %s, saveMode: %s", filePath, saveMode));
df.coalesce(1).write().mode(saveMode).parquet(filePath);
logger.info(String.format("Saved to parquet %s, saveMode: %s", filePath, saveMode));
return null;
}
示例7: getPayloadFromCsv
import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
private static Dataset<Row> getPayloadFromCsv( final SparkSession sparkSession, JdbcIntegrationConfig config ) {
// String csvPath = Resources.getResource( "dispatch.csv" ).getPath();
java.sql.Date d = new java.sql.Date( DateTime.now().minusDays( 90 ).toDate().getTime() );
String query = "(select * from dbo.Dispatch where CFS_DateTimeJanet >= '" + d.toString() +"') Dispatch";
Dataset<Row> payload = sparkSession
.read()
.format( "jdbc" )
.option( "url", config.getUrl() )
.option( "dbtable", query )
.option( "password", config.getDbPassword() )
.option( "user", config.getDbUser() )
.option( "driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver" )
.load();
//payload.createOrReplaceTempView( "Dispatch" );
return payload;
}
示例8: getPayloadFromCsv
import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
private static Dataset<Row> getPayloadFromCsv( final SparkSession sparkSession, JdbcIntegrationConfig config ) {
// String csvPath = Resources.getResource( "dispatch_persons.csv" ).getPath();
String sql = "(select * from dbo.Dispatch_Persons where Dis_id IN "
+ "( select distinct (Dis_Id) from Dispatch where CFS_DateTimeJanet > DateADD(d, -90, GETDATE()) ) ) Dispatch_Persons";
logger.info( "SQL Query for persons: {}", sql );
Dataset<Row> payload = sparkSession
.read()
.format( "jdbc" )
.option( "url", config.getUrl() )
.option( "dbtable", sql )
.option( "password", config.getDbPassword() )
.option( "user", config.getDbUser() )
.option( "driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver" )
.load();
payload.createOrReplaceTempView( "Dispatch_Persons" );
// .filter( col( "Timercvd" ).geq( DateTime.now().minusDays( 2 ) ) )
// .filter( col( "Type" ).notEqual( "2" ) );
return payload;
}
示例9: convertToDataFrame
import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
/**
* 将数据集文件转换为DataFrame TODO:增加json等格式
*
* @param context
* @return
* @throws CantConverException
*/
public static Dataset<Row> convertToDataFrame(DataFile dataFile, JavaSparkContext context) throws CantConverException {
SparkSession sparkSession = SparkSession.builder()
.sparkContext(context.sc())
.getOrCreate();
SQLContext sqlContext = new SQLContext(sparkSession);
switch (dataFile.getDataFileType()) {
case CSV:
return csvToDataFrame(dataFile, context, sqlContext);
case LIBSVM:
return libsvmToDataFrame(dataFile, sqlContext);
default:
throw new CantConverException("不支持的数据集格式");
}
}
示例10: setUpSpark
import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
/**
* Sets up a new SparkSession
* @param appName The name that will show up in the Web UI (and the history server)
* @param NUM_CORES_IN_CLUSTER the number of available cores in the cluster
* @param NUM_WORKERS the number of worker nodes in the cluster
* @param parallelismFactor spark tuning documentation suggests 2 or 3, unless OOM error (in that case more)
* @param tmpPath
* @return
*/
public static SparkSession setUpSpark(String appName, int NUM_CORES_IN_CLUSTER, int NUM_WORKERS, int parallelismFactor, String tmpPath) {
final int NUM_EXECUTORS = NUM_WORKERS * 3; //standard: NUM_WORKERS *3
final int NUM_EXECUTOR_CORES = NUM_CORES_IN_CLUSTER/NUM_EXECUTORS;
final int PARALLELISM = NUM_EXECUTORS * NUM_EXECUTOR_CORES * parallelismFactor;
return SparkSession.builder()
.appName(appName)
.config("spark.sql.warehouse.dir", tmpPath)
.config("spark.eventLog.enabled", true)
.config("spark.default.parallelism", PARALLELISM) //x tasks for each core --> x "reduce" rounds (keep this fixed for speedup tests), oherwise (set: PARALLELISM)
.config("spark.rdd.compress", true)
.config("spark.network.timeout", "600s")
.config("spark.executor.heartbeatInterval", "20s")
.config("spark.executor.instances", NUM_EXECUTORS)
.config("spark.executor.cores", NUM_EXECUTOR_CORES) //speedup tests: 12,9,6,3 OR 12,8,4,1
.config("spark.executor.memory", "55G") //55G is fine in ISL cluster
.config("spark.driver.maxResultSize", "2g")
.getOrCreate();
}
示例11: setUp
import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
@Before
public void setUp() {
System.setProperty("hadoop.home.dir", "C:\\Users\\VASILIS\\Documents\\hadoop_home"); //only for local mode
spark = SparkSession.builder()
.appName("test")
.config("spark.sql.warehouse.dir", "/file:/tmp")
.config("spark.executor.instances", 1)
.config("spark.executor.cores", 1)
.config("spark.executor.memory", "1G")
.config("spark.driver.maxResultSize", "1g")
.config("spark.master", "local")
.getOrCreate();
jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
}
示例12: setUp
import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
@Before
public void setUp() {
System.setProperty("hadoop.home.dir", "C:\\Users\\VASILIS\\Documents\\hadoop_home"); //only for local mode
spark = SparkSession.builder()
.appName("test")
.config("spark.sql.warehouse.dir", "/file:/tmp")
.config("spark.executor.instances", 1)
.config("spark.executor.cores", 1)
.config("spark.executor.memory", "1G")
.config("spark.driver.maxResultSize", "1g")
.config("spark.master", "local")
.getOrCreate();
jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
}
示例13: getFromDatabase
import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
/**
* Returns the collection of ancestors from the table in the given database.
*
* @param spark the spark session
* @param database name of the database containing the ancestors table
* @return a Hierarchies instance.
*/
public static Hierarchies getFromDatabase(SparkSession spark, String database) {
Dataset<Ancestor> ancestors = spark.sql("SELECT * FROM " + database + "." + ANCESTORS_TABLE)
.as(ANCESTOR_ENCODER);
Dataset<UrlAndVersion> members = ancestors.filter((FilterFunction<Ancestor>) ancestor ->
ancestor.getUri().startsWith(HIERARCHY_URI_PREFIX))
.select(col("uri").alias("url"), col("version"))
.distinct()
.as(URI_AND_VERSION_ENCODER);
return new Hierarchies(spark,
members,
ancestors);
}
示例14: extractEntry
import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
/**
* Extracts the given resource type from the RDD of bundles and returns
* it as a Dataset of that type.
*
* @param spark the spark session
* @param bundles an RDD of FHIR Bundles
* @param resourceName the FHIR name of the resource type to extract
* (e.g., condition, patient. etc).
* @param encoders the Encoders instance defining how the resources are encoded.
* @param <T> the type of the resource being extracted from the bundles.
* @return a dataset of the given resource
*/
public static <T extends IBaseResource> Dataset<T> extractEntry(SparkSession spark,
JavaRDD<Bundle> bundles,
String resourceName,
FhirEncoders encoders) {
RuntimeResourceDefinition def = context.getResourceDefinition(resourceName);
JavaRDD<T> resourceRdd = bundles.flatMap(new ToResource<T>(def.getName()));
Encoder<T> encoder = encoders.of((Class<T>) def.getImplementingClass());
return spark.createDataset(resourceRdd.rdd(), encoder);
}
示例15: setUp
import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
/**
* Sets up Spark.
*/
@BeforeClass
public static void setUp() throws IOException {
// Create a local spark session using an in-memory metastore.
// We must also use Hive and set the partition mode to non-strict to
// support dynamic partitions.
spark = SparkSession.builder()
.master("local[2]")
.appName("ConceptMapsTest")
.enableHiveSupport()
.config("javax.jdo.option.ConnectionURL",
"jdbc:derby:memory:metastore_db;create=true")
.config("hive.exec.dynamic.partition.mode",
"nonstrict")
.config("spark.sql.warehouse.dir",
Files.createTempDirectory("spark_warehouse").toString())
.getOrCreate();
spark.sql("create database mappingtestdb");
}