当前位置: 首页>>代码示例>>Java>>正文


Java SparkSession类代码示例

本文整理汇总了Java中org.apache.spark.sql.SparkSession的典型用法代码示例。如果您正苦于以下问题:Java SparkSession类的具体用法?Java SparkSession怎么用?Java SparkSession使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


SparkSession类属于org.apache.spark.sql包,在下文中一共展示了SparkSession类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: generateData_week_timepoints_by_10_minutes

import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
private static Dataset<Row> generateData_week_timepoints_by_10_minutes(SparkSession spark) {
    StructField[] structFields = new StructField[1];
    org.apache.spark.sql.types.DataType dataType = DataTypes.IntegerType;
    String column = "timepoint";
    StructField structField = new StructField(column, dataType, true, Metadata.empty());
    structFields[0] = structField;

    StructType structType = new StructType(structFields);

    List<Row> rows = new ArrayList<>();

    int weekTotalMinutes = 7 * 24 * 60;
    int timepointIntervalMinutes = 10;
    for (int i = 0; i < weekTotalMinutes / timepointIntervalMinutes; i++) {
        Object[] objects = new Object[structFields.length];
        objects[0] = i;
        Row row = RowFactory.create(objects);
        rows.add(row);
    }

    Dataset<Row> df = spark.createDataFrame(rows, structType);
    return df;
}
 
开发者ID:uber,项目名称:uberscriptquery,代码行数:24,代码来源:QueryEngine.java

示例2: readMultiaxialHierarchyFile

import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
/**
 * Reads the LOINC mutliaxial hierarchy file and converts it to a {@link HierarchicalElement}
 * dataset.
 *
 * @param spark the Spark session
 * @param loincHierarchyPath path to the multiaxial hierarchy CSV
 * @return a dataset of {@link HierarchicalElement} representing the hierarchical relationship.
 */
public static Dataset<HierarchicalElement> readMultiaxialHierarchyFile(SparkSession spark,
    String loincHierarchyPath) {

  return spark.read()
      .option("header", true)
      .csv(loincHierarchyPath)
      .select(col("IMMEDIATE_PARENT"), col("CODE"))
      .where(col("IMMEDIATE_PARENT").isNotNull()
          .and(col("IMMEDIATE_PARENT").notEqual(lit(""))))
      .where(col("CODE").isNotNull()
          .and(col("CODE").notEqual(lit(""))))
      .map((MapFunction<Row, HierarchicalElement>) row -> {

        HierarchicalElement element = new HierarchicalElement();

        element.setAncestorSystem(LOINC_CODE_SYSTEM_URI);
        element.setAncestorValue(row.getString(0));

        element.setDescendantSystem(LOINC_CODE_SYSTEM_URI);
        element.setDescendantValue(row.getString(1));

        return element;
      }, Hierarchies.getHierarchicalElementEncoder());
}
 
开发者ID:cerner,项目名称:bunsen,代码行数:33,代码来源:Loinc.java

示例3: getPayloadFromCsv

import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
private static Dataset<Row> getPayloadFromCsv( final SparkSession sparkSession, JdbcIntegrationConfig config ) {

        //        String csvPath = Resources.getResource( "dispatch_type.csv" ).getPath();
        java.sql.Date d = new java.sql.Date( DateTime.now().minusDays( 90 ).toDate().getTime() );
        String query = "(select * from dbo.Dispatch_Type where timercvd >= '" + d.toString() +"') Dispatch_Type";
        Dataset<Row> payload = sparkSession
                .read()
                .format( "jdbc" )
                .option( "url", config.getUrl() )
                .option( "dbtable", query )
                .option( "password", config.getDbPassword() )
                .option( "user", config.getDbUser() )
                .load();

        return payload;
    }
 
开发者ID:dataloom,项目名称:integrations,代码行数:17,代码来源:DispatchTypeFlight.java

示例4: main

import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
public static void main(String[] args) {
		
	
//	SparkConf conf = new SparkConf().setMaster("local").setAppName("BroadCasting");
//	JavaSparkContext jsc = new JavaSparkContext(conf);
//	
//	Broadcast<String> broadcastVar = jsc.broadcast("Hello Spark");
//	
	 SparkSession sparkSession = SparkSession.builder().master("local").appName("My App")
			 .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();
	 
	 Broadcast<String> broadcastVar= sparkSession.sparkContext().broadcast("Hello Spark",  scala.reflect.ClassTag$.MODULE$.apply(String.class));
	 System.out.println(broadcastVar.getValue());
	 
	 broadcastVar.unpersist();
	// broadcastVar.unpersist(true);
	 broadcastVar.destroy();
	
	}
 
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:20,代码来源:BroadcastVariable.java

示例5: generateData_numbers_1k

import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
private static Dataset<Row> generateData_numbers_1k(SparkSession spark) {
    StructField[] structFields = new StructField[1];
    org.apache.spark.sql.types.DataType dataType = DataTypes.IntegerType;
    String column = "number";
    StructField structField = new StructField(column, dataType, true, Metadata.empty());
    structFields[0] = structField;

    StructType structType = new StructType(structFields);

    List<Row> rows = new ArrayList<>();

    for (int i = 0; i <= 1000; i++) {
        Object[] objects = new Object[structFields.length];
        objects[0] = i;
        Row row = RowFactory.create(objects);
        rows.add(row);
    }

    Dataset<Row> df = spark.createDataFrame(rows, structType);
    return df;
}
 
开发者ID:uber,项目名称:uberscriptquery,代码行数:22,代码来源:QueryEngine.java

示例6: execute

import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
@Override
public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) {

    String filePath = actionStatement.getParamValues().get(0).getValue().toString();
    String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString();
    String dfTableName = actionStatement.getParamValues().get(2).getValue().toString();

    SaveMode saveMode = SaveMode.valueOf(saveModeStr);

    String sql = String.format("select * from %s", dfTableName);
    logger.info(String.format("Running sql [%s] to get data and then save it", sql));
    Dataset<Row> df = sparkSession.sql(sql);

    logger.info(String.format("Saving to parquet %s, saveMode: %s", filePath, saveMode));
    df.coalesce(1).write().mode(saveMode).parquet(filePath);
    logger.info(String.format("Saved to parquet %s, saveMode: %s", filePath, saveMode));
    return null;
}
 
开发者ID:uber,项目名称:uberscriptquery,代码行数:19,代码来源:WriteParquetFileActionStatementExecutor.java

示例7: getPayloadFromCsv

import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
private static Dataset<Row> getPayloadFromCsv( final SparkSession sparkSession, JdbcIntegrationConfig config ) {

        //        String csvPath = Resources.getResource( "dispatch.csv" ).getPath();
        java.sql.Date d = new java.sql.Date( DateTime.now().minusDays( 90 ).toDate().getTime() );
        String query = "(select * from dbo.Dispatch where CFS_DateTimeJanet >= '" + d.toString() +"') Dispatch";

        Dataset<Row> payload = sparkSession
                .read()
                .format( "jdbc" )
                .option( "url", config.getUrl() )
                .option( "dbtable", query )
                .option( "password", config.getDbPassword() )
                .option( "user", config.getDbUser() )
                .option( "driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver" )
                .load();

        //payload.createOrReplaceTempView( "Dispatch" );

        return payload;
    }
 
开发者ID:dataloom,项目名称:integrations,代码行数:21,代码来源:DispatchFlight.java

示例8: getPayloadFromCsv

import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
private static Dataset<Row> getPayloadFromCsv( final SparkSession sparkSession, JdbcIntegrationConfig config ) {

        //        String csvPath = Resources.getResource( "dispatch_persons.csv" ).getPath();

        String sql = "(select * from dbo.Dispatch_Persons where Dis_id IN "
                + "( select distinct (Dis_Id) from Dispatch where CFS_DateTimeJanet > DateADD(d, -90, GETDATE()) ) ) Dispatch_Persons";
        logger.info( "SQL Query for persons: {}", sql );
        Dataset<Row> payload = sparkSession
                .read()
                .format( "jdbc" )
                .option( "url", config.getUrl() )
                .option( "dbtable", sql )
                .option( "password", config.getDbPassword() )
                .option( "user", config.getDbUser() )
                .option( "driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver" )
                .load();
        payload.createOrReplaceTempView( "Dispatch_Persons" );
        //                .filter( col( "Timercvd" ).geq( DateTime.now().minusDays( 2 ) ) )
        //                .filter( col( "Type" ).notEqual( "2" ) );

        return payload;
    }
 
开发者ID:dataloom,项目名称:integrations,代码行数:23,代码来源:DispatchPersonsFlight.java

示例9: convertToDataFrame

import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
/**
 * 将数据集文件转换为DataFrame  TODO:增加json等格式
 *
 * @param context
 * @return
 * @throws CantConverException
 */
public static Dataset<Row> convertToDataFrame(DataFile dataFile, JavaSparkContext context) throws CantConverException {
    SparkSession sparkSession = SparkSession.builder()
            .sparkContext(context.sc())
            .getOrCreate();

    SQLContext sqlContext = new SQLContext(sparkSession);

    switch (dataFile.getDataFileType()) {
        case CSV:
            return csvToDataFrame(dataFile, context, sqlContext);
        case LIBSVM:
            return libsvmToDataFrame(dataFile, sqlContext);
        default:
            throw new CantConverException("不支持的数据集格式");
    }
}
 
开发者ID:hays2hong,项目名称:stonk,代码行数:24,代码来源:SparkDataFileConverter.java

示例10: setUpSpark

import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
/**
 * Sets up a new SparkSession
 * @param appName The name that will show up in the Web UI (and the history server)
 * @param NUM_CORES_IN_CLUSTER the number of available cores in the cluster
 * @param NUM_WORKERS the number of worker nodes in the cluster
 * @param parallelismFactor spark tuning documentation suggests 2 or 3, unless OOM error (in that case more)
 * @param tmpPath
 * @return 
 */
public static SparkSession setUpSpark(String appName, int NUM_CORES_IN_CLUSTER, int NUM_WORKERS, int parallelismFactor, String tmpPath) {
    final int NUM_EXECUTORS = NUM_WORKERS * 3; //standard: NUM_WORKERS *3
    final int NUM_EXECUTOR_CORES = NUM_CORES_IN_CLUSTER/NUM_EXECUTORS;
    final int PARALLELISM = NUM_EXECUTORS * NUM_EXECUTOR_CORES * parallelismFactor;
                   
    return SparkSession.builder()
        .appName(appName) 
        .config("spark.sql.warehouse.dir", tmpPath)
        .config("spark.eventLog.enabled", true)
        .config("spark.default.parallelism", PARALLELISM) //x tasks for each core --> x "reduce" rounds (keep this fixed for speedup tests), oherwise (set: PARALLELISM)
        .config("spark.rdd.compress", true)
        .config("spark.network.timeout", "600s")
        .config("spark.executor.heartbeatInterval", "20s")    
            
        .config("spark.executor.instances", NUM_EXECUTORS)
        .config("spark.executor.cores", NUM_EXECUTOR_CORES) //speedup tests: 12,9,6,3 OR 12,8,4,1
        .config("spark.executor.memory", "55G") //55G is fine in ISL cluster
        
        .config("spark.driver.maxResultSize", "2g")
        
        .getOrCreate();        
}
 
开发者ID:vefthym,项目名称:MinoanER,代码行数:32,代码来源:Utils.java

示例11: setUp

import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
@Before
public void setUp() {
    System.setProperty("hadoop.home.dir", "C:\\Users\\VASILIS\\Documents\\hadoop_home"); //only for local mode
    
    spark = SparkSession.builder()
        .appName("test") 
        .config("spark.sql.warehouse.dir", "/file:/tmp")                
        .config("spark.executor.instances", 1)
        .config("spark.executor.cores", 1)
        .config("spark.executor.memory", "1G")            
        .config("spark.driver.maxResultSize", "1g")
        .config("spark.master", "local")
        .getOrCreate();        
    
    
    
    jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); 
}
 
开发者ID:vefthym,项目名称:MinoanER,代码行数:19,代码来源:EntityBasedCNPMapPhaseTest.java

示例12: setUp

import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
@Before
public void setUp() {        
    System.setProperty("hadoop.home.dir", "C:\\Users\\VASILIS\\Documents\\hadoop_home"); //only for local mode
    
    spark = SparkSession.builder()
        .appName("test") 
        .config("spark.sql.warehouse.dir", "/file:/tmp")                
        .config("spark.executor.instances", 1)
        .config("spark.executor.cores", 1)
        .config("spark.executor.memory", "1G")            
        .config("spark.driver.maxResultSize", "1g")
        .config("spark.master", "local")
        .getOrCreate();        
    
    
    
    jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); 
}
 
开发者ID:vefthym,项目名称:MinoanER,代码行数:19,代码来源:BlockFilteringAdvancedTest.java

示例13: getFromDatabase

import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
/**
 * Returns the collection of ancestors from the table in the given database.
 *
 * @param spark the spark session
 * @param database name of the database containing the ancestors table
 * @return a Hierarchies instance.
 */
public static Hierarchies getFromDatabase(SparkSession spark, String database) {

  Dataset<Ancestor> ancestors = spark.sql("SELECT * FROM " + database + "." + ANCESTORS_TABLE)
      .as(ANCESTOR_ENCODER);

  Dataset<UrlAndVersion> members = ancestors.filter((FilterFunction<Ancestor>) ancestor ->
          ancestor.getUri().startsWith(HIERARCHY_URI_PREFIX))
      .select(col("uri").alias("url"), col("version"))
      .distinct()
      .as(URI_AND_VERSION_ENCODER);

  return new Hierarchies(spark,
      members,
      ancestors);
}
 
开发者ID:cerner,项目名称:bunsen,代码行数:23,代码来源:Hierarchies.java

示例14: extractEntry

import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
/**
 * Extracts the given resource type from the RDD of bundles and returns
 * it as a Dataset of that type.
 *
 * @param spark the spark session
 * @param bundles an RDD of FHIR Bundles
 * @param resourceName the FHIR name of the resource type to extract
 *     (e.g., condition, patient. etc).
 * @param encoders the Encoders instance defining how the resources are encoded.
 * @param <T> the type of the resource being extracted from the bundles.
 * @return a dataset of the given resource
 */
public static <T extends IBaseResource> Dataset<T> extractEntry(SparkSession spark,
    JavaRDD<Bundle> bundles,
    String resourceName,
    FhirEncoders encoders) {

  RuntimeResourceDefinition def = context.getResourceDefinition(resourceName);

  JavaRDD<T> resourceRdd = bundles.flatMap(new ToResource<T>(def.getName()));

  Encoder<T> encoder = encoders.of((Class<T>) def.getImplementingClass());

  return spark.createDataset(resourceRdd.rdd(), encoder);
}
 
开发者ID:cerner,项目名称:bunsen,代码行数:26,代码来源:Bundles.java

示例15: setUp

import org.apache.spark.sql.SparkSession; //导入依赖的package包/类
/**
 * Sets up Spark.
 */
@BeforeClass
public static void setUp() throws IOException {

  // Create a local spark session using an in-memory metastore.
  // We must also use Hive and set the partition mode to non-strict to
  // support dynamic partitions.
  spark = SparkSession.builder()
      .master("local[2]")
      .appName("ConceptMapsTest")
      .enableHiveSupport()
      .config("javax.jdo.option.ConnectionURL",
          "jdbc:derby:memory:metastore_db;create=true")
      .config("hive.exec.dynamic.partition.mode",
          "nonstrict")
      .config("spark.sql.warehouse.dir",
          Files.createTempDirectory("spark_warehouse").toString())
      .getOrCreate();

  spark.sql("create database mappingtestdb");
}
 
开发者ID:cerner,项目名称:bunsen,代码行数:24,代码来源:ValueSetsTest.java


注:本文中的org.apache.spark.sql.SparkSession类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。