當前位置: 首頁>>代碼示例>>Java>>正文


Java Dataset類代碼示例

本文整理匯總了Java中org.apache.spark.sql.Dataset的典型用法代碼示例。如果您正苦於以下問題:Java Dataset類的具體用法?Java Dataset怎麽用?Java Dataset使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。


Dataset類屬於org.apache.spark.sql包,在下文中一共展示了Dataset類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。

示例1: test_getDataSetResult

import org.apache.spark.sql.Dataset; //導入依賴的package包/類
@Test
public void test_getDataSetResult() {

    StructField[] structFields = new StructField[]{
            new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()),
            new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty())
    };

    StructType structType = new StructType(structFields);

    List<Row> rows = new ArrayList<>();
    rows.add(RowFactory.create(1, "v1"));
    rows.add(RowFactory.create(2, "v2"));

    Dataset<Row> df = sparkSession.createDataFrame(rows, structType);

    DataSetResult dataSetResult = SparkUtils.getDataSetResult(df);
    Assert.assertEquals(2, dataSetResult.getColumnNames().size());
    Assert.assertEquals(2, dataSetResult.getRows().size());
    Assert.assertEquals(new Integer(1), dataSetResult.getRows().get(0).get(0));
    Assert.assertEquals("v1", dataSetResult.getRows().get(0).get(1));
    Assert.assertEquals(new Integer(2), dataSetResult.getRows().get(1).get(0));
    Assert.assertEquals("v2", dataSetResult.getRows().get(1).get(1));
}
 
開發者ID:uber,項目名稱:uberscriptquery,代碼行數:25,代碼來源:SparkUtilsTest.java

示例2: readMultiaxialHierarchyFile

import org.apache.spark.sql.Dataset; //導入依賴的package包/類
/**
 * Reads the LOINC mutliaxial hierarchy file and converts it to a {@link HierarchicalElement}
 * dataset.
 *
 * @param spark the Spark session
 * @param loincHierarchyPath path to the multiaxial hierarchy CSV
 * @return a dataset of {@link HierarchicalElement} representing the hierarchical relationship.
 */
public static Dataset<HierarchicalElement> readMultiaxialHierarchyFile(SparkSession spark,
    String loincHierarchyPath) {

  return spark.read()
      .option("header", true)
      .csv(loincHierarchyPath)
      .select(col("IMMEDIATE_PARENT"), col("CODE"))
      .where(col("IMMEDIATE_PARENT").isNotNull()
          .and(col("IMMEDIATE_PARENT").notEqual(lit(""))))
      .where(col("CODE").isNotNull()
          .and(col("CODE").notEqual(lit(""))))
      .map((MapFunction<Row, HierarchicalElement>) row -> {

        HierarchicalElement element = new HierarchicalElement();

        element.setAncestorSystem(LOINC_CODE_SYSTEM_URI);
        element.setAncestorValue(row.getString(0));

        element.setDescendantSystem(LOINC_CODE_SYSTEM_URI);
        element.setDescendantValue(row.getString(1));

        return element;
      }, Hierarchies.getHierarchicalElementEncoder());
}
 
開發者ID:cerner,項目名稱:bunsen,代碼行數:33,代碼來源:Loinc.java

示例3: getDataFrameOfElementsWithEntityGroup

import org.apache.spark.sql.Dataset; //導入依賴的package包/類
public void getDataFrameOfElementsWithEntityGroup() {
    // ---------------------------------------------------------
    final GetDataFrameOfElements operation = new GetDataFrameOfElements.Builder()
            .view(new View.Builder()
                    .entity("entity")
                    .build())
            .build();
    // ---------------------------------------------------------

    final Dataset<Row> df = runExample(operation, null);

    // Restrict to entities involving certain vertices
    final Dataset<Row> seeded = df.filter("vertex = 1 OR vertex = 2");
    String result = seeded.showString(100, 20);
    printJava("df.filter(\"vertex = 1 OR vertex = 2\").show();");
    print("The results are:\n");
    print("```");
    print(result.substring(0, result.length() - 2));
    print("```");

    // Filter by property
    final Dataset<Row> filtered = df.filter("count > 1");
    result = filtered.showString(100, 20);
    printJava("df.filter(\"count > 1\").show();");
    print("The results are:\n");
    print("```");
    print(result.substring(0, result.length() - 2));
    print("```");
}
 
開發者ID:gchq,項目名稱:gaffer-doc,代碼行數:30,代碼來源:GetDataFrameOfElementsExample.java

示例4: toJson

import org.apache.spark.sql.Dataset; //導入依賴的package包/類
/**
 * Converts a set of FHIR resources to JSON.
 *
 * @param dataset a dataset containing FHIR resources
 * @param resourceType the FHIR resource type
 * @return a dataset of JSON strings for the FHIR resources
 */
public static Dataset<String> toJson(Dataset<?> dataset, String resourceType) {

  Dataset<IBaseResource> resourceDataset =
      dataset.as(FhirEncoders.forStu3()
          .getOrCreate()
          .of(resourceType));

  return resourceDataset.map(new ToJson(), Encoders.STRING());
}
 
開發者ID:cerner,項目名稱:bunsen,代碼行數:17,代碼來源:Functions.java

示例5: getPayloadFromCsv

import org.apache.spark.sql.Dataset; //導入依賴的package包/類
private static Dataset<Row> getPayloadFromCsv( final SparkSession sparkSession, JdbcIntegrationConfig config ) {

        //        String csvPath = Resources.getResource( "dispatch_type.csv" ).getPath();
        java.sql.Date d = new java.sql.Date( DateTime.now().minusDays( 90 ).toDate().getTime() );
        String query = "(select * from dbo.Dispatch_Type where timercvd >= '" + d.toString() +"') Dispatch_Type";
        Dataset<Row> payload = sparkSession
                .read()
                .format( "jdbc" )
                .option( "url", config.getUrl() )
                .option( "dbtable", query )
                .option( "password", config.getDbPassword() )
                .option( "user", config.getDbUser() )
                .load();

        return payload;
    }
 
開發者ID:dataloom,項目名稱:integrations,代碼行數:17,代碼來源:DispatchTypeFlight.java

示例6: computeJoins

import org.apache.spark.sql.Dataset; //導入依賴的package包/類
public Dataset<Row> computeJoins(SQLContext sqlContext){
	// compute all the joins
	Dataset<Row> results = node.computeJoinWithChildren(sqlContext);
	// select only the requested result
	Column [] selectedColumns = new Column[node.projection.size()];
	for (int i = 0; i < selectedColumns.length; i++) {
		selectedColumns[i]= new Column(node.projection.get(i));
	}

	// if there is a filter set, apply it
	results =  filter == null ? results.select(selectedColumns) : results.filter(filter).select(selectedColumns);
	
	// if results are distinct
	if(selectDistinct) results = results.distinct();
	
	return results;
	
}
 
開發者ID:tf-dbis-uni-freiburg,項目名稱:PRoST,代碼行數:19,代碼來源:JoinTree.java

示例7: coding

import org.apache.spark.sql.Dataset; //導入依賴的package包/類
@Test
public void coding() {

  Coding expectedCoding = condition.getSeverity().getCodingFirstRep();
  Coding actualCoding = decodedCondition.getSeverity().getCodingFirstRep();

  // Codings are a nested array, so we explode them into a table of the coding
  // fields so we can easily select and compare individual fields.
  Dataset<Row> severityCodings = conditionsDataset
      .select(functions.explode(conditionsDataset.col("severity.coding"))
          .alias("coding"))
      .select("coding.*") // Pull all fields in the coding to the top level.
      .cache();

  Assert.assertEquals(expectedCoding.getCode(),
      severityCodings.select("code").head().get(0));
  Assert.assertEquals(expectedCoding.getCode(),
      actualCoding.getCode());

  Assert.assertEquals(expectedCoding.getSystem(),
      severityCodings.select("system").head().get(0));
  Assert.assertEquals(expectedCoding.getSystem(),
      actualCoding.getSystem());

  Assert.assertEquals(expectedCoding.getUserSelected(),
      severityCodings.select("userSelected").head().get(0));
  Assert.assertEquals(expectedCoding.getUserSelected(),
      actualCoding.getUserSelected());

  Assert.assertEquals(expectedCoding.getDisplay(),
      severityCodings.select("display").head().get(0));
  Assert.assertEquals(expectedCoding.getDisplay(),
      actualCoding.getDisplay());
}
 
開發者ID:cerner,項目名稱:bunsen,代碼行數:35,代碼來源:FhirEncodersTest.java

示例8: getPayloadFromCsv

import org.apache.spark.sql.Dataset; //導入依賴的package包/類
private static Dataset<Row> getPayloadFromCsv( final SparkSession sparkSession, JdbcIntegrationConfig config ) {

        //        String csvPath = Resources.getResource( "dispatch_persons.csv" ).getPath();

        String sql = "(select * from dbo.Dispatch_Persons where Dis_id IN "
                + "( select distinct (Dis_Id) from Dispatch where CFS_DateTimeJanet > DateADD(d, -90, GETDATE()) ) ) Dispatch_Persons";
        logger.info( "SQL Query for persons: {}", sql );
        Dataset<Row> payload = sparkSession
                .read()
                .format( "jdbc" )
                .option( "url", config.getUrl() )
                .option( "dbtable", sql )
                .option( "password", config.getDbPassword() )
                .option( "user", config.getDbUser() )
                .option( "driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver" )
                .load();
        payload.createOrReplaceTempView( "Dispatch_Persons" );
        //                .filter( col( "Timercvd" ).geq( DateTime.now().minusDays( 2 ) ) )
        //                .filter( col( "Type" ).notEqual( "2" ) );

        return payload;
    }
 
開發者ID:dataloom,項目名稱:integrations,代碼行數:23,代碼來源:DispatchPersonsFlight.java

示例9: testCompareRdd

import org.apache.spark.sql.Dataset; //導入依賴的package包/類
/**
 * Test of compareRdd method, of class SparkCompare.
 */
@Test
public void testCompareRdd() {
   
    //code to get file1 location
    String file1Path = this.getClass().getClassLoader().
            getResource("TC5NullsAndEmptyData1.txt").getPath();
    
    String file2Path = this.getClass().getClassLoader().
            getResource("TC5NullsAndEmptyData2.txt").getPath();

    Pair<Dataset<Row>, Dataset<Row>> comparisonResult = SparkCompare.compareFiles(file1Path, file2Path);

    try {
        comparisonResult.getLeft().show();
        comparisonResult.getRight().show();
    } catch (Exception e) {
        Assert.fail("Straightforward output of test results somehow failed");
    }
}
 
開發者ID:FINRAOS,項目名稱:MegaSparkDiff,代碼行數:23,代碼來源:SparkCompareTest.java

示例10: execute

import org.apache.spark.sql.Dataset; //導入依賴的package包/類
@Override
public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) {

    String filePath = actionStatement.getParamValues().get(0).getValue().toString();
    String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString();
    String dfTableName = actionStatement.getParamValues().get(2).getValue().toString();

    SaveMode saveMode = SaveMode.valueOf(saveModeStr);

    String sql = String.format("select * from %s", dfTableName);
    logger.info(String.format("Running sql [%s] to get data and then save it", sql));
    Dataset<Row> df = sparkSession.sql(sql);

    logger.info(String.format("Saving to csv %s, saveMode: %s", filePath, saveMode));
    df.coalesce(1).write().mode(saveMode).option("header", "false").csv(filePath);
    logger.info(String.format("Saved to csv %s, saveMode: %s", filePath, saveMode));
    return null;
}
 
開發者ID:uber,項目名稱:uberscriptquery,代碼行數:19,代碼來源:WriteCsvFileActionStatementExecutor.java

示例11: main

import org.apache.spark.sql.Dataset; //導入依賴的package包/類
public static void main(String[] args) {
	SparkSession spark = SparkSession.builder()
			.master("local[8]")
			.appName("PCAExpt")
			.getOrCreate();

	// Load and parse data
	String filePath = "/home/kchoppella/book/Chapter09/data/covtypeNorm.csv";

	// Loads data.
	Dataset<Row> inDataset = spark.read()
			.format("com.databricks.spark.csv")
			.option("header", "true")
			.option("inferSchema", true)
			.load(filePath);
	ArrayList<String> inputColsList = new ArrayList<String>(Arrays.asList(inDataset.columns()));
	
	//Make single features column for feature vectors 
	inputColsList.remove("class");
	String[] inputCols = inputColsList.parallelStream().toArray(String[]::new);
	
	//Prepare dataset for training with all features in "features" column
	VectorAssembler assembler = new VectorAssembler().setInputCols(inputCols).setOutputCol("features");
	Dataset<Row> dataset = assembler.transform(inDataset);

	PCAModel pca = new PCA()
			.setK(16)
			.setInputCol("features")
			.setOutputCol("pcaFeatures")
			.fit(dataset);

	Dataset<Row> result = pca.transform(dataset).select("pcaFeatures");
	System.out.println("Explained variance:");
	System.out.println(pca.explainedVariance());
	result.show(false);
	// $example off$
	spark.stop();
}
 
開發者ID:PacktPublishing,項目名稱:Machine-Learning-End-to-Endguide-for-Java-developers,代碼行數:39,代碼來源:PCAExpt.java

示例12: testGetLatest

import org.apache.spark.sql.Dataset; //導入依賴的package包/類
@Test
public void testGetLatest() {

  String database = "test_get_latest";
  spark.sql("CREATE DATABASE " + database);

  ValueSets.getEmpty(spark)
      .withValueSets(
          valueSet("urn:cerner:valueset:newvalueset", "1"),
          valueSet("urn:cerner:valueset:newvalueset", "2"),
          valueSet("urn:cerner:valueset:othervalueset", "1"))
      .writeToDatabase(database);

  Dataset<Value> latest = ValueSets.getFromDatabase(spark, database)
      .getLatestValues(ImmutableSet.of("urn:cerner:valueset:newvalueset",
          "urn:cerner:valueset:othervalueset"),
          true);

  latest.cache();

  Assert.assertEquals(2, latest.count());

  Assert.assertEquals(0, latest.where(
      "valueSetUri == 'urn:cerner:valueset:newvalueset' AND valueSetVersion == '1'")
      .count());

  Assert.assertEquals(1, latest.where(
      "valueSetUri == 'urn:cerner:valueset:newvalueset' AND valueSetVersion == '2'")
      .count());

  Assert.assertEquals(1, latest.where(
      "valueSetUri == 'urn:cerner:valueset:othervalueset' AND valueSetVersion == '1'")
      .count());
}
 
開發者ID:cerner,項目名稱:bunsen,代碼行數:35,代碼來源:ValueSetsTest.java

示例13: returnDiff

import org.apache.spark.sql.Dataset; //導入依賴的package包/類
private Pair<Dataset<Row>, Dataset<Row>> returnDiff(String table1, String table2)
{
    AppleTable leftAppleTable = SparkFactory.parallelizeJDBCSource("org.hsqldb.jdbc.JDBCDriver",
            "jdbc:hsqldb:hsql://127.0.0.1:9001/testDb",
            "SA",
            "",
            "(select * from " + table1 + ")", "table1");

    AppleTable rightAppleTable = SparkFactory.parallelizeJDBCSource("org.hsqldb.jdbc.JDBCDriver",
            "jdbc:hsqldb:hsql://127.0.0.1:9001/testDb",
            "SA",
            "",
            "(select * from " + table2 + ")", "table2");

    return SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable);
}
 
開發者ID:FINRAOS,項目名稱:MegaSparkDiff,代碼行數:17,代碼來源:JdbcToJdbcTest.java

示例14: checkPatients

import org.apache.spark.sql.Dataset; //導入依賴的package包/類
private void checkPatients(Dataset<Patient> patients) {
  List<String> patientIds = patients
      .collectAsList()
      .stream()
      .map(Patient::getId)
      .collect(Collectors.toList());

  Assert.assertEquals(3, patientIds.size());

  List<String> expectedIds = ImmutableList.of(
      "Patient/6666001",
      "Patient/1032702",
      "Patient/9995679");

  Assert.assertTrue(patientIds.containsAll(expectedIds));
}
 
開發者ID:cerner,項目名稱:bunsen,代碼行數:17,代碼來源:BundlesTest.java

示例15: calculate_stats_table

import org.apache.spark.sql.Dataset; //導入依賴的package包/類
private TableStats calculate_stats_table(Dataset<Row> table, String tableName) {
	TableStats.Builder table_stats_builder = TableStats.newBuilder();
	
	// calculate the stats
	int table_size = (int) table.count();
	int distinct_subjects = (int) table.select(this.column_name_subject).distinct().count();
	boolean is_complex = table_size != distinct_subjects;
	
	// put them in the protobuf object
	table_stats_builder.setSize(table_size)
		.setDistinctSubjects(distinct_subjects)
		.setIsComplex(is_complex)
		.setName(tableName);
	
	return table_stats_builder.build();
}
 
開發者ID:tf-dbis-uni-freiburg,項目名稱:PRoST,代碼行數:17,代碼來源:VerticalPartitioningLoader.java


注:本文中的org.apache.spark.sql.Dataset類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。