本文整理匯總了Java中org.apache.spark.sql.Dataset類的典型用法代碼示例。如果您正苦於以下問題:Java Dataset類的具體用法?Java Dataset怎麽用?Java Dataset使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
Dataset類屬於org.apache.spark.sql包,在下文中一共展示了Dataset類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: test_getDataSetResult
import org.apache.spark.sql.Dataset; //導入依賴的package包/類
@Test
public void test_getDataSetResult() {
StructField[] structFields = new StructField[]{
new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()),
new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty())
};
StructType structType = new StructType(structFields);
List<Row> rows = new ArrayList<>();
rows.add(RowFactory.create(1, "v1"));
rows.add(RowFactory.create(2, "v2"));
Dataset<Row> df = sparkSession.createDataFrame(rows, structType);
DataSetResult dataSetResult = SparkUtils.getDataSetResult(df);
Assert.assertEquals(2, dataSetResult.getColumnNames().size());
Assert.assertEquals(2, dataSetResult.getRows().size());
Assert.assertEquals(new Integer(1), dataSetResult.getRows().get(0).get(0));
Assert.assertEquals("v1", dataSetResult.getRows().get(0).get(1));
Assert.assertEquals(new Integer(2), dataSetResult.getRows().get(1).get(0));
Assert.assertEquals("v2", dataSetResult.getRows().get(1).get(1));
}
示例2: readMultiaxialHierarchyFile
import org.apache.spark.sql.Dataset; //導入依賴的package包/類
/**
* Reads the LOINC mutliaxial hierarchy file and converts it to a {@link HierarchicalElement}
* dataset.
*
* @param spark the Spark session
* @param loincHierarchyPath path to the multiaxial hierarchy CSV
* @return a dataset of {@link HierarchicalElement} representing the hierarchical relationship.
*/
public static Dataset<HierarchicalElement> readMultiaxialHierarchyFile(SparkSession spark,
String loincHierarchyPath) {
return spark.read()
.option("header", true)
.csv(loincHierarchyPath)
.select(col("IMMEDIATE_PARENT"), col("CODE"))
.where(col("IMMEDIATE_PARENT").isNotNull()
.and(col("IMMEDIATE_PARENT").notEqual(lit(""))))
.where(col("CODE").isNotNull()
.and(col("CODE").notEqual(lit(""))))
.map((MapFunction<Row, HierarchicalElement>) row -> {
HierarchicalElement element = new HierarchicalElement();
element.setAncestorSystem(LOINC_CODE_SYSTEM_URI);
element.setAncestorValue(row.getString(0));
element.setDescendantSystem(LOINC_CODE_SYSTEM_URI);
element.setDescendantValue(row.getString(1));
return element;
}, Hierarchies.getHierarchicalElementEncoder());
}
示例3: getDataFrameOfElementsWithEntityGroup
import org.apache.spark.sql.Dataset; //導入依賴的package包/類
public void getDataFrameOfElementsWithEntityGroup() {
// ---------------------------------------------------------
final GetDataFrameOfElements operation = new GetDataFrameOfElements.Builder()
.view(new View.Builder()
.entity("entity")
.build())
.build();
// ---------------------------------------------------------
final Dataset<Row> df = runExample(operation, null);
// Restrict to entities involving certain vertices
final Dataset<Row> seeded = df.filter("vertex = 1 OR vertex = 2");
String result = seeded.showString(100, 20);
printJava("df.filter(\"vertex = 1 OR vertex = 2\").show();");
print("The results are:\n");
print("```");
print(result.substring(0, result.length() - 2));
print("```");
// Filter by property
final Dataset<Row> filtered = df.filter("count > 1");
result = filtered.showString(100, 20);
printJava("df.filter(\"count > 1\").show();");
print("The results are:\n");
print("```");
print(result.substring(0, result.length() - 2));
print("```");
}
示例4: toJson
import org.apache.spark.sql.Dataset; //導入依賴的package包/類
/**
* Converts a set of FHIR resources to JSON.
*
* @param dataset a dataset containing FHIR resources
* @param resourceType the FHIR resource type
* @return a dataset of JSON strings for the FHIR resources
*/
public static Dataset<String> toJson(Dataset<?> dataset, String resourceType) {
Dataset<IBaseResource> resourceDataset =
dataset.as(FhirEncoders.forStu3()
.getOrCreate()
.of(resourceType));
return resourceDataset.map(new ToJson(), Encoders.STRING());
}
示例5: getPayloadFromCsv
import org.apache.spark.sql.Dataset; //導入依賴的package包/類
private static Dataset<Row> getPayloadFromCsv( final SparkSession sparkSession, JdbcIntegrationConfig config ) {
// String csvPath = Resources.getResource( "dispatch_type.csv" ).getPath();
java.sql.Date d = new java.sql.Date( DateTime.now().minusDays( 90 ).toDate().getTime() );
String query = "(select * from dbo.Dispatch_Type where timercvd >= '" + d.toString() +"') Dispatch_Type";
Dataset<Row> payload = sparkSession
.read()
.format( "jdbc" )
.option( "url", config.getUrl() )
.option( "dbtable", query )
.option( "password", config.getDbPassword() )
.option( "user", config.getDbUser() )
.load();
return payload;
}
示例6: computeJoins
import org.apache.spark.sql.Dataset; //導入依賴的package包/類
public Dataset<Row> computeJoins(SQLContext sqlContext){
// compute all the joins
Dataset<Row> results = node.computeJoinWithChildren(sqlContext);
// select only the requested result
Column [] selectedColumns = new Column[node.projection.size()];
for (int i = 0; i < selectedColumns.length; i++) {
selectedColumns[i]= new Column(node.projection.get(i));
}
// if there is a filter set, apply it
results = filter == null ? results.select(selectedColumns) : results.filter(filter).select(selectedColumns);
// if results are distinct
if(selectDistinct) results = results.distinct();
return results;
}
示例7: coding
import org.apache.spark.sql.Dataset; //導入依賴的package包/類
@Test
public void coding() {
Coding expectedCoding = condition.getSeverity().getCodingFirstRep();
Coding actualCoding = decodedCondition.getSeverity().getCodingFirstRep();
// Codings are a nested array, so we explode them into a table of the coding
// fields so we can easily select and compare individual fields.
Dataset<Row> severityCodings = conditionsDataset
.select(functions.explode(conditionsDataset.col("severity.coding"))
.alias("coding"))
.select("coding.*") // Pull all fields in the coding to the top level.
.cache();
Assert.assertEquals(expectedCoding.getCode(),
severityCodings.select("code").head().get(0));
Assert.assertEquals(expectedCoding.getCode(),
actualCoding.getCode());
Assert.assertEquals(expectedCoding.getSystem(),
severityCodings.select("system").head().get(0));
Assert.assertEquals(expectedCoding.getSystem(),
actualCoding.getSystem());
Assert.assertEquals(expectedCoding.getUserSelected(),
severityCodings.select("userSelected").head().get(0));
Assert.assertEquals(expectedCoding.getUserSelected(),
actualCoding.getUserSelected());
Assert.assertEquals(expectedCoding.getDisplay(),
severityCodings.select("display").head().get(0));
Assert.assertEquals(expectedCoding.getDisplay(),
actualCoding.getDisplay());
}
示例8: getPayloadFromCsv
import org.apache.spark.sql.Dataset; //導入依賴的package包/類
private static Dataset<Row> getPayloadFromCsv( final SparkSession sparkSession, JdbcIntegrationConfig config ) {
// String csvPath = Resources.getResource( "dispatch_persons.csv" ).getPath();
String sql = "(select * from dbo.Dispatch_Persons where Dis_id IN "
+ "( select distinct (Dis_Id) from Dispatch where CFS_DateTimeJanet > DateADD(d, -90, GETDATE()) ) ) Dispatch_Persons";
logger.info( "SQL Query for persons: {}", sql );
Dataset<Row> payload = sparkSession
.read()
.format( "jdbc" )
.option( "url", config.getUrl() )
.option( "dbtable", sql )
.option( "password", config.getDbPassword() )
.option( "user", config.getDbUser() )
.option( "driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver" )
.load();
payload.createOrReplaceTempView( "Dispatch_Persons" );
// .filter( col( "Timercvd" ).geq( DateTime.now().minusDays( 2 ) ) )
// .filter( col( "Type" ).notEqual( "2" ) );
return payload;
}
示例9: testCompareRdd
import org.apache.spark.sql.Dataset; //導入依賴的package包/類
/**
* Test of compareRdd method, of class SparkCompare.
*/
@Test
public void testCompareRdd() {
//code to get file1 location
String file1Path = this.getClass().getClassLoader().
getResource("TC5NullsAndEmptyData1.txt").getPath();
String file2Path = this.getClass().getClassLoader().
getResource("TC5NullsAndEmptyData2.txt").getPath();
Pair<Dataset<Row>, Dataset<Row>> comparisonResult = SparkCompare.compareFiles(file1Path, file2Path);
try {
comparisonResult.getLeft().show();
comparisonResult.getRight().show();
} catch (Exception e) {
Assert.fail("Straightforward output of test results somehow failed");
}
}
示例10: execute
import org.apache.spark.sql.Dataset; //導入依賴的package包/類
@Override
public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) {
String filePath = actionStatement.getParamValues().get(0).getValue().toString();
String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString();
String dfTableName = actionStatement.getParamValues().get(2).getValue().toString();
SaveMode saveMode = SaveMode.valueOf(saveModeStr);
String sql = String.format("select * from %s", dfTableName);
logger.info(String.format("Running sql [%s] to get data and then save it", sql));
Dataset<Row> df = sparkSession.sql(sql);
logger.info(String.format("Saving to csv %s, saveMode: %s", filePath, saveMode));
df.coalesce(1).write().mode(saveMode).option("header", "false").csv(filePath);
logger.info(String.format("Saved to csv %s, saveMode: %s", filePath, saveMode));
return null;
}
示例11: main
import org.apache.spark.sql.Dataset; //導入依賴的package包/類
public static void main(String[] args) {
SparkSession spark = SparkSession.builder()
.master("local[8]")
.appName("PCAExpt")
.getOrCreate();
// Load and parse data
String filePath = "/home/kchoppella/book/Chapter09/data/covtypeNorm.csv";
// Loads data.
Dataset<Row> inDataset = spark.read()
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", true)
.load(filePath);
ArrayList<String> inputColsList = new ArrayList<String>(Arrays.asList(inDataset.columns()));
//Make single features column for feature vectors
inputColsList.remove("class");
String[] inputCols = inputColsList.parallelStream().toArray(String[]::new);
//Prepare dataset for training with all features in "features" column
VectorAssembler assembler = new VectorAssembler().setInputCols(inputCols).setOutputCol("features");
Dataset<Row> dataset = assembler.transform(inDataset);
PCAModel pca = new PCA()
.setK(16)
.setInputCol("features")
.setOutputCol("pcaFeatures")
.fit(dataset);
Dataset<Row> result = pca.transform(dataset).select("pcaFeatures");
System.out.println("Explained variance:");
System.out.println(pca.explainedVariance());
result.show(false);
// $example off$
spark.stop();
}
開發者ID:PacktPublishing,項目名稱:Machine-Learning-End-to-Endguide-for-Java-developers,代碼行數:39,代碼來源:PCAExpt.java
示例12: testGetLatest
import org.apache.spark.sql.Dataset; //導入依賴的package包/類
@Test
public void testGetLatest() {
String database = "test_get_latest";
spark.sql("CREATE DATABASE " + database);
ValueSets.getEmpty(spark)
.withValueSets(
valueSet("urn:cerner:valueset:newvalueset", "1"),
valueSet("urn:cerner:valueset:newvalueset", "2"),
valueSet("urn:cerner:valueset:othervalueset", "1"))
.writeToDatabase(database);
Dataset<Value> latest = ValueSets.getFromDatabase(spark, database)
.getLatestValues(ImmutableSet.of("urn:cerner:valueset:newvalueset",
"urn:cerner:valueset:othervalueset"),
true);
latest.cache();
Assert.assertEquals(2, latest.count());
Assert.assertEquals(0, latest.where(
"valueSetUri == 'urn:cerner:valueset:newvalueset' AND valueSetVersion == '1'")
.count());
Assert.assertEquals(1, latest.where(
"valueSetUri == 'urn:cerner:valueset:newvalueset' AND valueSetVersion == '2'")
.count());
Assert.assertEquals(1, latest.where(
"valueSetUri == 'urn:cerner:valueset:othervalueset' AND valueSetVersion == '1'")
.count());
}
示例13: returnDiff
import org.apache.spark.sql.Dataset; //導入依賴的package包/類
private Pair<Dataset<Row>, Dataset<Row>> returnDiff(String table1, String table2)
{
AppleTable leftAppleTable = SparkFactory.parallelizeJDBCSource("org.hsqldb.jdbc.JDBCDriver",
"jdbc:hsqldb:hsql://127.0.0.1:9001/testDb",
"SA",
"",
"(select * from " + table1 + ")", "table1");
AppleTable rightAppleTable = SparkFactory.parallelizeJDBCSource("org.hsqldb.jdbc.JDBCDriver",
"jdbc:hsqldb:hsql://127.0.0.1:9001/testDb",
"SA",
"",
"(select * from " + table2 + ")", "table2");
return SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable);
}
示例14: checkPatients
import org.apache.spark.sql.Dataset; //導入依賴的package包/類
private void checkPatients(Dataset<Patient> patients) {
List<String> patientIds = patients
.collectAsList()
.stream()
.map(Patient::getId)
.collect(Collectors.toList());
Assert.assertEquals(3, patientIds.size());
List<String> expectedIds = ImmutableList.of(
"Patient/6666001",
"Patient/1032702",
"Patient/9995679");
Assert.assertTrue(patientIds.containsAll(expectedIds));
}
示例15: calculate_stats_table
import org.apache.spark.sql.Dataset; //導入依賴的package包/類
private TableStats calculate_stats_table(Dataset<Row> table, String tableName) {
TableStats.Builder table_stats_builder = TableStats.newBuilder();
// calculate the stats
int table_size = (int) table.count();
int distinct_subjects = (int) table.select(this.column_name_subject).distinct().count();
boolean is_complex = table_size != distinct_subjects;
// put them in the protobuf object
table_stats_builder.setSize(table_size)
.setDistinctSubjects(distinct_subjects)
.setIsComplex(is_complex)
.setName(tableName);
return table_stats_builder.build();
}