本文整理匯總了Java中org.apache.spark.sql.SQLContext.createDataFrame方法的典型用法代碼示例。如果您正苦於以下問題:Java SQLContext.createDataFrame方法的具體用法?Java SQLContext.createDataFrame怎麽用?Java SQLContext.createDataFrame使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.spark.sql.SQLContext
的用法示例。
在下文中一共展示了SQLContext.createDataFrame方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: parse
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
/**
* Parses a list of PoS-tagged sentences, each on a line and writes the result to an output
* file in a specified output format.
* @param jsc
* @param sentences
* @param outputFileName
* @param outuptFormat
*/
public void parse(JavaSparkContext jsc, List<String> sentences, String outputFileName, OutputFormat outputFormat) {
JavaRDD<String> input = jsc.parallelize(sentences);
JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction());
JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction());
JavaRDD<Row> rows = graphs.map(new Function<DependencyGraph, Row>() {
private static final long serialVersionUID = -812004521983071103L;
public Row call(DependencyGraph graph) {
return RowFactory.create(graph.getSentence().toString(), graph.dependencies());
}
});
StructType schema = new StructType(new StructField[]{
new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),
new StructField("dependency", DataTypes.StringType, false, Metadata.empty())
});
SQLContext sqlContext = new SQLContext(jsc);
DataFrame df = sqlContext.createDataFrame(rows, schema);
if (outputFormat == OutputFormat.TEXT)
df.select("dependency").write().text(outputFileName);
else
df.repartition(1).write().json(outputFileName);
}
示例2: createDataframe
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
public void createDataframe(JavaSparkContext sc, SQLContext sqlContext ) {
List<TodoItem> todos = Arrays.asList(
new TodoItem("George", "Buy a new computer", "Shopping"),
new TodoItem("John", "Go to the gym", "Sport"),
new TodoItem("Ron", "Finish the homework", "Education"),
new TodoItem("Sam", "buy a car", "Shopping"),
new TodoItem("Janet", "buy groceries", "Shopping"),
new TodoItem("Andy", "go to the beach", "Fun"),
new TodoItem("Paul", "Prepare lunch", "Cooking")
);
JavaRDD<TodoItem> rdd = sc.parallelize(todos);
DataFrame dataframe = sqlContext.createDataFrame(rdd, TodoItem.class);
sqlContext.registerDataFrameAsTable(dataframe, "todo");
System.out.println("Total number of TodoItems = [" + rdd.count() + "]\n");
}
示例3: loadObservations
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
private static DataFrame loadObservations(JavaSparkContext sparkContext, SQLContext sqlContext,
String path) {
JavaRDD<Row> rowRdd = sparkContext.textFile(path).map((String line) -> {
String[] tokens = line.split("\t");
ZonedDateTime dt = ZonedDateTime.of(Integer.parseInt(tokens[0]),
Integer.parseInt(tokens[1]), Integer.parseInt(tokens[1]), 0, 0, 0, 0,
ZoneId.systemDefault());
String symbol = tokens[3];
double price = Double.parseDouble(tokens[5]);
return RowFactory.create(Timestamp.from(dt.toInstant()), symbol, price);
});
List<StructField> fields = new ArrayList();
fields.add(DataTypes.createStructField("timestamp", DataTypes.TimestampType, true));
fields.add(DataTypes.createStructField("symbol", DataTypes.StringType, true));
fields.add(DataTypes.createStructField("price", DataTypes.DoubleType, true));
StructType schema = DataTypes.createStructType(fields);
return sqlContext.createDataFrame(rowRdd, schema);
}
示例4: createWeather
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
private static void createWeather(JavaSparkContext sc, SQLContext sqlContext) {
JavaRDD<Weather> weather = sc.textFile("../173328.csv").map((String line) -> {
return WeatherParser.parseWeather(line);
});
StructType schema = getStructType(new Schema[] { Weather.SCHEMA$ });
JavaRDD<Row> rowRDD = weather.map((Weather weatherRow) -> {
return RowFactory.create(weatherRow.get(0), weatherRow.get(1), weatherRow.get(2), weatherRow.get(3),
weatherRow.get(4), weatherRow.get(5), weatherRow.get(6), weatherRow.get(7), weatherRow.get(8),
weatherRow.get(9), weatherRow.get(10), weatherRow.get(11), weatherRow.get(12), weatherRow.get(13),
weatherRow.get(14), weatherRow.get(15), weatherRow.get(16), weatherRow.get(17), weatherRow.get(18),
weatherRow.get(19), weatherRow.get(20), weatherRow.get(21), weatherRow.get(22), weatherRow.get(23),
weatherRow.get(24), weatherRow.get(25), weatherRow.get(26), weatherRow.get(27), weatherRow.get(28),
weatherRow.get(29), weatherRow.get(30), weatherRow.get(31), weatherRow.get(32), weatherRow.get(33),
weatherRow.get(34), weatherRow.get(34), weatherRow.get(36), weatherRow.get(37), weatherRow.get(38),
weatherRow.get(39), weatherRow.get(40), weatherRow.get(41), weatherRow.get(42), weatherRow.get(43),
weatherRow.get(44), weatherRow.get(45), weatherRow.get(46), weatherRow.get(47), weatherRow.get(48));
});
// Apply the schema to the RDD.
DataFrame weatherFrame = sqlContext.createDataFrame(rowRDD, schema);
weatherFrame.registerTempTable("weather");
}
示例5: createStadiums
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
private static void createStadiums(JavaSparkContext sc, SQLContext sqlContext) {
JavaRDD<Stadium> stadiums = sc.textFile("../stadiums.csv").map((String line) -> {
return StadiumParser.parseStadium(line);
});
StructType schema = getStructType(new Schema[] { Stadium.SCHEMA$ });
JavaRDD<Row> rowRDD = stadiums.map((Stadium stadiumRow) -> {
return RowFactory.create(stadiumRow.get(0), stadiumRow.get(1), stadiumRow.get(2), stadiumRow.get(3),
stadiumRow.get(4), stadiumRow.get(5), stadiumRow.get(6), stadiumRow.get(7), stadiumRow.get(8),
stadiumRow.get(9), stadiumRow.get(10));
});
// Apply the schema to the RDD.
DataFrame stadiumFrame = sqlContext.createDataFrame(rowRDD, schema);
stadiumFrame.registerTempTable("stadium");
}
示例6: createPlayByPlay
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
private static void createPlayByPlay(JavaRDD<PlayData> plays, SQLContext sqlContext) {
StructType schema = getStructType(new Schema[] { Play.SCHEMA$, Arrest.SCHEMA$ });
// Only plays and arrest exist so far
JavaRDD<Row> rowRDD = plays.map((PlayData playData) -> {
return RowFactory.create(playData.getPlay().get(0), playData.getPlay().get(1), playData.getPlay().get(2),
playData.getPlay().get(3), playData.getPlay().get(4), playData.getPlay().get(5),
playData.getPlay().get(6), playData.getPlay().get(7), playData.getPlay().get(8),
playData.getPlay().get(9), playData.getPlay().get(10), playData.getPlay().get(11),
playData.getPlay().get(12), playData.getPlay().get(13), playData.getPlay().get(14),
playData.getPlay().get(15), playData.getPlay().get(16), playData.getPlay().get(17),
playData.getPlay().get(18), playData.getPlay().get(19), playData.getPlay().get(20),
playData.getPlay().get(21), playData.getPlay().get(22), playData.getPlay().get(23),
playData.getPlay().get(24), playData.getPlay().get(25), playData.getPlay().get(26),
playData.getPlay().get(27), playData.getPlay().get(28), playData.getArrest().get(0),
playData.getArrest().get(1), playData.getArrest().get(2), playData.getArrest().get(3),
playData.getArrest().get(4));
});
// Apply the schema to the RDD.
DataFrame playsAndArrestsFrame = sqlContext.createDataFrame(rowRDD, schema);
playsAndArrestsFrame.registerTempTable("playbyplay");
}
示例7: csvToDataFrame
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
private static Dataset<Row> csvToDataFrame(DataFile dataFile, JavaSparkContext context, SQLContext sqlContext) throws CantConverException {
StructType schema = getStructType(dataFile);
JavaRDD<Row> rdd = context.textFile(dataFile.getPath())
.map(new LineParse(dataFile));
return sqlContext.createDataFrame(rdd, schema);
// return sqlContext.read()
// .format("csv")
// .option("header", header ? "true" : "false")
// .option("delimiter", delim)
// .option("inferSchema", "false")
// .schema(getStructType())
// .load(path);
}
示例8: writeDataFrame
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
@Override
public void writeDataFrame(String name, DataFrame df) {
if (!config.isBulkLoad()) {
super.writeDataFrame(name, df);
return;
}
String fullTableName = getFullTableName(name);
Properties properties = config.getProperties();
// create table schema by persisting empty dataframe
log.info("Creating schema of table {}", fullTableName);
SQLContext sql = df.sqlContext();
DataFrame emptyDf = sql.createDataFrame(sql.sparkContext().emptyRDD(ClassTag$.MODULE$.apply(Row.class)), df.schema());
emptyDf.write().mode(saveMode).jdbc(config.getUrl(), fullTableName, properties);
final Function0<Connection> connectionFactory = JdbcUtils.createConnectionFactory(config.getUrl(), properties);
log.info("Writing to database table {} using PostgreSQL COPY", fullTableName);
int batchSize = config.getBatchSize();
df.toJavaRDD().foreachPartition(rows -> {
Connection connection = connectionFactory.apply();
copyRows(fullTableName, rows, connection, batchSize);
try {
connection.close();
} catch (SQLException e) {
log.debug("Unexpected exception when closing database connection: {}", e);
}
});
}
示例9: getTestRDD
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
private DataFrame getTestRDD() {
SQLContext sql = new SQLContext(jsc());
List<Row> rdd = new ArrayList<>();
// cycle one -> two -> three -> one
rdd.add(RowFactory.create(0, uriIndex.getIndex("http://example.com/a"), 1L, uriIndex.getIndex("http://example.com/a"), 2L));
rdd.add(RowFactory.create(0, uriIndex.getIndex("http://example.com/a"), 2L, uriIndex.getIndex("http://example.com/a"), 3L));
rdd.add(RowFactory.create(0, uriIndex.getIndex("http://example.com/a"), 3L, uriIndex.getIndex("http://example.com/a"), 1L));
// one -> four, four -> one
rdd.add(RowFactory.create(0, uriIndex.getIndex("http://example.com/a"), 1L, uriIndex.getIndex("http://example.com/b"), 4L));
rdd.add(RowFactory.create(0, uriIndex.getIndex("http://example.com/b"), 4L, uriIndex.getIndex("http://example.com/a"), 1L));
// five -> one
rdd.add(RowFactory.create(0, uriIndex.getIndex("http://example.com/c"), 5L, uriIndex.getIndex("http://example.com/a"), 1L));
return sql.createDataFrame(rdd, new StructType()
.add("predicateIndex", DataTypes.IntegerType, false)
.add("fromTypeIndex", DataTypes.IntegerType, false)
.add("fromID", DataTypes.LongType, false)
.add("toTypeIndex", DataTypes.IntegerType, false)
.add("toID", DataTypes.LongType, false)
);
}
示例10: testCollectRelations
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
/**
* Test if expected directed relations are collected from a RDD of Instances
*/
@Test
public void testCollectRelations() {
SQLContext sql = new SQLContext(jsc());
RelationExtractor collector = new RelationExtractor(
new RelationConfig(),
jsc(),
new ClassGraph()
);
List<Row> rdd = new ArrayList<>();
// cycle one -> two -> three -> one
rdd.add(RowFactory.create(0, 1, 1L, 1, 2L));
rdd.add(RowFactory.create(0, 1, 2L, 1, 3L));
rdd.add(RowFactory.create(0, 1, 3L, 1, 1L));
// one -> four, four -> one
rdd.add(RowFactory.create(0, 2, 4L, 1, 1L));
rdd.add(RowFactory.create(0, 1, 1L, 2, 4L));
// five -> one
rdd.add(RowFactory.create(0, 3, 5L, 1, 1L));
DataFrame expected = sql.createDataFrame(rdd, new StructType()
.add("predicateIndex", DataTypes.IntegerType, false)
.add("fromTypeIndex", DataTypes.IntegerType, false)
.add("fromID", DataTypes.LongType, false)
.add("toTypeIndex", DataTypes.IntegerType, false)
.add("toID", DataTypes.LongType, false)
);
// (predicateIndex, fromTypeIndex, instanceID, toTypeIndex, relatedID)
DataFrame result = collector.extractRelations(getTestRDD());
assertEquals("Expected relation row schema is collected", expected.schema(), result.schema());
assertRDDEquals("Expected relation rows are collected", expected.javaRDD(), result.javaRDD());
}
示例11: tag
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
/**
* Tags a list of sequences and returns a list of tag sequences.
* @param sentences
* @return a list of tagged sequences.
*/
public List<String> tag(List<String> sentences) {
List<Row> rows = new LinkedList<Row>();
for (String sentence : sentences) {
rows.add(RowFactory.create(sentence));
}
StructType schema = new StructType(new StructField[]{
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
});
SQLContext sqlContext = new SQLContext(jsc);
DataFrame input = sqlContext.createDataFrame(rows, schema);
if (cmmModel != null) {
DataFrame output = cmmModel.transform(input).repartition(1);
return output.javaRDD().map(new RowToStringFunction(1)).collect();
} else {
System.err.println("Tagging model is null. You need to create or load a model first.");
return null;
}
}
示例12: main
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf()
.setAppName("ReadFromMapRDB-DF-Java")
.setMaster("local[1]");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
SQLContext sqlContext = new SQLContext(jsc);
Configuration config = null;
try {
config = HBaseConfiguration.create();
config.set(TableInputFormat.INPUT_TABLE, "/apps/tests/users_profiles");
} catch (Exception ce) {
ce.printStackTrace();
}
JavaPairRDD hBaseRDD =
jsc.newAPIHadoopRDD(config, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
// convert HBase result into Java RDD Pair key/User
JavaPairRDD rowPairRDD = hBaseRDD.mapToPair(
new PairFunction<Tuple2, String, User>() {
@Override
public Tuple2 call(
Tuple2 entry) throws Exception {
Result r = (Result) entry._2;
String rowKey = Bytes.toString(r.getRow());
User user = new User();
user.setRowkey( rowKey );
user.setFirstName(Bytes.toString(r.getValue(Bytes.toBytes("default"), Bytes.toBytes("first_name"))));
user.setLastName(Bytes.toString(r.getValue(Bytes.toBytes("default"), Bytes.toBytes("last_name"))));
return new Tuple2(rowKey, user);
}
});
System.out.println("************ RDD *************");
System.out.println(rowPairRDD.count());
System.out.println(rowPairRDD.keys().collect());
System.out.println(rowPairRDD.values().collect());
System.out.println("************ DF *************");
DataFrame df = sqlContext.createDataFrame(rowPairRDD.values(), User.class);
System.out.println(df.count());
System.out.println(df.schema());
df.show();
System.out.println("************ DF with SQL *************");
df.registerTempTable("USER_TABLE");
DataFrame dfSql = sqlContext.sql("SELECT * FROM USER_TABLE WHERE firstName = 'Ally' ");
System.out.println(dfSql.count());
System.out.println(dfSql.schema());
dfSql.show();
jsc.close();
}
示例13: execute
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
@Override
public int execute() throws Exception {
SQLContext sqlContext = new SQLContext(sparkContext);
DataFrame inputDataFrame = sqlContext.createDataFrame(
sparkContext.textFile(inputPath.toString()).map(new CsvToRow()), DataTypes.createStructType(Arrays.asList(
DataTypes.createStructField("myday", DataTypes.StringType, true),
DataTypes.createStructField("myint", DataTypes.IntegerType, true))
));
inputDataFrame.registerTempTable("mytable");
addResults(new ArrayList<>(sqlContext.sql(
"SELECT myday, sum(myint) as myint FROM mytable WHERE myday is not NULL and myint is not NULL GROUP BY myday ORDER BY myint"
).javaRDD().map(new RowToTsv()).collect()));
return SUCCESS;
}
示例14: main
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new HiveContext(sc.sc());
Options options = new Options();
Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
Option queryOpt = new Option( "query", true, "SQL query string." );
Option baminOpt = new Option( "in", true, "" );
options.addOption( opOpt );
options.addOption( queryOpt );
options.addOption( baminOpt );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);
//Read BAM/SAM from HDFS
JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
//Map to SAMRecord RDD
JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));
Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
samDF.registerTempTable(tablename);
if(query!=null) {
//Save as parquet file
Dataset df2 = sqlContext.sql(query);
df2.show(100,false);
if(bwaOutDir!=null)
df2.write().parquet(bwaOutDir);
}else{
if(bwaOutDir!=null)
samDF.write().parquet(bwaOutDir);
}
sc.stop();
}
示例15: main
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
public static void main(final String[] args) {
final String tableName = "SparkExampleML";
/** get the locator host/port from arguments, if specified.. **/
final String locatorHost = args.length > 0 ? args[0] : "localhost";
final int locatorPort = args.length > 1 ? Integer.valueOf(args[1]) : 10334;
int numClusters = Integer.getInteger("numClusters", 2);
int numIterations = Integer.getInteger("numIterations", 20);
/** create SparkContext **/
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SparkExampleDF");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(jsc);
/** create data-frame from sample ML data **/
DataFrame df = sqlContext.createDataFrame(jsc.parallelize(SAMPLE_ML_DATA), LabeledPoint.class);
df.show();
Map<String, String> options = new HashMap<>(2);
options.put("ampool.locator.host", locatorHost);
options.put("ampool.locator.port", String.valueOf(locatorPort));
/** overwrite existing table, if specified.. **/
SaveMode saveMode = Boolean.getBoolean("overwrite") ? SaveMode.Overwrite : SaveMode.ErrorIfExists;
/** save the dataFrame to Ampool as `tableName' **/
df.write().format("io.ampool").options(options).mode(saveMode).save(tableName);
/** load the data-frame from Ampool `tableName' **/
DataFrame df1 = sqlContext.read().format("io.ampool").options(options).load(tableName);
System.out.println("########## DATA FROM AMPOOL ############");
df1.show();
/** execute KMeans fit on the data loaded from Ampool **/
KMeans kMeans = new KMeans().setK(numClusters).setMaxIter(numIterations)
.setFeaturesCol("features").setPredictionCol("prediction");
KMeansModel model = kMeans.fit(df1);
Vector[] cost = model.clusterCenters();
System.out.println("# Sum of Squared Errors = " + Arrays.toString(cost));
}