本文整理匯總了Java中org.apache.spark.sql.SQLContext.sql方法的典型用法代碼示例。如果您正苦於以下問題:Java SQLContext.sql方法的具體用法?Java SQLContext.sql怎麽用?Java SQLContext.sql使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.spark.sql.SQLContext
的用法示例。
在下文中一共展示了SQLContext.sql方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: getBodyContent
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
private DataFrame getBodyContent(SQLContext sqlContxt, String jsonPath, String bodyColumn,
String whereClause, String label) {
DataFrame df = sqlContxt.read().json(jsonPath);
df.registerTempTable("news");
df.printSchema();
String sql = "SELECT\n"
+ " generateId('') AS id,\n"
+ " " + bodyColumn + " AS content,\n"
+ " CAST(" + label + " AS Double) AS label\n"
+ "FROM news\n"
+ "WHERE (trim(nvl(" + bodyColumn + " , '')) != '')\n"
+ whereClause;
DataFrame newsData = sqlContxt.sql(sql);
return newsData;
}
示例2: getLinesFromDASTable
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
public static JavaRDD<String> getLinesFromDASTable(String tableName, int tenantId, JavaSparkContext sparkContext)
throws AnalyticsTableNotAvailableException, AnalyticsException {
JavaRDD<String> lines;
String tableSchema = extractTableSchema(tableName, tenantId);
SQLContext sqlCtx = new SQLContext(sparkContext);
sqlCtx.sql("CREATE TEMPORARY TABLE ML_REF USING org.wso2.carbon.analytics.spark.core.sources.AnalyticsRelationProvider "
+ "OPTIONS ("
+ "tenantId \""
+ tenantId
+ "\", "
+ "tableName \""
+ tableName
+ "\", "
+ "schema \""
+ tableSchema + "\"" + ")");
DataFrame dataFrame = sqlCtx.sql("select * from ML_REF");
// Additional auto-generated column "_timestamp" needs to be dropped because it is not in the schema.
JavaRDD<Row> rows = dataFrame.drop("_timestamp").javaRDD();
lines = rows.map(new RowsToLines.Builder().separator(CSVFormat.RFC4180.getDelimiter() + "").build());
return lines;
}
示例3: main
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
public static void main(String[] args) throws Exception {
if (args.length != 3) {
throw new Exception("Usage LoadHive sparkMaster tbl");
}
String master = args[0];
String tbl = args[1];
JavaSparkContext sc = new JavaSparkContext(
master, "loadhive", System.getenv("SPARK_HOME"), System.getenv("JARS"));
SQLContext sqlCtx = new SQLContext(sc);
DataFrame rdd = sqlCtx.sql("SELECT key, value FROM src");
JavaRDD<Integer> squaredKeys = rdd.toJavaRDD().map(new SquareKey());
List<Integer> result = squaredKeys.collect();
for (Integer elem : result) {
System.out.println(elem);
}
}
示例4: computeNodeData
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
public void computeNodeData(SQLContext sqlContext){
if (tableName == null) {
System.err.println("The predicate does not have a VP table: " + triplePattern.predicate);
return;
}
StringBuilder query = new StringBuilder("SELECT DISTINCT ");
// SELECT
if (triplePattern.subjectType == ElementType.VARIABLE &&
triplePattern.objectType == ElementType.VARIABLE)
query.append("s AS " + Utils.removeQuestionMark(triplePattern.subject) +
", o AS " + Utils.removeQuestionMark(triplePattern.object) + " ");
else if (triplePattern.subjectType == ElementType.VARIABLE)
query.append("s AS " + Utils.removeQuestionMark(triplePattern.subject) );
else if (triplePattern.objectType == ElementType.VARIABLE)
query.append("o AS " + Utils.removeQuestionMark(triplePattern.object));
// FROM
query.append(" FROM ");
query.append("vp_" + tableName);
// WHERE
if( triplePattern.objectType == ElementType.CONSTANT || triplePattern.subjectType == ElementType.CONSTANT)
query.append(" WHERE ");
if (triplePattern.objectType == ElementType.CONSTANT)
query.append(" o='" + triplePattern.object +"' ");
if (triplePattern.subjectType == ElementType.CONSTANT)
query.append(" s='" + triplePattern.subject +"' ");
this.sparkNodeData = sqlContext.sql(query.toString());
}
示例5: runScript
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
/**
* Splits the bundled hql script into multiple expressions using ScriptSlitter utility class.
* Each expression is run on the provided HiveContext.
*
* @param sqlContext an SQLContext, as provided by spark through the TestHiveServer TestRule, used to run hql expressions
*/
@Override
public void runScript(SQLContext sqlContext) {
String[] expressions = ScriptSplitter.splitScriptIntoExpressions(script);
for (String expression : expressions) {
sqlContext.sql(expression);
}
}
示例6: runScriptReturnResults
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
/**
* Splits the bundled hql script into multiple expressions using ScriptSlitter utility class.
* Each expression is run on the provided HiveContext.
*
* @param sqlContext an SQLContext, as provided by spark through the TestHiveServer TestRule, used to run hql expressions
* @return the row results acquired from the last executed expression
*/
@Override
public List<Row> runScriptReturnResults(SQLContext sqlContext) {
String[] expressions = ScriptSplitter.splitScriptIntoExpressions(script);
for (int i = 0; i < expressions.length - 1; i++) {
String expression = expressions[i];
sqlContext.sql(expression);
}
List<Row> rows = sqlContext.sql(expressions[expressions.length - 1]).collectAsList();
return rows;
}
示例7: main
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf()
.setAppName("ReadFromMapRDB-DF-Java")
.setMaster("local[1]");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
SQLContext sqlContext = new SQLContext(jsc);
Configuration config = null;
try {
config = HBaseConfiguration.create();
config.set(TableInputFormat.INPUT_TABLE, "/apps/tests/users_profiles");
} catch (Exception ce) {
ce.printStackTrace();
}
JavaPairRDD hBaseRDD =
jsc.newAPIHadoopRDD(config, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
// convert HBase result into Java RDD Pair key/User
JavaPairRDD rowPairRDD = hBaseRDD.mapToPair(
new PairFunction<Tuple2, String, User>() {
@Override
public Tuple2 call(
Tuple2 entry) throws Exception {
Result r = (Result) entry._2;
String rowKey = Bytes.toString(r.getRow());
User user = new User();
user.setRowkey( rowKey );
user.setFirstName(Bytes.toString(r.getValue(Bytes.toBytes("default"), Bytes.toBytes("first_name"))));
user.setLastName(Bytes.toString(r.getValue(Bytes.toBytes("default"), Bytes.toBytes("last_name"))));
return new Tuple2(rowKey, user);
}
});
System.out.println("************ RDD *************");
System.out.println(rowPairRDD.count());
System.out.println(rowPairRDD.keys().collect());
System.out.println(rowPairRDD.values().collect());
System.out.println("************ DF *************");
DataFrame df = sqlContext.createDataFrame(rowPairRDD.values(), User.class);
System.out.println(df.count());
System.out.println(df.schema());
df.show();
System.out.println("************ DF with SQL *************");
df.registerTempTable("USER_TABLE");
DataFrame dfSql = sqlContext.sql("SELECT * FROM USER_TABLE WHERE firstName = 'Ally' ");
System.out.println(dfSql.count());
System.out.println(dfSql.schema());
dfSql.show();
jsc.close();
}
示例8: getTrainingDataset
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
public static DataFrame getTrainingDataset(SQLContext sqlContxt) {
DataFrame df = sqlContxt.read().json("/home/momchil/Documents/MasterThesis/dataset/w2v/long-abstracts_bg.json");
df.registerTempTable("dbpedia");
df.printSchema();
String sqlText =
"SELECT abstract as content\n"
+ "FROM dbpedia\n"
+ "WHERE abstract IS NOT NULL\n"
+ "LIMIT 101444"; //171444
df = sqlContxt.sql(sqlText);
return df;
}
示例9: main
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
JavaSparkContext sc = new JavaSparkContext("local", "JavaAPISuite", conf);
SQLContext sqlContext = new SQLContext(sc);
// Creates a DataFrame from a specified file
DataFrame plays = sqlContext.load("output", "com.databricks.spark.avro");
// Apply the schema to the RDD.
sqlContext.registerDataFrameAsTable(plays, "playbyplay");
// Run the query
DataFrame join = sqlContext
.sql("select playtype, pertotalstable.totalperplay, totalstable.total, ((pertotalstable.totalperplay / totalstable.total) * 100) as percentage from " +
"(select playtype, count(*) as totalperplay from playbyplay where rooftype <> \"None\" and prcp <= 0 group by playtype) pertotalstable " +
"full outer join " +
"(select count(*) as total from playbyplay where rooftype <> \"None\" and prcp <= 0) totalstable " +
"order by playtype");
// Output the query's rows
join.javaRDD().collect().forEach((Row row) -> {
System.out.println("Result:" + row.toString());
});
}
示例10: runSQL
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
private static void runSQL(SQLContext sqlContext, String query) {
// Run the query
DataFrame df = sqlContext.sql(query);
// Output the query's rows
df.javaRDD().collect().forEach((Row row) -> {
System.out.println("Result:" + row.toString());
});
}
示例11: main
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new HiveContext(sc.sc());
Options options = new Options();
Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
Option queryOpt = new Option( "query", true, "SQL query string." );
Option baminOpt = new Option( "in", true, "" );
options.addOption( opOpt );
options.addOption( queryOpt );
options.addOption( baminOpt );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);
//Read BAM/SAM from HDFS
JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
//Map to SAMRecord RDD
JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));
Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
samDF.registerTempTable(tablename);
if(query!=null) {
//Save as parquet file
Dataset df2 = sqlContext.sql(query);
df2.show(100,false);
if(bwaOutDir!=null)
df2.write().parquet(bwaOutDir);
}else{
if(bwaOutDir!=null)
samDF.write().parquet(bwaOutDir);
}
sc.stop();
}
示例12: computeNodeData
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
public void computeNodeData(SQLContext sqlContext) {
StringBuilder query = new StringBuilder("SELECT ");
ArrayList<String> whereConditions = new ArrayList<String>();
ArrayList<String> explodedColumns = new ArrayList<String>();
// subject
if (tripleGroup.get(0).subjectType == ElementType.VARIABLE)
query.append("s AS " + Utils.removeQuestionMark(tripleGroup.get(0).subject) + ",");
// objects
for (TriplePattern t : tripleGroup) {
String columnName = stats.findTableName(t.predicate.toString());
if (columnName == null) {
System.err.println("This column does not exists: " + t.predicate);
return;
}
if(t.subjectType == ElementType.CONSTANT) {
whereConditions.add("s='" + t.subject + "'");
}
if (t.objectType == ElementType.CONSTANT) {
if (t.isComplex)
whereConditions
.add("array_contains(" +columnName + ", '" + t.object + "')");
else
whereConditions.add(columnName + "='" + t.object + "'");
} else if (t.isComplex) {
query.append(" P" + columnName + " AS " + Utils.removeQuestionMark(t.object) + ",");
explodedColumns.add(columnName);
} else {
query.append(
" " + columnName + " AS " + Utils.removeQuestionMark(t.object) + ",");
whereConditions.add(columnName + " IS NOT NULL");
}
}
// delete last comma
query.deleteCharAt(query.length() - 1);
// TODO: parameterize the name of the table
query.append(" FROM property_table ");
for (String explodedColumn : explodedColumns) {
query.append("\n lateral view explode(" + explodedColumn + ") exploded" + explodedColumn + " AS P"
+ explodedColumn);
}
if (!whereConditions.isEmpty()) {
query.append(" WHERE ");
query.append(String.join(" AND ", whereConditions));
}
this.sparkNodeData = sqlContext.sql(query.toString());
}
示例13: getMovieRecommendations
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
public static void getMovieRecommendations(int userId, int num, ResultSet[] resultSets) {
try {
JavaSparkContext spliceSparkContext = SpliceSpark.getContext();
SQLContext sqlContext = new SQLContext(spliceSparkContext);
Connection conn = DriverManager.getConnection("jdbc:default:connection");
// Read the latest models path form database
String modelPath = "tmp/movielensRecommender";
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("SELECT model_path from MOVIELENS.MODEL ORDER BY create_date DESC {limit 1}");
if (rs.next()) {
modelPath = rs.getString(1);
}
rs.close();
stmt.close();
// Load the model
MatrixFactorizationModel sameModel = MatrixFactorizationModel.load(spliceSparkContext.sc(), modelPath);
// Get recommendation for the specified user and number of items
Rating[] recom = sameModel.recommendProducts(userId, num);
// Load the Movies table to get the details of the Movies
Map<String, String> options = new HashMap<String, String>();
options.put("driver", "com.splicemachine.db.jdbc.ClientDriver");
options.put( "url", "jdbc:splice://localhost:1527/splicedb;user=splice;password=admin;useSpark=true");
options.put("dbtable", "MOVIELENS.MOVIES");
DataFrame moviesDF = sqlContext.read().format("jdbc").options(options).load();
moviesDF.registerTempTable("TEMP_MOVIES");
// Collect the Movied Ids from Recommendations
StringBuffer sFilter = new StringBuffer();
for (Rating rate : recom) {
if (sFilter.length() > 0)
sFilter.append(", ");
sFilter.append(rate.product());
}
// Apply filter to select only the recommended Movies
DataFrame filteredMoviesDF = sqlContext.sql("Select * from TEMP_MOVIES where MOVIE_ID in (" + sFilter.toString() + ")");
List<Row> recMovieList = filteredMoviesDF.collectAsList();
// Collect the details to build Result Set to return with the
// details
int movId = 0;
String movTitle = "";
List<ExecRow> rows = new ArrayList();
for (Row movie : recMovieList) {
ExecRow row = new ValueRow(9);
row.setColumn(1, new SQLInteger(movie.getInt(0)));
row.setColumn(2, new SQLVarchar(movie.getString(1)));
rows.add(row);
}
// Convert the List of ExecRows to Result Set
Activation lastActivation = ((EmbedConnection) conn).getLanguageConnection().getLastActivation();
IteratorNoPutResultSet resultsToWrap = new IteratorNoPutResultSet(rows, MOVIE_RECOMMENDATIONS_COLUMNS, lastActivation);
resultsToWrap.openCore();
// Set the Return resultset
resultSets[0] = new EmbedResultSet40((EmbedConnection) conn, resultsToWrap, false, null, true);
} catch (StandardException e) {
LOG.error("Exception in getColumnStatistics", e);
e.printStackTrace();
} catch (SQLException sqle) {
LOG.error("Exception in getColumnStatistics", sqle);
sqle.printStackTrace();
}
}
示例14: CreateQuery
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
/**
* Heuristic Naive SPARK implementation of Transitive closure.
*
* @param oldTableName
* @param newTableName
* @param whereExpression
* @param joinOnExpression
* @param kleeneDepth1
* @param kleeneDepth2
* @param kleeneType
* @param selectionPart
*/
public static void CreateQuery(String[] oldTableName, String newTableName, String whereExpression,
ArrayList<String> joinOnExpression, int kleeneDepth1, int kleeneDepth2, String kleeneType,
String[] selectionPart) {
DataFrame resultFrame = null;
SQLContext sqlContext = AppSpark.sqlContext;
int numberOfLines = -1;
KleeneFixed.CreateQuery(oldTableName, newTableName, whereExpression, joinOnExpression, 0, 3, kleeneType,
selectionPart);
String insertTmp = KleeneFixed.baseQuery;
resultFrame = sqlContext.sql(insertTmp);
resultFrame.registerTempTable("temp1");
KleeneFixed.CreateQuery(oldTableName, newTableName, whereExpression, joinOnExpression, 4, -1, kleeneType,
selectionPart);
String minusOperation = "SELECT DISTINCT t.subject, t.predicate, t.object FROM (" + KleeneFixed.baseQuery
+ ") t"
+ " LEFT JOIN temp1 ON t.subject = temp1.subject AND t.predicate = temp1.predicate AND t.object = "
+ " temp1.object WHERE temp1.predicate IS NULL ";
baseQuery = baseQuery + minusOperation + "\n";
resultFrame = sqlContext.sql(minusOperation);
resultFrame.registerTempTable("temp2");
String resultsChecking = "SELECT COUNT(*) AS count FROM temp2";
resultFrame = sqlContext.sql(resultsChecking);
Row[] results = resultFrame.collect();
numberOfLines = (int) results[0].getLong(0);
baseQuery = baseQuery + resultsChecking + "\n";
System.out.println("# of new lines " + numberOfLines);
if (numberOfLines != 0) {
KleeneSemiNaiveSPARK.CreateQuery(oldTableName, newTableName, joinOnExpression, kleeneType, selectionPart,
kleeneDepth1, "temp2");
} else {
resultFrame = sqlContext.sql("SELECT subject, predicate, object FROM temp1");
QueryStruct.fillStructure(oldTableName, newTableName, baseQuery, "none", "none");
ResultStruct.fillStructureSpark(resultFrame);
}
}
示例15: partition
import org.apache.spark.sql.SQLContext; //導入方法依賴的package包/類
/**
* SPARK Vertical Partitioner.
* @param inputPath
* @param outputPath
*/
public static void partition(String inputPath, String outputPath) {
long lStartTime = System.nanoTime();
SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL").setMaster("local");
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
SQLContext sqlContext = new SQLContext(ctx);
System.out.println("=== Data source: RDD ===");
@SuppressWarnings("serial")
JavaRDD<RDFgraph> RDF = ctx.textFile(inputPath + "/*").map(new Function<String, RDFgraph>() {
@Override
public RDFgraph call(String line) {
String[] parts = line.split(" (?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)");
RDFgraph entry = new RDFgraph();
if (parts.length > 2) {
entry.setSubject(parts[0]);
entry.setPredicate(parts[1]);
entry.setObject(parts[2]);
}
return entry;
}
});
DataFrame rawGraph = sqlContext.createDataFrame(RDF, RDFgraph.class);
rawGraph.registerTempTable("rawGraph");
int numPredicates = sqlContext
.sql("SELECT predicate FROM rawGraph WHERE subject != '@prefix' GROUP BY predicate").collect().length;
DataFrame pureGraph = sqlContext
.sql("SELECT subject, predicate, object FROM rawGraph WHERE subject != '@prefix'");
DataFrame partitionedGraph = pureGraph.repartition(numPredicates, new Column("predicate"));
partitionedGraph.write().parquet(outputPath);
long lEndTime = System.nanoTime();
long difference = lEndTime - lStartTime;
System.out.println("Partitioning complete.\nElapsed milliseconds: " + difference / 1000000);
}