本文整理匯總了Java中org.apache.spark.sql.SQLContext類的典型用法代碼示例。如果您正苦於以下問題:Java SQLContext類的具體用法?Java SQLContext怎麽用?Java SQLContext使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
SQLContext類屬於org.apache.spark.sql包,在下文中一共展示了SQLContext類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: createNGramDataFrame
import org.apache.spark.sql.SQLContext; //導入依賴的package包/類
/**
* Creates a n-gram data frame from text lines.
* @param lines
* @return a n-gram data frame.
*/
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
private static final long serialVersionUID = -4332903997027358601L;
@Override
public Row call(String line) throws Exception {
return RowFactory.create(Arrays.asList(line.split("\\s+")));
}
});
StructType schema = new StructType(new StructField[] {
new StructField("words",
DataTypes.createArrayType(DataTypes.StringType), false,
Metadata.empty()) });
DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
// build a bigram language model
NGram transformer = new NGram().setInputCol("words")
.setOutputCol("ngrams").setN(2);
DataFrame ngramDF = transformer.transform(wordDF);
ngramDF.show(10, false);
return ngramDF;
}
示例2: parse
import org.apache.spark.sql.SQLContext; //導入依賴的package包/類
/**
* Parses a list of PoS-tagged sentences, each on a line and writes the result to an output
* file in a specified output format.
* @param jsc
* @param sentences
* @param outputFileName
* @param outuptFormat
*/
public void parse(JavaSparkContext jsc, List<String> sentences, String outputFileName, OutputFormat outputFormat) {
JavaRDD<String> input = jsc.parallelize(sentences);
JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction());
JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction());
JavaRDD<Row> rows = graphs.map(new Function<DependencyGraph, Row>() {
private static final long serialVersionUID = -812004521983071103L;
public Row call(DependencyGraph graph) {
return RowFactory.create(graph.getSentence().toString(), graph.dependencies());
}
});
StructType schema = new StructType(new StructField[]{
new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),
new StructField("dependency", DataTypes.StringType, false, Metadata.empty())
});
SQLContext sqlContext = new SQLContext(jsc);
DataFrame df = sqlContext.createDataFrame(rows, schema);
if (outputFormat == OutputFormat.TEXT)
df.select("dependency").write().text(outputFileName);
else
df.repartition(1).write().json(outputFileName);
}
示例3: createDataframe
import org.apache.spark.sql.SQLContext; //導入依賴的package包/類
public void createDataframe(JavaSparkContext sc, SQLContext sqlContext ) {
List<TodoItem> todos = Arrays.asList(
new TodoItem("George", "Buy a new computer", "Shopping"),
new TodoItem("John", "Go to the gym", "Sport"),
new TodoItem("Ron", "Finish the homework", "Education"),
new TodoItem("Sam", "buy a car", "Shopping"),
new TodoItem("Janet", "buy groceries", "Shopping"),
new TodoItem("Andy", "go to the beach", "Fun"),
new TodoItem("Paul", "Prepare lunch", "Cooking")
);
JavaRDD<TodoItem> rdd = sc.parallelize(todos);
DataFrame dataframe = sqlContext.createDataFrame(rdd, TodoItem.class);
sqlContext.registerDataFrameAsTable(dataframe, "todo");
System.out.println("Total number of TodoItems = [" + rdd.count() + "]\n");
}
示例4: computeJoins
import org.apache.spark.sql.SQLContext; //導入依賴的package包/類
public Dataset<Row> computeJoins(SQLContext sqlContext){
// compute all the joins
Dataset<Row> results = node.computeJoinWithChildren(sqlContext);
// select only the requested result
Column [] selectedColumns = new Column[node.projection.size()];
for (int i = 0; i < selectedColumns.length; i++) {
selectedColumns[i]= new Column(node.projection.get(i));
}
// if there is a filter set, apply it
results = filter == null ? results.select(selectedColumns) : results.filter(filter).select(selectedColumns);
// if results are distinct
if(selectDistinct) results = results.distinct();
return results;
}
示例5: main
import org.apache.spark.sql.SQLContext; //導入依賴的package包/類
public static void main(String[] args) throws IOException {
Flags.setFromCommandLineArgs(THE_OPTIONS, args);
// 初始化Spark Conf.
SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval());
SQLContext sqlContext = new SQLContext(sc);
// 初始化參數
HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(",")));
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker());
// 從Kafka Stream獲取數據
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);
JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
private static final long serialVersionUID = 5266880065425088203L;
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> {
List<ApacheAccessLog> list = new ArrayList<>();
try {
// 映射每一行
list.add(ApacheAccessLog.parseFromLogLine(line));
return list;
} catch (RuntimeException e) {
return list;
}
}).cache();
accessLogsDStream.foreachRDD(rdd -> {
// rdd to DataFrame
DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class);
// 寫入Parquet文件
df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());
return null;
});
// 啟動Streaming服務器
jssc.start(); // 啟動計算
jssc.awaitTermination(); // 等待終止
}
示例6: computeJoinWithChildren
import org.apache.spark.sql.SQLContext; //導入依賴的package包/類
public Dataset<Row> computeJoinWithChildren(SQLContext sqlContext){
if (sparkNodeData == null)
this.computeNodeData(sqlContext);
Dataset<Row> currentResult = this.sparkNodeData;
for (Node child: children){
Dataset<Row> childResult = child.computeJoinWithChildren(sqlContext);
List<String> joinVariables = Utils.commonVariables(currentResult.columns(), childResult.columns());
currentResult = currentResult.join(childResult,
scala.collection.JavaConversions.asScalaBuffer(joinVariables).seq());
}
return currentResult;
}
示例7: SparkDriver
import org.apache.spark.sql.SQLContext; //導入依賴的package包/類
public SparkDriver(Properties props) {
SparkConf conf = new SparkConf().setAppName(props.getProperty(MudrodConstants.SPARK_APP_NAME, "MudrodSparkApp")).setIfMissing("spark.master", props.getProperty(MudrodConstants.SPARK_MASTER))
.set("spark.hadoop.validateOutputSpecs", "false").set("spark.files.overwrite", "true");
String esHost = props.getProperty(MudrodConstants.ES_UNICAST_HOSTS);
String esPort = props.getProperty(MudrodConstants.ES_HTTP_PORT);
if (!"".equals(esHost)) {
conf.set("es.nodes", esHost);
}
if (!"".equals(esPort)) {
conf.set("es.port", esPort);
}
conf.set("spark.serializer", KryoSerializer.class.getName());
conf.set("es.batch.size.entries", "1500");
sc = new JavaSparkContext(conf);
sqlContext = new SQLContext(sc);
}
示例8: convertToDataFrame
import org.apache.spark.sql.SQLContext; //導入依賴的package包/類
/**
* 將數據集文件轉換為DataFrame TODO:增加json等格式
*
* @param context
* @return
* @throws CantConverException
*/
public static Dataset<Row> convertToDataFrame(DataFile dataFile, JavaSparkContext context) throws CantConverException {
SparkSession sparkSession = SparkSession.builder()
.sparkContext(context.sc())
.getOrCreate();
SQLContext sqlContext = new SQLContext(sparkSession);
switch (dataFile.getDataFileType()) {
case CSV:
return csvToDataFrame(dataFile, context, sqlContext);
case LIBSVM:
return libsvmToDataFrame(dataFile, sqlContext);
default:
throw new CantConverException("不支持的數據集格式");
}
}
示例9: main
import org.apache.spark.sql.SQLContext; //導入依賴的package包/類
/**
* @param args
* @throws SQLException
*/
public static void main(String[] args) throws SQLException {
if (args.length == 0) {
System.out.println("Usage: ImpalaSparkJDBC <url> <tableName>");
System.out.println(" (secure) jdbc:impala://impala-host:21050/;AuthMech=1;KrbRealm=realm;KrbHostFQDN=krbHost;KrbServiceName=impala");
System.out.println(" (insecure) jdbc:hive2://impala-host:21050/;auth=noSasl");
System.exit(1);
}
Properties prop = new Properties();
prop.setProperty("driver","com.cloudera.impala.jdbc41.Driver");
System.setProperty("java.security.auth.login.config", "jaas.conf");
System.setProperty("sun.security.jgss.debug","true");
System.setProperty("javax.security.auth.useSubjectCredsOnly","false");
SparkConf sparkConf = new SparkConf().setAppName("ImpalaJDBC");
SparkContext sc = new SparkContext(sparkConf);
SQLContext sqlContext = SQLContext.getOrCreate(sc);
sqlContext.read().jdbc(args[0], args[1], prop).show();
}
示例10: createRelation
import org.apache.spark.sql.SQLContext; //導入依賴的package包/類
@Override
public SparkRDF4JSparqlRelation createRelation(SQLContext sqlContext,
scala.collection.immutable.Map<String, String> scalaParameters, StructType schema) {
Map<String, String> parameters = JavaConversions.asJavaMap(scalaParameters);
String service = Optional.ofNullable(parameters.get("service")).orElseThrow(() -> new RuntimeException(
"Spark RDF4J Sparql requires a SPARQL 'service' to be specified in the parameters"));
String query = Optional.ofNullable(parameters.get("query")).orElseThrow(() -> new RuntimeException(
"Spark RDF4J Sparql requires a 'query' to be specified in the parameters"));
try {
ParsedQuery parsedQuery = QueryParserUtil.parseQuery(QueryLanguage.SPARQL, query, null);
if(!(parsedQuery instanceof ParsedTupleQuery)) {
throw new RuntimeException("Spark RDF4J can only be used with Tuple (Select) queries right now.");
}
return new SparkRDF4JSparqlRelation(service, parsedQuery, schema, sqlContext);
} catch (MalformedQueryException e) {
throw new RuntimeException("Query was not valid SPARQL", e);
}
}
示例11: SparkRDF4JSparqlRelation
import org.apache.spark.sql.SQLContext; //導入依賴的package包/類
/**
* Constructor for a new {@link SparkRDF4JSparqlRelation} based on the given
* service, query, schema, and context.
*
* @param service
* The URL to the SPARQL service to be used for this query.
* @param parsedQuery
* The preparsed SPARQL query.
* @param schema
* The schema to use for the results of the query.
* @param sqlContext
* The context for the query.
*/
SparkRDF4JSparqlRelation(String service, ParsedQuery parsedQuery, StructType schema, SQLContext sqlContext) {
this.serviceField = Objects.requireNonNull(service);
this.queryField = Objects.requireNonNull(parsedQuery);
this.schemaField = Optional.ofNullable(schema).orElseGet(() -> {
// These bindings are guaranteed to be present and are not nullable
Set<String> assuredBindingNames = this.queryField.getTupleExpr().getAssuredBindingNames();
// If bindings are only in the following they are nullable
Set<String> bindingNames = this.queryField.getTupleExpr().getBindingNames();
StructType result = new StructType();
for(String binding : bindingNames) {
result = result.add(binding, DataTypes.StringType, !(assuredBindingNames.contains(binding)));
};
return result;
});
this.sqlContextField = sqlContext;
}
示例12: createRelation
import org.apache.spark.sql.SQLContext; //導入依賴的package包/類
@Override
public BaseRelation createRelation(SQLContext arg0, Map<String, String> arg1) {
log.debug("-> createRelation()");
java.util.Map<String, String> javaMap = scala.collection.JavaConverters
.mapAsJavaMapConverter(arg1).asJava();
SubStringCounterRelation br = new SubStringCounterRelation();
br.setSqlContext(arg0);
for (java.util.Map.Entry<String, String> entry : javaMap.entrySet()) {
String key = entry.getKey();
String value = entry.getValue();
log.debug("[{}] --> [{}]", key, value);
if (key.compareTo(K.PATH) == 0) {
br.setFilename(value);
} else if (key.startsWith(K.COUNT)) {
br.addCriteria(value);
}
}
return br;
}
示例13: getBodyContent
import org.apache.spark.sql.SQLContext; //導入依賴的package包/類
private DataFrame getBodyContent(SQLContext sqlContxt, String jsonPath, String bodyColumn,
String whereClause, String label) {
DataFrame df = sqlContxt.read().json(jsonPath);
df.registerTempTable("news");
df.printSchema();
String sql = "SELECT\n"
+ " generateId('') AS id,\n"
+ " " + bodyColumn + " AS content,\n"
+ " CAST(" + label + " AS Double) AS label\n"
+ "FROM news\n"
+ "WHERE (trim(nvl(" + bodyColumn + " , '')) != '')\n"
+ whereClause;
DataFrame newsData = sqlContxt.sql(sql);
return newsData;
}
示例14: main
import org.apache.spark.sql.SQLContext; //導入依賴的package包/類
public static void main(String[] args) throws ParseException {
final NewsConfiguration conf = new NewsConfiguration(args);
SparkConf sparkConf = new SparkConf().setMaster("local[*]").setAppName("News Classificator");
try (JavaSparkContext sc = new JavaSparkContext(sparkConf)) {
SQLContext sqlContxt = new SQLContext(sc);
DatasetLoader data = new DatasetLoader(sqlContxt, new double[] {0.7, 0.}, conf);
DataFrame allData = data.getBazikiLeaks().unionAll(data.getCredibleData()).unionAll(data.getUnreliableData()).unionAll(data.getValidationSet());
NewsCredibilityMain
.getCommonFeatures(sqlContxt, allData, "tokens")
.select("label", "commonfeatures")
.repartition(1)
.write()
.format("com.databricks.spark.csv")
.option("header", "true")
.save("/home/momchil/Documents/MasterThesis/features/commonfeatures.csv");
}
}
示例15: loadObservations
import org.apache.spark.sql.SQLContext; //導入依賴的package包/類
private static DataFrame loadObservations(JavaSparkContext sparkContext, SQLContext sqlContext,
String path) {
JavaRDD<Row> rowRdd = sparkContext.textFile(path).map((String line) -> {
String[] tokens = line.split("\t");
ZonedDateTime dt = ZonedDateTime.of(Integer.parseInt(tokens[0]),
Integer.parseInt(tokens[1]), Integer.parseInt(tokens[1]), 0, 0, 0, 0,
ZoneId.systemDefault());
String symbol = tokens[3];
double price = Double.parseDouble(tokens[5]);
return RowFactory.create(Timestamp.from(dt.toInstant()), symbol, price);
});
List<StructField> fields = new ArrayList();
fields.add(DataTypes.createStructField("timestamp", DataTypes.TimestampType, true));
fields.add(DataTypes.createStructField("symbol", DataTypes.StringType, true));
fields.add(DataTypes.createStructField("price", DataTypes.DoubleType, true));
StructType schema = DataTypes.createStructType(fields);
return sqlContext.createDataFrame(rowRdd, schema);
}