本文整理汇总了Java中org.apache.spark.sql.DataFrame.count方法的典型用法代码示例。如果您正苦于以下问题:Java DataFrame.count方法的具体用法?Java DataFrame.count怎么用?Java DataFrame.count使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.sql.DataFrame
的用法示例。
在下文中一共展示了DataFrame.count方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: tpch14
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch14() {
int year_14 = 1993;
int monthOffset_14 = rand.nextInt(60);
SimpleDate d14_1 = new SimpleDate(year_14 + monthOffset_14 / 12, monthOffset_14 % 12 + 1, 1);
monthOffset_14 += 1;
SimpleDate d14_2 = new SimpleDate(year_14 + monthOffset_14 / 12, monthOffset_14 % 12 + 1, 1);
String lineitemPredicate = "l_shipdate >= \"" + d14_1 + "\" and l_shipdate < \"" + d14_2 + "\"";
long start = System.currentTimeMillis();
System.out.println("SELECT * "
+ "FROM lineitem JOIN part ON l_partkey = p_partkey "
+ "WHERE " + lineitemPredicate);
DataFrame df = sqlContext.sql("SELECT * "
+ "FROM lineitem JOIN part ON l_partkey = p_partkey "
+ "WHERE " + lineitemPredicate);
long result = df.count(); // 76860
System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start) + "; Result: " + result);
}
示例2: tpch6
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch6() {
int year_6 = 1993 + rand.nextInt(5);
SimpleDate d6_1 = new SimpleDate(year_6, 1, 1);
SimpleDate d6_2 = new SimpleDate(year_6 + 1, 1, 1);
double discount = rand.nextDouble() * 0.07 + 0.02;
double quantity = rand.nextInt(2) + 24.0;
String lineitemPredicate = "l_shipdate >= \"" + d6_1 + "\" and l_shipdate < \"" + d6_2 + "\" and "
+ " l_discount > " + (discount - 0.01) + " and l_discount <= " + (discount + 0.01)
+ " and l_quantity <= " + quantity;
long start = System.currentTimeMillis();
System.out.println("SELECT * "
+ "FROM lineitem "
+ "WHERE " + lineitemPredicate);
DataFrame df = sqlContext.sql("SELECT * "
+ "FROM lineitem "
+ "WHERE " + lineitemPredicate);
long result = df.count(); // 83063
System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start) + "; Result: " + result);
}
示例3: tpch19
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch19() {
String brand_19 = "Brand#" + (rand.nextInt(5) + 1) + "" + (rand.nextInt(5) + 1);
String shipInstruct_19 = "DELIVER IN PERSON";
double quantity_19 = rand.nextInt(10) + 1;
String lineitemPredicate = "l_shipinstruct = \"" + shipInstruct_19 + "\" and l_quantity > " + quantity_19;
String partPredicate = "p_brand = \"" + brand_19 + "\" and p_container = \"SM CASE\"";
quantity_19 += 10;
lineitemPredicate = lineitemPredicate + " and l_quantity <= " + quantity_19 + " and l_shipmode <= \"AIR\"";
partPredicate = partPredicate + " and p_size >= 1 and p_size <= 5";
long start = System.currentTimeMillis();
System.out.println("SELECT * "
+ "FROM lineitem JOIN part ON l_partkey = p_partkey "
+ "WHERE " + lineitemPredicate + " and " + partPredicate);
DataFrame df = sqlContext.sql("SELECT * "
+ "FROM lineitem JOIN part ON l_partkey = p_partkey "
+ "WHERE " + lineitemPredicate + " and " + partPredicate);
long result = df.count(); // 10
System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start) + "; Result: " + result);
}
示例4: cacheTable
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
* Cache complex_property_table.
*/
public void cacheTable() {
DataFrame result = hiveContext.sql(String.format("SELECT * FROM %s",
Tags.COMPLEX_PROPERTYTABLE_TABLENAME));
// add partitioning if enables
if (dfPartitions != 0) {
result.repartition(dfPartitions);
}
result.registerTempTable(Tags.CACHED_COMPLEX_PROPERTYTABLE_TABLENAME);
hiveContext.cacheTable(Tags.CACHED_COMPLEX_PROPERTYTABLE_TABLENAME);
// force caching
result.count();
}
示例5: tag
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
* Tags a data frame containing a column named 'sentence'.
* @param input
* @param outputFileName
* @param outputFormat
*/
public void tag(DataFrame input, String outputFileName, OutputFormat outputFormat) {
long tic = System.currentTimeMillis();
long duration = 0;
if (cmmModel != null) {
DataFrame output = cmmModel.transform(input).repartition(1);
duration = System.currentTimeMillis() - tic;
switch (outputFormat) {
case JSON:
output.write().json(outputFileName);
break;
case PARQUET:
output.write().parquet(outputFileName);
break;
case TEXT:
toTaggedSentence(output).repartition(1).saveAsTextFile(outputFileName);
// output.select("prediction").write().text(outputFileName);
break;
}
} else {
System.err.println("Tagging model is null. You need to create or load a model first.");
}
if (verbose) {
long n = input.count();
System.out.println(" Number of sentences = " + n);
System.out.println(" Total tagging time = " + duration + " milliseconds.");
System.out.println("Average tagging time = " + ((float)duration) / n + " milliseconds.");
}
}
示例6: main
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public static void main(String args[]) {
SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("spark-phoenix-df");
sparkConf.setMaster("local[*]");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
DataFrame df = sqlContext.read()
.format("org.apache.phoenix.spark")
.option("table", "ORDERS")
.option("zkUrl", "localhost:2181")
.load();
df.count();
}
示例7: tpch3
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch3() {
int rand_3 = rand.nextInt(mktSegmentVals.length);
String c_mktsegment = mktSegmentVals[rand_3];
Calendar c = new GregorianCalendar();
int dateOffset = (int) (rand.nextFloat() * (31 + 28 + 31));
c.set(1995, Calendar.MARCH, 01);
c.add(Calendar.DAY_OF_MONTH, dateOffset);
SimpleDate d3 = new SimpleDate(c.get(Calendar.YEAR),
c.get(Calendar.MONTH), c.get(Calendar.DAY_OF_MONTH));
String customerPredicate = "c_mktsegment <= \"" + c_mktsegment + "\"";
String ordersPredicate = "o_orderdate < \"" + d3 + "\"";
String lineitemPredicate = "l_shipdate > \"" + d3 + "\"";
if (rand_3 > 0) {
String c_mktsegment_prev = mktSegmentVals[rand_3 - 1];
customerPredicate = "c_mktsegment > \"" + c_mktsegment_prev + "\" and " + customerPredicate;
}
long start = System.currentTimeMillis();
System.out.println("SELECT * "
+ "FROM lineitem JOIN orders ON l_orderkey = o_orderkey JOIN customer ON o_custkey = c_custkey "
+ "WHERE " + lineitemPredicate + " and " + ordersPredicate + " and " + customerPredicate);
DataFrame df = sqlContext.sql("SELECT * "
+ "FROM lineitem JOIN orders ON l_orderkey = o_orderkey JOIN customer ON o_custkey = c_custkey "
+ "WHERE " + lineitemPredicate + " and " + ordersPredicate + " and " + customerPredicate);
long result = df.count(); // 29569
System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start) + "; Result: " + result);
}
示例8: tpch5
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch5() {
int rand_5 = rand.nextInt(regionNameVals.length);
String r_name_5 = regionNameVals[rand_5];
int year_5 = 1993 + rand.nextInt(5);
SimpleDate d5_1 = new SimpleDate(year_5, 1, 1);
SimpleDate d5_2 = new SimpleDate(year_5 + 1, 1, 1);
String customerPredicate = "c_region <= \"" + r_name_5 + "\"";
String supplierPredicate = "s_region <= \"" + r_name_5 + "\"";
String ordersPredicate = "o_orderdate >= \"" + d5_1 + "\" and o_orderdate < \"" + d5_2 + "\"";
if (rand_5 > 0) {
String r_name_prev_5 = regionNameVals[rand_5 - 1];
customerPredicate = "c_region > \"" + r_name_prev_5 + "\" and " + customerPredicate;
supplierPredicate = "s_region > \"" + r_name_prev_5 + "\" and " + supplierPredicate;
}
long start = System.currentTimeMillis();
System.out.println("SELECT * "
+ "FROM customer JOIN orders ON c_custkey = o_custkey "
+ "JOIN lineitem ON l_orderkey = o_orderkey "
+ "JOIN supplier ON l_suppkey = s_suppkey "
+ "WHERE " + customerPredicate + " and " + ordersPredicate + " and " + supplierPredicate);
DataFrame df = sqlContext.sql("SELECT * "
+ "FROM customer JOIN orders ON c_custkey = o_custkey "
+ "JOIN lineitem ON l_orderkey = o_orderkey "
+ "JOIN supplier ON l_suppkey = s_suppkey "
+ "WHERE " + customerPredicate + " and " + ordersPredicate + " and " + supplierPredicate);
long result = df.count(); // 35307
System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start) + "; Result: " + result);
}
示例9: tpch8
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch8() {
int rand_8_1 = rand.nextInt(regionNameVals.length);
String r_name_8 = regionNameVals[rand_8_1];
SimpleDate d8_1 = new SimpleDate(1995, 1, 1);
SimpleDate d8_2 = new SimpleDate(1996, 12, 31);
String p_type_8 = partTypeVals[rand.nextInt(partTypeVals.length)];
String customerPredicate = "c_region <= \"" + r_name_8 + "\"";
String ordersPredicate = "o_orderdate >= \"" + d8_1 + "\" and o_orderdate < \"" + d8_2 + "\"";
String partPredicate = "p_type = \"" + p_type_8 + "\"";
if (rand_8_1 > 0) {
String r_name_prev_8 = regionNameVals[rand_8_1 - 1];
customerPredicate = "c_region > \"" + r_name_prev_8 + "\" and " + customerPredicate;
}
long start = System.currentTimeMillis();
System.out.println("SELECT * "
+ "FROM customer JOIN orders ON c_custkey = o_custkey "
+ "JOIN lineitem ON l_orderkey = o_orderkey "
+ "JOIN part ON l_partkey = p_partkey "
+ "WHERE " + customerPredicate + " and " + ordersPredicate + " and " + partPredicate);
DataFrame df = sqlContext.sql("SELECT * "
+ "FROM customer JOIN orders ON c_custkey = o_custkey "
+ "JOIN lineitem ON l_orderkey = o_orderkey "
+ "JOIN part ON l_partkey = p_partkey "
+ "WHERE " + customerPredicate + " and " + ordersPredicate + " and " + partPredicate);
long result = df.count(); // 0
System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start) + "; Result: " + result);
}
示例10: tpch10
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch10() {
String l_returnflag_10 = "R";
String l_returnflag_prev_10 = "N";
int year_10 = 1993;
int monthOffset = rand.nextInt(24);
SimpleDate d10_1 = new SimpleDate(year_10 + monthOffset / 12, monthOffset % 12 + 1, 1);
monthOffset = monthOffset + 3;
SimpleDate d10_2 = new SimpleDate(year_10 + monthOffset / 12, monthOffset % 12 + 1, 1);
String ordersPredicate = "o_orderdate >= \"" + d10_1 + "\" and o_orderdate < \"" + d10_2 + "\"";
String lineitemPredicate = "l_returnflag <= \"" + l_returnflag_10 + "\" and l_returnflag > \"" + l_returnflag_prev_10 + "\"";
long start = System.currentTimeMillis();
System.out.println("SELECT * "
+ "FROM lineitem JOIN orders ON l_orderkey = o_orderkey "
+ "JOIN customer ON c_custkey = o_custkey "
+ "WHERE " + ordersPredicate + " and " + lineitemPredicate);
DataFrame df = sqlContext.sql("SELECT * "
+ "FROM lineitem JOIN orders ON l_orderkey = o_orderkey "
+ "JOIN customer ON c_custkey = o_custkey "
+ "WHERE " + ordersPredicate + " and " + lineitemPredicate);
long result = df.count(); // 111918
System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start) + "; Result: " + result);
}
示例11: tpch12
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch12() {
int rand_12 = rand.nextInt(shipModeVals.length);
String shipmode_12 = shipModeVals[rand_12];
int year_12 = 1993 + rand.nextInt(5);
SimpleDate d12_1 = new SimpleDate(year_12, 1, 1);
SimpleDate d12_2 = new SimpleDate(year_12 + 1, 1, 1);
String lineitemPredicate = "l_shipmode <= \"" + shipmode_12 + "\" and l_receiptdate >= \"" + d12_1 + "\" and "
+ "l_receiptdate < \"" + d12_2 + "\"";
if (rand_12 > 0) {
String shipmode_prev_12 = shipModeVals[rand_12 - 1];
lineitemPredicate = "l_shipmode > \"" + shipmode_prev_12 + "\" and " + lineitemPredicate;
}
long start = System.currentTimeMillis();
System.out.println("SELECT * "
+ "FROM lineitem JOIN orders ON l_orderkey = o_orderkey "
+ "WHERE " + lineitemPredicate);
DataFrame df = sqlContext.sql("SELECT * "
+ "FROM lineitem JOIN orders ON l_orderkey = o_orderkey "
+ "WHERE " + lineitemPredicate);
long result = df.count(); // 130474
System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start) + "; Result: " + result);
}
示例12: shouldCountFrame
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
@Test
public void shouldCountFrame() {
assumeTrue(shouldRunHive);
DataFrameCallback callback = new DataFrameCallback<Long>() {
@Override
public Long onDataFrame(DataFrame dataFrame, Object... payloads) {
return dataFrame.count();
}
};
long tablesCount = template.requestBodyAndHeader(sparkDataFrameUri, null, SPARK_DATAFRAME_CALLBACK_HEADER, callback, Long.class);
Truth.assertThat(tablesCount).isEqualTo(2);
}
示例13: runQueryWithSpark
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
* Run a query by Spark.
*
* @param sparkConnection connection to spark
* @param sqlQuery sql query in string format that is run
* @param resultsTableName name of the table where the result of the query
* is stored.
* @param executionTime how much time the query takes
* @param nrTuples how many tuples the query has
* @param isBenchmark is the query is run with benchmark purposes
* @param isOnlyCount if true only how many rows the query will returned is
* counted. Otherwise, the result of the query is stored in a
* table
*/
public static HashMap<String, Long> runQueryWithSpark(Spark sparkConnection, String sqlQuery,
String resultsTableName, boolean isBenchmark, boolean isOnlyCount) {
long executionTime = 0;
long nrTuples = 0;
// remove the initial "("
sqlQuery = sqlQuery.substring(1);
// remove the final ")"
sqlQuery = sqlQuery.substring(0, sqlQuery.length() - 1);
// Execute the query
long startTime = System.currentTimeMillis();
// executes query
DataFrame result = sparkConnection.sql(sqlQuery);
if (isOnlyCount) {
nrTuples = result.count();
} else {
// store the result into a table
result.write().saveAsTable(String.format("%s.%s", Tags.SEMPALA_RESULTS_DB_NAME, resultsTableName));
}
executionTime = System.currentTimeMillis() - startTime;
System.out.print(String.format(" %s ms", executionTime));
// if the result was stored in a table, retrieve its count
if (!isOnlyCount) {
nrTuples = result.count();
}
System.out.println(String.format(" %s pc", nrTuples));
// Immediately delete the results if this is just a
// benchmark run
if (isBenchmark) {
sparkConnection
.sql(String.format("DROP TABLE IF EXISTS %s.%s", Tags.SEMPALA_RESULTS_DB_NAME, resultsTableName));
}
HashMap<String, Long> results = new HashMap<String, Long>();
results.put("executionTime", executionTime);
results.put("nrTuples", nrTuples);
return results;
}
示例14: tokenize
import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
* Tokenizes a RDD of text lines and return a RDD of result.
* @param input
* @return a RDD of tokenized text lines.
*/
public JavaRDD<String> tokenize(JavaRDD<String> input) {
if (verbose) {
// print some basic statistic about the input, including
// max line length, min line length, average line length in syllables
JavaRDD<Integer> wordCount = input.map(new Function<String, Integer>() {
private static final long serialVersionUID = 7214093453452927565L;
@Override
public Integer call(String line) throws Exception {
return line.split("\\s+").length;
}
});
Comparator<Integer> comp = new IntegerComparator();
System.out.println("Max line length (in syllables) = " + wordCount.max(comp));
System.out.println("Min line length (in syllables) = " + wordCount.min(comp));
float totalCount = wordCount.reduce(new Function2<Integer, Integer, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
System.out.println("Avg line length (in syllables) = " + (totalCount) / input.count());
}
JavaRDD<String> output = null;
if (classifier == null) {
// use phrase graph approach (shortest paths and bigram model)
// to segment phrases
output = input.map(new SegmentationFunction());
} else {
// use logistic regression approach to segment phrases
JavaRDD<String> s = input.map(new SegmentationFunction());
// make sure that the preceding lazy computation has been evaluated
// so that whitespace contexts have been properly accumulated
System.out.println("Number of text lines = " + s.count());
System.out.println("Number of contexts = " + contexts.value().size());
// use whitespace classification approach (logistic regresion model)
JavaRDD<WhitespaceContext> jrdd = jsc.parallelize(contexts.value());
DataFrame df0 = (new SQLContext(jsc)).createDataFrame(jrdd, WhitespaceContext.class);
DataFrame df1 = model.transform(df0);
prediction = jsc.broadcast(df1.select("prediction").collect());
if (df1.count() > 0) {
output = s.map(new WhitespaceClassificationFunction());
}
else {
System.err.println("Empty data frame!");
}
}
if (verbose) {
// print number of non-space characters of the input and output dataset
System.out.println("#(non-space characters of input) = " + numCharacters(input));
if (output != null) {
System.out.println("#(non-space characters of output) = " + numCharacters(output));
}
}
return output;
}