当前位置: 首页>>代码示例>>Java>>正文


Java DataFrame.count方法代码示例

本文整理汇总了Java中org.apache.spark.sql.DataFrame.count方法的典型用法代码示例。如果您正苦于以下问题:Java DataFrame.count方法的具体用法?Java DataFrame.count怎么用?Java DataFrame.count使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.spark.sql.DataFrame的用法示例。


在下文中一共展示了DataFrame.count方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: tpch14

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch14() {
    int year_14 = 1993;
    int monthOffset_14 = rand.nextInt(60);
    SimpleDate d14_1 = new SimpleDate(year_14 + monthOffset_14 / 12, monthOffset_14 % 12 + 1, 1);
    monthOffset_14 += 1;
    SimpleDate d14_2 = new SimpleDate(year_14 + monthOffset_14 / 12, monthOffset_14 % 12 + 1, 1);

    String lineitemPredicate =  "l_shipdate >= \"" + d14_1 + "\" and l_shipdate < \"" + d14_2 + "\"";

    long start = System.currentTimeMillis();

    System.out.println("SELECT * "
            + "FROM lineitem JOIN part ON  l_partkey = p_partkey "
            + "WHERE " + lineitemPredicate);

    DataFrame df = sqlContext.sql("SELECT * "
            + "FROM lineitem JOIN part ON  l_partkey = p_partkey "
            + "WHERE " + lineitemPredicate);

    long result = df.count();  // 76860
    System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start)  + "; Result: " + result);
}
 
开发者ID:mitdbg,项目名称:AdaptDB,代码行数:23,代码来源:TPCHSparkJoinWorkload.java

示例2: tpch6

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch6() {
    int year_6 = 1993 + rand.nextInt(5);
    SimpleDate d6_1 = new SimpleDate(year_6, 1, 1);
    SimpleDate d6_2 = new SimpleDate(year_6 + 1, 1, 1);
    double discount = rand.nextDouble() * 0.07 + 0.02;
    double quantity = rand.nextInt(2) + 24.0;

    String lineitemPredicate = "l_shipdate >= \"" + d6_1 + "\" and l_shipdate < \"" + d6_2 + "\" and "
            + " l_discount > " + (discount - 0.01) + " and l_discount <= " + (discount + 0.01)
            + " and l_quantity <= " + quantity;

    long start = System.currentTimeMillis();

    System.out.println("SELECT * "
            + "FROM lineitem "
            + "WHERE " + lineitemPredicate);

    DataFrame df = sqlContext.sql("SELECT * "
            + "FROM lineitem "
            + "WHERE " + lineitemPredicate);

    long result = df.count();  // 83063
    System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start)  + "; Result: " + result);
}
 
开发者ID:mitdbg,项目名称:AdaptDB,代码行数:25,代码来源:TPCHSparkJoinWorkload.java

示例3: tpch19

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch19() {
    String brand_19 = "Brand#" + (rand.nextInt(5) + 1) + "" + (rand.nextInt(5) + 1);
    String shipInstruct_19 = "DELIVER IN PERSON";
    double quantity_19 = rand.nextInt(10) + 1;

    String lineitemPredicate = "l_shipinstruct = \"" + shipInstruct_19 + "\" and l_quantity > "  + quantity_19;
    String partPredicate = "p_brand = \"" + brand_19 + "\" and p_container = \"SM CASE\"";
    quantity_19 += 10;

    lineitemPredicate = lineitemPredicate + " and l_quantity <= " +  quantity_19 + " and l_shipmode <= \"AIR\"";
    partPredicate = partPredicate + " and p_size >= 1 and p_size <= 5";

    long start = System.currentTimeMillis();

    System.out.println("SELECT * "
            + "FROM lineitem JOIN part ON  l_partkey = p_partkey "
            + "WHERE " + lineitemPredicate + " and " + partPredicate);

    DataFrame df = sqlContext.sql("SELECT * "
            + "FROM lineitem JOIN part ON  l_partkey = p_partkey "
            + "WHERE " + lineitemPredicate + " and " + partPredicate);

    long result = df.count(); // 10
    System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start)  + "; Result: " + result);
}
 
开发者ID:mitdbg,项目名称:AdaptDB,代码行数:26,代码来源:TPCHSparkJoinWorkload.java

示例4: cacheTable

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
 * Cache complex_property_table.
 */
public void cacheTable() {
	DataFrame result = hiveContext.sql(String.format("SELECT * FROM %s",
			Tags.COMPLEX_PROPERTYTABLE_TABLENAME));
	// add partitioning if enables
	if (dfPartitions != 0) {
		result.repartition(dfPartitions);
	}
	
	result.registerTempTable(Tags.CACHED_COMPLEX_PROPERTYTABLE_TABLENAME);
	hiveContext.cacheTable(Tags.CACHED_COMPLEX_PROPERTYTABLE_TABLENAME);
	
	// force caching
	result.count();
}
 
开发者ID:aschaetzle,项目名称:Sempala,代码行数:18,代码来源:Spark.java

示例5: tag

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
	 * Tags a data frame containing a column named 'sentence'.
	 * @param input
	 * @param outputFileName
	 * @param outputFormat
	 */
	public void tag(DataFrame input, String outputFileName, OutputFormat outputFormat) {
		long tic = System.currentTimeMillis();
		long duration = 0;
		if (cmmModel != null) {
			DataFrame output = cmmModel.transform(input).repartition(1);
			duration = System.currentTimeMillis() - tic;
			switch (outputFormat) {
			case JSON:
				output.write().json(outputFileName);
				break;
			case PARQUET:
				output.write().parquet(outputFileName);
				break;
			case TEXT:
				toTaggedSentence(output).repartition(1).saveAsTextFile(outputFileName);
//				output.select("prediction").write().text(outputFileName);
				break;
			}
		} else {
			System.err.println("Tagging model is null. You need to create or load a model first.");
		}
		if (verbose) {
			long n = input.count();
			System.out.println(" Number of sentences = " + n);
			System.out.println("  Total tagging time = " + duration + " milliseconds.");
			System.out.println("Average tagging time = " + ((float)duration) / n + " milliseconds.");
		}
	}
 
开发者ID:phuonglh,项目名称:vn.vitk,代码行数:35,代码来源:Tagger.java

示例6: main

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public static void main(String args[]) {
    SparkConf sparkConf = new SparkConf();
    sparkConf.setAppName("spark-phoenix-df");
    sparkConf.setMaster("local[*]");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);

    DataFrame df = sqlContext.read()
            .format("org.apache.phoenix.spark")
            .option("table", "ORDERS")
            .option("zkUrl", "localhost:2181")
            .load();
    df.count();

}
 
开发者ID:mravi,项目名称:pro-phoenix,代码行数:16,代码来源:PhoenixSparkDf.java

示例7: tpch3

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch3() {
    int rand_3 = rand.nextInt(mktSegmentVals.length);
    String c_mktsegment = mktSegmentVals[rand_3];
    Calendar c = new GregorianCalendar();
    int dateOffset = (int) (rand.nextFloat() * (31 + 28 + 31));
    c.set(1995, Calendar.MARCH, 01);
    c.add(Calendar.DAY_OF_MONTH, dateOffset);
    SimpleDate d3 = new SimpleDate(c.get(Calendar.YEAR),
            c.get(Calendar.MONTH), c.get(Calendar.DAY_OF_MONTH));

    String customerPredicate = "c_mktsegment <= \"" +  c_mktsegment + "\"";
    String ordersPredicate = "o_orderdate < \"" + d3 + "\"";
    String lineitemPredicate = "l_shipdate > \"" + d3 + "\"";


    if (rand_3 > 0) {
        String c_mktsegment_prev = mktSegmentVals[rand_3 - 1];
        customerPredicate = "c_mktsegment > \"" + c_mktsegment_prev + "\" and " + customerPredicate;
    }

    long start = System.currentTimeMillis();

    System.out.println("SELECT * "
            + "FROM lineitem JOIN orders ON l_orderkey = o_orderkey JOIN customer ON o_custkey = c_custkey "
            + "WHERE " + lineitemPredicate + " and " + ordersPredicate + " and " +  customerPredicate);

    DataFrame df = sqlContext.sql("SELECT * "
            + "FROM lineitem JOIN orders ON l_orderkey = o_orderkey JOIN customer ON o_custkey = c_custkey "
            + "WHERE " + lineitemPredicate + " and " + ordersPredicate + " and " +  customerPredicate);

    long result = df.count();  // 29569
    System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start)  + "; Result: " + result);
}
 
开发者ID:mitdbg,项目名称:AdaptDB,代码行数:34,代码来源:TPCHSparkJoinWorkload.java

示例8: tpch5

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch5() {
    int rand_5 = rand.nextInt(regionNameVals.length);
    String r_name_5 = regionNameVals[rand_5];
    int year_5 = 1993 + rand.nextInt(5);
    SimpleDate d5_1 = new SimpleDate(year_5, 1, 1);
    SimpleDate d5_2 = new SimpleDate(year_5 + 1, 1, 1);

    String customerPredicate = "c_region <= \"" +  r_name_5 + "\"";
    String supplierPredicate = "s_region <= \"" +  r_name_5 + "\"";
    String ordersPredicate = "o_orderdate >= \"" + d5_1 + "\" and o_orderdate < \"" + d5_2 + "\"";

    if (rand_5 > 0) {
        String r_name_prev_5 = regionNameVals[rand_5 - 1];
        customerPredicate = "c_region > \"" + r_name_prev_5 + "\" and " + customerPredicate;
        supplierPredicate = "s_region > \"" + r_name_prev_5 + "\" and " + supplierPredicate;
    }

    long start = System.currentTimeMillis();

    System.out.println("SELECT * "
            + "FROM customer JOIN orders ON c_custkey = o_custkey "
            + "JOIN lineitem ON l_orderkey = o_orderkey "
            + "JOIN supplier ON l_suppkey = s_suppkey "
            + "WHERE " + customerPredicate + " and " + ordersPredicate + " and " +  supplierPredicate);

    DataFrame df = sqlContext.sql("SELECT * "
            + "FROM customer JOIN orders ON c_custkey = o_custkey "
            + "JOIN lineitem ON l_orderkey = o_orderkey "
            + "JOIN supplier ON l_suppkey = s_suppkey "
            + "WHERE " + customerPredicate + " and " + ordersPredicate + " and " +  supplierPredicate);

    long result = df.count(); // 35307
    System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start)  + "; Result: " + result);
}
 
开发者ID:mitdbg,项目名称:AdaptDB,代码行数:35,代码来源:TPCHSparkJoinWorkload.java

示例9: tpch8

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch8() {
    int rand_8_1 = rand.nextInt(regionNameVals.length);
    String r_name_8 = regionNameVals[rand_8_1];
    SimpleDate d8_1 = new SimpleDate(1995, 1, 1);
    SimpleDate d8_2 = new SimpleDate(1996, 12, 31);
    String p_type_8 = partTypeVals[rand.nextInt(partTypeVals.length)];


    String customerPredicate = "c_region <= \"" +  r_name_8 + "\"";
    String ordersPredicate = "o_orderdate >= \"" + d8_1 + "\" and o_orderdate < \"" + d8_2 + "\"";
    String partPredicate = "p_type = \"" +  p_type_8 + "\"";


    if (rand_8_1 > 0) {
        String r_name_prev_8 = regionNameVals[rand_8_1 - 1];
        customerPredicate = "c_region > \"" + r_name_prev_8 + "\" and " + customerPredicate;
    }

    long start = System.currentTimeMillis();

    System.out.println("SELECT * "
            + "FROM customer JOIN orders ON c_custkey = o_custkey "
            + "JOIN lineitem ON l_orderkey = o_orderkey "
            + "JOIN part ON l_partkey = p_partkey "
            + "WHERE " + customerPredicate + " and " + ordersPredicate + " and " +  partPredicate);

    DataFrame df = sqlContext.sql("SELECT * "
            + "FROM customer JOIN orders ON c_custkey = o_custkey "
            + "JOIN lineitem ON l_orderkey = o_orderkey "
            + "JOIN part ON l_partkey = p_partkey "
            + "WHERE " + customerPredicate + " and " + ordersPredicate + " and " +  partPredicate);

    long result = df.count();  // 0
    System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start)  + "; Result: " + result);
}
 
开发者ID:mitdbg,项目名称:AdaptDB,代码行数:36,代码来源:TPCHSparkJoinWorkload.java

示例10: tpch10

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch10() {
    String l_returnflag_10 = "R";
    String l_returnflag_prev_10 = "N";
    int year_10 = 1993;
    int monthOffset = rand.nextInt(24);
    SimpleDate d10_1 = new SimpleDate(year_10 + monthOffset / 12, monthOffset % 12 + 1, 1);
    monthOffset = monthOffset + 3;
    SimpleDate d10_2 = new SimpleDate(year_10 + monthOffset / 12, monthOffset % 12 + 1, 1);

    String ordersPredicate = "o_orderdate >= \"" + d10_1 + "\" and o_orderdate < \"" + d10_2 + "\"";
    String lineitemPredicate =  "l_returnflag <= \"" + l_returnflag_10 + "\" and l_returnflag > \"" + l_returnflag_prev_10 + "\"";

    long start = System.currentTimeMillis();

    System.out.println("SELECT * "
            + "FROM lineitem JOIN orders ON  l_orderkey = o_orderkey "
            + "JOIN customer ON c_custkey = o_custkey "
            + "WHERE " + ordersPredicate + " and " + lineitemPredicate);

    DataFrame df = sqlContext.sql("SELECT * "
            + "FROM lineitem JOIN orders ON  l_orderkey = o_orderkey "
            + "JOIN customer ON c_custkey = o_custkey "
            + "WHERE " + ordersPredicate + " and " + lineitemPredicate);

    long result = df.count();  // 111918
    System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start)  + "; Result: " + result);
}
 
开发者ID:mitdbg,项目名称:AdaptDB,代码行数:28,代码来源:TPCHSparkJoinWorkload.java

示例11: tpch12

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
public void tpch12() {
    int rand_12 = rand.nextInt(shipModeVals.length);
    String shipmode_12 = shipModeVals[rand_12];
    int year_12 = 1993 + rand.nextInt(5);
    SimpleDate d12_1 = new SimpleDate(year_12, 1, 1);
    SimpleDate d12_2 = new SimpleDate(year_12 + 1, 1, 1);

    String lineitemPredicate =  "l_shipmode <= \"" + shipmode_12 + "\" and l_receiptdate >= \"" + d12_1 + "\" and "
                            + "l_receiptdate < \"" + d12_2 + "\"";

    if (rand_12 > 0) {
        String shipmode_prev_12 = shipModeVals[rand_12 - 1];
        lineitemPredicate = "l_shipmode > \"" + shipmode_prev_12 + "\" and " +  lineitemPredicate;
    }

    long start = System.currentTimeMillis();

    System.out.println("SELECT * "
            + "FROM lineitem JOIN orders ON  l_orderkey = o_orderkey "
            + "WHERE " + lineitemPredicate);

    DataFrame df = sqlContext.sql("SELECT * "
            + "FROM lineitem JOIN orders ON  l_orderkey = o_orderkey "
            + "WHERE " + lineitemPredicate);

    long result = df.count(); // 130474
    System.out.println("RES: Time Taken: " + (System.currentTimeMillis() - start)  + "; Result: " + result);
}
 
开发者ID:mitdbg,项目名称:AdaptDB,代码行数:29,代码来源:TPCHSparkJoinWorkload.java

示例12: shouldCountFrame

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
@Test
public void shouldCountFrame() {
    assumeTrue(shouldRunHive);
    DataFrameCallback callback = new DataFrameCallback<Long>() {
        @Override
        public Long onDataFrame(DataFrame dataFrame, Object... payloads) {
            return dataFrame.count();
        }
    };
    long tablesCount = template.requestBodyAndHeader(sparkDataFrameUri, null, SPARK_DATAFRAME_CALLBACK_HEADER, callback, Long.class);
    Truth.assertThat(tablesCount).isEqualTo(2);
}
 
开发者ID:HydAu,项目名称:Camel,代码行数:13,代码来源:SparkProducerTest.java

示例13: runQueryWithSpark

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
 * Run a query by Spark.
 * 
 * @param sparkConnection connection to spark
 * @param sqlQuery sql query in string format that is run
 * @param resultsTableName name of the table where the result of the query
 *            is stored.
 * @param executionTime how much time the query takes
 * @param nrTuples how many tuples the query has
 * @param isBenchmark is the query is run with benchmark purposes
 * @param isOnlyCount if true only how many rows the query will returned is
 *            counted. Otherwise, the result of the query is stored in a
 *            table
 */
public static HashMap<String, Long> runQueryWithSpark(Spark sparkConnection, String sqlQuery,
		String resultsTableName, boolean isBenchmark, boolean isOnlyCount) {
	long executionTime = 0;
	long nrTuples = 0;

	// remove the initial "("
	sqlQuery = sqlQuery.substring(1);
	// remove the final ")"
	sqlQuery = sqlQuery.substring(0, sqlQuery.length() - 1);

	// Execute the query
	long startTime = System.currentTimeMillis();

	// executes query
	DataFrame result = sparkConnection.sql(sqlQuery);
	if (isOnlyCount) {
		nrTuples = result.count();
	} else {
		// store the result into a table
		result.write().saveAsTable(String.format("%s.%s", Tags.SEMPALA_RESULTS_DB_NAME, resultsTableName));
	}

	executionTime = System.currentTimeMillis() - startTime;
	System.out.print(String.format(" %s ms", executionTime));

	// if the result was stored in a table, retrieve its count
	if (!isOnlyCount) {
		nrTuples = result.count();
	}

	System.out.println(String.format(" %s pc", nrTuples));

	// Immediately delete the results if this is just a
	// benchmark run
	if (isBenchmark) {
		sparkConnection
				.sql(String.format("DROP TABLE IF EXISTS %s.%s", Tags.SEMPALA_RESULTS_DB_NAME, resultsTableName));
	}

	HashMap<String, Long> results = new HashMap<String, Long>();
	results.put("executionTime", executionTime);
	results.put("nrTuples", nrTuples);
	return results;
}
 
开发者ID:aschaetzle,项目名称:Sempala,代码行数:59,代码来源:Main.java

示例14: tokenize

import org.apache.spark.sql.DataFrame; //导入方法依赖的package包/类
/**
 * Tokenizes a RDD of text lines and return a RDD of result.
 * @param input
 * @return a RDD of tokenized text lines.
 */
public JavaRDD<String> tokenize(JavaRDD<String> input) {
	if (verbose) {
		// print some basic statistic about the input, including 
		// max line length, min line length, average line length in syllables
		JavaRDD<Integer> wordCount = input.map(new Function<String, Integer>() {
			private static final long serialVersionUID = 7214093453452927565L;
			@Override
			public Integer call(String line) throws Exception {
				return line.split("\\s+").length;
			}
			
		});
		Comparator<Integer> comp = new IntegerComparator();
		System.out.println("Max line length (in syllables) = " + wordCount.max(comp));
		System.out.println("Min line length (in syllables) = " + wordCount.min(comp));
		float totalCount = wordCount.reduce(new Function2<Integer, Integer, Integer>() {
			private static final long serialVersionUID = 1L;
			@Override
			public Integer call(Integer v1, Integer v2) throws Exception {
				return v1 + v2;
			}
		});
		System.out.println("Avg line length (in syllables) = " + (totalCount) / input.count());
	}
	
	JavaRDD<String> output = null;
	if (classifier == null) {
		// use phrase graph approach (shortest paths and bigram model)
		// to segment phrases
		output = input.map(new SegmentationFunction());
	} else {
		// use logistic regression approach to segment phrases
		JavaRDD<String> s = input.map(new SegmentationFunction());
		// make sure that the preceding lazy computation has been evaluated
		// so that whitespace contexts have been properly accumulated
		System.out.println("Number of text lines = " + s.count());
		System.out.println("Number of contexts = " + contexts.value().size());
		// use whitespace classification approach (logistic regresion model)
		JavaRDD<WhitespaceContext> jrdd = jsc.parallelize(contexts.value());
		DataFrame df0 = (new SQLContext(jsc)).createDataFrame(jrdd, WhitespaceContext.class);
		DataFrame df1 = model.transform(df0);
		prediction = jsc.broadcast(df1.select("prediction").collect());
		if (df1.count() > 0) {
			output = s.map(new WhitespaceClassificationFunction());
		}
		else { 
			System.err.println("Empty data frame!");
		}
	}
	if (verbose) {
		// print number of non-space characters of the input and output dataset
		System.out.println("#(non-space characters of input) = " + numCharacters(input));
		if (output != null) {
			System.out.println("#(non-space characters of output) = " + numCharacters(output));
		}
	}
	return output;
}
 
开发者ID:phuonglh,项目名称:vn.vitk,代码行数:64,代码来源:Tokenizer.java


注:本文中的org.apache.spark.sql.DataFrame.count方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。