當前位置: 首頁>>代碼示例>>Java>>正文


Java JavaRDD.flatMap方法代碼示例

本文整理匯總了Java中org.apache.spark.api.java.JavaRDD.flatMap方法的典型用法代碼示例。如果您正苦於以下問題:Java JavaRDD.flatMap方法的具體用法?Java JavaRDD.flatMap怎麽用?Java JavaRDD.flatMap使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在org.apache.spark.api.java.JavaRDD的用法示例。


在下文中一共展示了JavaRDD.flatMap方法的7個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。

示例1: main

import org.apache.spark.api.java.JavaRDD; //導入方法依賴的package包/類
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage:");
        System.err.println("  SparkWordCount <sourceFile> <targetFile>");
        System.exit(1);
    }

    SparkConf conf = new SparkConf()
            .setAppName("Word Count");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> textFile = sc.textFile(args[0]);
    JavaRDD<String> words = textFile.flatMap(LineIterator::new);
    JavaPairRDD<String, Long> pairs =
            words.mapToPair(s -> new Tuple2<>(s, 1L));
    JavaPairRDD<String, Long> counts =
            pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b);

    System.out.println("Starting task..");
    long t = System.currentTimeMillis();
    counts.saveAsTextFile(args[1] + "_" + t);
    System.out.println("Time=" + (System.currentTimeMillis() - t));
}
 
開發者ID:hazelcast,項目名稱:big-data-benchmark,代碼行數:23,代碼來源:SparkWordCount.java

示例2: wordCountJava8

import org.apache.spark.api.java.JavaRDD; //導入方法依賴的package包/類
public static void wordCountJava8( String filename )
{
    // Define a configuration to use to interact with Spark
    SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");

    // Create a Java version of the Spark Context from the configuration
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Load the input data, which is a text file read from the command line
    JavaRDD<String> input = sc.textFile( filename );

    // Java 8 with lambdas: split the input string into words
   // TODO here a change has happened 
    JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );

    // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
    JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );

    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile( "output" );
}
 
開發者ID:PacktPublishing,項目名稱:Apache-Spark-2x-for-Java-Developers,代碼行數:22,代碼來源:WordCount.java

示例3: extractEntry

import org.apache.spark.api.java.JavaRDD; //導入方法依賴的package包/類
/**
 * Extracts the given resource type from the RDD of bundles and returns
 * it as a Dataset of that type.
 *
 * @param spark the spark session
 * @param bundles an RDD of FHIR Bundles
 * @param resourceName the FHIR name of the resource type to extract
 *     (e.g., condition, patient. etc).
 * @param encoders the Encoders instance defining how the resources are encoded.
 * @param <T> the type of the resource being extracted from the bundles.
 * @return a dataset of the given resource
 */
public static <T extends IBaseResource> Dataset<T> extractEntry(SparkSession spark,
    JavaRDD<Bundle> bundles,
    String resourceName,
    FhirEncoders encoders) {

  RuntimeResourceDefinition def = context.getResourceDefinition(resourceName);

  JavaRDD<T> resourceRdd = bundles.flatMap(new ToResource<T>(def.getName()));

  Encoder<T> encoder = encoders.of((Class<T>) def.getImplementingClass());

  return spark.createDataset(resourceRdd.rdd(), encoder);
}
 
開發者ID:cerner,項目名稱:bunsen,代碼行數:26,代碼來源:Bundles.java

示例4: main

import org.apache.spark.api.java.JavaRDD; //導入方法依賴的package包/類
public static void main(String[] args) throws ParseException {

		final Validator validator = new Validator(args);
		ValidatorParameters params = validator.getParameters();
		validator.setDoPrintInProcessRecord(false);

		logger.info("Input file is " + params.getArgs());
		SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount");
		JavaSparkContext context = new JavaSparkContext(conf);

		System.err.println(validator.getParameters().formatParameters());

		JavaRDD<String> inputFile = context.textFile(validator.getParameters().getArgs()[0]);

		JavaRDD<String> baseCountsRDD = inputFile
			.flatMap(content -> {
				MarcReader reader = ReadMarc.getMarcStringReader(content);
				Record marc4jRecord = reader.next();
				MarcRecord marcRecord = MarcFactory.createFromMarc4j(
					marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq());
				validator.processRecord(marcRecord, 1);
				return ValidationErrorFormatter
					.formatForSummary(marcRecord.getValidationErrors(), params.getFormat())
					.iterator();
			}
		);
		baseCountsRDD.saveAsTextFile(validator.getParameters().getFileName());
	}
 
開發者ID:pkiraly,項目名稱:metadata-qa-marc,代碼行數:29,代碼來源:ParallelValidator.java

示例5: modifyQuads

import org.apache.spark.api.java.JavaRDD; //導入方法依賴的package包/類
/**
 * Modify RDD of quads in any needed way (filtering, flatMapping, ...)
 *
 * @param quads RDD of quads to modify
 * @return modified RDD of quads, returns original RDD in default
 */
@Override
public JavaRDD<Quad> modifyQuads(JavaRDD<Quad> quads) {
    final String labelURI = RDFS.LABEL.toString();
    return quads.flatMap(quad -> {
        if (quad.getSubject().isURI()) {
            String subjectURI = quad.getSubject().getURI();
            // for each quad specifying property label, create label quads for each URI variant of this property
            // done because Wikidata only provides entity labels, for example http://www.wikidata.org/entity/P279 and not http://www.wikidata.org/prop/direct/P279
            if (subjectURI.contains(PROPERTY_ENTITY_PREFIX) && quad.getPredicate().getURI().equals(labelURI)) {
                return Sets.newHashSet(
                        quad,
                        new Quad(quad.getGraph(),
                                NodeFactory.createURI(subjectURI.replace(PROPERTY_ENTITY_PREFIX, PROPERTY_DIRECT_PREFIX)),
                                quad.getPredicate(),
                                quad.getObject()),
                        new Quad(quad.getGraph(),
                                NodeFactory.createURI(subjectURI.replace(PROPERTY_ENTITY_PREFIX, PROPERTY_STATEMENT_PREFIX)),
                                quad.getPredicate(),
                                quad.getObject()),
                        new Quad(quad.getGraph(),
                                NodeFactory.createURI(subjectURI.replace(PROPERTY_ENTITY_PREFIX, PROPERTY_QUALIFIER_PREFIX)),
                                quad.getPredicate(),
                                quad.getObject())
                );
            }
        }

        return Collections.singleton(quad);
    });
}
 
開發者ID:Merck,項目名稱:rdf2x,代碼行數:37,代碼來源:WikidataFlavor.java

示例6: getDistinctEntityProperties

import org.apache.spark.api.java.JavaRDD; //導入方法依賴的package包/類
/**
 * Reduce a RDD of {@link Instance}s into a map of [type index -&gt; list of its {@link Predicate}s and their properties (occurrences, is multiple)]
 *
 * @param instances  a RDD of {@link Instance}s
 * @param typeCounts map of type indexes to counts of their instances
 * @return map of [type index -&gt; list of its {@link Predicate}s and their properties (occurrences, is multiple)]
 */
private Map<Integer, List<EntityProperty>> getDistinctEntityProperties(JavaRDD<Instance> instances, Map<Integer, Long> typeCounts) {

    // all triples of (instance type, instance predicate, is multiple valued predicate)
    JavaRDD<Tuple3<Integer, Predicate, Boolean>> typePredicates = instances.flatMap(instance -> {
        Set<Predicate> predicates = instance.getLiteralPredicates();
        return instance.getTypes().stream()
                .flatMap(typeInt -> predicates.stream()
                        .map(predicate -> new Tuple3<>(
                                typeInt, // type index
                                predicate, // predicate
                                instance.getLiteralValue(predicate) instanceof Set // is multiple valued
                        ))
                ).collect(Collectors.toList());
    });

    return typePredicates
            .mapToPair(typePredicate -> new Tuple2<>(
                            new Tuple2<>(typePredicate._1(), typePredicate._2()), // predicate in type
                            new Tuple2<>(1L, typePredicate._3()) // count, is multiple valued
                    )
            )
            // get properties of each predicate in a specific type (will become a column)
            .reduceByKey((a, b) -> new Tuple2<>(
                    a._1() + b._1(), // sum counts
                    a._2() || b._2() // is multiple if it is multiple in any instance
            ))
            // collect to Java list
            .collect().stream()
            // group by type -> list of predicates and their properties
            .collect(Collectors.groupingBy(
                    typePredicate -> typePredicate._1()._1(),
                    Collectors.mapping(
                            typePredicate -> new EntityProperty(
                                    typePredicate._1()._2(), // predicate index
                                    typePredicate._2()._2(), // is multiple
                                    typePredicate._2()._1() / ((double) typeCounts.get(typePredicate._1()._1())) // non-null ratio
                            ),
                            Collectors.toList())
            ));

}
 
開發者ID:Merck,項目名稱:rdf2x,代碼行數:49,代碼來源:EntitySchemaCollector.java

示例7: writeEntityAttributeValueTable

import org.apache.spark.api.java.JavaRDD; //導入方法依賴的package包/類
/**
 * Persist the Entity Attribute Value table
 *
 * @param entitySchema entity schema
 * @param instances    RDD of {@link Instance}s
 */
public void writeEntityAttributeValueTable(EntitySchema entitySchema, JavaRDD<Instance> instances) {

    IndexMap<String> typeIndex = rdfSchema.getTypeIndex();
    // create the schema
    List<StructField> fields = new ArrayList<>();
    fields.add(DataTypes.createStructField(ID_COLUMN_NAME, DataTypes.LongType, false));
    fields.add(DataTypes.createStructField(PREDICATE_COLUMN_NAME, DataTypes.IntegerType, false));
    fields.add(DataTypes.createStructField(EAV_DATATYPE_COLUMN_NAME, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(EAV_LANGUAGE_COLUMN_NAME, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(EAV_VALUE_COLUMN_NAME, DataTypes.StringType, false));
    StructType schema = DataTypes.createStructType(fields);

    List<Tuple2<String, String>> indexes = new ArrayList<>();
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, ID_COLUMN_NAME));
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, PREDICATE_COLUMN_NAME));
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, EAV_DATATYPE_COLUMN_NAME));
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, EAV_LANGUAGE_COLUMN_NAME));

    // get map of type index -> set of attributes
    Map<Integer, Set<Predicate>> typeEavPredicates = entitySchema.getTables().stream()
            .collect(Collectors.toMap(
                    table -> typeIndex.getIndex(table.getTypeURI()),
                    table -> table.getAttributes().stream()
                            .map(EntityProperty::getPredicate)
                            .collect(Collectors.toSet())
            ));

    // get all entity attribute values
    JavaRDD<Row> rowRDD = instances.flatMap(instance ->
            instance.getLiteralPredicates().stream()
                    // filter predicates that are in the EAV set of at least one of the instance types
                    .filter(predicate -> instance.getTypes().stream().anyMatch(type ->
                            typeEavPredicates.containsKey(type) && // type could have been removed (not enough rows, ...)
                                    typeEavPredicates.get(type).contains(predicate)
                    ))
                    // map to row of values
                    .flatMap(predicate -> {
                                Object value = instance.getLiteralValue(predicate);
                                if (value instanceof Set) {
                                    // return a row for each single value
                                    return ((Set<Object>) value).stream().map(val -> getAttributeRow(instance, predicate, val));
                                }
                                return Stream.of(getAttributeRow(instance, predicate, value));//getAttributeRow(instance, predicate, value)
                            }
                    )
                    .collect(Collectors.toList())
    );

    int predicateCount = typeEavPredicates.values().stream().collect(Collectors.summingInt(Set::size));

    // create and write the dataframe
    log.info("Writing EAV table of {} predicates", predicateCount);
    DataFrame df = sql.createDataFrame(rowRDD, schema);
    persistor.writeDataFrame(EAV_TABLE_NAME, df);
    log.info("Creating indexes for EAV table");
    persistor.createIndexes(indexes);
    df.unpersist();
}
 
開發者ID:Merck,項目名稱:rdf2x,代碼行數:65,代碼來源:InstanceRelationWriter.java


注:本文中的org.apache.spark.api.java.JavaRDD.flatMap方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。