当前位置: 首页>>代码示例>>Java>>正文


Java JavaRDD.flatMap方法代码示例

本文整理汇总了Java中org.apache.spark.api.java.JavaRDD.flatMap方法的典型用法代码示例。如果您正苦于以下问题:Java JavaRDD.flatMap方法的具体用法?Java JavaRDD.flatMap怎么用?Java JavaRDD.flatMap使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.spark.api.java.JavaRDD的用法示例。


在下文中一共展示了JavaRDD.flatMap方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: main

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage:");
        System.err.println("  SparkWordCount <sourceFile> <targetFile>");
        System.exit(1);
    }

    SparkConf conf = new SparkConf()
            .setAppName("Word Count");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> textFile = sc.textFile(args[0]);
    JavaRDD<String> words = textFile.flatMap(LineIterator::new);
    JavaPairRDD<String, Long> pairs =
            words.mapToPair(s -> new Tuple2<>(s, 1L));
    JavaPairRDD<String, Long> counts =
            pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b);

    System.out.println("Starting task..");
    long t = System.currentTimeMillis();
    counts.saveAsTextFile(args[1] + "_" + t);
    System.out.println("Time=" + (System.currentTimeMillis() - t));
}
 
开发者ID:hazelcast,项目名称:big-data-benchmark,代码行数:23,代码来源:SparkWordCount.java

示例2: wordCountJava8

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void wordCountJava8( String filename )
{
    // Define a configuration to use to interact with Spark
    SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");

    // Create a Java version of the Spark Context from the configuration
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Load the input data, which is a text file read from the command line
    JavaRDD<String> input = sc.textFile( filename );

    // Java 8 with lambdas: split the input string into words
   // TODO here a change has happened 
    JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );

    // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
    JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );

    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile( "output" );
}
 
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:22,代码来源:WordCount.java

示例3: extractEntry

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Extracts the given resource type from the RDD of bundles and returns
 * it as a Dataset of that type.
 *
 * @param spark the spark session
 * @param bundles an RDD of FHIR Bundles
 * @param resourceName the FHIR name of the resource type to extract
 *     (e.g., condition, patient. etc).
 * @param encoders the Encoders instance defining how the resources are encoded.
 * @param <T> the type of the resource being extracted from the bundles.
 * @return a dataset of the given resource
 */
public static <T extends IBaseResource> Dataset<T> extractEntry(SparkSession spark,
    JavaRDD<Bundle> bundles,
    String resourceName,
    FhirEncoders encoders) {

  RuntimeResourceDefinition def = context.getResourceDefinition(resourceName);

  JavaRDD<T> resourceRdd = bundles.flatMap(new ToResource<T>(def.getName()));

  Encoder<T> encoder = encoders.of((Class<T>) def.getImplementingClass());

  return spark.createDataset(resourceRdd.rdd(), encoder);
}
 
开发者ID:cerner,项目名称:bunsen,代码行数:26,代码来源:Bundles.java

示例4: main

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws ParseException {

		final Validator validator = new Validator(args);
		ValidatorParameters params = validator.getParameters();
		validator.setDoPrintInProcessRecord(false);

		logger.info("Input file is " + params.getArgs());
		SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount");
		JavaSparkContext context = new JavaSparkContext(conf);

		System.err.println(validator.getParameters().formatParameters());

		JavaRDD<String> inputFile = context.textFile(validator.getParameters().getArgs()[0]);

		JavaRDD<String> baseCountsRDD = inputFile
			.flatMap(content -> {
				MarcReader reader = ReadMarc.getMarcStringReader(content);
				Record marc4jRecord = reader.next();
				MarcRecord marcRecord = MarcFactory.createFromMarc4j(
					marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq());
				validator.processRecord(marcRecord, 1);
				return ValidationErrorFormatter
					.formatForSummary(marcRecord.getValidationErrors(), params.getFormat())
					.iterator();
			}
		);
		baseCountsRDD.saveAsTextFile(validator.getParameters().getFileName());
	}
 
开发者ID:pkiraly,项目名称:metadata-qa-marc,代码行数:29,代码来源:ParallelValidator.java

示例5: modifyQuads

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Modify RDD of quads in any needed way (filtering, flatMapping, ...)
 *
 * @param quads RDD of quads to modify
 * @return modified RDD of quads, returns original RDD in default
 */
@Override
public JavaRDD<Quad> modifyQuads(JavaRDD<Quad> quads) {
    final String labelURI = RDFS.LABEL.toString();
    return quads.flatMap(quad -> {
        if (quad.getSubject().isURI()) {
            String subjectURI = quad.getSubject().getURI();
            // for each quad specifying property label, create label quads for each URI variant of this property
            // done because Wikidata only provides entity labels, for example http://www.wikidata.org/entity/P279 and not http://www.wikidata.org/prop/direct/P279
            if (subjectURI.contains(PROPERTY_ENTITY_PREFIX) && quad.getPredicate().getURI().equals(labelURI)) {
                return Sets.newHashSet(
                        quad,
                        new Quad(quad.getGraph(),
                                NodeFactory.createURI(subjectURI.replace(PROPERTY_ENTITY_PREFIX, PROPERTY_DIRECT_PREFIX)),
                                quad.getPredicate(),
                                quad.getObject()),
                        new Quad(quad.getGraph(),
                                NodeFactory.createURI(subjectURI.replace(PROPERTY_ENTITY_PREFIX, PROPERTY_STATEMENT_PREFIX)),
                                quad.getPredicate(),
                                quad.getObject()),
                        new Quad(quad.getGraph(),
                                NodeFactory.createURI(subjectURI.replace(PROPERTY_ENTITY_PREFIX, PROPERTY_QUALIFIER_PREFIX)),
                                quad.getPredicate(),
                                quad.getObject())
                );
            }
        }

        return Collections.singleton(quad);
    });
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:37,代码来源:WikidataFlavor.java

示例6: getDistinctEntityProperties

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Reduce a RDD of {@link Instance}s into a map of [type index -&gt; list of its {@link Predicate}s and their properties (occurrences, is multiple)]
 *
 * @param instances  a RDD of {@link Instance}s
 * @param typeCounts map of type indexes to counts of their instances
 * @return map of [type index -&gt; list of its {@link Predicate}s and their properties (occurrences, is multiple)]
 */
private Map<Integer, List<EntityProperty>> getDistinctEntityProperties(JavaRDD<Instance> instances, Map<Integer, Long> typeCounts) {

    // all triples of (instance type, instance predicate, is multiple valued predicate)
    JavaRDD<Tuple3<Integer, Predicate, Boolean>> typePredicates = instances.flatMap(instance -> {
        Set<Predicate> predicates = instance.getLiteralPredicates();
        return instance.getTypes().stream()
                .flatMap(typeInt -> predicates.stream()
                        .map(predicate -> new Tuple3<>(
                                typeInt, // type index
                                predicate, // predicate
                                instance.getLiteralValue(predicate) instanceof Set // is multiple valued
                        ))
                ).collect(Collectors.toList());
    });

    return typePredicates
            .mapToPair(typePredicate -> new Tuple2<>(
                            new Tuple2<>(typePredicate._1(), typePredicate._2()), // predicate in type
                            new Tuple2<>(1L, typePredicate._3()) // count, is multiple valued
                    )
            )
            // get properties of each predicate in a specific type (will become a column)
            .reduceByKey((a, b) -> new Tuple2<>(
                    a._1() + b._1(), // sum counts
                    a._2() || b._2() // is multiple if it is multiple in any instance
            ))
            // collect to Java list
            .collect().stream()
            // group by type -> list of predicates and their properties
            .collect(Collectors.groupingBy(
                    typePredicate -> typePredicate._1()._1(),
                    Collectors.mapping(
                            typePredicate -> new EntityProperty(
                                    typePredicate._1()._2(), // predicate index
                                    typePredicate._2()._2(), // is multiple
                                    typePredicate._2()._1() / ((double) typeCounts.get(typePredicate._1()._1())) // non-null ratio
                            ),
                            Collectors.toList())
            ));

}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:49,代码来源:EntitySchemaCollector.java

示例7: writeEntityAttributeValueTable

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Persist the Entity Attribute Value table
 *
 * @param entitySchema entity schema
 * @param instances    RDD of {@link Instance}s
 */
public void writeEntityAttributeValueTable(EntitySchema entitySchema, JavaRDD<Instance> instances) {

    IndexMap<String> typeIndex = rdfSchema.getTypeIndex();
    // create the schema
    List<StructField> fields = new ArrayList<>();
    fields.add(DataTypes.createStructField(ID_COLUMN_NAME, DataTypes.LongType, false));
    fields.add(DataTypes.createStructField(PREDICATE_COLUMN_NAME, DataTypes.IntegerType, false));
    fields.add(DataTypes.createStructField(EAV_DATATYPE_COLUMN_NAME, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(EAV_LANGUAGE_COLUMN_NAME, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(EAV_VALUE_COLUMN_NAME, DataTypes.StringType, false));
    StructType schema = DataTypes.createStructType(fields);

    List<Tuple2<String, String>> indexes = new ArrayList<>();
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, ID_COLUMN_NAME));
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, PREDICATE_COLUMN_NAME));
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, EAV_DATATYPE_COLUMN_NAME));
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, EAV_LANGUAGE_COLUMN_NAME));

    // get map of type index -> set of attributes
    Map<Integer, Set<Predicate>> typeEavPredicates = entitySchema.getTables().stream()
            .collect(Collectors.toMap(
                    table -> typeIndex.getIndex(table.getTypeURI()),
                    table -> table.getAttributes().stream()
                            .map(EntityProperty::getPredicate)
                            .collect(Collectors.toSet())
            ));

    // get all entity attribute values
    JavaRDD<Row> rowRDD = instances.flatMap(instance ->
            instance.getLiteralPredicates().stream()
                    // filter predicates that are in the EAV set of at least one of the instance types
                    .filter(predicate -> instance.getTypes().stream().anyMatch(type ->
                            typeEavPredicates.containsKey(type) && // type could have been removed (not enough rows, ...)
                                    typeEavPredicates.get(type).contains(predicate)
                    ))
                    // map to row of values
                    .flatMap(predicate -> {
                                Object value = instance.getLiteralValue(predicate);
                                if (value instanceof Set) {
                                    // return a row for each single value
                                    return ((Set<Object>) value).stream().map(val -> getAttributeRow(instance, predicate, val));
                                }
                                return Stream.of(getAttributeRow(instance, predicate, value));//getAttributeRow(instance, predicate, value)
                            }
                    )
                    .collect(Collectors.toList())
    );

    int predicateCount = typeEavPredicates.values().stream().collect(Collectors.summingInt(Set::size));

    // create and write the dataframe
    log.info("Writing EAV table of {} predicates", predicateCount);
    DataFrame df = sql.createDataFrame(rowRDD, schema);
    persistor.writeDataFrame(EAV_TABLE_NAME, df);
    log.info("Creating indexes for EAV table");
    persistor.createIndexes(indexes);
    df.unpersist();
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:65,代码来源:InstanceRelationWriter.java


注:本文中的org.apache.spark.api.java.JavaRDD.flatMap方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。