本文整理汇总了Java中org.apache.spark.api.java.JavaRDD.flatMap方法的典型用法代码示例。如果您正苦于以下问题:Java JavaRDD.flatMap方法的具体用法?Java JavaRDD.flatMap怎么用?Java JavaRDD.flatMap使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaRDD
的用法示例。
在下文中一共展示了JavaRDD.flatMap方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) {
if (args.length != 2) {
System.err.println("Usage:");
System.err.println(" SparkWordCount <sourceFile> <targetFile>");
System.exit(1);
}
SparkConf conf = new SparkConf()
.setAppName("Word Count");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> textFile = sc.textFile(args[0]);
JavaRDD<String> words = textFile.flatMap(LineIterator::new);
JavaPairRDD<String, Long> pairs =
words.mapToPair(s -> new Tuple2<>(s, 1L));
JavaPairRDD<String, Long> counts =
pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b);
System.out.println("Starting task..");
long t = System.currentTimeMillis();
counts.saveAsTextFile(args[1] + "_" + t);
System.out.println("Time=" + (System.currentTimeMillis() - t));
}
示例2: wordCountJava8
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void wordCountJava8( String filename )
{
// Define a configuration to use to interact with Spark
SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");
// Create a Java version of the Spark Context from the configuration
JavaSparkContext sc = new JavaSparkContext(conf);
// Load the input data, which is a text file read from the command line
JavaRDD<String> input = sc.textFile( filename );
// Java 8 with lambdas: split the input string into words
// TODO here a change has happened
JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );
// Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );
// Save the word count back out to a text file, causing evaluation.
counts.saveAsTextFile( "output" );
}
示例3: extractEntry
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Extracts the given resource type from the RDD of bundles and returns
* it as a Dataset of that type.
*
* @param spark the spark session
* @param bundles an RDD of FHIR Bundles
* @param resourceName the FHIR name of the resource type to extract
* (e.g., condition, patient. etc).
* @param encoders the Encoders instance defining how the resources are encoded.
* @param <T> the type of the resource being extracted from the bundles.
* @return a dataset of the given resource
*/
public static <T extends IBaseResource> Dataset<T> extractEntry(SparkSession spark,
JavaRDD<Bundle> bundles,
String resourceName,
FhirEncoders encoders) {
RuntimeResourceDefinition def = context.getResourceDefinition(resourceName);
JavaRDD<T> resourceRdd = bundles.flatMap(new ToResource<T>(def.getName()));
Encoder<T> encoder = encoders.of((Class<T>) def.getImplementingClass());
return spark.createDataset(resourceRdd.rdd(), encoder);
}
示例4: main
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws ParseException {
final Validator validator = new Validator(args);
ValidatorParameters params = validator.getParameters();
validator.setDoPrintInProcessRecord(false);
logger.info("Input file is " + params.getArgs());
SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount");
JavaSparkContext context = new JavaSparkContext(conf);
System.err.println(validator.getParameters().formatParameters());
JavaRDD<String> inputFile = context.textFile(validator.getParameters().getArgs()[0]);
JavaRDD<String> baseCountsRDD = inputFile
.flatMap(content -> {
MarcReader reader = ReadMarc.getMarcStringReader(content);
Record marc4jRecord = reader.next();
MarcRecord marcRecord = MarcFactory.createFromMarc4j(
marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq());
validator.processRecord(marcRecord, 1);
return ValidationErrorFormatter
.formatForSummary(marcRecord.getValidationErrors(), params.getFormat())
.iterator();
}
);
baseCountsRDD.saveAsTextFile(validator.getParameters().getFileName());
}
示例5: modifyQuads
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Modify RDD of quads in any needed way (filtering, flatMapping, ...)
*
* @param quads RDD of quads to modify
* @return modified RDD of quads, returns original RDD in default
*/
@Override
public JavaRDD<Quad> modifyQuads(JavaRDD<Quad> quads) {
final String labelURI = RDFS.LABEL.toString();
return quads.flatMap(quad -> {
if (quad.getSubject().isURI()) {
String subjectURI = quad.getSubject().getURI();
// for each quad specifying property label, create label quads for each URI variant of this property
// done because Wikidata only provides entity labels, for example http://www.wikidata.org/entity/P279 and not http://www.wikidata.org/prop/direct/P279
if (subjectURI.contains(PROPERTY_ENTITY_PREFIX) && quad.getPredicate().getURI().equals(labelURI)) {
return Sets.newHashSet(
quad,
new Quad(quad.getGraph(),
NodeFactory.createURI(subjectURI.replace(PROPERTY_ENTITY_PREFIX, PROPERTY_DIRECT_PREFIX)),
quad.getPredicate(),
quad.getObject()),
new Quad(quad.getGraph(),
NodeFactory.createURI(subjectURI.replace(PROPERTY_ENTITY_PREFIX, PROPERTY_STATEMENT_PREFIX)),
quad.getPredicate(),
quad.getObject()),
new Quad(quad.getGraph(),
NodeFactory.createURI(subjectURI.replace(PROPERTY_ENTITY_PREFIX, PROPERTY_QUALIFIER_PREFIX)),
quad.getPredicate(),
quad.getObject())
);
}
}
return Collections.singleton(quad);
});
}
示例6: getDistinctEntityProperties
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Reduce a RDD of {@link Instance}s into a map of [type index -> list of its {@link Predicate}s and their properties (occurrences, is multiple)]
*
* @param instances a RDD of {@link Instance}s
* @param typeCounts map of type indexes to counts of their instances
* @return map of [type index -> list of its {@link Predicate}s and their properties (occurrences, is multiple)]
*/
private Map<Integer, List<EntityProperty>> getDistinctEntityProperties(JavaRDD<Instance> instances, Map<Integer, Long> typeCounts) {
// all triples of (instance type, instance predicate, is multiple valued predicate)
JavaRDD<Tuple3<Integer, Predicate, Boolean>> typePredicates = instances.flatMap(instance -> {
Set<Predicate> predicates = instance.getLiteralPredicates();
return instance.getTypes().stream()
.flatMap(typeInt -> predicates.stream()
.map(predicate -> new Tuple3<>(
typeInt, // type index
predicate, // predicate
instance.getLiteralValue(predicate) instanceof Set // is multiple valued
))
).collect(Collectors.toList());
});
return typePredicates
.mapToPair(typePredicate -> new Tuple2<>(
new Tuple2<>(typePredicate._1(), typePredicate._2()), // predicate in type
new Tuple2<>(1L, typePredicate._3()) // count, is multiple valued
)
)
// get properties of each predicate in a specific type (will become a column)
.reduceByKey((a, b) -> new Tuple2<>(
a._1() + b._1(), // sum counts
a._2() || b._2() // is multiple if it is multiple in any instance
))
// collect to Java list
.collect().stream()
// group by type -> list of predicates and their properties
.collect(Collectors.groupingBy(
typePredicate -> typePredicate._1()._1(),
Collectors.mapping(
typePredicate -> new EntityProperty(
typePredicate._1()._2(), // predicate index
typePredicate._2()._2(), // is multiple
typePredicate._2()._1() / ((double) typeCounts.get(typePredicate._1()._1())) // non-null ratio
),
Collectors.toList())
));
}
示例7: writeEntityAttributeValueTable
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Persist the Entity Attribute Value table
*
* @param entitySchema entity schema
* @param instances RDD of {@link Instance}s
*/
public void writeEntityAttributeValueTable(EntitySchema entitySchema, JavaRDD<Instance> instances) {
IndexMap<String> typeIndex = rdfSchema.getTypeIndex();
// create the schema
List<StructField> fields = new ArrayList<>();
fields.add(DataTypes.createStructField(ID_COLUMN_NAME, DataTypes.LongType, false));
fields.add(DataTypes.createStructField(PREDICATE_COLUMN_NAME, DataTypes.IntegerType, false));
fields.add(DataTypes.createStructField(EAV_DATATYPE_COLUMN_NAME, DataTypes.StringType, true));
fields.add(DataTypes.createStructField(EAV_LANGUAGE_COLUMN_NAME, DataTypes.StringType, true));
fields.add(DataTypes.createStructField(EAV_VALUE_COLUMN_NAME, DataTypes.StringType, false));
StructType schema = DataTypes.createStructType(fields);
List<Tuple2<String, String>> indexes = new ArrayList<>();
indexes.add(new Tuple2<>(EAV_TABLE_NAME, ID_COLUMN_NAME));
indexes.add(new Tuple2<>(EAV_TABLE_NAME, PREDICATE_COLUMN_NAME));
indexes.add(new Tuple2<>(EAV_TABLE_NAME, EAV_DATATYPE_COLUMN_NAME));
indexes.add(new Tuple2<>(EAV_TABLE_NAME, EAV_LANGUAGE_COLUMN_NAME));
// get map of type index -> set of attributes
Map<Integer, Set<Predicate>> typeEavPredicates = entitySchema.getTables().stream()
.collect(Collectors.toMap(
table -> typeIndex.getIndex(table.getTypeURI()),
table -> table.getAttributes().stream()
.map(EntityProperty::getPredicate)
.collect(Collectors.toSet())
));
// get all entity attribute values
JavaRDD<Row> rowRDD = instances.flatMap(instance ->
instance.getLiteralPredicates().stream()
// filter predicates that are in the EAV set of at least one of the instance types
.filter(predicate -> instance.getTypes().stream().anyMatch(type ->
typeEavPredicates.containsKey(type) && // type could have been removed (not enough rows, ...)
typeEavPredicates.get(type).contains(predicate)
))
// map to row of values
.flatMap(predicate -> {
Object value = instance.getLiteralValue(predicate);
if (value instanceof Set) {
// return a row for each single value
return ((Set<Object>) value).stream().map(val -> getAttributeRow(instance, predicate, val));
}
return Stream.of(getAttributeRow(instance, predicate, value));//getAttributeRow(instance, predicate, value)
}
)
.collect(Collectors.toList())
);
int predicateCount = typeEavPredicates.values().stream().collect(Collectors.summingInt(Set::size));
// create and write the dataframe
log.info("Writing EAV table of {} predicates", predicateCount);
DataFrame df = sql.createDataFrame(rowRDD, schema);
persistor.writeDataFrame(EAV_TABLE_NAME, df);
log.info("Creating indexes for EAV table");
persistor.createIndexes(indexes);
df.unpersist();
}