本文整理汇总了Java中org.apache.spark.api.java.JavaRDD.filter方法的典型用法代码示例。如果您正苦于以下问题:Java JavaRDD.filter方法的具体用法?Java JavaRDD.filter怎么用?Java JavaRDD.filter使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaRDD
的用法示例。
在下文中一共展示了JavaRDD.filter方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: splitNewDataToTrainTest
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Implementation which splits based solely on time. It will return approximately
* the earliest {@link #getTestFraction()} of input, ordered by timestamp, as new training
* data and the rest as test data.
*/
@Override
protected Pair<JavaRDD<String>,JavaRDD<String>> splitNewDataToTrainTest(JavaRDD<String> newData) {
// Rough approximation; assumes timestamps are fairly evenly distributed
StatCounter maxMin = newData.mapToDouble(line -> MLFunctions.TO_TIMESTAMP_FN.call(line).doubleValue()).stats();
long minTime = (long) maxMin.min();
long maxTime = (long) maxMin.max();
log.info("New data timestamp range: {} - {}", minTime, maxTime);
long approxTestTrainBoundary = (long) (maxTime - getTestFraction() * (maxTime - minTime));
log.info("Splitting at timestamp {}", approxTestTrainBoundary);
JavaRDD<String> newTrainData = newData.filter(
line -> MLFunctions.TO_TIMESTAMP_FN.call(line) < approxTestTrainBoundary);
JavaRDD<String> testData = newData.filter(
line -> MLFunctions.TO_TIMESTAMP_FN.call(line) >= approxTestTrainBoundary);
return new Pair<>(newTrainData, testData);
}
示例2: filter
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* filter RDD of {@link Instance}s based on the specified config
*
* @param instances RDD of instances to filter
* @param typeIndex index mapping type URIs to integers
* @return filtered RDD of instances
*/
public JavaRDD<Instance> filter(JavaRDD<Instance> instances, IndexMap<String> typeIndex) {
if (config.getTypes().isEmpty()) {
return instances;
}
// get indexes of accepted type URIs
Set<Integer> acceptedTypes = config.getTypes().stream()
.map(typeIndex::getIndex)
.collect(Collectors.toSet());
instances = instances.filter(instance -> !Collections.disjoint(instance.getTypes(), acceptedTypes));
if (config.isIgnoreOtherTypes()) {
// remove other than accepted types from each instance
instances = instances.map(instance -> {
Set<Integer> intersect = Sets.intersection(instance.getTypes(), acceptedTypes).immutableCopy();
instance.getTypes().clear();
instance.getTypes().addAll(intersect);
return instance;
});
}
return instances;
}
示例3: filterQuadsByForbiddenSubjects
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Get quads with specified subjects filtered out, computed by querying an in-memory set of subjects
*
* @param quads RDD of quads to filter
* @param subjectBlacklist set of requested subject URIs to be filtered out
* @return filtered RDD with only those quads whose subject is NOT in subjectBlacklist
*/
public static JavaRDD<Quad> filterQuadsByForbiddenSubjects(JavaRDD<Quad> quads, Set<String> subjectBlacklist) {
if (subjectBlacklist.isEmpty()) {
return quads;
}
return quads.filter(quad -> !quad.getSubject().isURI() ||
!subjectBlacklist.contains(quad.getSubject().getURI())
);
}
示例4: filterTypeQuads
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Get quads that specify type of a resource
*
* @param quads RDD of quads to filter
* @param typePredicateURIs additional type predicates to use together with rdf:type
* @return RDD of quads that specify type of a resource
*/
public static JavaRDD<Quad> filterTypeQuads(JavaRDD<Quad> quads, List<String> typePredicateURIs) {
String typePredicateURI = RDF.TYPE.toString();
return quads.filter(quad -> {
if (!quad.getPredicate().isURI() || !quad.getObject().isURI()) {
return false;
}
String uri = quad.getPredicate().getURI();
return uri.equals(typePredicateURI) || typePredicateURIs.contains(uri);
});
}
示例5: main
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* @param args
*/
public static void main(String[] args) {
//C:\Users\sumit.kumar\Downloads\bin\warehouse
//System.setProperty("hadoop.home.dir", "C:\\Users\\sumit.kumar\\Downloads");
String logFile = "src/main/resources/Apology_by_Plato.txt"; // Should be some file on your system
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
SparkConf conf = new SparkConf().setMaster("local").setAppName("ActionExamples").set("spark.hadoop.validateOutputSpecs", "false");
JavaSparkContext sparkContext = new JavaSparkContext(conf);
JavaRDD<Integer> rdd = sparkContext.parallelize(Arrays.asList(1, 2, 3,4,5),3).cache();
JavaRDD<Integer> evenRDD= rdd.filter(new org.apache.spark.api.java.function.Function<Integer, Boolean>() {
@Override
public Boolean call(Integer v1) throws Exception {
return ((v1%2)==0)?true:false;
}
});
evenRDD.persist(StorageLevel.MEMORY_AND_DISK());
evenRDD.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer t) throws Exception {
System.out.println("The value of RDD are :"+t);
}
});
//unpersisting the RDD
evenRDD.unpersist();
rdd.unpersist();
/* JavaRDD<String> lines = spark.read().textFile(logFile).javaRDD().cache();
System.out.println("DEBUG: \n"+ lines.toDebugString());
long word= lines.count();
JavaRDD<String> distinctLines=lines.distinct();
System.out.println("DEBUG: \n"+ distinctLines.toDebugString());
JavaRDD<String> finalRdd=lines.subtract(distinctLines);
System.out.println("DEBUG: \n"+ finalRdd.toDebugString());
System.out.println("The count is "+word);
System.out.println("The count is "+distinctLines.count());
System.out.println("The count is "+finalRdd.count());
finalRdd.foreach(new VoidFunction<String>() {
@Override
public void call(String t) throws Exception {
// TODO Auto-generated method stub
System.out.println(t);
}
});
*/ /*SparkConf conf = new SparkConf().setAppName("Simple Application");
JavaSparkContext sc = new JavaSparkContext(conf);
StorageLevel newLevel;
JavaRDD<String> logData = sc.textFile(logFile).cache();
long numAs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) {
public Boolean call(String s) { return s.contains("a"); }
}).count();
long numBs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) {
public Boolean call(String s) { return s.contains("b"); }
}).count();
System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
sc.stop();*/
}
示例6: parseQuads
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public JavaRDD<Quad> parseQuads(String path) {
Configuration conf = new Configuration();
Integer batchSize = config.getBatchSize();
conf.set(NLineInputFormat.LINES_PER_MAP, batchSize.toString());
if (config.getErrorHandling() == ParseErrorHandling.Throw) {
conf.set(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, "false");
} else {
conf.set(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, "true");
}
Boolean isLineBased = config.getLineBasedFormat();
if (isLineBased == null) {
isLineBased = guessIsLineBasedFormat(path);
}
JavaRDD<Quad> quads;
Integer partitions = config.getRepartition();
if (isLineBased) {
log.info("Parsing RDF in parallel with batch size: {}", batchSize);
quads = sc.newAPIHadoopFile(path,
NQuadsInputFormat.class,
LongWritable.class, // position
QuadWritable.class, // value
conf).values().map(QuadWritable::get);
} else {
// let Jena guess the format, load whole files
log.info("Input format is not line based, parsing RDF by Master node only.");
quads = sc.newAPIHadoopFile(path,
TriplesOrQuadsInputFormat.class,
LongWritable.class, // position
QuadWritable.class, // value
conf).values().map(QuadWritable::get);
if (partitions == null) {
log.warn("Reading non-line based formats by master node only, consider setting --parsing.repartition to redistribute work to other nodes.");
}
}
if (partitions != null) {
log.info("Distributing workload, repartitioning into {} partitions", partitions);
quads = quads.repartition(partitions);
}
final List<String> acceptedLanguages = config.getAcceptedLanguages();
// if only some languages are accepted
if (!acceptedLanguages.isEmpty()) {
// filter out literals of unsupported languages
quads = quads.filter(quad ->
!quad.getObject().isLiteral() ||
quad.getObject().getLiteralLanguage() == null ||
quad.getObject().getLiteralLanguage().isEmpty() ||
acceptedLanguages.contains(quad.getObject().getLiteralLanguage())
);
}
return quads;
}
示例7: checkErrorInstance
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private JavaRDD<Instance> checkErrorInstance(JavaRDD<Instance> result) {
int errorType = uriIndex.getIndex(InstanceAggregator.IRI_TYPE_ERROR);
assertTrue("Parsing error instance is present", result.filter(instance -> instance.getTypes().contains(errorType)).count() == 1L);
// filter out error instance
return result.filter(instance -> !instance.getTypes().contains(errorType));
}
示例8: filterQuadsByAllowedSubjects
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Get quads with specified subjects only, computed by querying an in-memory set of subjects
*
* @param quads RDD of quads to filter
* @param subjectURIs set of requested subject URIs to keep
* @return filtered RDD with only those quads whose subject is in subjectURIs
*/
public static JavaRDD<Quad> filterQuadsByAllowedSubjects(JavaRDD<Quad> quads, Set<String> subjectURIs) {
return quads.filter(quad -> quad.getSubject().isURI() &&
subjectURIs.contains(quad.getSubject().getURI())
);
}
示例9: filterQuadsByObjects
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Get quads with specified object URIs, computed by querying an in-memory set of subjects
*
* @param quads RDD of quads to filter
* @param objectURIs set of requested object URIs to filter on
* @return filtered RDD with only those quads whose object is in objectURIs
*/
public static JavaRDD<Quad> filterQuadsByObjects(JavaRDD<Quad> quads, Set<String> objectURIs) {
return quads.filter(quad -> quad.getObject().isURI() &&
objectURIs.contains(quad.getObject().getURI())
);
}