当前位置: 首页>>代码示例>>Java>>正文


Java JavaRDD.filter方法代码示例

本文整理汇总了Java中org.apache.spark.api.java.JavaRDD.filter方法的典型用法代码示例。如果您正苦于以下问题:Java JavaRDD.filter方法的具体用法?Java JavaRDD.filter怎么用?Java JavaRDD.filter使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.spark.api.java.JavaRDD的用法示例。


在下文中一共展示了JavaRDD.filter方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: splitNewDataToTrainTest

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Implementation which splits based solely on time. It will return approximately
 * the earliest {@link #getTestFraction()} of input, ordered by timestamp, as new training
 * data and the rest as test data.
 */
@Override
protected Pair<JavaRDD<String>,JavaRDD<String>> splitNewDataToTrainTest(JavaRDD<String> newData) {
  // Rough approximation; assumes timestamps are fairly evenly distributed
  StatCounter maxMin = newData.mapToDouble(line -> MLFunctions.TO_TIMESTAMP_FN.call(line).doubleValue()).stats();

  long minTime = (long) maxMin.min();
  long maxTime = (long) maxMin.max();
  log.info("New data timestamp range: {} - {}", minTime, maxTime);
  long approxTestTrainBoundary = (long) (maxTime - getTestFraction() * (maxTime - minTime));
  log.info("Splitting at timestamp {}", approxTestTrainBoundary);

  JavaRDD<String> newTrainData = newData.filter(
      line -> MLFunctions.TO_TIMESTAMP_FN.call(line) < approxTestTrainBoundary);
  JavaRDD<String> testData = newData.filter(
      line -> MLFunctions.TO_TIMESTAMP_FN.call(line) >= approxTestTrainBoundary);

  return new Pair<>(newTrainData, testData);
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:24,代码来源:ALSUpdate.java

示例2: filter

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * filter RDD of {@link Instance}s based on the specified config
 *
 * @param instances RDD of instances to filter
 * @param typeIndex index mapping type URIs to integers
 * @return filtered RDD of instances
 */
public JavaRDD<Instance> filter(JavaRDD<Instance> instances, IndexMap<String> typeIndex) {
    if (config.getTypes().isEmpty()) {
        return instances;
    }
    // get indexes of accepted type URIs
    Set<Integer> acceptedTypes = config.getTypes().stream()
            .map(typeIndex::getIndex)
            .collect(Collectors.toSet());

    instances = instances.filter(instance -> !Collections.disjoint(instance.getTypes(), acceptedTypes));

    if (config.isIgnoreOtherTypes()) {
        // remove other than accepted types from each instance
        instances = instances.map(instance -> {
            Set<Integer> intersect = Sets.intersection(instance.getTypes(), acceptedTypes).immutableCopy();
            instance.getTypes().clear();
            instance.getTypes().addAll(intersect);
            return instance;
        });
    }

    return instances;
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:31,代码来源:InstanceFilter.java

示例3: filterQuadsByForbiddenSubjects

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Get quads with specified subjects filtered out, computed by querying an in-memory set of subjects
 *
 * @param quads            RDD of quads to filter
 * @param subjectBlacklist set of requested subject URIs to be filtered out
 * @return filtered RDD with only those quads whose subject is NOT in subjectBlacklist
 */
public static JavaRDD<Quad> filterQuadsByForbiddenSubjects(JavaRDD<Quad> quads, Set<String> subjectBlacklist) {
    if (subjectBlacklist.isEmpty()) {
        return quads;
    }
    return quads.filter(quad -> !quad.getSubject().isURI() ||
            !subjectBlacklist.contains(quad.getSubject().getURI())
    );
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:16,代码来源:QuadUtils.java

示例4: filterTypeQuads

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Get quads that specify type of a resource
 *
 * @param quads             RDD of quads to filter
 * @param typePredicateURIs additional type predicates to use together with rdf:type
 * @return RDD of quads that specify type of a resource
 */
public static JavaRDD<Quad> filterTypeQuads(JavaRDD<Quad> quads, List<String> typePredicateURIs) {
    String typePredicateURI = RDF.TYPE.toString();

    return quads.filter(quad -> {
        if (!quad.getPredicate().isURI() || !quad.getObject().isURI()) {
            return false;
        }
        String uri = quad.getPredicate().getURI();
        return uri.equals(typePredicateURI) || typePredicateURIs.contains(uri);
    });
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:19,代码来源:QuadUtils.java

示例5: main

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
	 * @param args
	 */
	public static void main(String[] args) {
		//C:\Users\sumit.kumar\Downloads\bin\warehouse
		//System.setProperty("hadoop.home.dir", "C:\\Users\\sumit.kumar\\Downloads");
		String logFile = "src/main/resources/Apology_by_Plato.txt"; // Should be some file on your system
		Logger rootLogger = LogManager.getRootLogger();
		rootLogger.setLevel(Level.WARN);
		 SparkConf conf = new SparkConf().setMaster("local").setAppName("ActionExamples").set("spark.hadoop.validateOutputSpecs", "false");
			JavaSparkContext sparkContext = new JavaSparkContext(conf);
		    JavaRDD<Integer> rdd = sparkContext.parallelize(Arrays.asList(1, 2, 3,4,5),3).cache();	
		    JavaRDD<Integer> evenRDD= rdd.filter(new org.apache.spark.api.java.function.Function<Integer, Boolean>() {
			@Override
			public Boolean call(Integer v1) throws Exception {
			  return ((v1%2)==0)?true:false;
				}
			});
		    
		    evenRDD.persist(StorageLevel.MEMORY_AND_DISK());
		    evenRDD.foreach(new VoidFunction<Integer>() {
			@Override
			public void call(Integer t) throws Exception {
			System.out.println("The value of RDD are :"+t);
			 }
			});
		   //unpersisting the RDD 
		   evenRDD.unpersist();
		   rdd.unpersist();
		   
		   /* JavaRDD<String> lines = spark.read().textFile(logFile).javaRDD().cache();
		    System.out.println("DEBUG: \n"+ lines.toDebugString());
		   long word= lines.count();
		   JavaRDD<String> distinctLines=lines.distinct();
		   System.out.println("DEBUG: \n"+ distinctLines.toDebugString());
		   JavaRDD<String> finalRdd=lines.subtract(distinctLines);
		    
		   
		   System.out.println("DEBUG: \n"+ finalRdd.toDebugString());
		   System.out.println("The count is "+word);
		   System.out.println("The count is "+distinctLines.count());
		   System.out.println("The count is "+finalRdd.count());
		   
		   finalRdd.foreach(new VoidFunction<String>() {
			
			@Override
			public void call(String t) throws Exception {
				// TODO Auto-generated method stub
				System.out.println(t);
			}
		});
*/	    /*SparkConf conf = new SparkConf().setAppName("Simple Application");
	    JavaSparkContext sc = new JavaSparkContext(conf);
	    StorageLevel newLevel;
		JavaRDD<String> logData = sc.textFile(logFile).cache();

	    long numAs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) {
	      public Boolean call(String s) { return s.contains("a"); }
	    }).count();

	    long numBs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) {
	      public Boolean call(String s) { return s.contains("b"); }
	    }).count();

	    System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
	    
	    sc.stop();*/

	}
 
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:70,代码来源:PersistExample.java

示例6: parseQuads

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public JavaRDD<Quad> parseQuads(String path) {

    Configuration conf = new Configuration();

    Integer batchSize = config.getBatchSize();
    conf.set(NLineInputFormat.LINES_PER_MAP, batchSize.toString());

    if (config.getErrorHandling() == ParseErrorHandling.Throw) {
        conf.set(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, "false");
    } else {
        conf.set(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, "true");
    }

    Boolean isLineBased = config.getLineBasedFormat();
    if (isLineBased == null) {
        isLineBased = guessIsLineBasedFormat(path);
    }
    JavaRDD<Quad> quads;
    Integer partitions = config.getRepartition();
    if (isLineBased) {
        log.info("Parsing RDF in parallel with batch size: {}", batchSize);
        quads = sc.newAPIHadoopFile(path,
                NQuadsInputFormat.class,
                LongWritable.class, // position
                QuadWritable.class, // value
                conf).values().map(QuadWritable::get);
    } else {
        // let Jena guess the format, load whole files
        log.info("Input format is not line based, parsing RDF by Master node only.");
        quads = sc.newAPIHadoopFile(path,
                TriplesOrQuadsInputFormat.class,
                LongWritable.class, // position
                QuadWritable.class, // value
                conf).values().map(QuadWritable::get);

        if (partitions == null) {
            log.warn("Reading non-line based formats by master node only, consider setting --parsing.repartition to redistribute work to other nodes.");
        }
    }
    if (partitions != null) {
        log.info("Distributing workload, repartitioning into {} partitions", partitions);
        quads = quads.repartition(partitions);
    }


    final List<String> acceptedLanguages = config.getAcceptedLanguages();
    // if only some languages are accepted
    if (!acceptedLanguages.isEmpty()) {
        // filter out literals of unsupported languages
        quads = quads.filter(quad ->
                !quad.getObject().isLiteral() ||
                        quad.getObject().getLiteralLanguage() == null ||
                        quad.getObject().getLiteralLanguage().isEmpty() ||
                        acceptedLanguages.contains(quad.getObject().getLiteralLanguage())
        );
    }

    return quads;
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:61,代码来源:ElephasQuadParser.java

示例7: checkErrorInstance

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private JavaRDD<Instance> checkErrorInstance(JavaRDD<Instance> result) {
    int errorType = uriIndex.getIndex(InstanceAggregator.IRI_TYPE_ERROR);
    assertTrue("Parsing error instance is present", result.filter(instance -> instance.getTypes().contains(errorType)).count() == 1L);
    // filter out error instance
    return result.filter(instance -> !instance.getTypes().contains(errorType));
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:7,代码来源:InstanceAggregatorTest.java

示例8: filterQuadsByAllowedSubjects

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Get quads with specified subjects only, computed by querying an in-memory set of subjects
 *
 * @param quads       RDD of quads to filter
 * @param subjectURIs set of requested subject URIs to keep
 * @return filtered RDD with only those quads whose subject is in subjectURIs
 */
public static JavaRDD<Quad> filterQuadsByAllowedSubjects(JavaRDD<Quad> quads, Set<String> subjectURIs) {
    return quads.filter(quad -> quad.getSubject().isURI() &&
            subjectURIs.contains(quad.getSubject().getURI())
    );
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:13,代码来源:QuadUtils.java

示例9: filterQuadsByObjects

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Get quads with specified object URIs, computed by querying an in-memory set of subjects
 *
 * @param quads      RDD of quads to filter
 * @param objectURIs set of requested object URIs to filter on
 * @return filtered RDD with only those quads whose object is in objectURIs
 */
public static JavaRDD<Quad> filterQuadsByObjects(JavaRDD<Quad> quads, Set<String> objectURIs) {
    return quads.filter(quad -> quad.getObject().isURI() &&
            objectURIs.contains(quad.getObject().getURI())
    );
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:13,代码来源:QuadUtils.java


注:本文中的org.apache.spark.api.java.JavaRDD.filter方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。