当前位置: 首页>>代码示例>>Python>>正文


Python SparkContext.parallelize方法代码示例

本文整理汇总了Python中pyspark.SparkContext.parallelize方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.parallelize方法的具体用法?Python SparkContext.parallelize怎么用?Python SparkContext.parallelize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.SparkContext的用法示例。


在下文中一共展示了SparkContext.parallelize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: SearchTiles_and_Factorize

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def SearchTiles_and_Factorize(n): 
	global globalmergedtiles
	global globalcoordinates
	global factors_accum 
	global spcon

	spcon = SparkContext("local[4]","Spark_TileSearch_Optimized")

	if persisted_tiles == True:
        	tileintervalsf=open("/home/shrinivaasanka/Krishna_iResearch_OpenSource/GitHub/asfer-github-code/cpp-src/miscellaneous/DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.tileintervals","r")

        	tileintervalslist=tileintervalsf.read().split("\n")
		#print "tileintervalslist=",tileintervalslist
        	tileintervalslist_accum=spcon.accumulator(tilesintervalslist, VectorAccumulatorParam())
		paralleltileintervals=spcon.parallelize(tileintervalslist)
		paralleltileintervals.foreach(tilesearch)
	else:
		factorsfile=open("DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.factors","w")
		hardy_ramanujan_ray_shooting_queries(n)
		hardy_ramanujan_prime_number_theorem_ray_shooting_queries(n)
		baker_harman_pintz_ray_shooting_queries(n)
		cramer_ray_shooting_queries(n)
		zhang_ray_shooting_queries(n)
        	factors_accum=spcon.accumulator(factors_of_n, FactorsAccumulatorParam())
		#spcon.parallelize(xrange(1,n)).foreach(tilesearch_nonpersistent)
		spcon.parallelize(spcon.range(1,n).collect()).foreach(tilesearch_nonpersistent)
		print "factors_accum.value = ", factors_accum.value
		factors=[]
		factordict={}
		for f in factors_accum.value:
			factors += f
		factordict[n]=factors
		json.dump(factordict,factorsfile)
		return factors
开发者ID:shrinivaasanka,项目名称:asfer-github-code,代码行数:36,代码来源:DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.py

示例2: TestWordCounter

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
class TestWordCounter(unittest.TestCase):



	def setUp(self):
	   conf = SparkConf().setAppName("appTest").setMaster("local[*]")
	   self.sc = SparkContext(conf=conf)
	   self.counter = WordCounter() 

	def tearDown(self):
	   self.sc.stop()

	def test_when_exist_one_movie_and_counter(self):
	   movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy",
	                "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"]	 
	   result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']),
	             ('Toy', ['::Toy Story Toy (1995)::']))                
	   movies = self.sc.parallelize(movieList)
 	   self.assertEqual(self.counter.getMaxValues(movies),result)   


 	def test_when_exist_one_movie_and_counter_moreMovies(self):
	   movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy",
	                "1993::ToyA StoryB ToyA (1995)::Animation|Children's|Comedy",
	                "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"]	 
	   result = (('ToyA', ['::ToyA StoryB ToyA (1995)::','::ToyA StoryA ToyA (1995)::']))                
	   movies = self.sc.parallelize(movieList)
 	   self.assertEqual(self.counter.getMaxValues(movies),result)   
开发者ID:cpedrero,项目名称:BigDataGroup,代码行数:30,代码来源:word_counter_tests.py

示例3: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def main():
    sc = SparkContext(appName="MyApp")
    sc.setLogLevel('ERROR')

    # Parse data
    train_labels, train_data = load_data('train.csv')
    dummy_labels, test_data = load_data('test.csv', use_labels=False)

    # Truncate the last 2 features of the data
    for dataPoint in train_data:
        len = np.size(dataPoint)
        dataPoint = np.delete(dataPoint, [len - 2, len - 1])

    for dataPoint in test_data:
        len = np.size(dataPoint)
        dataPoint = np.delete(dataPoint, [len - 2, len - 1])

    # Map each data point's label to its features
    train_set = reformatData(train_data, train_labels)
    test_set = reformatData(test_data, dummy_labels)

    # Parallelize the data
    parallelized_train_set = sc.parallelize(train_set)
    parallelized_test_set = sc.parallelize(test_set)

    # Split the data
    trainSet, validationSet = parallelized_train_set.randomSplit([0.01, 0.99], seed=42)

    # Train the models
    randomForestModel = RandomForest.trainClassifier(trainSet, numClasses=4, impurity='gini', categoricalFeaturesInfo={},
                                         numTrees=750, seed=42, maxDepth=30, maxBins=32)

    # Test the model
    testRandomForest(randomForestModel, parallelized_test_set)
开发者ID:adepalatis,项目名称:379K_Final_Project,代码行数:36,代码来源:RandomForest.py

示例4: longest_common_substring

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def longest_common_substring(strands):
	pass
	# create the Spark context
	conf = SparkConf().setAppName("longest_common_substring")
	sc = SparkContext(conf=conf)

	# create an accumulator for key-value pairs, where each key is a substring, and each value is the set of strings where the substring can be found
	class ArrayAccumulatorParam(AccumulatorParam):
		def zero(self, initialValue):
			return initialValue

		def addInPlace(self, v1, v2):
			if type(v2) is list:
				v1.extend(v2)
			elif type(v2) is tuple:
				v1.append(v2)

			return v1

	acc = sc.accumulator([], ArrayAccumulatorParam())

	def generate_substrings(data_element):
		k, v = data_element
		i = 0
		while i < len(v):
			j = i + 1
			while j < len(v):
				acc.add((v[i:j],k))
				j += 1
			i += 1

	sc.parallelize([(k, v) for k, v in strands.iteritems()]).foreach(generate_substrings)

	all_substrings = sc.parallelize(acc.value)
	return all_substrings.groupByKey().filter(lambda x: set(list(x[1])) == set(strands.keys())).takeOrdered(1, key=lambda x: -len(x[0]))[0][0]
开发者ID:vardaofthevalier,项目名称:Miscellaneous,代码行数:37,代码来源:shared_motif.py

示例5: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def main():
    sc = SparkContext(appName="MyApp")
    sc.setLogLevel('ERROR')

    # Parse data
    train_labels, train_data = load_data('train.csv')
    dummy_labels, test_data = load_data('test.csv', use_labels=False)

    # Map each data point's label to its features
    train_set = reformatData(train_data, train_labels)
    test_set = reformatData(test_data, dummy_labels)

    # Parallelize the data
    parallelized_train_set = sc.parallelize(train_set)
    parallelized_test_set = sc.parallelize(test_set)

    # Split the data
    trainSet, validationSet = parallelized_train_set.randomSplit([1.0, 0.0], seed=42)

    # Train the models
    decisionTreeModel = DecisionTree.trainClassifier(trainSet, numClasses=5, categoricalFeaturesInfo={},
                                         impurity='gini', maxBins=55, maxDepth=30, minInstancesPerNode=2)

    # Test the model
    testDecisionTree(decisionTreeModel, parallelized_test_set)
开发者ID:adepalatis,项目名称:379K_Final_Project,代码行数:27,代码来源:DecisionTree.py

示例6: LookAlikeTest

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
class LookAlikeTest(unittest.TestCase):
    def setUp(self):
        conf = SparkConf().setAppName("Tests").setMaster("local")
        self.sc = SparkContext(conf=conf)

    def tearDown(self):
        self.sc.stop()

    def test_ratings_calculation(self):
        data = [("u1", 123), ("u1", 123), ("u1", 132),
                ("u2", 123), ("u2", 111), ("u2", 111), ("u2", 111), ("u2", 111),
                ("u3", 123), ("u3", 123), ("u3", 125), ("u3", 125), ("u3", 111)]
        input_data = self.sc.parallelize(data)
        ratings = calculate_ratings(input_data).collectAsMap()
        self.assertEqual(ratings["u1"][123], 1.0)
        self.assertEqual(ratings["u1"][132], 0.5)
        self.assertEqual(ratings["u2"][111], 1.0)
        self.assertEqual(ratings["u2"][123], 0.25)
        self.assertEqual(ratings["u3"][123], 1.0)
        self.assertEqual(ratings["u3"][125], 1.0)
        self.assertEqual(ratings["u3"][111], 0.5)

    def test_correlations_calculation(self):
        ratings = [("u1", {1: 0.5, 2: 1.0, 3: 0.1}),
                   ("u2", {1: 0.25, 3: 1.0}),
                   ("u3", {2: 0.25, 3: 1.0})]
        ratings_data = self.sc.parallelize(ratings)
        correlations = calculate_correlations(ratings_data, 3).collectAsMap()
        self.assertEqual(round(correlations[1], 2), -1.0)
        self.assertEqual(round(correlations[2], 2), -1.0)
开发者ID:AlexanderTolmachev,项目名称:look-alike,代码行数:32,代码来源:tests.py

示例7: SparkBroadcastAccumulator

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def SparkBroadcastAccumulator(n): 
	global broadcast_var
	global accumulator_var
	spcon = SparkContext("local[2]","SparkBroadcastAccumulator")
	broadcast_var=spcon.broadcast("broadcast_message")
	accumulator_var=spcon.accumulator(0)
	spcon.parallelize(xrange(1,n)).foreach(lambda x: broadcast_accumulator_receiver(accumulator_var.add(x)))
开发者ID:shrinivaasanka,项目名称:Grafit,代码行数:9,代码来源:Spark_Broadcast_Accumulator.py

示例8: parallelDisassembler

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def parallelDisassembler(matrix,groups):


	def splitGroup(g):
		b,n1,n2 = isAFalse(g,matrix)
		if b:
			g1 = [n1]
			g2 = [n2]

			for nid in g:
				if nid != n1 and nid != n2:
					sim1 = 0.0
					sim2 = 0.0
					for tmp in g1:
						sim1 += jcSig(matrix[tmp],matrix[nid])
					for tmp in g2:
						sim2 += jcSig(matrix[tmp],matrix[nid])


					if sim1 / len(g1) > sim2 / len(g2):
						g1 += [nid]
					else:
						g2 += [nid]

			return g1,g2

		return ([],g)

	tmp = len(groups)
	sc = SparkContext(appName="Splitter")
	parrGroup = sc.parallelize(groups)
	groups = parrGroup.map(splitGroup).collect()
	tmpgrp = []
	for g1,g2 in groups:
		tmpgrp += [g1]
		tmpgrp += [g2]

	groups = tmpgrp

	while len(groups) != tmp :

		tmp = len(groups)
		#print(tmp)

		parrGroup = sc.parallelize(groups)
		groups = parrGroup.map(splitGroup).collect()
		tmpgrp = []
		for g1,g2 in groups:
			if g1 != []:
				tmpgrp += [g1]
			tmpgrp += [g2]

		groups = tmpgrp

	sc.stop()

	return groups
开发者ID:luca-zamboni,项目名称:Big-Data,代码行数:59,代码来源:aggregator.py

示例9: _kmeans_spark

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def _kmeans_spark(X, n_clusters, max_iter=300, worker_nums=10, init='k-means++', random_state=None, tol=1e-4):
    from pyspark import SparkContext, SparkConf

    conf = SparkConf().setAppName('K-Means_Spark').setMaster('local[%d]'%worker_nums)
    sc = SparkContext(conf=conf)
    data = sc.parallelize(X)
    data.cache()

    random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None

    x_squared_norms = row_norms(X, squared=True)
    #  x_squared_norms = data.map(lambda x: (x*x).sum(axis=0)).collect()
    #  x_squared_norms = np.array(x_squared_norms, dtype='float64')

    centers = _init_centroids(X, n_clusters, init, random_state, x_squared_norms=x_squared_norms)

    bs = X.shape[0]/worker_nums
    data_temp = []
    for i in range(worker_nums-1):
        data_temp.append(X[i*bs:(i+1)*bs])
    data_temp.append(X[(worker_nums-1)*bs:])
    data_temp = np.array(data_temp, dtype='float64')
    data_temp = sc.parallelize(data_temp)
    data_temp.cache()


    for i in range(max_iter):
        centers_old = centers.copy()

        all_distances = data_temp.map(lambda x: euclidean_distances(centers, x, squared=True)).collect()
        temp_all_distances = all_distances[0]
        for i in range(1, worker_nums):
            temp_all_distances = np.hstack((temp_all_distances, all_distances[i]))
        all_distances = temp_all_distances

        #  all_distances = data.map(lambda x: euclidean_distances(centers, x, squared=True)).collect()
        #  # reshape, from (1, n_samples, k) to (k, n_samples)
        #  all_distances = np.asarray(all_distances, dtype="float64").T[0]

        # Assignment, also called E-step of EM
        labels, inertia = _labels_inertia(X, x_squared_norms, centers, all_distances=all_distances)
        # re-computation of the centroids, also called M-step of EM
        centers = _centers(X, labels, n_clusters)

        if best_inertia is None or inertia < best_inertia:
            best_labels  = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        shift = squared_norm(centers_old - centers)
        if shift <= tol:
            break

    return best_centers, best_labels, best_inertia
开发者ID:cyh24,项目名称:PySparkML,代码行数:58,代码来源:k_means_.py

示例10: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def main(num_factors, num_workers, num_iterations, beta_value, lambda_value, Wm_value, \
        V_filename, output_W_filename, output_H_filename):
    # Conf
    conf = SparkConf().setAppName("Spark SGD MF")
    sc = SparkContext(conf=conf)
    
    user_movie_ratings = sc.textFile(V_filename).map(line_to_movie_user_ratings)
    user_movie_ratings.persist()

    #global user_nonzero, movie_nonzero
    #user_nonzero = user_movie_ratings.keyBy(first_element).countByKey()
    #movie_nonzero = user_movie_ratings.keyBy(second_element).countByKey()

    num_users = int(user_movie_ratings.map(first_element).reduce(max))
    num_movies = int(user_movie_ratings.map(second_element).reduce(max))

    global updates_total
    updates_total = 0
   
    # Begin iterations
    iter = 0
    global seed
    while iter < num_iterations:
        # Initialize W and H
        if iter == 0:
            W = sc.parallelize(range(num_users+1)).map(key_to_entry_rand).persist()#(user_id,rand(num_factors))
            H = sc.parallelize(range(num_movies+1)).map(key_to_entry_rand).persist()#(movie_id,rand(num_factors)

        # Set random seed
        seed = random.randrange(MAXSEED)

        # Partition parameters
        W_blocks = W.keyBy(lambda W_entry: item_to_block(W_entry[0]))#key:worker_id,value:(user_id,rand(num_factors))
        H_blocks = H.keyBy(lambda H_entry: item_to_block(H_entry[0]))#key:worker_id,value:(movie_id,rand(num_factors)

        # Filter diagonal blocks
        V_diagonal = user_movie_ratings.filter(filter_diagonal).persist()#(user_id,movie_id,rating) where worker_id(user_id) == worker_id(movie_id)
        V_blocks = V_diagonal.keyBy(lambda t : item_to_block(t[0]))#key:worker_id,value:(user_id,movie_id,rating) where user_id == movie_id
        updates_curr = V_diagonal.count()
        V_diagonal.unpersist()    
        V_group = V_blocks.groupWith(W_blocks, H_blocks).coalesce(num_workers)#key:worker_id,value:seq[V],seq[W],seq[H]

        # Perform SGD
        updatedWH = V_group.map(SGD_update).persist()
        W = updatedWH.flatMap(first_element).persist()
        H = updatedWH.flatMap(second_element).persist()
        updates_total += updates_curr
        iter += 1
   
    W_result = numpy.vstack(W.sortByKey().map(second_element).collect()[1:])
    H_result = numpy.vstack(H.sortByKey().map(second_element).collect()[1:])
    # Save W and H
    savetxt(output_W_filename, W_result, delimiter=',')
    savetxt(output_H_filename, H_result, delimiter=',')
    sc.stop
开发者ID:LiuShifeng,项目名称:Matrix_Factor_Python,代码行数:57,代码来源:dsgd_mf.py

示例11: LinearRegressionModel

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def LinearRegressionModel(dataPath, label, normalize, character, master, ispca):

    pca_n = 2
    sc = SparkContext(master)
    data = sc.textFile(dataPath)

# not RDD data 

    ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)])))

    if label == 0:
        ndata = ndata.map(lambda line: line[::-1])

    if normalize == 1:
        test_data = norm(ndata.collect())    
        norm_data = sc.parallelize(test_data)
        train_data = norm_data.map(lambda part: lbp(part[0], part[1]))   
     #raw_data = data.map(lambda line: line.split(character))


    else:
        test_data = ndata.map(lambda part: (part[len(part) - 1], part[0:len(part) - 1])).collect()
        train_data = ndata.map(lambda part: lbp(part[len(part) - 1], part[0: len(part) - 1]))
    
    
    if ispca == 1:
        pca = PCA(n_components = pca_n)
        pca_train = [test_data[i][1] for i in range(len(test_data))]
        pca_data = pca.fit(pca_train).transform(pca_train)

        test = []
        for i in range(len(pca_data)):
            test.append([test_data[i][0], pca_data[i]])

        train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1]))
        test_data = test
            

    model_lr = lr.train(train_data)
    err_lr = 0.0
    size = len(train_data.collect())
   
    for i in range(size):
        err_lr = err_lr + abs(model_lr.predict(test_data[i][1]) - test_data[i][0])
           

    print "result:", err_lr/size

    String = "Linear Regression Result:\n"
    String = String + str(model_lr.weights) + '\n'
    String = String + "Error: " + str(err_lr / size) 
    
    sc.stop()

    return String
开发者ID:Tomlong,项目名称:MLlib-UI,代码行数:57,代码来源:mlLinearRegression.py

示例12: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def main(image_files):
    sc = SparkContext( appName="Resize Images")
    sc.parallelize(image_files).map(resize_image_file).count()

    #read all the resized images into an array to save as a pickled object
    #out_dir = CUR_DIR + TEST_OR_TRAIN + '_' + str(IMAGE_SIZE)
    #save_images(out_dir)

    #read all the resized images into an array to save as a csv file
    out_dir = CUR_DIR + TEST_OR_TRAIN + '_' + str(IMAGE_SIZE)
    save_images_csv(out_dir)
开发者ID:nathanieljblack,项目名称:W251_Project,代码行数:13,代码来源:resize_spark_mapper.py

示例13: model

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def model(classifier, ftrain, fvalid, fprediction):

    startTime = time.time()

    ctx = SparkContext(appName="model_on_Spark")
    sqlContext = SQLContext(ctx)
    logger = SparkLogger(ctx)
    logger.set_level('ERROR')

    # load and prepare training and validation data
    rawTrain, train = prepData(sqlContext, ctx, ftrain)
    rawValid, valid = prepData(sqlContext, ctx, fvalid)

    # is needed to join columns
    valid = indexData(valid)
    rawValid = indexData(rawValid)

    classifiers = {
        "RandomForestClassifier" : RFC
    }

    clf = classifiers[classifier]()

    labelIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")

    # train and predict
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf])
    model = pipeline.fit(train)

    predictions = model.transform(valid)

    # write to file:

    subsetPrediction = predictions.select("prediction", "index")
    subsetValidData = rawValid.select("dataset", "index")

    output = (subsetValidData
               .join(subsetPrediction, subsetPrediction.index == subsetValidData.index)
                    .drop("index")
                    .drop("index"))

    lines = output.map(toCSVLine)
    lines.saveAsTextFile('output')

    evaluator = MulticlassClassificationEvaluator(
       labelCol="label", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print "Test Error = %g" % (1.0 - accuracy)

    executionTime = time.time() - startTime
    row=classifier+','+str(executionTime)
    ctx.parallelize([row]).saveAsTextFile("timing")
开发者ID:KiprasKancys,项目名称:DMWMAnalytics,代码行数:55,代码来源:pyspark_ml.py

示例14: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def main():
  HDFS_URI = "hdfs://hdfs.domain.cc/folder"
  sc = SparkContext()
  rdd = sc.parallelize([("a", 1), ("b", 2), ("c", 3)])
  rdd.saveAsNewAPIHadoopFile(HDFS_URI + "/01", "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")
  rdd = sc.parallelize([("d", 4), ("e", 5), ("f", 6)])
  rdd.saveAsNewAPIHadoopFile(HDFS_URI + "/02", "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")
  folder = TwoDHDFSMap(sc, HDFS_URI)
  print("hdfsURI test: ", folder.hdfsURI == HDFS_URI)
  print("folder[\"01\"][\"a\"] test: ", folder["01"]["a"] == 1)
  print("\"01\" in folder test: ", "01" in folder)
  print("\"02\" in folder test: ", "02" in folder)
  print("folder[\"02\"][\"d\"] test: ", folder["02"]["d"] == 4)
开发者ID:ire7715,项目名称:TwoDHDFSMap,代码行数:15,代码来源:test.py

示例15: TestCalculator

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
class TestCalculator (unittest.TestCase):

	def setUp(self):
	   conf = SparkConf().setAppName("appTest").setMaster("local[*]")
	   self.sc = SparkContext(conf=conf)
	   self.setMovies = SetMovies() 

	def tearDown(self):
	   self.sc.stop()

	def test_when_calculate_set_word_most_repeater(self):
	   entry = [('Toy', (1, ['::Toy Story Toy (1995)::'])),
	            ('ToyA', (3, ['::ToyA StoryA ToyA (1995)::'])),
	            ('Story', (1, ['::Toy Story Toy (1995)::'])),
	            ('StoryA', (3, ['::ToyA StoryA ToyA (1995)::']))]
	   result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']),
	             ("StoryA",["::ToyA StoryA ToyA (1995)::"]))
	   funcReverseTuple = lambda value :((value[1][0],(value[0],value[1][1])))
	   rdd = self.sc.parallelize(entry)		              
	   self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)

	def test_when_calculate_set_word_most_repeater_one(self):
	   entry = [('Toy', (1, ['::Toy Story Toy (1995)::'])),
	            ('ToyA', (3, ['::ToyA StoryA ToyA (1995)::'])),
	            ('Story', (1, ['::Toy Story Toy (1995)::'])),
	            ('StoryA', (1, ['::ToyA StoryA ToyA (1995)::']))]
	   result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']))
	   funcReverseTuple = lambda value :(value[1][0],(value[0],value[1][1]))
	   rdd = self.sc.parallelize(entry)		              
	   self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)	   

	def test_when_calculate_maximum_year(self):
	   entry = [('(1996)',2),
	            ('(1998)',2),
	            ('(1997)',1)]  
	   result = ('(1996)','(1998)')
	   rdd = self.sc.parallelize(entry)	
	   funcReverseTuple = lambda value :(value[1],value[0])
	   self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)

	def test_when_calculate_maximum_year_with_only_one(self):
	   entry = [('(1996)',2),
	            ('(1998)',1),
	            ('(1997)',1),
	            ('(1999)',1)]  
	   result = ('(1996)')
	   rdd = self.sc.parallelize(entry)	
	   funcReverseTuple = lambda value :(value[1],value[0])
	   self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)        
开发者ID:cpedrero,项目名称:BigDataGroup,代码行数:51,代码来源:sets_calculator_tests.py


注:本文中的pyspark.SparkContext.parallelize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。