本文整理汇总了Python中pyspark.SparkContext.parallelize方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.parallelize方法的具体用法?Python SparkContext.parallelize怎么用?Python SparkContext.parallelize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext
的用法示例。
在下文中一共展示了SparkContext.parallelize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: SearchTiles_and_Factorize
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def SearchTiles_and_Factorize(n):
global globalmergedtiles
global globalcoordinates
global factors_accum
global spcon
spcon = SparkContext("local[4]","Spark_TileSearch_Optimized")
if persisted_tiles == True:
tileintervalsf=open("/home/shrinivaasanka/Krishna_iResearch_OpenSource/GitHub/asfer-github-code/cpp-src/miscellaneous/DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.tileintervals","r")
tileintervalslist=tileintervalsf.read().split("\n")
#print "tileintervalslist=",tileintervalslist
tileintervalslist_accum=spcon.accumulator(tilesintervalslist, VectorAccumulatorParam())
paralleltileintervals=spcon.parallelize(tileintervalslist)
paralleltileintervals.foreach(tilesearch)
else:
factorsfile=open("DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.factors","w")
hardy_ramanujan_ray_shooting_queries(n)
hardy_ramanujan_prime_number_theorem_ray_shooting_queries(n)
baker_harman_pintz_ray_shooting_queries(n)
cramer_ray_shooting_queries(n)
zhang_ray_shooting_queries(n)
factors_accum=spcon.accumulator(factors_of_n, FactorsAccumulatorParam())
#spcon.parallelize(xrange(1,n)).foreach(tilesearch_nonpersistent)
spcon.parallelize(spcon.range(1,n).collect()).foreach(tilesearch_nonpersistent)
print "factors_accum.value = ", factors_accum.value
factors=[]
factordict={}
for f in factors_accum.value:
factors += f
factordict[n]=factors
json.dump(factordict,factorsfile)
return factors
开发者ID:shrinivaasanka,项目名称:asfer-github-code,代码行数:36,代码来源:DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.py
示例2: TestWordCounter
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
class TestWordCounter(unittest.TestCase):
def setUp(self):
conf = SparkConf().setAppName("appTest").setMaster("local[*]")
self.sc = SparkContext(conf=conf)
self.counter = WordCounter()
def tearDown(self):
self.sc.stop()
def test_when_exist_one_movie_and_counter(self):
movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy",
"1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"]
result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']),
('Toy', ['::Toy Story Toy (1995)::']))
movies = self.sc.parallelize(movieList)
self.assertEqual(self.counter.getMaxValues(movies),result)
def test_when_exist_one_movie_and_counter_moreMovies(self):
movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy",
"1993::ToyA StoryB ToyA (1995)::Animation|Children's|Comedy",
"1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"]
result = (('ToyA', ['::ToyA StoryB ToyA (1995)::','::ToyA StoryA ToyA (1995)::']))
movies = self.sc.parallelize(movieList)
self.assertEqual(self.counter.getMaxValues(movies),result)
示例3: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def main():
sc = SparkContext(appName="MyApp")
sc.setLogLevel('ERROR')
# Parse data
train_labels, train_data = load_data('train.csv')
dummy_labels, test_data = load_data('test.csv', use_labels=False)
# Truncate the last 2 features of the data
for dataPoint in train_data:
len = np.size(dataPoint)
dataPoint = np.delete(dataPoint, [len - 2, len - 1])
for dataPoint in test_data:
len = np.size(dataPoint)
dataPoint = np.delete(dataPoint, [len - 2, len - 1])
# Map each data point's label to its features
train_set = reformatData(train_data, train_labels)
test_set = reformatData(test_data, dummy_labels)
# Parallelize the data
parallelized_train_set = sc.parallelize(train_set)
parallelized_test_set = sc.parallelize(test_set)
# Split the data
trainSet, validationSet = parallelized_train_set.randomSplit([0.01, 0.99], seed=42)
# Train the models
randomForestModel = RandomForest.trainClassifier(trainSet, numClasses=4, impurity='gini', categoricalFeaturesInfo={},
numTrees=750, seed=42, maxDepth=30, maxBins=32)
# Test the model
testRandomForest(randomForestModel, parallelized_test_set)
示例4: longest_common_substring
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def longest_common_substring(strands):
pass
# create the Spark context
conf = SparkConf().setAppName("longest_common_substring")
sc = SparkContext(conf=conf)
# create an accumulator for key-value pairs, where each key is a substring, and each value is the set of strings where the substring can be found
class ArrayAccumulatorParam(AccumulatorParam):
def zero(self, initialValue):
return initialValue
def addInPlace(self, v1, v2):
if type(v2) is list:
v1.extend(v2)
elif type(v2) is tuple:
v1.append(v2)
return v1
acc = sc.accumulator([], ArrayAccumulatorParam())
def generate_substrings(data_element):
k, v = data_element
i = 0
while i < len(v):
j = i + 1
while j < len(v):
acc.add((v[i:j],k))
j += 1
i += 1
sc.parallelize([(k, v) for k, v in strands.iteritems()]).foreach(generate_substrings)
all_substrings = sc.parallelize(acc.value)
return all_substrings.groupByKey().filter(lambda x: set(list(x[1])) == set(strands.keys())).takeOrdered(1, key=lambda x: -len(x[0]))[0][0]
示例5: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def main():
sc = SparkContext(appName="MyApp")
sc.setLogLevel('ERROR')
# Parse data
train_labels, train_data = load_data('train.csv')
dummy_labels, test_data = load_data('test.csv', use_labels=False)
# Map each data point's label to its features
train_set = reformatData(train_data, train_labels)
test_set = reformatData(test_data, dummy_labels)
# Parallelize the data
parallelized_train_set = sc.parallelize(train_set)
parallelized_test_set = sc.parallelize(test_set)
# Split the data
trainSet, validationSet = parallelized_train_set.randomSplit([1.0, 0.0], seed=42)
# Train the models
decisionTreeModel = DecisionTree.trainClassifier(trainSet, numClasses=5, categoricalFeaturesInfo={},
impurity='gini', maxBins=55, maxDepth=30, minInstancesPerNode=2)
# Test the model
testDecisionTree(decisionTreeModel, parallelized_test_set)
示例6: LookAlikeTest
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
class LookAlikeTest(unittest.TestCase):
def setUp(self):
conf = SparkConf().setAppName("Tests").setMaster("local")
self.sc = SparkContext(conf=conf)
def tearDown(self):
self.sc.stop()
def test_ratings_calculation(self):
data = [("u1", 123), ("u1", 123), ("u1", 132),
("u2", 123), ("u2", 111), ("u2", 111), ("u2", 111), ("u2", 111),
("u3", 123), ("u3", 123), ("u3", 125), ("u3", 125), ("u3", 111)]
input_data = self.sc.parallelize(data)
ratings = calculate_ratings(input_data).collectAsMap()
self.assertEqual(ratings["u1"][123], 1.0)
self.assertEqual(ratings["u1"][132], 0.5)
self.assertEqual(ratings["u2"][111], 1.0)
self.assertEqual(ratings["u2"][123], 0.25)
self.assertEqual(ratings["u3"][123], 1.0)
self.assertEqual(ratings["u3"][125], 1.0)
self.assertEqual(ratings["u3"][111], 0.5)
def test_correlations_calculation(self):
ratings = [("u1", {1: 0.5, 2: 1.0, 3: 0.1}),
("u2", {1: 0.25, 3: 1.0}),
("u3", {2: 0.25, 3: 1.0})]
ratings_data = self.sc.parallelize(ratings)
correlations = calculate_correlations(ratings_data, 3).collectAsMap()
self.assertEqual(round(correlations[1], 2), -1.0)
self.assertEqual(round(correlations[2], 2), -1.0)
示例7: SparkBroadcastAccumulator
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def SparkBroadcastAccumulator(n):
global broadcast_var
global accumulator_var
spcon = SparkContext("local[2]","SparkBroadcastAccumulator")
broadcast_var=spcon.broadcast("broadcast_message")
accumulator_var=spcon.accumulator(0)
spcon.parallelize(xrange(1,n)).foreach(lambda x: broadcast_accumulator_receiver(accumulator_var.add(x)))
示例8: parallelDisassembler
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def parallelDisassembler(matrix,groups):
def splitGroup(g):
b,n1,n2 = isAFalse(g,matrix)
if b:
g1 = [n1]
g2 = [n2]
for nid in g:
if nid != n1 and nid != n2:
sim1 = 0.0
sim2 = 0.0
for tmp in g1:
sim1 += jcSig(matrix[tmp],matrix[nid])
for tmp in g2:
sim2 += jcSig(matrix[tmp],matrix[nid])
if sim1 / len(g1) > sim2 / len(g2):
g1 += [nid]
else:
g2 += [nid]
return g1,g2
return ([],g)
tmp = len(groups)
sc = SparkContext(appName="Splitter")
parrGroup = sc.parallelize(groups)
groups = parrGroup.map(splitGroup).collect()
tmpgrp = []
for g1,g2 in groups:
tmpgrp += [g1]
tmpgrp += [g2]
groups = tmpgrp
while len(groups) != tmp :
tmp = len(groups)
#print(tmp)
parrGroup = sc.parallelize(groups)
groups = parrGroup.map(splitGroup).collect()
tmpgrp = []
for g1,g2 in groups:
if g1 != []:
tmpgrp += [g1]
tmpgrp += [g2]
groups = tmpgrp
sc.stop()
return groups
示例9: _kmeans_spark
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def _kmeans_spark(X, n_clusters, max_iter=300, worker_nums=10, init='k-means++', random_state=None, tol=1e-4):
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName('K-Means_Spark').setMaster('local[%d]'%worker_nums)
sc = SparkContext(conf=conf)
data = sc.parallelize(X)
data.cache()
random_state = check_random_state(random_state)
best_labels, best_inertia, best_centers = None, None, None
x_squared_norms = row_norms(X, squared=True)
# x_squared_norms = data.map(lambda x: (x*x).sum(axis=0)).collect()
# x_squared_norms = np.array(x_squared_norms, dtype='float64')
centers = _init_centroids(X, n_clusters, init, random_state, x_squared_norms=x_squared_norms)
bs = X.shape[0]/worker_nums
data_temp = []
for i in range(worker_nums-1):
data_temp.append(X[i*bs:(i+1)*bs])
data_temp.append(X[(worker_nums-1)*bs:])
data_temp = np.array(data_temp, dtype='float64')
data_temp = sc.parallelize(data_temp)
data_temp.cache()
for i in range(max_iter):
centers_old = centers.copy()
all_distances = data_temp.map(lambda x: euclidean_distances(centers, x, squared=True)).collect()
temp_all_distances = all_distances[0]
for i in range(1, worker_nums):
temp_all_distances = np.hstack((temp_all_distances, all_distances[i]))
all_distances = temp_all_distances
# all_distances = data.map(lambda x: euclidean_distances(centers, x, squared=True)).collect()
# # reshape, from (1, n_samples, k) to (k, n_samples)
# all_distances = np.asarray(all_distances, dtype="float64").T[0]
# Assignment, also called E-step of EM
labels, inertia = _labels_inertia(X, x_squared_norms, centers, all_distances=all_distances)
# re-computation of the centroids, also called M-step of EM
centers = _centers(X, labels, n_clusters)
if best_inertia is None or inertia < best_inertia:
best_labels = labels.copy()
best_centers = centers.copy()
best_inertia = inertia
shift = squared_norm(centers_old - centers)
if shift <= tol:
break
return best_centers, best_labels, best_inertia
示例10: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def main(num_factors, num_workers, num_iterations, beta_value, lambda_value, Wm_value, \
V_filename, output_W_filename, output_H_filename):
# Conf
conf = SparkConf().setAppName("Spark SGD MF")
sc = SparkContext(conf=conf)
user_movie_ratings = sc.textFile(V_filename).map(line_to_movie_user_ratings)
user_movie_ratings.persist()
#global user_nonzero, movie_nonzero
#user_nonzero = user_movie_ratings.keyBy(first_element).countByKey()
#movie_nonzero = user_movie_ratings.keyBy(second_element).countByKey()
num_users = int(user_movie_ratings.map(first_element).reduce(max))
num_movies = int(user_movie_ratings.map(second_element).reduce(max))
global updates_total
updates_total = 0
# Begin iterations
iter = 0
global seed
while iter < num_iterations:
# Initialize W and H
if iter == 0:
W = sc.parallelize(range(num_users+1)).map(key_to_entry_rand).persist()#(user_id,rand(num_factors))
H = sc.parallelize(range(num_movies+1)).map(key_to_entry_rand).persist()#(movie_id,rand(num_factors)
# Set random seed
seed = random.randrange(MAXSEED)
# Partition parameters
W_blocks = W.keyBy(lambda W_entry: item_to_block(W_entry[0]))#key:worker_id,value:(user_id,rand(num_factors))
H_blocks = H.keyBy(lambda H_entry: item_to_block(H_entry[0]))#key:worker_id,value:(movie_id,rand(num_factors)
# Filter diagonal blocks
V_diagonal = user_movie_ratings.filter(filter_diagonal).persist()#(user_id,movie_id,rating) where worker_id(user_id) == worker_id(movie_id)
V_blocks = V_diagonal.keyBy(lambda t : item_to_block(t[0]))#key:worker_id,value:(user_id,movie_id,rating) where user_id == movie_id
updates_curr = V_diagonal.count()
V_diagonal.unpersist()
V_group = V_blocks.groupWith(W_blocks, H_blocks).coalesce(num_workers)#key:worker_id,value:seq[V],seq[W],seq[H]
# Perform SGD
updatedWH = V_group.map(SGD_update).persist()
W = updatedWH.flatMap(first_element).persist()
H = updatedWH.flatMap(second_element).persist()
updates_total += updates_curr
iter += 1
W_result = numpy.vstack(W.sortByKey().map(second_element).collect()[1:])
H_result = numpy.vstack(H.sortByKey().map(second_element).collect()[1:])
# Save W and H
savetxt(output_W_filename, W_result, delimiter=',')
savetxt(output_H_filename, H_result, delimiter=',')
sc.stop
示例11: LinearRegressionModel
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def LinearRegressionModel(dataPath, label, normalize, character, master, ispca):
pca_n = 2
sc = SparkContext(master)
data = sc.textFile(dataPath)
# not RDD data
ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)])))
if label == 0:
ndata = ndata.map(lambda line: line[::-1])
if normalize == 1:
test_data = norm(ndata.collect())
norm_data = sc.parallelize(test_data)
train_data = norm_data.map(lambda part: lbp(part[0], part[1]))
#raw_data = data.map(lambda line: line.split(character))
else:
test_data = ndata.map(lambda part: (part[len(part) - 1], part[0:len(part) - 1])).collect()
train_data = ndata.map(lambda part: lbp(part[len(part) - 1], part[0: len(part) - 1]))
if ispca == 1:
pca = PCA(n_components = pca_n)
pca_train = [test_data[i][1] for i in range(len(test_data))]
pca_data = pca.fit(pca_train).transform(pca_train)
test = []
for i in range(len(pca_data)):
test.append([test_data[i][0], pca_data[i]])
train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1]))
test_data = test
model_lr = lr.train(train_data)
err_lr = 0.0
size = len(train_data.collect())
for i in range(size):
err_lr = err_lr + abs(model_lr.predict(test_data[i][1]) - test_data[i][0])
print "result:", err_lr/size
String = "Linear Regression Result:\n"
String = String + str(model_lr.weights) + '\n'
String = String + "Error: " + str(err_lr / size)
sc.stop()
return String
示例12: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def main(image_files):
sc = SparkContext( appName="Resize Images")
sc.parallelize(image_files).map(resize_image_file).count()
#read all the resized images into an array to save as a pickled object
#out_dir = CUR_DIR + TEST_OR_TRAIN + '_' + str(IMAGE_SIZE)
#save_images(out_dir)
#read all the resized images into an array to save as a csv file
out_dir = CUR_DIR + TEST_OR_TRAIN + '_' + str(IMAGE_SIZE)
save_images_csv(out_dir)
示例13: model
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def model(classifier, ftrain, fvalid, fprediction):
startTime = time.time()
ctx = SparkContext(appName="model_on_Spark")
sqlContext = SQLContext(ctx)
logger = SparkLogger(ctx)
logger.set_level('ERROR')
# load and prepare training and validation data
rawTrain, train = prepData(sqlContext, ctx, ftrain)
rawValid, valid = prepData(sqlContext, ctx, fvalid)
# is needed to join columns
valid = indexData(valid)
rawValid = indexData(rawValid)
classifiers = {
"RandomForestClassifier" : RFC
}
clf = classifiers[classifier]()
labelIndexer = StringIndexer(inputCol="label", outputCol="indexed")
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")
# train and predict
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf])
model = pipeline.fit(train)
predictions = model.transform(valid)
# write to file:
subsetPrediction = predictions.select("prediction", "index")
subsetValidData = rawValid.select("dataset", "index")
output = (subsetValidData
.join(subsetPrediction, subsetPrediction.index == subsetValidData.index)
.drop("index")
.drop("index"))
lines = output.map(toCSVLine)
lines.saveAsTextFile('output')
evaluator = MulticlassClassificationEvaluator(
labelCol="label", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print "Test Error = %g" % (1.0 - accuracy)
executionTime = time.time() - startTime
row=classifier+','+str(executionTime)
ctx.parallelize([row]).saveAsTextFile("timing")
示例14: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
def main():
HDFS_URI = "hdfs://hdfs.domain.cc/folder"
sc = SparkContext()
rdd = sc.parallelize([("a", 1), ("b", 2), ("c", 3)])
rdd.saveAsNewAPIHadoopFile(HDFS_URI + "/01", "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")
rdd = sc.parallelize([("d", 4), ("e", 5), ("f", 6)])
rdd.saveAsNewAPIHadoopFile(HDFS_URI + "/02", "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")
folder = TwoDHDFSMap(sc, HDFS_URI)
print("hdfsURI test: ", folder.hdfsURI == HDFS_URI)
print("folder[\"01\"][\"a\"] test: ", folder["01"]["a"] == 1)
print("\"01\" in folder test: ", "01" in folder)
print("\"02\" in folder test: ", "02" in folder)
print("folder[\"02\"][\"d\"] test: ", folder["02"]["d"] == 4)
示例15: TestCalculator
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import parallelize [as 别名]
class TestCalculator (unittest.TestCase):
def setUp(self):
conf = SparkConf().setAppName("appTest").setMaster("local[*]")
self.sc = SparkContext(conf=conf)
self.setMovies = SetMovies()
def tearDown(self):
self.sc.stop()
def test_when_calculate_set_word_most_repeater(self):
entry = [('Toy', (1, ['::Toy Story Toy (1995)::'])),
('ToyA', (3, ['::ToyA StoryA ToyA (1995)::'])),
('Story', (1, ['::Toy Story Toy (1995)::'])),
('StoryA', (3, ['::ToyA StoryA ToyA (1995)::']))]
result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']),
("StoryA",["::ToyA StoryA ToyA (1995)::"]))
funcReverseTuple = lambda value :((value[1][0],(value[0],value[1][1])))
rdd = self.sc.parallelize(entry)
self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)
def test_when_calculate_set_word_most_repeater_one(self):
entry = [('Toy', (1, ['::Toy Story Toy (1995)::'])),
('ToyA', (3, ['::ToyA StoryA ToyA (1995)::'])),
('Story', (1, ['::Toy Story Toy (1995)::'])),
('StoryA', (1, ['::ToyA StoryA ToyA (1995)::']))]
result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']))
funcReverseTuple = lambda value :(value[1][0],(value[0],value[1][1]))
rdd = self.sc.parallelize(entry)
self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)
def test_when_calculate_maximum_year(self):
entry = [('(1996)',2),
('(1998)',2),
('(1997)',1)]
result = ('(1996)','(1998)')
rdd = self.sc.parallelize(entry)
funcReverseTuple = lambda value :(value[1],value[0])
self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)
def test_when_calculate_maximum_year_with_only_one(self):
entry = [('(1996)',2),
('(1998)',1),
('(1997)',1),
('(1999)',1)]
result = ('(1996)')
rdd = self.sc.parallelize(entry)
funcReverseTuple = lambda value :(value[1],value[0])
self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)