本文整理汇总了Python中pyspark.SparkContext.broadcast方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.broadcast方法的具体用法?Python SparkContext.broadcast怎么用?Python SparkContext.broadcast使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext
的用法示例。
在下文中一共展示了SparkContext.broadcast方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import broadcast [as 别名]
def main(training_file,n):
epochs = int(n);
x,y,tags = read_training_data(training_file)
v = {}
sc = SparkContext(appName="parameterMixing")
tags = sc.broadcast(tags)
time0 = time.time()
training_data = []
for i in range(len(x)):
training_data.append((x[i],y[i]))
train_data = sc.parallelize(training_data).cache()
for round in range(0,epochs):
fv = sc.broadcast(v)
feat_vec_list = train_data.mapPartitions(lambda t: perc_train(t, tags.value, fv.value))
feat_vec_list = feat_vec_list.combineByKey((lambda x: (x,1)),
(lambda x, y: (x[0] + y, x[1] + 1)),
(lambda x, y: (x[0] + y[0], x[1] + y[1]))).collect()
for (feat, (a,b)) in feat_vec_list:
v[feat] = float(a)/float(b)
sc.stop()
# Compute the weight vector using the Perceptron algorithm
#trainer.perceptron_algorithm(5)
print "iteration %d in %f seconds" %(iterations, time.time()-t0)
# Write out the final weight vector
write_weight_vector(v)
示例2: createContext
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import broadcast [as 别名]
def createContext():
uBATCH_INTERVAL = 10
sc = SparkContext(SPARK_MASTER, appName="StreamingKafka")
sc.broadcast(batchUserPostDict)
sc.broadcast(batchPostUserDict)
#sc = SparkContext("local[*]", appName="StreamingKafka")
# streaming batch interval of 5 sec first, and reduce later to 1 sec or lower
ssc = StreamingContext(sc, uBATCH_INTERVAL)
ssc.checkpoint(CHECKPOINT_DIR) # set checkpoint directory in HDFS
#ssc.checkpoint(10 * uBATCH_INTERVAL)
return ssc
ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, createContext)
示例3: geneSpark
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import broadcast [as 别名]
def geneSpark(input_filename, output_filename,
upstream_bp=2000, downstream_bp=500):
'''
Performs geneSpark extensions given a `input_filename`
and stores the output in `output_filename`
Parameters
----------
input_filename : string
path to the GTF file
output_filename : string
path to the output extended GTF file
upstream_bp : int (default=2000):
Extend upstream of first exon of each gene
dowstream_bp : int (default=500):
Extend dowstream of last exon of each gene
'''
# create spark context
sc = SparkContext(appName="geneSpark")
# set up broadcasting variables
upstream_bp_var = sc.broadcast(upstream_bp)
downstream_bp_var = sc.broadcast(downstream_bp)
# create temporary folder where to store the output chunks
tempFile = NamedTemporaryFile(delete=True)
tempFile.close()
# define the spark pipeline
(sc.textFile(input_filename)
.map(lambda x: x.split('\t'))
.filter(lambda x: x[2] == 'exon')
.map(parse_line)
.reduceByKey(min_and_max)
.sortByKey()
.map(partial(geneSpark,
upstream_bp=upstream_bp_var,
downstream_bp=downstream_bp_var))
.saveAsTextFile(tempFile.name))
# merge output chunks to single output_filename
with open(output_filename, 'w') as fw:
for line in input(sorted(glob(tempFile.name + "/part-000*"))):
fw.write(line)
sc.stop()
示例4: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import broadcast [as 别名]
def main(name, divide):
"""
old_g = pickle.load(open("/net/data/facebook/facebook-ucsb/Facebook_2008/"+name +"/original_pickles/"+name +".pickle", 'r'))
new_g = networkx.Graph()
for node, friends in old_g.adj.iteritems():
if node not in new_g.nodes():
new_g.add_node(node)
for friend in friends.iterkeys():
new_g.add_node(friend)
new_g.add_edge(node, friend)
"""
# serialize the networkx graph as text files of edgelist
# into a text file for workers to read
# networkx.write_edgelist(new_g, "edgelist/"+name, data=False)
# subprocess.check_call("hdfs dfs -put edgelist/"+name+ " edgelist/", shell=True)
new_g = networkx.read_adjlist(name + "_list.txt") # Egypt_list is an edge list
sc = SparkContext(appName="Sorted_removal")
dataG = json_graph.node_link_data(new_g)
stringG = json.dumps(dataG)
originalG = sc.broadcast(stringG)
edges = sc.textFile("hdfs://scrapper/user/xiaofeng/edgelist/" + name, 192 * 4 * int(divide))
costs = edges.map(lambda line: line.split(" ")).map(lambda edge: edge_to_cost(edge, originalG.value))
costs.saveAsTextFile("hdfs://scrapper/user/xiaofeng/costs_" + name)
sc.stop()
subprocess.check_call("hdfs dfs -get costs_" + name + " /home/xiaofeng/facebook/FacebookProject/costs/", shell=True)
Reformat("/home/xiaofeng/facebook/FacebookProject/costs/costs_" + name + "/", name)
示例5: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import broadcast [as 别名]
def main():
# Insure a search term was supplied at the command line
if len(sys.argv) != 2:
sys.stderr.write("Usage: {} <search_term>".format(sys.argv[0]))
sys.exit()
# Create the SparkContext
sc = SparkContext(appName="SparkWordCount")
# Broadcast the requested term
requested_movie = sc.broadcast(sys.argv[1])
# Load the input file
source_file = sc.textFile("/user/hduser/input/movies")
# Get the movie title from the second fields
titles = source_file.map(lambda line: line.split("|")[1])
# Create a map of the normalized title to the raw title
normalized_title = titles.map(lambda title: (re.sub(r"\s*\(\d{4}\)", "", title).lower(), title))
# Find all movies matching the requested_movie
matches = normalized_title.filter(lambda x: requested_movie.value in x[0])
# Collect all the matching titles
matching_titles = matches.map(lambda x: x[1]).distinct().collect()
# Display the result
print "{} Matching titles found:".format(len(matching_titles))
for title in matching_titles:
print title
sc.stop()
示例6: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import broadcast [as 别名]
def main():
"""Process the input file got as a command-line argument."""
global stop_words, punctuations
input_file, feature_dimensions, num_clusters, max_iterations, runs = _parse_cmd_line_args()
sc = SparkContext(conf=_get_conf("CS-838-Assignment3-PartB"))
# for the _tokenize function to remove stopwords and punctuations
stop_words = sc.broadcast(set(stopwords.words("english")))
punctuations = sc.broadcast(set(string.punctuation))
input_text_rdd, tfidf_vectors_rdd = get_feature_vectors(sc, input_file, feature_dimensions)
model = build_cluster_model(tfidf_vectors_rdd, num_clusters, max_iterations, runs)
top_n_in_each_cluster(sc, input_text_rdd, tfidf_vectors_rdd, model, 5)
示例7: SparkBroadcastAccumulator
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import broadcast [as 别名]
def SparkBroadcastAccumulator(n):
global broadcast_var
global accumulator_var
spcon = SparkContext("local[2]","SparkBroadcastAccumulator")
broadcast_var=spcon.broadcast("broadcast_message")
accumulator_var=spcon.accumulator(0)
spcon.parallelize(xrange(1,n)).foreach(lambda x: broadcast_accumulator_receiver(accumulator_var.add(x)))
示例8: run
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import broadcast [as 别名]
def run(date):
""""
加载hdfs上 业务提供的规则
并封装成 FunnelRule对象
例如:[FunnelRule(funnelId=u'1496', ruleId=u'896', level=u'1', requestRule=u'contains')]
"""""
sc = SparkContext(appName="readHdfsFile",master=conf.sparkURL)
rulesList=readFile(sc,conf.dim_model_url_new).flatMap(lambda line:line.split('\r\n')).map(buildBean).collect() #OrderedDict(
rules_lookup = sc.broadcast(rulesList)
"""
setp2:加载点击流日志与规则表比对,剔除无效日志, 生成后期数据分析结构(in 1-----> out N+)
set4:产生新的key
set5:
"""
"""
>>>rdd2=sc.parallelize([['1\t1',['1','1','2','a']],['1\t1',['1','1','1','b']],['2\t1',['2','1','1','b']]])
>>>rdd2.groupByKey().map(lambda line:list(line[1])).filter(lambda x:x[0][0]=='1').flatMap(lambda x:x).collect()
[['1', '1', '2', 'a'], ['1', '1', '1', 'b']]
"""
#conf.click_jr_log_url_dir+"/dt="+date
clickLogRDD=readFile(sc,"/funnelNew/input/click_log/000000_0").map(rowSplit)
clickLogRDD1=clickLogRDD.flatMap(lambda line:funnelFilter.getList(line[0],rules_lookup)).groupByKey()\
.map(lambda line:line[1]).filter(reduceFilter).flatMap(lambda x:x).map(countSessionKey).\
partitionBy(1).reduceByKey(add)
clickLogRDD1.saveAsTextFile("/funnelNew/output/dt="+date)
示例9: _train_spark
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import broadcast [as 别名]
def _train_spark(data, n_components, n_pc, covar_types, verbose, n_jobs, n_iter_search):
# Spark configuration.
conf = (SparkConf()
.setMaster("local[" + str(n_jobs) + "]")
.setAppName("FDD")
.set("spark.executor.memory", "512mb")
.set("spark.cores.max", str(n_jobs)))
sc = SparkContext(conf=conf)
# Build hyperparameter vectors.
parameters = cartesian((n_components,
n_pc,
covar_types))
# Distribute the hyperparameters vector.
parameters_rdd = sc.parallelize(parameters, 96)
# Broadcast the data to all workers.
data_broadcast = sc.broadcast(data)
# Train a model for each hyperparameter set.
models = parameters_rdd.map(lambda param: train_with_parameters(param, data_broadcast))
# Persist the models the avoid re-computation.
models.persist(StorageLevel(True, True, False, True, 1))
# Sort by BIC.
sorted_models = models.sortBy(lambda model: model[0])
# The first is the best model.
best_model = sorted_models.collect()[0][1]
sc.stop()
return best_model
示例10: count_triangles
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import broadcast [as 别名]
def count_triangles(data, master="local[2]"):
"""
@brief: Count triangles using Spark
@param data: The data location for the input files
@param master: The master URL as defined at
https://spark.apache.org/docs/1.1.0/submitting-applications.html#master-urls
"""
################# NO EDITS HERE ###################
assert not os.path.exists("triangles.out"), "File: triangles.out \
already exists"
sc = SparkContext(master, "Triangle Count")
start = time()
############### END NO EDITS HERE ################
# TODO: Your code goes here!
people = sc.textFile(data)
AdjList = people.map(makepair)
DriverAdj = dict(AdjList.collect())
WorkerAdj = sc.broadcast(DriverAdj)
Edges = AdjList.flatMapValues(lambda x: x)
TriSet = Edges.map(lambda (k,v): ((k,v),
AintersectB(k,v,WorkerAdj.value)))
Triangle = TriSet.flatMapValues(lambda x: x).map(lambda (k,v):
tuple(sorted([int(v),int(k[0]),int(k[1])],reverse=True)))
output = set(Triangle.collect())
################# NO EDITS HERE ###################
print "\n\n*****************************************"
print "\nTotal algorithm time: %.4f sec \n" % (time()-start)
print "*****************************************\n\n"""
############### END NO EDITS HERE ################
with open("triangles.out", "wb") as f:
for friends in output:
f.write(str(friends[0])+" "+str(friends[1])+" "+str(friends[2])+"\n") # TODO: Loop with f to write your result to file serially
pass
示例11: SLAPmi_initialize_spark
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import broadcast [as 别名]
def SLAPmi_initialize_spark(fullpath):
D = io.loadmat(fullpath, struct_as_record=False, squeeze_me=True)
obs = D['obs']
opts = D['opts']
Y = obs.data_in
P0 = opts.P.T # transpose
Sk = D['Sk']
Su = D['Su']
if len(Su.shape)<2:
Su = Su[:,None]
masks = D['masks']
#S = Sk
#S = np.concatenate((Sk,Su), axis=1)
def P (frame):
return P0
def solveOneFrame(frameDataIn): #framedata has structure [framenumber, y[:,framenumber]]
Pt = P(frameDataIn[0])
#PSk = np.zeros((Pt.shape[0], Sk.shape[0]))
#for Sk_ix in range(len(Sk)):
# PSk[:, Sk_ix] = Pt[:,masks[:,Sk_ix].toarray()[:,0]].dot(Sk[Sk_ix])
#code.interact(local=locals())
PSk = Pt.dot(Sk_bc.value).toarray()
PSu = Pt.dot(Su_bc.value)
PS = np.concatenate((PSk, PSu), axis=1)
F = optimize.nnls(PS,frameDataIn[1])
#code.interact(local=locals())
return F[0]
#code.interact(local=locals())
conf = SparkConf().setAppName('SLAPmi_initialize')
sc = SparkContext(conf=conf)
Sk_bc = sc.broadcast(Sk)
Su_bc = sc.broadcast(Su)
frameData = [(i, Y[:,i]) for i in range(Y.shape[1])]
F_solved = np.array(sc.parallelize(frameData,len(frameData)).map(solveOneFrame).collect())
#
print 'F_solved', F_solved.shape
print 'Sk', Sk.shape
print 'Su', Su.shape
Fk = F_solved[:, 0:Sk.shape[1]].T
Fu = F_solved[:, Sk.shape[1]:(Sk.shape[1]+Su.shape[1])].T
return Sk,Su,Fk,Fu, obs, opts, masks, D['ground_truth']
示例12: BroadcastTest
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import broadcast [as 别名]
class BroadcastTest(unittest.TestCase):
def tearDown(self):
if getattr(self, "sc", None) is not None:
self.sc.stop()
self.sc = None
def _test_encryption_helper(self, vs):
"""
Creates a broadcast variables for each value in vs, and runs a simple job to make sure the
value is the same when it's read in the executors. Also makes sure there are no task
failures.
"""
bs = [self.sc.broadcast(value=v) for v in vs]
exec_values = self.sc.parallelize(range(2)).map(lambda x: [b.value for b in bs]).collect()
for ev in exec_values:
self.assertEqual(ev, vs)
# make sure there are no task failures
status = self.sc.statusTracker()
for jid in status.getJobIdsForGroup():
for sid in status.getJobInfo(jid).stageIds:
stage_info = status.getStageInfo(sid)
self.assertEqual(0, stage_info.numFailedTasks)
def _test_multiple_broadcasts(self, *extra_confs):
"""
Test broadcast variables make it OK to the executors. Tests multiple broadcast variables,
and also multiple jobs.
"""
conf = SparkConf()
for key, value in extra_confs:
conf.set(key, value)
conf.setMaster("local-cluster[2,1,1024]")
self.sc = SparkContext(conf=conf)
self._test_encryption_helper([5])
self._test_encryption_helper([5, 10, 20])
def test_broadcast_with_encryption(self):
self._test_multiple_broadcasts(("spark.io.encryption.enabled", "true"))
def test_broadcast_no_encryption(self):
self._test_multiple_broadcasts()
def _test_broadcast_on_driver(self, *extra_confs):
conf = SparkConf()
for key, value in extra_confs:
conf.set(key, value)
conf.setMaster("local-cluster[2,1,1024]")
self.sc = SparkContext(conf=conf)
bs = self.sc.broadcast(value=5)
self.assertEqual(5, bs.value)
def test_broadcast_value_driver_no_encryption(self):
self._test_broadcast_on_driver()
def test_broadcast_value_driver_encryption(self):
self._test_broadcast_on_driver(("spark.io.encryption.enabled", "true"))
示例13: process
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import broadcast [as 别名]
def process(master, input_container, output_container):
sc = SparkContext(master, "CDNBilling")
# load broadcast variables
countryMapRDD = sc.textFile(input_container + "/country_map.tsv")
countryMapList = countryMapRDD.collect()
sc.broadcast(countryMapList)
countryMapDict.update(createCountryDict(countryMapList))
# load domainLogs
domainsRawRDD = sc.textFile(input_container + "/domains_map.tsv")
domainsRDD = domainsRawRDD.map(formatDomainsLine)
# load logs
logsRDD = sc.textFile(input_container + "/raxcdn_*.gz")
# drop the header
actual_log_lines = logsRDD.filter(lambda x: x[0] != '#')
# filter by date
filteredRDD = actual_log_lines.filter(filterByDate)
# format the data
formattedRDD = filteredRDD.map(formatLogLine, countryMapDict)
# Zero event domains
domains_unused = domainsRDD.subtractByKey(formattedRDD)
domains_unused_formatted = domains_unused.map(formatUnusedDomain)
# for each domain, calculate bandwidth and request count
aggregatedLogs = formattedRDD.combineByKey(createCombiner, mergeValue,
mergeCombiners)
# add type of domain, project-ID, service-ID
joinedWithDomainDetails = aggregatedLogs.join(domainsRDD)
# join the usage logs with domains map including zero events
joinedLogs = joinedWithDomainDetails.union(domains_unused_formatted)
# save the output
joinedLogs.saveAsTextFile(output_container + "/output-files")
sc.stop()
示例14: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import broadcast [as 别名]
def main():
conf = SparkConf().setAppName("Test2")
sc = SparkContext(conf=conf)
# new_dict函数将<tuple,value>键值对转换成<tuple_1,dict(tuple_2,value)>键值对
def new_dict(line):
Dict = dict()
Dict[line[0][1]] = line[1]
return (line[0][0], Dict)
# 读取原始文件,形成<文件,内容>的键值对
data_raw = sc.wholeTextFiles("/home/djt/data/proclassified")
# Doc函数将<文件,内容>键值对中内容按行split,每一行即对应一封判决书的内容
def Doc(line):
s = line[1].split("\n")
return s[0:len(s) - 1]
# <文件,内容>的键值对 => <判决书路径,判决书内容>键值对
data = data_raw.flatMap(Doc)
# 将判决书路径 => ID
def DocID(string):
s = filter(lambda x: x.isdigit(), string)
return s[1:len(s)]
# <判决书路径,判决书内容> => <判决书ID,判决书内容>
data_wordsplit = data.map(lambda line: (DocID(line.split(",<")[0]), line.split(",<")[1].split(" ")))
# 去除分词后文本之间的空格,便于后续正则表达式匹配
def Doc_Integration(line):
doc = ""
for k in line[1]:
doc += k
return (line[0], doc)
# <判决书ID,判决书内容(有空格)> => <判决书ID,判决书内容>
data_doc = data_wordsplit.map(Doc_Integration)
# 从keywords_body.txt中提取出各可能维度,用正则表达式编译
keywords_raw = sc.textFile("/home/djt/data/keywords_crime.txt")
keywords = keywords_raw.map(
lambda line: re.compile(line)).collect()
# 将<维度,set(特征词)>键值对广播
keywords = sc.broadcast(keywords)
# 正则表达式匹配各判决书中出现的所有腐败行为方式(即罪名)
def keywords_stats(line):
doc = line[1]
# 匹配 doc是判决书内容 value[0]即正则表达式
temp = keywords.value[0].findall(doc)
crime_set = set(temp)
crime = ""
for k in crime_set:
crime+="\t"+k
return (line[0],crime)
# raw:<判决书ID,所有出现的行为方式(罪名)>
raw = data_doc.map(keywords_stats)
after = raw.sortByKey()
# 输出
res = after.map(lambda (k, v): k + "\t" + v)
res.saveAsTextFile("/home/djt/data/out")
示例15: computeMinHashSig
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import broadcast [as 别名]
def computeMinHashSig(K, N, rdd):
"""
:param K: number of random hash functions (i.e., the number of rows of the signature matrix)
:param N: maximum number of elements in any of the considered sets
:param rdd: RDD where each record contains one set represented as a sorted list of 32-bit integers from the
range [1 , . . . , N]
:return: RDD containing the signature matrix, stored column-wise.
That is, one record holds the K entries that correspond to the signature of one set
"""
sc = SparkContext(appName="PythonMinhash")
# first choose a set of K random hash functions h1,..., hK (described in lecture 5 on slide 33)
hashParams = sc.broadcast(generateHashParams(K))
data = sc.parallelize(rdd)
sig = data.map(lambda x: computeSig(hashParams.value, N, x))
return sig.collect()
开发者ID:melhindi,项目名称:Assignments_ParallelDataProcessingAndAnalysis,代码行数:19,代码来源:IPDPA_Assignment4_3_MEH.py