本文整理汇总了Python中pyspark.SparkContext类的典型用法代码示例。如果您正苦于以下问题:Python SparkContext类的具体用法?Python SparkContext怎么用?Python SparkContext使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了SparkContext类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
def run(self):
sc = SparkContext("local", "gender")
sqlContext = SQLContext(sc)
#StringType =(str, unicode)
_out = self.output().open('w')
#lines = sc.textFile("myUser.csv")
#fobj = self.input().open("r")
#lines = sc.textFile(fobj.name)
print(type(self.required_tasks['insert_source'].output()))
print(self.required_tasks['insert_source'])
#print(self.input()['insert_source'].input())
lines = sc.textFile("myUser.csv")
parts = lines.map(lambda l: l.split(","))
users = parts.map(lambda p: (p[0], p[1],p[2],p[3],p[4],p[5],p[6],p[7],
p[8],p[9],p[10],p[11],p[12],p[13],p[14],p[15],p[16],p[17],p[18],p[19]))
schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId"
print(schemaString)
_out.write(schemaString )
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
#schemaUser = sqlContext.createDataFrame(users, schema)
schemaUser = sqlContext.applySchema(users, schema)
schemaUser.registerTempTable("users")
results = sqlContext.sql("SELECT gender FROM users")
genders = results.map(lambda p : (p,1))
counts = genders.reduceByKey(lambda a, b: a + b) #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect()
for name in counts.collect():
_out.write(str(name))
_out.close()
示例2: main
def main():
inputs = sys.argv[1]
output = sys.argv[2]
conf = SparkConf().setAppName('scalable multiplication')
sc = SparkContext(conf=conf)
assert sc.version >= '1.5.1'
text = sc.textFile(inputs)
# sbaronia - Split the row to get individual numbers
row = text.map(lambda line: line.split())
# sbaronia - calling element_wise_product on individual line
# and then adding all the returned 10x10 matrix to get
# final matrix
sub = row.map(element_wise_product).reduce(add_tuples)
# sbaronia - writing formatted output to a file in
# a 10x10 matrix
result = open(output, 'w')
count = 0
for i in range(len(sub)):
result.write(str(sub[i]) + " ")
count += 1
if (count == 10):
result.write("\n")
count = 0
result.close()
示例3: do_all
def do_all(f_path,out_name):
sc = SparkContext()
data = sc.textFile(f_path)
data = data.map(parseKeepD).filter(lambda p: p[0] != None)
# Scale Features
features = data.map(lambda x: x[0].features)
summary = Statistics.colStats(features)
global means
global varis
means = summary.mean()
varis = summary.variance()
#scale the points
data = data.map(lambda y: (conv_label_pt(y[0]),y[1]))
#train model
model = LinearRegressionWithSGD().train(data.map(lambda x: x[0]), intercept=True, regType='none')
#calculate disparity
disparity = data.map(lambda p: (p[0].label, model.predict(p[0].features), p[1]))
#calculate SSR for later
ssr = disparity.map(lambda x: (x[0] - x[1])**2).sum()
#keep N
N = disparity.count()
#shut down SC
MSE = ssr/float(N)
se = std_errors(data,MSE,N)
disparity.saveAsTextFile(out_loc + out_name)
sc.stop()
return model.intercept,model.weights,se,disparity, ssr, N
示例4: KMeansModel
def KMeansModel(dataPath, label, k, character, master):
sc = SparkContext(master)
data = sc.textFile(dataPath).map(lambda line: line.replace(character, ','))
if label == 0:
label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[0]), 1)).reduceByKey(add).collect()
label = data.map(lambda line: line.split(',')).map(lambda data: float(data[0])).collect()
train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part), x[1:len(x)]))
else:
label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[-1]), 1)).reduceByKey(add).collect()
label = data.map(lambda line: line.split(',')).map(lambda data: float(data[-1])).collect()
train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part) if part is not None else '', x[:len(x) - 1]))
model = km.train(train_data, k)
predict_data = train_data.collect()
train = len(predict_data)
acc = 0
for i in range(len(label_sum)):
ksum = np.zeros(k, dtype = int)
cur_label = label_sum[i][0]
for j in range(train):
if label[j] == cur_label:
ksum[model.predict(predict_data[j])] += 1
acc += max(ksum)
string = "KMeans Result: \n"
center = model.centers
for i in range(k):
cur = str(i) + ":" + str(center[i]) + '\n'
string += cur
string = string + "Acc: " + str((float(acc)/train) * 100) + "%"
sc.stop()
return string
示例5: main
def main():
cleanup()
sc = SparkContext()
spark = SparkSession(sc)
path = os.path.join(mysql_export_dir, "name_string_indices.tsv")
df = spark.read.csv(path, header=True, inferSchema=True, sep='\t', nullValue='NULL')
names = df.select('name').rdd.map(lambda r: r['name'])
names_json = parse_spark(sc, names) \
.map(json.loads) \
.zip(df.rdd)
synonym_names = names_json.filter(lambda n: is_synonym(n))
accepted_names = names_json.filter(lambda n: not is_synonym(n))
synonym_names_with_accepted_columns = synonym_names \
.map(to_key_value) \
.leftOuterJoin(accepted_names.map(to_key_value)) \
.map(add_accepted_data_to_synonym_name)
accepted_names_with_accepted_columns = accepted_names \
.map(add_accepted_data_accepted_name)
sc.union([synonym_names_with_accepted_columns, accepted_names_with_accepted_columns]) \
.map(join_fields) \
.saveAsTextFile(output_dir_name_string_indices)
示例6: stackexchange_json_spark_job
def stackexchange_json_spark_job():
"""
Spark job to convert json data from hdfs into ques and ans.
Result is written into elasticsearch for text based search from user.
"""
server = bluebook_conf.HDFS_FQDN
conf = SparkConf().setAppName("stackexchange_json_spark_job")
spark_context = SparkContext(conf=conf)
json_ques_folder_address = "hdfs://" + server + "/" +\
bluebook_conf.STACKEXCHANGE_JSON_QUES_FOLDER_NAME +\
"/part-*"
json_ans_folder_address = "hdfs://" + server + "/" +\
bluebook_conf.STACKEXCHANGE_JSON_ANS_FOLDER_NAME +\
"/part-*"
# Ques and ans files are seperately read from hdfs
ques_file = spark_context.textFile(json_ques_folder_address)
ans_file = spark_context.textFile(json_ans_folder_address)
ques_tups = ques_file.map(lambda line: stackexchange_json_mapper(line, 'ques'))
ans_tups = ans_file.map(lambda line: stackexchange_json_mapper(line, 'ans'))
# Join accepted answers with their respective questions
ques_ans = ques_tups.join(ans_tups).map(lambda x: (x[0], {'ques': x[1][0], 'ans': x[1][1]}))
ques_ans.saveAsNewAPIHadoopFile(
path='-',
outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
keyClass="org.apache.hadoop.io.NullWritable",
valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
conf=stackoverflow_es_write_conf)
示例7: TestWordCounter
class TestWordCounter(unittest.TestCase):
def setUp(self):
conf = SparkConf().setAppName("appTest").setMaster("local[*]")
self.sc = SparkContext(conf=conf)
self.counter = WordCounter()
def tearDown(self):
self.sc.stop()
def test_when_exist_one_movie_and_counter(self):
movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy",
"1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"]
result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']),
('Toy', ['::Toy Story Toy (1995)::']))
movies = self.sc.parallelize(movieList)
self.assertEqual(self.counter.getMaxValues(movies),result)
def test_when_exist_one_movie_and_counter_moreMovies(self):
movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy",
"1993::ToyA StoryB ToyA (1995)::Animation|Children's|Comedy",
"1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"]
result = (('ToyA', ['::ToyA StoryB ToyA (1995)::','::ToyA StoryA ToyA (1995)::']))
movies = self.sc.parallelize(movieList)
self.assertEqual(self.counter.getMaxValues(movies),result)
示例8: main
def main(name, divide):
"""
old_g = pickle.load(open("/net/data/facebook/facebook-ucsb/Facebook_2008/"+name +"/original_pickles/"+name +".pickle", 'r'))
new_g = networkx.Graph()
for node, friends in old_g.adj.iteritems():
if node not in new_g.nodes():
new_g.add_node(node)
for friend in friends.iterkeys():
new_g.add_node(friend)
new_g.add_edge(node, friend)
"""
# serialize the networkx graph as text files of edgelist
# into a text file for workers to read
# networkx.write_edgelist(new_g, "edgelist/"+name, data=False)
# subprocess.check_call("hdfs dfs -put edgelist/"+name+ " edgelist/", shell=True)
new_g = networkx.read_adjlist(name + "_list.txt") # Egypt_list is an edge list
sc = SparkContext(appName="Sorted_removal")
dataG = json_graph.node_link_data(new_g)
stringG = json.dumps(dataG)
originalG = sc.broadcast(stringG)
edges = sc.textFile("hdfs://scrapper/user/xiaofeng/edgelist/" + name, 192 * 4 * int(divide))
costs = edges.map(lambda line: line.split(" ")).map(lambda edge: edge_to_cost(edge, originalG.value))
costs.saveAsTextFile("hdfs://scrapper/user/xiaofeng/costs_" + name)
sc.stop()
subprocess.check_call("hdfs dfs -get costs_" + name + " /home/xiaofeng/facebook/FacebookProject/costs/", shell=True)
Reformat("/home/xiaofeng/facebook/FacebookProject/costs/costs_" + name + "/", name)
示例9: stackexchange_xml_spark_job
def stackexchange_xml_spark_job():
server = bluebook_conf.HDFS_FQDN
conf = SparkConf()
xml_file_address = "hdfs://" + server + "/" +\
bluebook_conf.STACKEXCHANGE_XML_FOLDER_NAME +\
bluebook_conf.STACKEXCHANGE_XML_FILE_NAME
json_ques_folder_address = "hdfs://" + server + "/" +\
bluebook_conf.STACKEXCHANGE_JSON_QUES_FOLDER_NAME
json_ans_folder_address = "hdfs://" + server + "/" +\
bluebook_conf.STACKEXCHANGE_JSON_ANS_FOLDER_NAME
conf.setAppName('stackexchange_xml_spark_job')
spark_context = SparkContext(conf=conf)
file = spark_context.textFile(xml_file_address)
# Ques and Ans files are stored seperately depending of their 'posttypeid'
# Ques -> posttypeid == 1
# Ans -> posttypeid == 2
ques = file.map(stackexchange_xml_mapper)\
.filter(lambda dic: 'posttypeid' in dic.keys())\
.filter(lambda dic: dic['posttypeid'] == '1')\
.map(lambda d: jsoner(d))
ans = file.map(stackexchange_xml_mapper)\
.filter(lambda dic: 'posttypeid' in dic.keys())\
.filter(lambda dic: dic['posttypeid'] == '2')\
.map(lambda d: jsoner(d))
ques.saveAsTextFile(json_ques_folder_address)
ans.saveAsTextFile(json_ans_folder_address)
示例10: count_triangles
def count_triangles(data, master="local[2]"):
"""
@brief: Count triangles using Spark
@param data: The data location for the input files
@param master: The master URL as defined at
https://spark.apache.org/docs/1.1.0/submitting-applications.html#master-urls
"""
################# NO EDITS HERE ###################
assert not os.path.exists("triangles.out"), "File: triangles.out \
already exists"
sc = SparkContext(master, "Triangle Count")
start = time()
############### END NO EDITS HERE ################
# TODO: Your code goes here!
people = sc.textFile(data)
triad = people.flatMap(GetTriad).reduceByKey(add).filter(lambda x: x[1]>1)
#triadCount = triad.map(lambda x: (x,1))
#triadSum = triadCount.reduceByKey(add)
#triangles = triadSum.filter(lambda x: x[1]>1)
#output = triangles.collect()
output = triad.collect()
#triangles.saveAsTextFile("test1")
################# NO EDITS HERE ###################
print "\n\n*****************************************"
print "\nTotal algorithm time: %.4f sec \n" % (time()-start)
print "*****************************************\n\n"""
############### END NO EDITS HERE ################
with open("triangles.out", "wb") as f:
for friends in output:
f.write(friends[0]+"\n") # TODO: Loop with f to write your result to file serially
pass
示例11: bmRun
def bmRun(self):
"""
Connect DB from Spark and Run/Profile Query
"""
#create output file for results
print "Create benchmark output file for recoring..."
file_out = open("/Users/mira67/Downloads/benchmark_output.txt", "w")
print "start query evaluation, load tables from DB and register tables in Spark..."
#load data with Spark
with Timer() as tm:
sc = SparkContext("local","penguin")
#sc = SparkContext(master=local[2])
sqlContext = SQLContext(sc)
#queries test here, depends on queries to load table in memory
df1 =sqlContext.read.jdbc(url=self.url, table = self.tbName[0],lowerBound = 0, upperBound = 350, numPartitions=200)#dbtable is variable
df1.registerTempTable(self.tbName[0])
df2 =sqlContext.read.jdbc(url=self.url, table = self.tbName[1],lowerBound = 0, upperBound = 350, numPartitions=200)#dbtable is variable
df2.registerTempTable(self.tbName[1])
#register helper functions for SQL
sqlContext.registerFunction("MONTH", lambda x: x[5:7], StringType())#grab Month
sqlContext.registerFunction("YEAR", lambda x: x[0:4], StringType())
sqlContext.registerFunction("DAY", lambda x: x[8:10], StringType())
rdf1 = sqlContext.sql("SELECT * FROM "+self.tbName[0])
rdf2 = sqlContext.sql("SELECT * FROM " + self.tbName[1])
sqlContext.registerDataFrameAsTable(rdf1, self.mtb[0])
sqlContext.registerDataFrameAsTable(rdf2, self.mtb[1])
mem_use = self.memory_usage_psutil()
print "memory_use_load %s" %mem_use
print "=> elasped load data: %s ms" % (tm.secs * 1000)
#Query with Spark
with Timer() as tm:
#query
rdf = sqlContext.sql(self.sqlStm)
#need register as table first
print "Data schema from query:"
rdf.printSchema()
#hist of BT values
#Todo
mem_use = self.memory_usage_psutil()
print "memory_use_load %s" %mem_use
print "=> elasped: %s ms" % (tm.secs * 1000)
file_out.write("Query Time %s Memory %s\n" % (str(tm.secs * 1000),str(mem_use)))
#example enabled
day1 = sqlContext.sql("SELECT * FROM ssmi t1, map t2 WHERE t1.DATE BETWEEN '1990-01-01' AND '1990-01-01' AND t1.LOCID = t2.ID ORDER BY t1.LOCID")
#call plot
demoplt = qplt.queryPlot()
demoplt.qMapDemo(day1)
#stop sparkcontext
sc.stop()
示例12: main
def main(argv=None):
'''this is called if run from command line'''
parser = argparse.ArgumentParser()
parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True)
parser.add_argument('-o','--output', help="UTF-8 output file on cluster.", required=False)
parser.add_argument('-p','--printToLog', help="Print results to log.", required=False, action='store_true')
args = parser.parse_args()
sc = SparkContext()
global goodJsonRecords, badJsonRecords
goodJsonRecords = sc.accumulator(0)
badJsonRecords = sc.accumulator(0)
data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
tagCounts = data.values().flatMap(getTokens).countByValue()
# So far, this code isn't useful. The output fiile is written by the
# master node into an isolated folder, and I don't know of a way to
# retrieve it.
if args.output != None:
with codecs.open(args.output, 'wb', 'utf-8') as f:
for k in sorted(tagCounts):
f.write(k + " " + str(tagCounts[k]) + "\n")
print "========================================"
print "goodJsonRecords = %d" % goodJsonRecords.value
print "badJsonRecords = %d" % badJsonRecords.value
if args.printToLog:
for k in sorted(tagCounts):
print json.dumps(k), tagCounts[k]
print "========================================"
示例13: main
def main(arglist):
with open("log_file_v.txt", "a") as f:
f.write("Start time of validation...... %s\n" % datetime.datetime.now())
print("Start time of validation...... %s" % datetime.datetime.now())
# mapreduce params
output = arglist[0]
minPartitions = int(arglist[1])
# initialize
sc = SparkContext(appName="PythonValidate")
# rdd = sc.textFile(output_file_name, minPartitions=minPartitions)
rdd = sc.wholeTextFiles(output, minPartitions=minPartitions)
print('partitions', rdd.getNumPartitions())
error_count = rdd.mapPartitions(separateBlocks).sum()
sc.stop()
print("End time of validation...... %s" % datetime.datetime.now())
with open("log_file_v.txt", "a") as f:
f.write("End time of validation...... %s\n" % datetime.datetime.now())
f.write("Error count of sorted file...... %s" % error_count)
f.close()
示例14: __init__
def __init__(self, file_path, train_file, test_file, real_file=None):
"""
file_path: the folder where data files reside
train_file: (user, item, rating) quote records
test_file: (user, item) records, preferences to be predicted
real_file: (user, option, value) real purchase records, can be none if it doesn't exist
For this specific project:
item here is the combination of options with their values,
e.g. item 10 denotes option A with choice 0; item 21 denotes option B with choice 1
rating is the number of quotes for a certain item by a user
"""
self.file_path = file_path
config = SparkConf().setMaster("local").setAppName("Kaggle")\
.set("spark.executor.memory", "2g")\
.set("spark.storage.memoryFraction", "1")
sc = SparkContext(conf=config)
self.train_data = sc.textFile("file:" + self.file_path + train_file).cache()\
.map(lambda line: array([float(x) for x in line.split(',')]))
self.test_data = sc.textFile("file:" + self.file_path + test_file).cache()\
.map(lambda line: [float(x) for x in line.split(',')])
if real_file:
self.real_data = sc.textFile("file:" + self.file_path + real_file).cache()\
.map(lambda line: [float(x) for x in line.split(',')]).map(lambda r: ((r[0], r[1]), r[2]))
示例15: run
def run():
#if __name__ == "__main__":
sc = SparkContext(master = spark_addr, appName= app_name)
rdd = sc.textFile(hdfs_addr + file_path, 2).map(lambda line:format_list(line)).cache()
# rdd = sc.parallelize(test_list,4).cache()
#********create rules************
supp = float(rdd.count())*supp_rate
item = create_item(rdd) #create one item
item = freq(rdd,item,supp)
one_item = item
freq_items = item
while item.count() > 0:
more_item = item_plus(sc,item)
item = freq(rdd,more_item,supp)
freq_items = freq_items.union(item)
#result freq_items is key_value,key's type is frozenset
# rules = produce_rule(freq_items,one_item)
# rule_result = rules.collect()
freq_result = freq_items.collect()
# one_result = one_item.keys().collect()
one_result = one_item.keys().collect()
dict_rule = produce_rule(freq_result,one_result)
out,total = probability(rdd,dict_rule,0.5)
out1 =out.collect()
print "$$$$$$$$$$$$$$$$$$$$$$$out=",out1,"all=",total
#****************************
sc.stop()
return freq_result,dict_rule