本文整理汇总了Python中pyspark.SparkContext.union方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.union方法的具体用法?Python SparkContext.union怎么用?Python SparkContext.union使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext
的用法示例。
在下文中一共展示了SparkContext.union方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def main():
cleanup()
sc = SparkContext()
spark = SparkSession(sc)
path = os.path.join(mysql_export_dir, "name_string_indices.tsv")
df = spark.read.csv(path, header=True, inferSchema=True, sep='\t', nullValue='NULL')
names = df.select('name').rdd.map(lambda r: r['name'])
names_json = parse_spark(sc, names) \
.map(json.loads) \
.zip(df.rdd)
synonym_names = names_json.filter(lambda n: is_synonym(n))
accepted_names = names_json.filter(lambda n: not is_synonym(n))
synonym_names_with_accepted_columns = synonym_names \
.map(to_key_value) \
.leftOuterJoin(accepted_names.map(to_key_value)) \
.map(add_accepted_data_to_synonym_name)
accepted_names_with_accepted_columns = accepted_names \
.map(add_accepted_data_accepted_name)
sc.union([synonym_names_with_accepted_columns, accepted_names_with_accepted_columns]) \
.map(join_fields) \
.saveAsTextFile(output_dir_name_string_indices)
示例2: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def main():
master = 'local[1]'
app_name = 'reduce_demo1'
# print(range(0, 3))
sc = SparkContext(master, app_name)
# 测试1:正常
# rdd_list = [sc.parallelize(range(i * 3, (i+1) * 3)) for i in range(0,3)]
# rdd_union = sc.union(rdd_list)
# print(rdd_union.getNumPartitions())
# result = rdd_union.map(fun_map_print)
# result.count()
# 测试2:两次 union
rdd_list_outer = []
for x in ['a', 'b', 'c']:
rdd_list_inner = [sc.parallelize(map(lambda j: x + str(j),range(i * 3, (i+1) * 3))) for i in range(0,3)]
rdd_union_inner = sc.union(rdd_list_inner)
rdd_list_outer.append(rdd_union_inner)
rdd_union_outer = reduce(lambda rddx, rddy: rddx.union(rddy), rdd_list_outer)
result = rdd_union_outer.map(fun_map_print)
result.count()
sc.stop()
示例3: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def main(argv):
parser = argparse.ArgumentParser(description="Count words in files")
parser.add_argument("files", metavar="F", nargs='+', help="A file to count words in")
parser.add_argument("-s", "--stopFile", help="a file containing words to ignore, one word per line, ignores lines starting with #")
parser.add_argument("-o", "--outputFile", help="the file to store the output in")
args = parser.parse_args()
shutil.rmtree(args.outputFile, ignore_errors=True)
sc = SparkContext("local", "WordCounter")
createStopwordList(args.stopFile)
documentCounts = []
output = []
for file in args.files:
fileLines = sc.textFile(file)
documentCounts.append(fileLines.flatMap(tokenizer) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a + b) \
.filter(stopwordFilter)
.map(lambda (a, b): (b, a) ) \
.sortByKey(False, 1) \
.map(lambda (a, b): (b, a)))
output.append({'name': file.split(".")[0], 'wordCounts': []})
# Combine the word counts for the documents
combinedCounts = sc.union(documentCounts) \
.reduceByKey(lambda a, b: a + b) \
.map(lambda (a, b): (b, a) ) \
.sortByKey(False, 1) \
.map(lambda (a, b): (b, a))
# Write most common words to file in JSON format
f = open(args.outputFile, 'w')
countIndex = 0;
for counts in documentCounts:
output[countIndex]['wordCounts'] = counts.take(25)
countIndex = countIndex + 1
output.append({'name' : "Combined", 'wordCounts' : combinedCounts.take(25)})
json.dump(output,f)
f.close()
示例4: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def main():
args = parse_arguments()
print(args)
if os.path.exists(args.output_file):
print('Output file already exists:', args.output_file)
sys.exit(1)
sc = SparkContext()
input_partitions = args.input_partitions or sc.defaultMinPartitions
output_partitions = args.output_partitions or input_partitions
rdds_list = [sc.textFile(
name=input_file,
minPartitions=input_partitions,
use_unicode=True,
).map(input_mapper(input_file)) for input_file in args.input_files]
union_rdd = sc.union(rdds_list)
if args.projects:
projects = args.projects.split(u',')
line_filter = input_line_filter_provider(projects)
filtered_rdd = union_rdd.filter(line_filter)
else:
filtered_rdd = union_rdd
sorted_rdd = filtered_rdd.sortBy(
keyfunc=line_sorting_key,
ascending=True,
numPartitions=output_partitions,
)
sorted_rdd_text = sorted_rdd.map(line_tuple_to_text)
sorted_rdd_text.saveAsTextFile(
args.output_file,
compressionCodecClass='org.apache.hadoop.io.compress.GzipCodec',
)
示例5: TFIDF
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
class TFIDF():
def __init__(self,input_path,output_path):
self.input = input_path
self.output = output_path
self.texts = glob(self.input + '/*.txt')
self.conf = SparkConf().setAppName('tfidf')\
.setMaster('local')\
.set('spark.executor.memory','1g')
self.sc = SparkContext(conf=self.conf)
def writeToCSVFile(self,rdd):
with open(self.output + '/tfidf-scores.csv','wb') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['docID','word','score'])
writer.writerows(rdd)
def run(self):
# Job 1: Word Frequency in Documents.
tfilter = TextFilter().filter
wcRDD = self.sc.emptyRDD()
for dkey,textfile in enumerate(self.texts):
tf = self.sc.textFile(textfile)\
.filter(lambda line: len(line.strip()) > 0)\
.flatMap(lambda line: tfilter(line))\
.map(lambda word: ((word,dkey),1))\
.reduceByKey(operator.add)
N = tf.map(lambda ((w,d),y): y).sum()
tf = tf.map(lambda ((w,d),y): ((w,d),(y,N)))
wcRDD = self.sc.union([wcRDD,tf])
# Job 2: Word Frequency in Corpus & Calculate TF-IDF.
D = self.sc.broadcast(len(self.texts))
wcRDD = wcRDD.map(lambda ((w,d),(a,b)): (w,(d,a,b)))
wfRDD = wcRDD.map(lambda (w,(d,a,b)): (w,1)).reduceByKey(operator.add)
tfidf = wcRDD.join(wfRDD).map(lambda (w,((d,a,b),c)): ((d,-a/b * np.log(D.value/c),w),1))\
.sortByKey(True).map(lambda ((d,z,w),a): (d,w,-z))
self.writeToCSVFile(tfidf.collect())
示例6: matches
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
regexp = u"^.* (?P<int>[0-9]+) +(?P<string>[\w ]+)$"
### sc.textfile makes an RDD[String] object and we then apply regexp to it to find the interesting rows
### The 1 in the argument list to textFile ensures that we only make one partition of the file
timetable_regex = sc.textFile(dir + file, 1).map(lambda row: re.match(regexp, row, re.UNICODE))
### We filter out failed matches (None) and then maps the result to a key-value RDD of the form RDD[(Time, Stop)]
timetable_time_stop = timetable_regex.filter(lambda m: m is not None).map(lambda m: [m.group('int'), m.group('string')])
### Again a filter to get rid of some more faulty rows.
timetable_time_stop_filtered = timetable_time_stop.filter(lambda x: x[0].isdigit() and re.match(r"^[A-Z].*", x[1][0], re.UNICODE) != None)
### A glom makes an array of the RDDs key-values for every partition (similary to groupByKey but on partition level)
timetable_glom = timetable_time_stop_filtered.glom()
### Return an RDD of the form RDD[((Stop1, Stop2), timediff)] where timediff is time to take between stops
return timetable_glom.flatMap(lambda x: map(lambda i : [(x[i+1][1].lower(), x[i][1].lower()), (int(x[i + 1][0]) - int(x[i][0]), 1)], xrange(len(x) -1))).filter(lambda x: x[1][0] > 0)
### Make a single RDD from all the file RDDs parsed by timetableRDD function
timetable_data_union = sc.union(map(lambda x: timetableRDD(x), timetables))
### Here we repartition to make 100 partitions (instead of the ~ 5k we have)
timetable_data_union_repartitioned = timetable_data_union.repartition(100)
### Since many stops and trams go between directly between same stops, take minimum time between them to get geographical distance
### We also cache the result so that we can reuse it without having to read everything from file again
timetable_reduction = timetable_data_union_repartitioned.reduceByKey(lambda x, y: (min(x[0], y[0]), x[1] + y[1])).cache()
### Get list of all unique stops, and give them a unique index
### collect() command sends the result to the "driver" node and must hence fit in its memory
unique_stops = sc.union([timetable_reduction.map(lambda row : row[0][1]), timetable_reduction.map(lambda row : row[0][0])]).distinct().zipWithIndex().collect()
### We make maps to transform stop name to index and back and send the results to the workers
stops_to_index = sc.broadcast(dict((x, y) for x, y in unique_stops))
stops_from_index = sc.broadcast(dict((y, x) for x, y in unique_stops))
示例7: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def main():
parser = argparse.ArgumentParser(description="Gathers stats from apache logs")
parser.add_argument(
"--top-requested-pages",
"-r",
dest="top_req_pages",
action="store_true",
help="Reports the top 10 requested pages and the number of requests made for each",
)
parser.add_argument(
"--top-unsuccessful-pages",
"-u",
dest="top_uns_pages",
action="store_true",
help="Reports the top 10 requested pages and the number of requests made for each",
)
parser.add_argument(
"--top-ips",
"-i",
dest="top_ips",
action="store_true",
help="Reports the top 10 requested pages and the number of requests made for each",
)
parser.add_argument(
"--show-successful",
"-s",
dest="successful",
action="store_true",
help="Reports percentage of successful requests",
)
parser.add_argument(
"--show-unsuccessful",
"-n",
dest="unsuccessful",
action="store_true",
help="Reports percentage of unsuccessful requests",
)
parser.add_argument(
"--all-per-minute",
"-m",
dest="all_per_min",
action="store_true",
help="Reports the total number of requests made every minute in the entire time period covered by the file provided",
)
parser.add_argument(
"--top-urls-per-ip",
"-l",
dest="top_url_per_top_ip",
action="store_true",
help="Reports for each of the top 10 IPs, the top 5 pages requested and the number of requests for each.",
)
args = parser.parse_args()
noargs = not len(sys.argv) > 1
spark = SparkContext(appName="apacheStats")
numIterations = 1 # increase to test scalability
lines = spark.union([spark.textFile(path)] * numIterations)
dicts = lines.map(lambda line: get_dict_from_line(line))
statsRdd = dicts.map(lambda dictx: get_data_fromDict(dictx, None if noargs else args))
stats = statsRdd.reduce(add)
try:
f = open("output_spark", "w")
if noargs or args.top_req_pages:
f.write("Top 10 requested pages and the number of requests made for each:\n")
for (page, requests) in stats.pagesToNumberOfAccesses.most_common(10):
f.write(page)
f.write(": ")
f.write(str(requests))
f.write("\n")
f.write("\n")
if noargs or args.successful:
f.write("Percentage of successful requests: ")
f.write(str((float(stats.successful) / (stats.unsuccessful + stats.successful)) * 100) + "%")
f.write("\n\n")
if noargs or args.unsuccessful:
f.write("Percentage of unsuccessful requests: ")
f.write(str((float(stats.unsuccessful) / (stats.unsuccessful + stats.successful)) * 100) + "%")
f.write("\n\n")
if noargs or args.top_uns_pages:
f.write("Most unsuccessful: \n")
for (page, requests) in stats.unsuccessfulPages.most_common(10):
f.write(page)
f.write(": ")
f.write(str(requests))
f.write("\n")
f.write("\n")
if noargs or args.top_ips:
f.write(
"The top 10 IPs making the most requests, displaying the IP address and number of requests made: \n"
)
for (page, requests) in stats.ipToNumberOfAccesses.most_common(10):
f.write(page)
f.write(": ")
f.write(str(requests))
f.write("\n")
f.write("\n")
if noargs or args.all_per_min:
f.write(
"The total number of requests made every minute in the entire time period covered by the file provided: \n"
)
for date, accesses in stats.accessesPerMinute.iteritems():
#.........这里部分代码省略.........
示例8: parse_in_data
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
# read wiki page count stats and clean wiki page titles
for src_file_name in src_files:
base = os.path.basename(src_file_name)
filename_tokens = base.split('-')
(date, time) = filename_tokens[1], filename_tokens[2].split('.')[0]
if run_mode == "swift":
src_file_name = "swift://" + source_dir + "." + swift_region + "/" + src_file_name
lines = sc.textFile(src_file_name)
parts = lines\
.filter(lambda l: wiki_regex.match(l)) \
.filter(lambda line: "facebook" in line.lower() ) \
.map(lambda l: parse_in_data(l, date + time)) \
.filter(lambda l: l != None)
# .filter(lambda line: "facebook" in line.lower() ) \
rdds.append(parts)
page_w_date = sc.union(rdds)
# calculate trends
pageview_counts = page_w_date \
.reduceByKey(lambda a, b: a + b) \
.map(lambda ( (p, d), c): (p, ([ d ], [ c ])) ) \
.reduceByKey(lambda (d0, c0), (d1, c1): (d0 + d1, c0 + c1) ) \
.map(lambda ( p, (d, c)): calc_trend(p, d, c) )
# write output to target directory
pageview_counts.saveAsTextFile(target_dir)
示例9: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def main():
inputs = sys.argv[1]
rating_file = sys.argv[2]
output = sys.argv[3]
conf = SparkConf().setAppName('movie recommendation')
sc = SparkContext(conf=conf)
assert sc.version >= '1.5.1'
sqlContext = SQLContext(sc)
""" sbaronia - getting files from directory and
reading from it and using parse_rating_movie and parse_my_input for parsing the
content of the files to an rdd"""
movies_path = join(inputs, "movies.dat")
ratings_path = join(inputs, "ratings.dat")
read_ratings = sc.textFile(ratings_path)
read_movies = sc.textFile(movies_path)
read_mymovies = sc.textFile(rating_file)
parse_ratings = read_ratings.map(lambda line : parse_rating_movie(line, "ratings.dat")).cache()
parse_movies = read_movies.map(lambda line : parse_rating_movie(line, "movies.dat")).cache()
parse_mymovies = read_mymovies.map(lambda line: parse_my_input(line)).cache()
""" sbaronia - converting movie and rating data to dataframes """
schema_movie = StructType([StructField('movie_id', IntegerType(), True),
StructField('movie_name', StringType(), True)])
movie_df = sqlContext.createDataFrame(parse_movies, schema=schema_movie).cache()
schema_mymovie = StructType([StructField('ip_uid', IntegerType(), True),
StructField('ip_mname', StringType(), True),
StructField('ip_rating', IntegerType(), True),
StructField('ldistance', IntegerType(), True)])
mymovie_df = sqlContext.createDataFrame(parse_mymovies, schema=schema_mymovie).cache()
""" sbaronia - combining user input movies with movies data
then finding Levenshtein distance with every movie and then finding
the one with minimum Levenshtein distance as our best match"""
movie_plus_ip = movie_df.join(mymovie_df, None, 'inner').cache()
movie_plus_ip_distance = movie_plus_ip.withColumn('ldistance', levenshtein('movie_name','ip_mname'))
mymovie_distance = movie_plus_ip_distance \
.groupBy('ip_uid', 'ip_mname') \
.min('ldistance') \
.withColumnRenamed('min(ldistance)','ldistance') \
.cache()
""" sbaronia - join the tables to get only those movies with minimum
Levenshtein distance and then from that table select columns
necessary. Then create a test data for all movies with new user 0"""
refined_movies = movie_plus_ip_distance.join(mymovie_distance, ['ip_uid', 'ip_mname', 'ldistance'], 'inner').cache()
input_rating = refined_movies.select('ip_uid', 'movie_id', 'ip_rating').cache()
input_rating_rdd = input_rating.rdd.map(lambda row1: (row1.ip_uid, row1.movie_id, float(row1.ip_rating))).cache()
input_with_train = sc.union([input_rating_rdd, parse_ratings]).cache()
test_newuser = parse_movies.map(lambda line: (0, line[0])).cache()
""" sbaronia - train on all data including new one and then
test on all movies for new user and sort them in descending
order of ratings"""
model = ALS.train(input_with_train, 10, 10, 0.1)
predictions = model.predictAll(test_newuser) \
.map(lambda row1: (row1.rating, row1.product)) \
.sortByKey(ascending=False) \
.map(lambda row: (row[1], row[0])) \
.cache()
final_rating = sqlContext.createDataFrame(predictions, ['movie_id', 'movie_rating']).cache()
final_movie_rating = movie_df.join(final_rating, ['movie_id'], 'inner').sort("movie_rating", ascending=False).cache()
final_movie_rating_rdd = final_movie_rating.rdd.map(lambda row: (str(row.movie_id) + ' :: ' + str(row.movie_name)) + ' :: ' + str(row.movie_rating)).coalesce(1).cache()
final_movie_rating_rdd.saveAsTextFile(output)
示例10: extract_big5_elements
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def extract_big5_elements(parent_json_data):
user_dict = {}
for idx in range(0,5):
if parent_json_data["tree"]["children"][0]["children"][0]["children"][idx]["id"] == "Openness" :
user_dict.update(update_entry(idx,3,parent_json_data))
if parent_json_data['tree']['children'][0]['children'][0]['children'][idx]['id'] == 'Conscientiousness' :
user_dict.update(update_entry(idx,2,parent_json_data))
if parent_json_data['tree']['children'][0]['children'][0]['children'][idx]['id'] == 'Extraversion' :
user_dict.update(update_entry(idx,5,parent_json_data))
if parent_json_data['tree']['children'][0]['children'][0]['children'][idx]['id'] == 'Agreeableness' :
user_dict.update(update_entry(idx,1,parent_json_data))
if parent_json_data['tree']['children'][0]['children'][0]['children'][idx]['id'] == 'Neuroticism' :
user_dict.update(update_entry(idx,4,parent_json_data))
return user_dict def update_entry(idx,index,parent_json_data):
trait_name = parent_json_data["tree"]["children"][0]["children"][0]["children"][idx]["children"][index]["id"]
trait_percentage = parent_json_data["tree"]["children"][0]["children"][0]["children"][idx]["children"][index]["percentage"]
return {trait_name:trait_percentage}
# convert_to_dict would properly format the statuses of the user (i.e. it would make more descriptive)
def convert_to_dict(obj):
return {
'userid': str(obj['user']['id']),
'id': str(obj['id']),
'sourceid': 'python-twitter',
'contenttype': 'text/plain',
'language': obj['lang'],
'content': obj['text'],
'reply': ((obj['in_reply_to_status_id']) == None),
'forward': False
}
# calculate_average would be called by reduceByKey() function to get the average personality trait for each location
def calculate_average(a,b):
sum_dict = {}
for key in a:
sum_dict[key] = ((a[key] + b[key])/300) # divide by the total number of users in a location
return sum_dict
if __name__ == "__main__":
tweet_list = get_all_tweets
get_avg = calculate_average
my_conf = (SparkConf()
.setAppName("Spark-twitter-user")
.set("spark.network.timeout", "1200s")) # set timeout to 20 minutes as spark job
# would sit idle for 15 minutes in order to refresh the request window of twitter API
sc = SparkContext(conf=my_conf)
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId","") #specify your AWSAccessKey here
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey","") #specify your AWSSecretKey here
input_rdd = sc.textFile('s3n://perloc/twitterDB.txt/').map(lambda entry: tuple(entry.split(",")))
print('input_rdd loaded')
#print the start-time of the spark-job
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
''' We'll create subsets from the main rdd based on the key, i.e. location
and work upon each rdd for every window of 15 minutes
(twitter restrictions apply here, see twitter-API-rate-limits)
https://dev.twitter.com/rest/public/rate-limits
'''
rio_filter_rdd = input_rdd.filter(lambda (loc,user): loc=='Rio')
rio_rdd = rio_filter_rdd.mapValues(lambda user : tweet_list(user))
print('Rio RDD created')
time.sleep(61*15) # required because of the API-rate limit
chicago_filter_rdd = input_rdd.filter(lambda (loc,user): loc=='Chicago')
chicago_rdd = chicago_filter_rdd.mapValues(lambda user : tweet_list(user))
print('Chicago RDD created')
time.sleep(61*15)
newyork_filter_rdd = input_rdd.filter(lambda (loc,user): loc=='NewYork')
newyork_rdd = newyork_filter_rdd.mapValues(lambda user : tweet_list(user))
print('NewYork RDD created')
time.sleep(61*15)
california_filter_rdd = input_rdd.filter(lambda (loc,user): loc=='California')
california_rdd = california_filter_rdd.mapValues(lambda user : tweet_list(user))
print('California RDD created')
# Once done with the personality analysis for each location, simply gather them all in one rdd
finalRDD = sc.union([rio_rdd,chicago_rdd,newyork_rdd,california_rdd])
.reduceByKey(lambda user1,user2 : get_avg(user1,user2)) # get the average of each personality trait for each location
finalRDD.repartition(1).saveAsTextFile("s3n://perloc/twitter-ibm/personality-profile.txt")
print('All done,file created')
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
sys.exit()
示例11: run
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def run(schema_file, data_path, script=None, spec_file=None, verbose=None, rout=None, yarn=None):
"""
Main function to run pyspark job. It requires a schema file, an HDFS directory
with data and optional script with mapper/reducer functions.
"""
if script:
script = get_script(script)
if verbose:
print("### schema: %s" % schema_file)
print("### path : %s" % data_path)
print("### script: %s" % script)
print("### spec : %s" % spec_file)
time0 = time.time()
# pyspark modules
from pyspark import SparkContext
# define spark context, it's main object which allow
# to communicate with spark
ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script])
logger = SparkLogger(ctx)
if not verbose:
logger.set_level('ERROR')
if yarn:
logger.info("YARN client mode enabled")
# load FWJR schema
rdd = ctx.textFile(schema_file, 1).collect()
# define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines)
avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list
schema = ''.join(avsc.split()) # remove spaces in avsc map
conf = {"avro.schema.input.key": schema}
# define newAPIHadoopFile parameters, java classes
aformat="org.apache.avro.mapreduce.AvroKeyInputFormat"
akey="org.apache.avro.mapred.AvroKey"
awrite="org.apache.hadoop.io.NullWritable"
aconv="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter"
# load data from HDFS
if isinstance(data_path, list):
avro_rdd = ctx.union([ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path])
else:
avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf)
# process data, here the map will read record from avro file
# if we need a whole record we'll use lambda x: x[0], e.g.
# output = avro_rdd.map(lambda x: x[0]).collect()
#
# if we need a particular key, e.g. jobid, we'll extract it
# within lambda function, e.g. lambda x: x[0]['jobid'], e.g.
# output = avro_rdd.map(lambda x: x[0]['jobid']).collect()
#
# in more general way we write mapper/reducer functions which will be
# executed by Spark via collect call
spec = None
if spec_file:
if os.path.isfile(spec_file):
spec = json.load(open(spec_file))
else:
spec = json.loads(spec_file)
if verbose:
spec['verbose'] = 1
print("### spec %s" % json.dumps(spec))
if rout:
spec['output'] = rout
if script:
obj = import_(script)
logger.info("Use user-based script %s" % obj)
if not hasattr(obj, 'MapReduce'):
logger.error('Unable to find MapReduce class in %s, %s' \
% (script, obj))
ctx.stop()
return
# we have a nested use case when one MR return WMArchive spec
# we'll loop in that case until we get non-spec output
count = 0
while True:
mro = obj.MapReduce(spec)
mname = mro.__dict__.get('name', '').split('.')[0]
print("### Load %s" % mname)
if mname.lower().endswith('counter'):
out = avro_rdd.filter(mro.mapper).count()
if rout:
with open(rout, 'w') as ostream:
ostream.write(out)
break
# example of collecting records from mapper and
# passing all of them to reducer function
records = avro_rdd.filter(mro.mapper).collect()
out = mro.reducer(records)
if verbose:
print("### Loop count %s" % count)
if count > 3:
print("### WARNING, loop counter exceed its limit")
break
if is_spec(out):
spec = out
else:
break
#.........这里部分代码省略.........
示例12: stripSPM
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
return sc.textFile(f).map(lambda x: (f_date, x))
def stripSPM(f):
field = str(f)
while(len(field) > 0 and not is_number(field)):
field = field[1:]
return float(field) if len(field) > 0 else 0.0
udf = UserDefinedFunction(lambda x: stripSPM(x), FloatType())
s_type = sqlContext.read.format('jdbc').options(url=url, dbtable='public.st_type').load()
# Read in the Metadata files
path = 'meta/*/*/*'
# path = 'meta/2008/*/*'
allFiles = glob(path + ".txt")
meta_files = sc.union([date_row(f) for f in allFiles])
md_rows = meta_files.map(lambda l: [l[0]]+l[1].split('\t')) \
.map(lambda p: Row(effective_start=p[0], pems_id=p[1], Fwy=ZeroInt(p[2]), Dir=p[3],
district_id=ZeroInt(p[4]), County=ZeroInt(p[5]), City=ZeroInt(p[6]),
state_pm=p[7], abs_pm=ZeroFloat(p[8]), latitude=ZeroFloat(p[9]),
longitude=ZeroFloat(p[10]), length=ZeroFloat(p[11]), Type=p[12],
num_lanes=ZeroFloat(p[13]), name=p[14]))
station_meta = sqlContext.createDataFrame(md_rows)
st_cols = station_meta.columns # Get the initial list of columns
station_meta = station_meta.fillna({'City': -1})
station_meta = station_meta.select(*[udf(column).alias('state_pm') if column == 'state_pm' else column for column in station_meta.columns])
# Drop duplicates w/o including file data
station_meta = station_meta.dropDuplicates([c for c in st_cols if c != 'effective_start'])
示例13: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def main():
conf = SparkConf().setAppName("Test")
sc = SparkContext(conf=conf)
# load tagged datasets as training data
dataY0 = sc.wholeTextFiles('/home/xsran/IdeaProjects/hadoop1/data/Y-cut')
dataN0 = sc.wholeTextFiles('/home/xsran/IdeaProjects/hadoop1/data/N-cut')
# split text into words
dataN = dataN0.map(lambda x: x[1].split(" "))
dataY = dataY0.map(lambda x: x[1].split(" "))
# merge the positive and negative into a single dataset
dataA = dataY.union(dataN)
# map words list into (word,1) tuple
words = dataA.flatMap(lambda x: x).map(lambda x: (x, 1))
# counting the number of words
wordCount = words.reduceByKey(lambda x, y: x + y).map(lambda x: (x[1], x[0])).sortByKey(ascending=False)
wordCount.cache()
# saving this results
# wordCount.map(lambda x:'%s,%s' % (x[1],x[0])).saveAsTextFile(dir+'wordCount')
# wordCount.map(lambda x:(x[1],x[0])).saveAsTextFile(dir+'wordCount_rep')
# filter this words list. Only keep the words with a certain frequency as features
# feature_count: (features word, count)
feature_count = wordCount.filter(lambda x: 150 < x[0] < 5000).map(lambda x: (x[1], x[0]))
# count the word frequency in positive and negative case respectively.
dataN1 = dataN0.flatMap(lambda x: [(w, 1) for w in set(x[1].split(" "))]).reduceByKey(lambda x, y: x + y)
dataY1 = dataY0.flatMap(lambda x: [(w, 1) for w in set(x[1].split(" "))]).reduceByKey(lambda x, y: x + y)
# dataA1: (word,(N num,Y num))
dataA1 = dataN1.fullOuterJoin(dataY1).mapValues(lambda x: (x[0] if x[0] else 0, x[1] if x[1] else 0))
fs = feature_count.map(lambda x: (x[0], 0))
totalNnum = dataN0.count()
totalYnum = dataY0.count()
# only keep those words in the feature_count
# dataA2:(word,(N num,Y num))
dataA2 = dataA1.rightOuterJoin(fs).mapValues(lambda x: x[0]).filter(
lambda x: x[1][0] != totalNnum and x[1][1] != totalYnum)
# compute the chi square values
dataA3 = dataA2.mapValues(lambda x: (x, (totalNnum - x[0], totalYnum - x[1]), totalNnum + totalYnum))
dataX2 = dataA3.mapValues(lambda x: (float(x[0][0] * x[1][1] - x[0][1] * x[1][0]) ** 2 * x[2]) / (
(x[0][0] + x[0][1]) * (x[1][0] + x[1][1]) * (x[0][0] + x[1][0]) * (x[0][1] + x[1][1])))
# sorting
dataX2 = dataX2.sortBy(lambda x: abs(x[1]), ascending=False)
# only keep 100 features with highest chi square values
# features: this variable only keep the 100 words.
features = dataX2.map(lambda x: x[0]).collect()[:100]
# features_x2: this variable record the chi square values of each features
features_x2 = dataX2.collect()[:100]
# broadcasting those data to spark's worker nodes.
features = sc.broadcast(features)
features_x2 = sc.broadcast(features_x2)
# this function is used to extract features from a case
def make_feature(doc):
doc = doc.split(" ")
f = []
for i in features.value:
f.append(doc.count(i))
return f
def make_feature2(doc):
doc = doc.split(" ")
f = []
for k, v in features_x2.value:
a = doc.count(k)
a = v if a else 0
f.append(a)
return f
# convert case into features
fN = dataN0.mapValues(make_feature2)
fY = dataY0.mapValues(make_feature2)
# fN.repartition(1).map(lambda x:(x[0].split('/')[-1][:-4],x[1])).saveAsTextFile(dir+'VecN')
# fY.repartition(1).map(lambda x:(x[0].split('/')[-1][:-4],x[1])).saveAsTextFile(dir+'VecY')
fN = fN.map(lambda x: x[1])
fY = fY.map(lambda x: x[1])
# sc.stop()
# convert features into LabeledPoint to train the model.
fNtl = fN.map(lambda x: LabeledPoint(0, x))
fYtl = fY.map(lambda x: LabeledPoint(1, x))
# union the positive and negative data and train the NaiveBayes model.
fTrain = fNtl.union(fYtl)
bn = NaiveBayes.train(fTrain)
# load the all untagged data
inputs = [sc.wholeTextFiles('/home/xsran/tmp/BigData/data_c_' + str(i)) for i in range(10)]
#.........这里部分代码省略.........
示例14: map_to_item_user_rating
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
batchsize = 1000
list_dicts = []
i=12
for f in filenames:
d = sc.textFile(f).zipWithIndex().map(lambda x: (x[1], x[0])).map(parseMovieFile).reduce(merge_dicts)
list_dicts.append(d)
i += 1
if i == 1000:
i = 0
rdds.append(sc.parallelize(list_dicts))
list_dicts = []
rdds.append(sc.parallelize(list_dicts))
rdd = sc.union(rdds)
rdd.cache()
def map_to_item_user_rating(d):
movie = d['movie']
del d['movie']
return (movie, d)
def flat_map_user_ratings(line):
ret_arr = []
for k in line[1]:
ret_arr.append((k, {line[0]:line[1][k]}))
return ret_arr
item_user_ratings = rdd.map(map_to_item_user_rating)
user_ratings = item_user_ratings.flatMap(flat_map_user_ratings)
示例15: run
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def run(schema_file, data_path, script=None, spec_file=None, verbose=None, yarn=None):
"""
Main function to run pyspark job. It requires a schema file, an HDFS directory
with data and optional script with mapper/reducer functions.
"""
time0 = time.time()
# pyspark modules
from pyspark import SparkContext
# define spark context, it's main object which allow
# to communicate with spark
ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script])
logger = SparkLogger(ctx)
if not verbose:
logger.set_level('ERROR')
if yarn:
logger.info("YARN client mode enabled")
# load FWJR schema
rdd = ctx.textFile(schema_file, 1).collect()
# define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines)
avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list
schema = ''.join(avsc.split()) # remove spaces in avsc map
conf = {"avro.schema.input.key": schema}
# define newAPIHadoopFile parameters, java classes
aformat="org.apache.avro.mapreduce.AvroKeyInputFormat"
akey="org.apache.avro.mapred.AvroKey"
awrite="org.apache.hadoop.io.NullWritable"
aconv="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter"
# load data from HDFS
if isinstance(data_path, list):
avro_rdd = ctx.union([ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path])
else:
avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf)
# process data, here the map will read record from avro file
# if we need a whole record we'll use lambda x: x[0], e.g.
# output = avro_rdd.map(lambda x: x[0]).collect()
#
# if we need a particular key, e.g. jobid, we'll extract it
# within lambda function, e.g. lambda x: x[0]['jobid'], e.g.
# output = avro_rdd.map(lambda x: x[0]['jobid']).collect()
#
# in more general way we write mapper/reducer functions which will be
# executed by Spark via collect call
spec = None
if spec_file:
if os.path.isfile(spec_file):
spec = json.load(open(spec_file))
else:
spec = json.loads(spec_file)
if script:
obj = import_(script)
logger.info("Use user-based script %s" % obj)
if not hasattr(obj, 'MapReduce'):
logger.error('Unable to find MapReduce class in %s, %s' \
% (script, obj))
ctx.stop()
return
mro = obj.MapReduce(spec)
# example of collecting records from mapper and
# passing all of them to reducer function
records = avro_rdd.map(mro.mapper).collect()
out = mro.reducer(records)
# the map(f).reduce(f) example but it does not collect
# intermediate records
# out = avro_rdd.map(obj.mapper).reduce(obj.reducer).collect()
else:
records = avro_rdd.map(basic_mapper).collect()
out = basic_reducer(records)
ctx.stop()
if verbose:
logger.info("Elapsed time %s" % htime(time.time()-time0))
return out