当前位置: 首页>>代码示例>>Python>>正文


Python SparkContext.union方法代码示例

本文整理汇总了Python中pyspark.SparkContext.union方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.union方法的具体用法?Python SparkContext.union怎么用?Python SparkContext.union使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.SparkContext的用法示例。


在下文中一共展示了SparkContext.union方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def main():
    cleanup()

    sc = SparkContext()
    spark = SparkSession(sc)
    path = os.path.join(mysql_export_dir, "name_string_indices.tsv")

    df = spark.read.csv(path, header=True, inferSchema=True, sep='\t', nullValue='NULL')

    names = df.select('name').rdd.map(lambda r: r['name'])
    names_json = parse_spark(sc, names) \
        .map(json.loads) \
        .zip(df.rdd)

    synonym_names = names_json.filter(lambda n: is_synonym(n))
    accepted_names = names_json.filter(lambda n: not is_synonym(n))

    synonym_names_with_accepted_columns = synonym_names \
        .map(to_key_value) \
        .leftOuterJoin(accepted_names.map(to_key_value)) \
        .map(add_accepted_data_to_synonym_name)
    accepted_names_with_accepted_columns = accepted_names \
        .map(add_accepted_data_accepted_name)
    sc.union([synonym_names_with_accepted_columns, accepted_names_with_accepted_columns]) \
        .map(join_fields) \
        .saveAsTextFile(output_dir_name_string_indices)
开发者ID:GlobalNamesArchitecture,项目名称:gnharvester,代码行数:28,代码来源:name_string_indices.py

示例2: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def main():

    master = 'local[1]'
    app_name = 'reduce_demo1'

    # print(range(0, 3))

    sc = SparkContext(master, app_name)

    # 测试1:正常
    # rdd_list = [sc.parallelize(range(i * 3, (i+1) * 3)) for i in range(0,3)]
    # rdd_union = sc.union(rdd_list)
    # print(rdd_union.getNumPartitions())
    # result = rdd_union.map(fun_map_print)
    # result.count()

    # 测试2:两次 union
    rdd_list_outer = []
    for x in ['a', 'b', 'c']:
        rdd_list_inner = [sc.parallelize(map(lambda j: x + str(j),range(i * 3, (i+1) * 3))) for i in range(0,3)]
        rdd_union_inner = sc.union(rdd_list_inner)
        rdd_list_outer.append(rdd_union_inner)

    rdd_union_outer = reduce(lambda rddx, rddy: rddx.union(rddy), rdd_list_outer)
    result = rdd_union_outer.map(fun_map_print)
    result.count()

    sc.stop()
开发者ID:tsingfu,项目名称:xuetangx-streaming-app,代码行数:30,代码来源:test_reduce_demo1.py

示例3: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def main(argv):
    parser = argparse.ArgumentParser(description="Count words in files")
    parser.add_argument("files", metavar="F", nargs='+', help="A file to count words in")
    parser.add_argument("-s", "--stopFile", help="a file containing words to ignore, one word per line, ignores lines starting with #")
    parser.add_argument("-o", "--outputFile", help="the file to store the output in")
    args = parser.parse_args()    
    
    shutil.rmtree(args.outputFile, ignore_errors=True)

    sc = SparkContext("local", "WordCounter")    
    createStopwordList(args.stopFile)
    documentCounts = []
    output = []
    for file in args.files:        
        fileLines = sc.textFile(file)
        documentCounts.append(fileLines.flatMap(tokenizer) \
                .map(lambda word: (word, 1)) \
                .reduceByKey(lambda a, b: a + b) \
                .filter(stopwordFilter)
                .map(lambda (a, b): (b, a) ) \
                .sortByKey(False, 1) \
                .map(lambda (a, b): (b, a)))
        output.append({'name': file.split(".")[0], 'wordCounts': []})                 

    # Combine the word counts for the documents      
    combinedCounts = sc.union(documentCounts) \
                .reduceByKey(lambda a, b: a + b) \
                .map(lambda (a, b): (b, a) ) \
                .sortByKey(False, 1) \
                .map(lambda (a, b): (b, a))
                
    # Write most common words to file in JSON format
    f = open(args.outputFile, 'w')
    countIndex = 0;
    for counts in documentCounts:
        output[countIndex]['wordCounts'] = counts.take(25)
        countIndex = countIndex + 1
        
    output.append({'name' : "Combined", 'wordCounts' : combinedCounts.take(25)})
    json.dump(output,f)
    f.close()
开发者ID:lgaud,项目名称:WordCountVisualization,代码行数:43,代码来源:WordCounter.py

示例4: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def main():
    args = parse_arguments()
    print(args)

    if os.path.exists(args.output_file):
        print('Output file already exists:', args.output_file)
        sys.exit(1)

    sc = SparkContext()

    input_partitions = args.input_partitions or sc.defaultMinPartitions
    output_partitions = args.output_partitions or input_partitions

    rdds_list = [sc.textFile(
        name=input_file,
        minPartitions=input_partitions,
        use_unicode=True,
        ).map(input_mapper(input_file)) for input_file in args.input_files]

    union_rdd = sc.union(rdds_list)

    if args.projects:
        projects = args.projects.split(u',')
        line_filter = input_line_filter_provider(projects)
        filtered_rdd = union_rdd.filter(line_filter)
    else:
        filtered_rdd = union_rdd

    sorted_rdd = filtered_rdd.sortBy(
        keyfunc=line_sorting_key,
        ascending=True,
        numPartitions=output_partitions,
        )

    sorted_rdd_text = sorted_rdd.map(line_tuple_to_text)

    sorted_rdd_text.saveAsTextFile(
        args.output_file,
        compressionCodecClass='org.apache.hadoop.io.compress.GzipCodec',
    )
开发者ID:youtux,项目名称:sorting-pagecounts,代码行数:42,代码来源:sort.py

示例5: TFIDF

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
class TFIDF():

	def __init__(self,input_path,output_path):
		self.input = input_path
		self.output = output_path
		self.texts = glob(self.input + '/*.txt')
		self.conf = SparkConf().setAppName('tfidf')\
							   .setMaster('local')\
							   .set('spark.executor.memory','1g')
		self.sc = SparkContext(conf=self.conf)

	def writeToCSVFile(self,rdd):
		with open(self.output + '/tfidf-scores.csv','wb') as csvfile:
			writer = csv.writer(csvfile)
			writer.writerow(['docID','word','score'])
			writer.writerows(rdd)


	def run(self):
		# Job 1: Word Frequency in Documents.
		tfilter = TextFilter().filter
		wcRDD = self.sc.emptyRDD()
		for dkey,textfile in enumerate(self.texts):
			tf = self.sc.textFile(textfile)\
					 .filter(lambda line: len(line.strip()) > 0)\
				     .flatMap(lambda line: tfilter(line))\
				     .map(lambda word: ((word,dkey),1))\
				     .reduceByKey(operator.add)
			N = tf.map(lambda ((w,d),y): y).sum()
			tf = tf.map(lambda ((w,d),y): ((w,d),(y,N)))
			wcRDD = self.sc.union([wcRDD,tf])

		# Job 2: Word Frequency in Corpus & Calculate TF-IDF.
		D = self.sc.broadcast(len(self.texts))
		wcRDD = wcRDD.map(lambda ((w,d),(a,b)): (w,(d,a,b)))
		wfRDD = wcRDD.map(lambda (w,(d,a,b)): (w,1)).reduceByKey(operator.add)
		tfidf = wcRDD.join(wfRDD).map(lambda (w,((d,a,b),c)): ((d,-a/b * np.log(D.value/c),w),1))\
					 .sortByKey(True).map(lambda ((d,z,w),a): (d,w,-z))
		self.writeToCSVFile(tfidf.collect())
开发者ID:pragaashp,项目名称:Scalable-tf-idf,代码行数:41,代码来源:tfidf.py

示例6: matches

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
    regexp = u"^.* (?P<int>[0-9]+) +(?P<string>[\w ]+)$"
    ### sc.textfile makes an RDD[String] object and we then apply regexp to it to find the interesting rows
    ### The 1 in the argument list to textFile ensures that we only make one partition of the file
    timetable_regex = sc.textFile(dir + file, 1).map(lambda row: re.match(regexp, row, re.UNICODE))
    ### We filter out failed matches (None) and then maps the result to a key-value RDD of the form RDD[(Time, Stop)]
    timetable_time_stop = timetable_regex.filter(lambda m: m is not None).map(lambda m: [m.group('int'), m.group('string')])
    ### Again a filter to get rid of some more faulty rows.
    timetable_time_stop_filtered = timetable_time_stop.filter(lambda x: x[0].isdigit() and re.match(r"^[A-Z].*", x[1][0], re.UNICODE) != None)
    ### A glom makes an array of the RDDs key-values for every partition (similary to groupByKey but on partition level)
    timetable_glom = timetable_time_stop_filtered.glom()
    ### Return an RDD of the form RDD[((Stop1, Stop2), timediff)] where timediff is time to take between stops
    return timetable_glom.flatMap(lambda x: map(lambda i : [(x[i+1][1].lower(), x[i][1].lower()), (int(x[i + 1][0]) - int(x[i][0]), 1)], xrange(len(x) -1))).filter(lambda x: x[1][0] > 0)


### Make a single RDD from all the file RDDs parsed by timetableRDD function
timetable_data_union = sc.union(map(lambda x: timetableRDD(x), timetables))

### Here we repartition to make 100 partitions (instead of the ~ 5k we have)
timetable_data_union_repartitioned = timetable_data_union.repartition(100)

### Since many stops and trams go between directly between same stops, take minimum time between them to get geographical distance
### We also cache the result so that we can reuse it without having to read everything from file again
timetable_reduction = timetable_data_union_repartitioned.reduceByKey(lambda x, y: (min(x[0], y[0]), x[1] + y[1])).cache()

### Get list of all unique stops, and give them a unique index
### collect() command sends the result to the "driver" node and must hence fit in its memory
unique_stops = sc.union([timetable_reduction.map(lambda row : row[0][1]), timetable_reduction.map(lambda row : row[0][0])]).distinct().zipWithIndex().collect()

### We make maps to transform stop name to index and back and send the results to the workers
stops_to_index = sc.broadcast(dict((x, y) for x, y in unique_stops))
stops_from_index = sc.broadcast(dict((y, x) for x, y in unique_stops))
开发者ID:Oscarlsson,项目名称:Spark-DS-example,代码行数:33,代码来源:readPdfs.py

示例7: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def main():
    parser = argparse.ArgumentParser(description="Gathers stats from apache logs")
    parser.add_argument(
        "--top-requested-pages",
        "-r",
        dest="top_req_pages",
        action="store_true",
        help="Reports the top 10 requested pages and the number of requests made for each",
    )
    parser.add_argument(
        "--top-unsuccessful-pages",
        "-u",
        dest="top_uns_pages",
        action="store_true",
        help="Reports the top 10 requested pages and the number of requests made for each",
    )
    parser.add_argument(
        "--top-ips",
        "-i",
        dest="top_ips",
        action="store_true",
        help="Reports the top 10 requested pages and the number of requests made for each",
    )
    parser.add_argument(
        "--show-successful",
        "-s",
        dest="successful",
        action="store_true",
        help="Reports percentage of successful requests",
    )
    parser.add_argument(
        "--show-unsuccessful",
        "-n",
        dest="unsuccessful",
        action="store_true",
        help="Reports percentage of unsuccessful requests",
    )
    parser.add_argument(
        "--all-per-minute",
        "-m",
        dest="all_per_min",
        action="store_true",
        help="Reports the total number of requests made every minute in the entire time period covered by the file provided",
    )
    parser.add_argument(
        "--top-urls-per-ip",
        "-l",
        dest="top_url_per_top_ip",
        action="store_true",
        help="Reports for each of the top 10 IPs, the top 5 pages requested and the number of requests for each.",
    )
    args = parser.parse_args()
    noargs = not len(sys.argv) > 1
    spark = SparkContext(appName="apacheStats")
    numIterations = 1  # increase to test scalability
    lines = spark.union([spark.textFile(path)] * numIterations)
    dicts = lines.map(lambda line: get_dict_from_line(line))
    statsRdd = dicts.map(lambda dictx: get_data_fromDict(dictx, None if noargs else args))
    stats = statsRdd.reduce(add)
    try:
        f = open("output_spark", "w")
        if noargs or args.top_req_pages:
            f.write("Top 10 requested pages and the number of requests made for each:\n")
            for (page, requests) in stats.pagesToNumberOfAccesses.most_common(10):
                f.write(page)
                f.write(": ")
                f.write(str(requests))
                f.write("\n")
            f.write("\n")
        if noargs or args.successful:
            f.write("Percentage of successful requests: ")
            f.write(str((float(stats.successful) / (stats.unsuccessful + stats.successful)) * 100) + "%")
            f.write("\n\n")
        if noargs or args.unsuccessful:
            f.write("Percentage of unsuccessful requests: ")
            f.write(str((float(stats.unsuccessful) / (stats.unsuccessful + stats.successful)) * 100) + "%")
            f.write("\n\n")
        if noargs or args.top_uns_pages:
            f.write("Most unsuccessful: \n")
            for (page, requests) in stats.unsuccessfulPages.most_common(10):
                f.write(page)
                f.write(": ")
                f.write(str(requests))
                f.write("\n")
            f.write("\n")
        if noargs or args.top_ips:
            f.write(
                "The top 10 IPs making the most requests, displaying the IP address and number of requests made: \n"
            )
            for (page, requests) in stats.ipToNumberOfAccesses.most_common(10):
                f.write(page)
                f.write(": ")
                f.write(str(requests))
                f.write("\n")
            f.write("\n")
        if noargs or args.all_per_min:
            f.write(
                "The total number of requests made every minute in the entire time period covered by the file provided: \n"
            )
            for date, accesses in stats.accessesPerMinute.iteritems():
#.........这里部分代码省略.........
开发者ID:cerebro84,项目名称:ApacheStats,代码行数:103,代码来源:apacheStatsSpark.py

示例8: parse_in_data

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
# read wiki page count stats and clean wiki page titles 
for src_file_name in src_files:
  base = os.path.basename(src_file_name)
  filename_tokens = base.split('-')
  (date, time) = filename_tokens[1], filename_tokens[2].split('.')[0] 
    
  if run_mode == "swift":
    src_file_name = "swift://" + source_dir + "." + swift_region + "/" + src_file_name
    
  lines = sc.textFile(src_file_name)
  parts = lines\
    .filter(lambda l: wiki_regex.match(l)) \
    .filter(lambda line: "facebook" in line.lower() ) \
    .map(lambda l: parse_in_data(l, date + time)) \
    .filter(lambda l: l != None)
#   .filter(lambda line: "facebook" in line.lower() ) \

  rdds.append(parts)

page_w_date = sc.union(rdds)

# calculate trends
pageview_counts = page_w_date \
    .reduceByKey(lambda a, b: a + b) \
    .map(lambda ( (p, d), c): (p, ([ d ], [ c ])) ) \
    .reduceByKey(lambda (d0, c0), (d1, c1): (d0 + d1, c0 + c1) ) \
    .map(lambda ( p, (d, c)): calc_trend(p, d, c) )

# write output to target directory
pageview_counts.saveAsTextFile(target_dir)
开发者ID:RajeshThallam,项目名称:MIDS-W251-FINAL-PROJECT,代码行数:32,代码来源:wiki_latest_page_trends.py

示例9: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def main():
	inputs = sys.argv[1]
	rating_file = sys.argv[2]
	output = sys.argv[3]

	conf = SparkConf().setAppName('movie recommendation')
	sc = SparkContext(conf=conf)
	assert sc.version >= '1.5.1'

	sqlContext = SQLContext(sc)
	
	""" sbaronia - getting files from directory and 
	reading from it and using parse_rating_movie and parse_my_input for parsing the
	content of the files to an rdd"""

	movies_path = join(inputs, "movies.dat")
	ratings_path = join(inputs, "ratings.dat")
	
	read_ratings = sc.textFile(ratings_path)
	read_movies  = sc.textFile(movies_path)
	read_mymovies = sc.textFile(rating_file)

	parse_ratings = read_ratings.map(lambda line : parse_rating_movie(line, "ratings.dat")).cache()
	parse_movies = read_movies.map(lambda line : parse_rating_movie(line, "movies.dat")).cache()
	parse_mymovies = read_mymovies.map(lambda line: parse_my_input(line)).cache()
	
	""" sbaronia - converting movie and rating data to dataframes """

	schema_movie = StructType([StructField('movie_id', IntegerType(), True),
								StructField('movie_name', StringType(), True)])

	movie_df = sqlContext.createDataFrame(parse_movies, schema=schema_movie).cache()


	schema_mymovie = StructType([StructField('ip_uid', IntegerType(), True),
								StructField('ip_mname', StringType(), True),
								StructField('ip_rating', IntegerType(), True),
								StructField('ldistance', IntegerType(), True)])

	mymovie_df = sqlContext.createDataFrame(parse_mymovies, schema=schema_mymovie).cache()

	""" sbaronia - combining user input movies with movies data
	then finding Levenshtein distance with every movie and then finding
	the one with minimum Levenshtein distance as our best match"""

	movie_plus_ip = movie_df.join(mymovie_df, None, 'inner').cache()
		
	movie_plus_ip_distance = movie_plus_ip.withColumn('ldistance', levenshtein('movie_name','ip_mname'))

	mymovie_distance = movie_plus_ip_distance \
							  .groupBy('ip_uid', 'ip_mname') \
							  .min('ldistance') \
							  .withColumnRenamed('min(ldistance)','ldistance') \
							  .cache()

	""" sbaronia - join the tables to get only those movies with minimum 
	Levenshtein distance and then from that table select columns 
	necessary. Then create a test data for all movies with new user 0"""
	refined_movies = movie_plus_ip_distance.join(mymovie_distance, ['ip_uid', 'ip_mname', 'ldistance'], 'inner').cache()
	
	input_rating = refined_movies.select('ip_uid', 'movie_id', 'ip_rating').cache()

	input_rating_rdd = input_rating.rdd.map(lambda row1: (row1.ip_uid, row1.movie_id, float(row1.ip_rating))).cache()
	
	input_with_train = sc.union([input_rating_rdd, parse_ratings]).cache()
	
	test_newuser = parse_movies.map(lambda line: (0, line[0])).cache()
	
	""" sbaronia - train on all data including new one and then 
	test on all movies for new user and sort them in descending 
	order of ratings"""
	model = ALS.train(input_with_train, 10, 10, 0.1)	
	predictions = model.predictAll(test_newuser) \
					   .map(lambda row1: (row1.rating, row1.product)) \
					   .sortByKey(ascending=False) \
					   .map(lambda row: (row[1], row[0])) \
					   .cache()

	final_rating = sqlContext.createDataFrame(predictions, ['movie_id', 'movie_rating']).cache()

	final_movie_rating = movie_df.join(final_rating, ['movie_id'], 'inner').sort("movie_rating", ascending=False).cache()

	final_movie_rating_rdd = final_movie_rating.rdd.map(lambda row: (str(row.movie_id) + ' :: ' + str(row.movie_name)) + ' :: ' + str(row.movie_rating)).coalesce(1).cache()
	final_movie_rating_rdd.saveAsTextFile(output)
开发者ID:gitofsid,项目名称:MyBigDataCode,代码行数:86,代码来源:movie_recommendations.py

示例10: extract_big5_elements

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def extract_big5_elements(parent_json_data):
    user_dict = {}
    for idx in range(0,5):
        if parent_json_data["tree"]["children"][0]["children"][0]["children"][idx]["id"] == "Openness" :
            user_dict.update(update_entry(idx,3,parent_json_data))
        if parent_json_data['tree']['children'][0]['children'][0]['children'][idx]['id'] == 'Conscientiousness' :
            user_dict.update(update_entry(idx,2,parent_json_data))
        if parent_json_data['tree']['children'][0]['children'][0]['children'][idx]['id'] == 'Extraversion' :
            user_dict.update(update_entry(idx,5,parent_json_data))
        if parent_json_data['tree']['children'][0]['children'][0]['children'][idx]['id'] == 'Agreeableness' :
            user_dict.update(update_entry(idx,1,parent_json_data))
        if parent_json_data['tree']['children'][0]['children'][0]['children'][idx]['id'] == 'Neuroticism' :
            user_dict.update(update_entry(idx,4,parent_json_data))
    return user_dict def update_entry(idx,index,parent_json_data):
    trait_name = parent_json_data["tree"]["children"][0]["children"][0]["children"][idx]["children"][index]["id"]
    trait_percentage = parent_json_data["tree"]["children"][0]["children"][0]["children"][idx]["children"][index]["percentage"]
    return {trait_name:trait_percentage}


# convert_to_dict would properly format the statuses of the user (i.e. it would make more descriptive)
def convert_to_dict(obj):
    return {
        'userid': str(obj['user']['id']),
        'id': str(obj['id']),
        'sourceid': 'python-twitter',
        'contenttype': 'text/plain',
        'language': obj['lang'],
        'content': obj['text'],
        'reply': ((obj['in_reply_to_status_id']) == None),
        'forward': False
    }


# calculate_average would be called by reduceByKey() function to get the average personality trait for each location
def calculate_average(a,b):
    sum_dict = {}
    for key in a:
        sum_dict[key] = ((a[key] + b[key])/300) # divide by the total number of users in a location
    return sum_dict 


if __name__ == "__main__":
    tweet_list = get_all_tweets
    get_avg = calculate_average
    my_conf = (SparkConf()
         .setAppName("Spark-twitter-user")
         .set("spark.network.timeout", "1200s")) # set timeout to 20 minutes as spark job
         # would sit idle for 15 minutes in order to refresh the request window of twitter API
    sc = SparkContext(conf=my_conf)
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId","") #specify your AWSAccessKey here
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey","") #specify your AWSSecretKey here
    input_rdd = sc.textFile('s3n://perloc/twitterDB.txt/').map(lambda entry: tuple(entry.split(",")))
    print('input_rdd loaded')
    #print the start-time of the spark-job
    print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    ''' We'll create subsets from the main rdd based on the key, i.e. location
     and work upon each rdd for every window of 15 minutes
     (twitter restrictions apply here, see twitter-API-rate-limits)
     https://dev.twitter.com/rest/public/rate-limits
    '''
    rio_filter_rdd = input_rdd.filter(lambda (loc,user): loc=='Rio')
    rio_rdd = rio_filter_rdd.mapValues(lambda user : tweet_list(user))
    print('Rio RDD created')
    time.sleep(61*15) # required because of the API-rate limit
    chicago_filter_rdd = input_rdd.filter(lambda (loc,user): loc=='Chicago')
    chicago_rdd = chicago_filter_rdd.mapValues(lambda user : tweet_list(user))
    print('Chicago RDD created')
    time.sleep(61*15)
    newyork_filter_rdd = input_rdd.filter(lambda (loc,user): loc=='NewYork')
    newyork_rdd = newyork_filter_rdd.mapValues(lambda user : tweet_list(user))
    print('NewYork RDD created')
    time.sleep(61*15)
    california_filter_rdd = input_rdd.filter(lambda (loc,user): loc=='California')
    california_rdd = california_filter_rdd.mapValues(lambda user : tweet_list(user))
    print('California RDD created')
    # Once done with the personality analysis for each location, simply gather them all in one rdd
    finalRDD = sc.union([rio_rdd,chicago_rdd,newyork_rdd,california_rdd])
                 .reduceByKey(lambda user1,user2 : get_avg(user1,user2)) # get the average of each personality trait for each location
    finalRDD.repartition(1).saveAsTextFile("s3n://perloc/twitter-ibm/personality-profile.txt")
    print('All done,file created')
    print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    sys.exit()
开发者ID:GudduAbhishek,项目名称:perloc,代码行数:84,代码来源:get_personality.py

示例11: run

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def run(schema_file, data_path, script=None, spec_file=None, verbose=None, rout=None, yarn=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    if  script:
        script = get_script(script)
    if  verbose:
        print("### schema: %s" % schema_file)
        print("### path  : %s" % data_path)
        print("### script: %s" % script)
        print("### spec  : %s" % spec_file)
    time0 = time.time()
    # pyspark modules
    from pyspark import SparkContext

    # define spark context, it's main object which allow
    # to communicate with spark
    ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script])
    logger = SparkLogger(ctx)
    if  not verbose:
        logger.set_level('ERROR')
    if yarn:
        logger.info("YARN client mode enabled")

    # load FWJR schema
    rdd = ctx.textFile(schema_file, 1).collect()

    # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines)
    avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list
    schema = ''.join(avsc.split()) # remove spaces in avsc map
    conf = {"avro.schema.input.key": schema}

    # define newAPIHadoopFile parameters, java classes
    aformat="org.apache.avro.mapreduce.AvroKeyInputFormat"
    akey="org.apache.avro.mapred.AvroKey"
    awrite="org.apache.hadoop.io.NullWritable"
    aconv="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter"

    # load data from HDFS
    if  isinstance(data_path, list):
        avro_rdd = ctx.union([ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path])
    else:
        avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf)

    # process data, here the map will read record from avro file
    # if we need a whole record we'll use lambda x: x[0], e.g.
    # output = avro_rdd.map(lambda x: x[0]).collect()
    #
    # if we need a particular key, e.g. jobid, we'll extract it
    # within lambda function, e.g. lambda x: x[0]['jobid'], e.g.
    # output = avro_rdd.map(lambda x: x[0]['jobid']).collect()
    #
    # in more general way we write mapper/reducer functions which will be
    # executed by Spark via collect call
    spec = None
    if  spec_file:
        if  os.path.isfile(spec_file):
            spec = json.load(open(spec_file))
        else:
            spec = json.loads(spec_file)
    if  verbose:
        spec['verbose'] = 1
        print("### spec %s" % json.dumps(spec))
    if  rout:
        spec['output'] = rout
    if  script:
        obj = import_(script)
        logger.info("Use user-based script %s" % obj)
        if  not hasattr(obj, 'MapReduce'):
            logger.error('Unable to find MapReduce class in %s, %s' \
                    % (script, obj))
            ctx.stop()
            return
        # we have a nested use case when one MR return WMArchive spec
        # we'll loop in that case until we get non-spec output
        count = 0
        while True:
            mro = obj.MapReduce(spec)
            mname = mro.__dict__.get('name', '').split('.')[0]
            print("### Load %s" % mname)
            if  mname.lower().endswith('counter'):
                out = avro_rdd.filter(mro.mapper).count()
                if  rout:
                    with open(rout, 'w') as ostream:
                        ostream.write(out)
                break
            # example of collecting records from mapper and
            # passing all of them to reducer function
            records = avro_rdd.filter(mro.mapper).collect()
            out = mro.reducer(records)
            if  verbose:
                print("### Loop count %s" % count)
            if  count > 3:
                print("### WARNING, loop counter exceed its limit")
                break
            if  is_spec(out):
                spec = out
            else:
                break
#.........这里部分代码省略.........
开发者ID:yuyiguo,项目名称:WMArchive,代码行数:103,代码来源:myspark.py

示例12: stripSPM

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
    return sc.textFile(f).map(lambda x: (f_date, x))

def stripSPM(f):
    field = str(f)
    while(len(field) > 0 and not is_number(field)):
        field = field[1:]
    return float(field) if len(field) > 0 else 0.0
udf = UserDefinedFunction(lambda x: stripSPM(x), FloatType())

s_type = sqlContext.read.format('jdbc').options(url=url, dbtable='public.st_type').load()

# Read in the Metadata files
path = 'meta/*/*/*'
# path = 'meta/2008/*/*'
allFiles = glob(path + ".txt")
meta_files = sc.union([date_row(f) for f in allFiles])

md_rows = meta_files.map(lambda l: [l[0]]+l[1].split('\t')) \
                    .map(lambda p: Row(effective_start=p[0], pems_id=p[1], Fwy=ZeroInt(p[2]), Dir=p[3],
                                        district_id=ZeroInt(p[4]), County=ZeroInt(p[5]), City=ZeroInt(p[6]),
                                        state_pm=p[7], abs_pm=ZeroFloat(p[8]), latitude=ZeroFloat(p[9]),
                                        longitude=ZeroFloat(p[10]), length=ZeroFloat(p[11]), Type=p[12],
                                        num_lanes=ZeroFloat(p[13]), name=p[14]))

station_meta = sqlContext.createDataFrame(md_rows)
st_cols = station_meta.columns # Get the initial list of columns
station_meta = station_meta.fillna({'City': -1})
station_meta = station_meta.select(*[udf(column).alias('state_pm') if column == 'state_pm' else column for column in station_meta.columns])

# Drop duplicates w/o including file data
station_meta = station_meta.dropDuplicates([c for c in st_cols if c != 'effective_start'])
开发者ID:conwaywong,项目名称:dse_capstone,代码行数:33,代码来源:traffic_etl_spark.py

示例13: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def main():
    conf = SparkConf().setAppName("Test")
    sc = SparkContext(conf=conf)

    # load tagged datasets as training data
    dataY0 = sc.wholeTextFiles('/home/xsran/IdeaProjects/hadoop1/data/Y-cut')
    dataN0 = sc.wholeTextFiles('/home/xsran/IdeaProjects/hadoop1/data/N-cut')

    # split text into words
    dataN = dataN0.map(lambda x: x[1].split(" "))
    dataY = dataY0.map(lambda x: x[1].split(" "))

    # merge the positive and negative into a single dataset
    dataA = dataY.union(dataN)

    # map words list into (word,1) tuple
    words = dataA.flatMap(lambda x: x).map(lambda x: (x, 1))
    # counting the number of words
    wordCount = words.reduceByKey(lambda x, y: x + y).map(lambda x: (x[1], x[0])).sortByKey(ascending=False)
    wordCount.cache()

    # saving this results
    # wordCount.map(lambda x:'%s,%s' % (x[1],x[0])).saveAsTextFile(dir+'wordCount')
    # wordCount.map(lambda x:(x[1],x[0])).saveAsTextFile(dir+'wordCount_rep')

    # filter this words list. Only keep the words with a certain frequency as features
    # feature_count: (features word, count)
    feature_count = wordCount.filter(lambda x: 150 < x[0] < 5000).map(lambda x: (x[1], x[0]))

    # count the word frequency in positive and negative case respectively.
    dataN1 = dataN0.flatMap(lambda x: [(w, 1) for w in set(x[1].split(" "))]).reduceByKey(lambda x, y: x + y)
    dataY1 = dataY0.flatMap(lambda x: [(w, 1) for w in set(x[1].split(" "))]).reduceByKey(lambda x, y: x + y)
    # dataA1: (word,(N num,Y num))
    dataA1 = dataN1.fullOuterJoin(dataY1).mapValues(lambda x: (x[0] if x[0] else 0, x[1] if x[1] else 0))

    fs = feature_count.map(lambda x: (x[0], 0))

    totalNnum = dataN0.count()
    totalYnum = dataY0.count()
    # only keep those words in the feature_count
    # dataA2:(word,(N num,Y num))
    dataA2 = dataA1.rightOuterJoin(fs).mapValues(lambda x: x[0]).filter(
        lambda x: x[1][0] != totalNnum and x[1][1] != totalYnum)

    # compute the chi square values
    dataA3 = dataA2.mapValues(lambda x: (x, (totalNnum - x[0], totalYnum - x[1]), totalNnum + totalYnum))
    dataX2 = dataA3.mapValues(lambda x: (float(x[0][0] * x[1][1] - x[0][1] * x[1][0]) ** 2 * x[2]) / (
        (x[0][0] + x[0][1]) * (x[1][0] + x[1][1]) * (x[0][0] + x[1][0]) * (x[0][1] + x[1][1])))
    # sorting
    dataX2 = dataX2.sortBy(lambda x: abs(x[1]), ascending=False)

    # only keep 100 features with highest chi square values
    # features: this variable only keep the 100 words.
    features = dataX2.map(lambda x: x[0]).collect()[:100]
    # features_x2: this variable record the chi square values of each features
    features_x2 = dataX2.collect()[:100]

    # broadcasting those data to spark's worker nodes.
    features = sc.broadcast(features)
    features_x2 = sc.broadcast(features_x2)

    # this function is used to extract features from a case
    def make_feature(doc):
        doc = doc.split(" ")
        f = []
        for i in features.value:
            f.append(doc.count(i))
        return f

    def make_feature2(doc):
        doc = doc.split(" ")
        f = []
        for k, v in features_x2.value:
            a = doc.count(k)
            a = v if a else 0
            f.append(a)
        return f

    # convert case into features
    fN = dataN0.mapValues(make_feature2)
    fY = dataY0.mapValues(make_feature2)

    # fN.repartition(1).map(lambda x:(x[0].split('/')[-1][:-4],x[1])).saveAsTextFile(dir+'VecN')
    # fY.repartition(1).map(lambda x:(x[0].split('/')[-1][:-4],x[1])).saveAsTextFile(dir+'VecY')

    fN = fN.map(lambda x: x[1])
    fY = fY.map(lambda x: x[1])

    # sc.stop()

    # convert features into LabeledPoint to train the model.
    fNtl = fN.map(lambda x: LabeledPoint(0, x))
    fYtl = fY.map(lambda x: LabeledPoint(1, x))

    # union the positive and negative data and train the NaiveBayes model.
    fTrain = fNtl.union(fYtl)
    bn = NaiveBayes.train(fTrain)

    # load the all untagged data
    inputs = [sc.wholeTextFiles('/home/xsran/tmp/BigData/data_c_' + str(i)) for i in range(10)]
#.........这里部分代码省略.........
开发者ID:whyaysd233,项目名称:BigData,代码行数:103,代码来源:CaseClassifier.py

示例14: map_to_item_user_rating

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
batchsize = 1000
list_dicts = []
i=12
for f in filenames:
    d = sc.textFile(f).zipWithIndex().map(lambda x: (x[1], x[0])).map(parseMovieFile).reduce(merge_dicts)
    list_dicts.append(d)
    i += 1
    if i == 1000:
        i = 0 
        rdds.append(sc.parallelize(list_dicts))
        
        list_dicts = []

rdds.append(sc.parallelize(list_dicts))

rdd = sc.union(rdds)
rdd.cache()

def map_to_item_user_rating(d):
    movie = d['movie']
    del d['movie']
    return (movie, d)

def flat_map_user_ratings(line):
    ret_arr = []
    for k in line[1]:
        ret_arr.append((k, {line[0]:line[1][k]}))
    return ret_arr

item_user_ratings = rdd.map(map_to_item_user_rating)
user_ratings = item_user_ratings.flatMap(flat_map_user_ratings)
开发者ID:mickeykedia,项目名称:Matrix-Factorization-ALS,代码行数:33,代码来源:matrix_factorization_spark.py

示例15: run

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import union [as 别名]
def run(schema_file, data_path, script=None, spec_file=None, verbose=None, yarn=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    time0 = time.time()
    # pyspark modules
    from pyspark import SparkContext

    # define spark context, it's main object which allow
    # to communicate with spark
    ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script])
    logger = SparkLogger(ctx)
    if  not verbose:
        logger.set_level('ERROR')
    if yarn:
        logger.info("YARN client mode enabled")

    # load FWJR schema
    rdd = ctx.textFile(schema_file, 1).collect()

    # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines)
    avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list
    schema = ''.join(avsc.split()) # remove spaces in avsc map
    conf = {"avro.schema.input.key": schema}

    # define newAPIHadoopFile parameters, java classes
    aformat="org.apache.avro.mapreduce.AvroKeyInputFormat"
    akey="org.apache.avro.mapred.AvroKey"
    awrite="org.apache.hadoop.io.NullWritable"
    aconv="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter"

    # load data from HDFS
    if  isinstance(data_path, list):
        avro_rdd = ctx.union([ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path])
    else:
        avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf)

    # process data, here the map will read record from avro file
    # if we need a whole record we'll use lambda x: x[0], e.g.
    # output = avro_rdd.map(lambda x: x[0]).collect()
    #
    # if we need a particular key, e.g. jobid, we'll extract it
    # within lambda function, e.g. lambda x: x[0]['jobid'], e.g.
    # output = avro_rdd.map(lambda x: x[0]['jobid']).collect()
    #
    # in more general way we write mapper/reducer functions which will be
    # executed by Spark via collect call
    spec = None
    if  spec_file:
        if  os.path.isfile(spec_file):
            spec = json.load(open(spec_file))
        else:
            spec = json.loads(spec_file)
    if  script:
        obj = import_(script)
        logger.info("Use user-based script %s" % obj)
        if  not hasattr(obj, 'MapReduce'):
            logger.error('Unable to find MapReduce class in %s, %s' \
                    % (script, obj))
            ctx.stop()
            return
        mro = obj.MapReduce(spec)
        # example of collecting records from mapper and
        # passing all of them to reducer function
        records = avro_rdd.map(mro.mapper).collect()
        out = mro.reducer(records)

        # the map(f).reduce(f) example but it does not collect
        # intermediate records
        # out = avro_rdd.map(obj.mapper).reduce(obj.reducer).collect()
    else:
        records = avro_rdd.map(basic_mapper).collect()
        out = basic_reducer(records)
    ctx.stop()
    if  verbose:
        logger.info("Elapsed time %s" % htime(time.time()-time0))
    return out
开发者ID:sartaj10,项目名称:WMArchive,代码行数:80,代码来源:myspark.py


注:本文中的pyspark.SparkContext.union方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。