本文整理汇总了Python中pyspark.context.SparkContext.broadcast方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.broadcast方法的具体用法?Python SparkContext.broadcast怎么用?Python SparkContext.broadcast使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.context.SparkContext
的用法示例。
在下文中一共展示了SparkContext.broadcast方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: TestRDDFunctions
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import broadcast [as 别名]
class TestRDDFunctions(PySparkTestCase):
def test_failed_sparkcontext_creation(self):
# Regression test for SPARK-1550
self.sc.stop()
self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
self.sc = SparkContext("local")
def test_save_as_textfile_with_unicode(self):
# Regression test for SPARK-970
x = u"\u00A1Hola, mundo!"
data = self.sc.parallelize([x])
tempFile = tempfile.NamedTemporaryFile(delete=True)
tempFile.close()
data.saveAsTextFile(tempFile.name)
raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
def test_transforming_cartesian_result(self):
# Regression test for SPARK-1034
rdd1 = self.sc.parallelize([1, 2])
rdd2 = self.sc.parallelize([3, 4])
cart = rdd1.cartesian(rdd2)
result = cart.map(lambda (x, y): x + y).collect()
def test_transforming_pickle_file(self):
# Regression test for SPARK-2601
data = self.sc.parallelize(["Hello", "World!"])
tempFile = tempfile.NamedTemporaryFile(delete=True)
tempFile.close()
data.saveAsPickleFile(tempFile.name)
pickled_file = self.sc.pickleFile(tempFile.name)
pickled_file.map(lambda x: x).collect()
def test_cartesian_on_textfile(self):
# Regression test for
path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
a = self.sc.textFile(path)
result = a.cartesian(a).collect()
(x, y) = result[0]
self.assertEqual("Hello World!", x.strip())
self.assertEqual("Hello World!", y.strip())
def test_deleting_input_files(self):
# Regression test for SPARK-1025
tempFile = tempfile.NamedTemporaryFile(delete=False)
tempFile.write("Hello World!")
tempFile.close()
data = self.sc.textFile(tempFile.name)
filtered_data = data.filter(lambda x: True)
self.assertEqual(1, filtered_data.count())
os.unlink(tempFile.name)
self.assertRaises(Exception, lambda: filtered_data.count())
def testAggregateByKey(self):
data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)
def seqOp(x, y):
x.add(y)
return x
def combOp(x, y):
x |= y
return x
sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
self.assertEqual(3, len(sets))
self.assertEqual(set([1]), sets[1])
self.assertEqual(set([2]), sets[3])
self.assertEqual(set([1, 3]), sets[5])
def test_itemgetter(self):
rdd = self.sc.parallelize([range(10)])
from operator import itemgetter
self.assertEqual([1], rdd.map(itemgetter(1)).collect())
self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())
def test_namedtuple_in_rdd(self):
from collections import namedtuple
Person = namedtuple("Person", "id firstName lastName")
jon = Person(1, "Jon", "Doe")
jane = Person(2, "Jane", "Doe")
theDoes = self.sc.parallelize([jon, jane])
self.assertEquals([jon, jane], theDoes.collect())
def test_large_broadcast(self):
N = 100000
data = [[float(i) for i in range(300)] for i in range(N)]
bdata = self.sc.broadcast(data) # 270MB
m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum()
self.assertEquals(N, m)
示例2: TestRDDFunctions
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import broadcast [as 别名]
class TestRDDFunctions(PySparkTestCase):
def test_failed_sparkcontext_creation(self):
# Regression test for SPARK-1550
self.sc.stop()
self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
self.sc = SparkContext("local")
def test_save_as_textfile_with_unicode(self):
# Regression test for SPARK-970
x = u"\u00A1Hola, mundo!"
data = self.sc.parallelize([x])
tempFile = tempfile.NamedTemporaryFile(delete=True)
tempFile.close()
data.saveAsTextFile(tempFile.name)
raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
def test_save_as_textfile_with_utf8(self):
x = u"\u00A1Hola, mundo!"
data = self.sc.parallelize([x.encode("utf-8")])
tempFile = tempfile.NamedTemporaryFile(delete=True)
tempFile.close()
data.saveAsTextFile(tempFile.name)
raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
def test_transforming_cartesian_result(self):
# Regression test for SPARK-1034
rdd1 = self.sc.parallelize([1, 2])
rdd2 = self.sc.parallelize([3, 4])
cart = rdd1.cartesian(rdd2)
result = cart.map(lambda (x, y): x + y).collect()
def test_transforming_pickle_file(self):
# Regression test for SPARK-2601
data = self.sc.parallelize(["Hello", "World!"])
tempFile = tempfile.NamedTemporaryFile(delete=True)
tempFile.close()
data.saveAsPickleFile(tempFile.name)
pickled_file = self.sc.pickleFile(tempFile.name)
pickled_file.map(lambda x: x).collect()
def test_cartesian_on_textfile(self):
# Regression test for
path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
a = self.sc.textFile(path)
result = a.cartesian(a).collect()
(x, y) = result[0]
self.assertEqual("Hello World!", x.strip())
self.assertEqual("Hello World!", y.strip())
def test_deleting_input_files(self):
# Regression test for SPARK-1025
tempFile = tempfile.NamedTemporaryFile(delete=False)
tempFile.write("Hello World!")
tempFile.close()
data = self.sc.textFile(tempFile.name)
filtered_data = data.filter(lambda x: True)
self.assertEqual(1, filtered_data.count())
os.unlink(tempFile.name)
self.assertRaises(Exception, lambda: filtered_data.count())
def testAggregateByKey(self):
data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)
def seqOp(x, y):
x.add(y)
return x
def combOp(x, y):
x |= y
return x
sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
self.assertEqual(3, len(sets))
self.assertEqual(set([1]), sets[1])
self.assertEqual(set([2]), sets[3])
self.assertEqual(set([1, 3]), sets[5])
def test_itemgetter(self):
rdd = self.sc.parallelize([range(10)])
from operator import itemgetter
self.assertEqual([1], rdd.map(itemgetter(1)).collect())
self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())
def test_namedtuple_in_rdd(self):
from collections import namedtuple
Person = namedtuple("Person", "id firstName lastName")
jon = Person(1, "Jon", "Doe")
jane = Person(2, "Jane", "Doe")
theDoes = self.sc.parallelize([jon, jane])
self.assertEquals([jon, jane], theDoes.collect())
def test_large_broadcast(self):
N = 100000
data = [[float(i) for i in range(300)] for i in range(N)]
bdata = self.sc.broadcast(data) # 270MB
m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum()
self.assertEquals(N, m)
#.........这里部分代码省略.........
示例3: processTrainData
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import broadcast [as 别名]
# parse raw user artist data
userArtistDataFile = filePath + 'user_artist_data.txt'
rawUserArtistData = sc.textFile(userArtistDataFile)
# parse Artist data file
artistDataFile = filePath + 'artist_data.txt'
rawArtistData = sc.textFile(artistDataFile)
artistById = rawArtistData.map(parseArtistByIdData).filter(lambda (k, v) : k != -1)
# parse artist alias file
artistAliasDataFile = filePath + 'artist_alias.txt'
rawArtistAliasData = sc.textFile(artistAliasDataFile)
artistAlias = rawArtistAliasData.map(parseArtistAliasData).filter(lambda (k, v) : k != -1).collectAsMap()
# broadcast variable
bArtistAlias = sc.broadcast(artistAlias)
def processTrainData(line):
(userId, artistId, count) = map(int, line.split(' '))
artistAliasId = bArtistAlias.value.get(artistId)
if artistAliasId == None:
artistAliasId = artistId
return Rating(userId, artistAliasId, count)
trainData = rawUserArtistData.map(processTrainData).cache()
model = ALS.trainImplicit(trainData, 10)
print model.productFeatures()
示例4: SparkContext
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import broadcast [as 别名]
from pyspark.sql.functions import udf, col
from pyspark.sql.types import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.mllib.linalg import SparseVector, DenseVector
sc = SparkContext(appName='sparking_your_interest')
SQLContext = HiveContext(sc)
speech_stopwords_list = list([line.strip() for line in open('speech_stopwords.txt', 'r')])
speech_stopwords_broadcasted = sc.broadcast(speech_stopwords_list)
nltk_stopwords = set(stopwords.words('english'))
nltk_stopwords_broadcasted = sc.broadcast(nltk_stopwords)
more_stopwords = set([line.strip() for line in open('more_stopwords.txt', 'r')])
more_stopwords_broadcasted = sc.broadcast(more_stopwords)
def clean_up(s):
text_removing_brackets = re.sub("[\(\[].*?[\)\]]", "", s)
text_removing_double_quotes = re.sub('"',"",text_removing_brackets)
speech_stopwords = speech_stopwords_broadcasted.value
text_removing_stopwords = text_removing_double_quotes
for token in speech_stopwords:
text_removing_stopwords = re.sub(token,'',text_removing_stopwords)
return text_removing_stopwords
def unicode_encode(s):
示例5: SparkContext
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import broadcast [as 别名]
######### Global variabls ######### (gross)
# The following variables are broadcast to the spark
# cluster and can be used in the functions below
songTable = 'song_data'
sc = SparkContext('local[*]', 'lastfm_recommender')
sqlContext = SQLContext(sc)
### Set up database connections for metadata and similar artists
### This is starting to get really ugly.
### broadcasting this data is probably not a good idea
artist_engine = create_engine('sqlite:///'+sys.argv[1])
sims = pd.read_sql_query(
'SELECT * FROM similarity', artist_engine)
# broadcsasting these variables is probably a bad idea since
# they ar quite big
similars = sc.broadcast(sims.similar)
similar_groups = sc.broadcast(sims.groupby('target').groups)
tagFile = open('lastfm_unique_tags.txt', 'r')
# make tag dictionary available across the cluster.
tags = [tagstr[0] for tagstr in map(lambda ts: ts.split('\t'),
[next(tagFile) for x in xrange(500)])]
tagDictionary = sc.broadcast(tags)
tagFile.close()
######## Functions for feature extraction #########
# make a "vector" with indices corresoinding to values in
# tagDictionary
def getTagVector(track):
return {tagDictionary.value[tag]:1 for [tag, f] in track.tags
示例6: values
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import broadcast [as 别名]
# values()
m = sc.parallelize([(1, 2), (3, 4)]).values()
m.collect()
# variance()
sc.parallelize([1, 2, 3]).variance()
# zip(other)
x = sc.parallelize(range(0,5))
y = sc.parallelize(range(1000, 1005))
x.zip(y).collect()
# zipWithIndex()
sc.parallelize(["a", "b", "c", "d"], 3).zipWithIndex().collect()
# zipWithUniqueId()
sc.parallelize(["a", "b", "c", "d", "e"], 3).zipWithUniqueId().collect()
### BROADCAST
from pyspark.context import SparkContext
sc = SparkContext('local', 'test')
b = sc.broadcast([1, 2, 3, 4, 5])
b.value
sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect()
b.unpersist()
large_broadcast = sc.broadcast(range(10000))
示例7: MainApp
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import broadcast [as 别名]
class MainApp(object):
def __init__(self):
pass
def init(self):
os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
# os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
# os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
conf = SparkConf()
conf.setMaster("local")
conf.setAppName("PySparkShell")
conf.set("spark.executor.memory", "2g")
# conf.set("spark.driver.memory", "1g")
self.sc = SparkContext(conf=conf)
self.sqlContext = SQLContext(self.sc)
def loadData(self):
self.df_review = self.sqlContext.read.json(
"../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json"
).cache()
# self.df_review = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_review.json").cache()
self.df_business = self.sqlContext.read.json(
"../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json"
).cache()
# self.df_business = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_business.json").cache()
self.df_review.registerTempTable("reviews")
self.df_business.registerTempTable("business")
def createCheckInDataPerUser(self):
review_user = self.sqlContext.sql("SELECT business_id, user_id FROM reviews")
business_loc = self.sqlContext.sql("SELECT business_id, latitude, longitude FROM business")
review_user.registerTempTable("reviews_user")
business_loc.registerTempTable("business_loc")
self.df_join_reviewAndBusiness = self.sqlContext.sql(
"SELECT r.user_id, b.latitude, b.longitude FROM reviews_user r JOIN business_loc b ON r.business_id = b.business_id"
).cache()
self.df_join_reviewAndBusiness.registerTempTable("userBusiness")
self.df_unique_users = self.sqlContext.sql(
'SELECT DISTINCT user_id FROM userBusiness where user_id = "SIfJLNMv7vBwo-fSipxNgg"'
)
self.df_unique_users.registerTempTable("users")
pd = self.df_join_reviewAndBusiness.toPandas()
global_db = self.sc.broadcast(pd)
schema = StructType([StructField("latitude", FloatType()), StructField("longitude", FloatType())])
partialFunc = partial(getLocationsOfUser, business_db=global_db.value)
self.get_locations = udf(partialFunc, ArrayType(schema))
self.get_centers = udf(getCentersOfUser, ArrayType(schema))
self.df_unique_users = self.df_unique_users.withColumn(
"user_locations", self.get_locations(self.df_unique_users["user_id"])
)
self.df_unique_users.registerTempTable("users")
self.df_unique_users.repartition(1).write.save("user.json", "json", "overwrite")
print(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"]))
self.df_unique_users = self.df_unique_users.withColumn(
"user_centers", self.get_centers(self.df_unique_users["user_locations"])
)
self.df_unique_users.registerTempTable("users")
self.df_unique_users.repartition(1).write.save("center.json", "json", "overwrite")
self.df_unique_users.show()
def distanceCalc(self):
self.df_unique_users = self.sqlContext.read.json(
"user.json/part-r-00000-23a1b514-f5fe-4f61-9a64-01ebbc88c146"
).cache()
print(len(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"])))