当前位置: 首页>>代码示例>>Python>>正文


Python SparkContext.broadcast方法代码示例

本文整理汇总了Python中pyspark.context.SparkContext.broadcast方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.broadcast方法的具体用法?Python SparkContext.broadcast怎么用?Python SparkContext.broadcast使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.context.SparkContext的用法示例。


在下文中一共展示了SparkContext.broadcast方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: TestRDDFunctions

# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import broadcast [as 别名]
class TestRDDFunctions(PySparkTestCase):

    def test_failed_sparkcontext_creation(self):
        # Regression test for SPARK-1550
        self.sc.stop()
        self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
        self.sc = SparkContext("local")

    def test_save_as_textfile_with_unicode(self):
        # Regression test for SPARK-970
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_transforming_cartesian_result(self):
        # Regression test for SPARK-1034
        rdd1 = self.sc.parallelize([1, 2])
        rdd2 = self.sc.parallelize([3, 4])
        cart = rdd1.cartesian(rdd2)
        result = cart.map(lambda (x, y): x + y).collect()

    def test_transforming_pickle_file(self):
        # Regression test for SPARK-2601
        data = self.sc.parallelize(["Hello", "World!"])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsPickleFile(tempFile.name)
        pickled_file = self.sc.pickleFile(tempFile.name)
        pickled_file.map(lambda x: x).collect()

    def test_cartesian_on_textfile(self):
        # Regression test for
        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
        a = self.sc.textFile(path)
        result = a.cartesian(a).collect()
        (x, y) = result[0]
        self.assertEqual("Hello World!", x.strip())
        self.assertEqual("Hello World!", y.strip())

    def test_deleting_input_files(self):
        # Regression test for SPARK-1025
        tempFile = tempfile.NamedTemporaryFile(delete=False)
        tempFile.write("Hello World!")
        tempFile.close()
        data = self.sc.textFile(tempFile.name)
        filtered_data = data.filter(lambda x: True)
        self.assertEqual(1, filtered_data.count())
        os.unlink(tempFile.name)
        self.assertRaises(Exception, lambda: filtered_data.count())

    def testAggregateByKey(self):
        data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)

        def seqOp(x, y):
            x.add(y)
            return x

        def combOp(x, y):
            x |= y
            return x

        sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
        self.assertEqual(3, len(sets))
        self.assertEqual(set([1]), sets[1])
        self.assertEqual(set([2]), sets[3])
        self.assertEqual(set([1, 3]), sets[5])

    def test_itemgetter(self):
        rdd = self.sc.parallelize([range(10)])
        from operator import itemgetter
        self.assertEqual([1], rdd.map(itemgetter(1)).collect())
        self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())

    def test_namedtuple_in_rdd(self):
        from collections import namedtuple
        Person = namedtuple("Person", "id firstName lastName")
        jon = Person(1, "Jon", "Doe")
        jane = Person(2, "Jane", "Doe")
        theDoes = self.sc.parallelize([jon, jane])
        self.assertEquals([jon, jane], theDoes.collect())

    def test_large_broadcast(self):
        N = 100000
        data = [[float(i) for i in range(300)] for i in range(N)]
        bdata = self.sc.broadcast(data)  # 270MB
        m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum()
        self.assertEquals(N, m)
开发者ID:ArchangelSeraphim,项目名称:spark,代码行数:93,代码来源:tests.py

示例2: TestRDDFunctions

# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import broadcast [as 别名]
class TestRDDFunctions(PySparkTestCase):

    def test_failed_sparkcontext_creation(self):
        # Regression test for SPARK-1550
        self.sc.stop()
        self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
        self.sc = SparkContext("local")

    def test_save_as_textfile_with_unicode(self):
        # Regression test for SPARK-970
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_save_as_textfile_with_utf8(self):
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x.encode("utf-8")])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_transforming_cartesian_result(self):
        # Regression test for SPARK-1034
        rdd1 = self.sc.parallelize([1, 2])
        rdd2 = self.sc.parallelize([3, 4])
        cart = rdd1.cartesian(rdd2)
        result = cart.map(lambda (x, y): x + y).collect()

    def test_transforming_pickle_file(self):
        # Regression test for SPARK-2601
        data = self.sc.parallelize(["Hello", "World!"])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsPickleFile(tempFile.name)
        pickled_file = self.sc.pickleFile(tempFile.name)
        pickled_file.map(lambda x: x).collect()

    def test_cartesian_on_textfile(self):
        # Regression test for
        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
        a = self.sc.textFile(path)
        result = a.cartesian(a).collect()
        (x, y) = result[0]
        self.assertEqual("Hello World!", x.strip())
        self.assertEqual("Hello World!", y.strip())

    def test_deleting_input_files(self):
        # Regression test for SPARK-1025
        tempFile = tempfile.NamedTemporaryFile(delete=False)
        tempFile.write("Hello World!")
        tempFile.close()
        data = self.sc.textFile(tempFile.name)
        filtered_data = data.filter(lambda x: True)
        self.assertEqual(1, filtered_data.count())
        os.unlink(tempFile.name)
        self.assertRaises(Exception, lambda: filtered_data.count())

    def testAggregateByKey(self):
        data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)

        def seqOp(x, y):
            x.add(y)
            return x

        def combOp(x, y):
            x |= y
            return x

        sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
        self.assertEqual(3, len(sets))
        self.assertEqual(set([1]), sets[1])
        self.assertEqual(set([2]), sets[3])
        self.assertEqual(set([1, 3]), sets[5])

    def test_itemgetter(self):
        rdd = self.sc.parallelize([range(10)])
        from operator import itemgetter
        self.assertEqual([1], rdd.map(itemgetter(1)).collect())
        self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())

    def test_namedtuple_in_rdd(self):
        from collections import namedtuple
        Person = namedtuple("Person", "id firstName lastName")
        jon = Person(1, "Jon", "Doe")
        jane = Person(2, "Jane", "Doe")
        theDoes = self.sc.parallelize([jon, jane])
        self.assertEquals([jon, jane], theDoes.collect())

    def test_large_broadcast(self):
        N = 100000
        data = [[float(i) for i in range(300)] for i in range(N)]
        bdata = self.sc.broadcast(data)  # 270MB
        m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum()
        self.assertEquals(N, m)
#.........这里部分代码省略.........
开发者ID:CodEnFisH,项目名称:cogngin,代码行数:103,代码来源:tests.py

示例3: processTrainData

# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import broadcast [as 别名]
# parse raw user artist data
userArtistDataFile = filePath + 'user_artist_data.txt'
rawUserArtistData = sc.textFile(userArtistDataFile)

# parse Artist data file
artistDataFile = filePath + 'artist_data.txt'
rawArtistData = sc.textFile(artistDataFile)
artistById = rawArtistData.map(parseArtistByIdData).filter(lambda (k, v) : k != -1)

# parse artist alias file
artistAliasDataFile = filePath + 'artist_alias.txt'
rawArtistAliasData = sc.textFile(artistAliasDataFile)
artistAlias = rawArtistAliasData.map(parseArtistAliasData).filter(lambda (k, v) : k != -1).collectAsMap()

# broadcast variable
bArtistAlias = sc.broadcast(artistAlias)


def processTrainData(line):
    (userId, artistId, count) = map(int, line.split(' '))
    
    artistAliasId = bArtistAlias.value.get(artistId)
    if artistAliasId == None: 
        artistAliasId = artistId
    return Rating(userId, artistAliasId, count)

trainData = rawUserArtistData.map(processTrainData).cache()

model = ALS.trainImplicit(trainData, 10)
print model.productFeatures()
开发者ID:piyjoshi,项目名称:machine_learning,代码行数:32,代码来源:audio_recommender.py

示例4: SparkContext

# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import broadcast [as 别名]
from pyspark.sql.functions import udf, col
from pyspark.sql.types import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.mllib.linalg import SparseVector, DenseVector

sc = SparkContext(appName='sparking_your_interest')
SQLContext = HiveContext(sc)

speech_stopwords_list = list([line.strip() for line in open('speech_stopwords.txt', 'r')])
speech_stopwords_broadcasted = sc.broadcast(speech_stopwords_list)
nltk_stopwords = set(stopwords.words('english'))
nltk_stopwords_broadcasted = sc.broadcast(nltk_stopwords)
more_stopwords = set([line.strip() for line in open('more_stopwords.txt', 'r')])
more_stopwords_broadcasted = sc.broadcast(more_stopwords)

def clean_up(s):
    text_removing_brackets = re.sub("[\(\[].*?[\)\]]", "", s)
    text_removing_double_quotes = re.sub('"',"",text_removing_brackets)
    speech_stopwords = speech_stopwords_broadcasted.value
    text_removing_stopwords = text_removing_double_quotes
    for token in speech_stopwords:
        text_removing_stopwords = re.sub(token,'',text_removing_stopwords)
    return text_removing_stopwords

def unicode_encode(s):
开发者ID:vikaasa,项目名称:Spark_Workshop,代码行数:33,代码来源:sparking_your_interest.py

示例5: SparkContext

# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import broadcast [as 别名]
######### Global variabls ######### (gross)
# The following variables are broadcast to the spark
# cluster and can be used in the functions below
songTable = 'song_data'
sc = SparkContext('local[*]', 'lastfm_recommender')
sqlContext = SQLContext(sc)

### Set up database connections for metadata and similar artists
### This is starting to get really ugly.
### broadcasting this data is probably not a good idea
artist_engine = create_engine('sqlite:///'+sys.argv[1])
sims = pd.read_sql_query(
    'SELECT * FROM similarity', artist_engine)
# broadcsasting these variables is probably a bad idea since 
# they ar quite big
similars = sc.broadcast(sims.similar)
similar_groups = sc.broadcast(sims.groupby('target').groups)

tagFile = open('lastfm_unique_tags.txt', 'r')
# make tag dictionary available across the cluster.
tags = [tagstr[0] for tagstr in map(lambda ts: ts.split('\t'),
                                    [next(tagFile) for x in xrange(500)])]
tagDictionary = sc.broadcast(tags)
tagFile.close()

######## Functions for feature extraction #########

# make a "vector" with indices corresoinding to values in 
# tagDictionary
def getTagVector(track):
    return {tagDictionary.value[tag]:1 for [tag, f] in track.tags
开发者ID:xysmas,项目名称:million_songs,代码行数:33,代码来源:cbRecommender.py

示例6: values

# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import broadcast [as 别名]
# values()
m = sc.parallelize([(1, 2), (3, 4)]).values()
m.collect()

# variance()
sc.parallelize([1, 2, 3]).variance()

# zip(other)
x = sc.parallelize(range(0,5))
y = sc.parallelize(range(1000, 1005))
x.zip(y).collect()

# zipWithIndex()
sc.parallelize(["a", "b", "c", "d"], 3).zipWithIndex().collect()

# zipWithUniqueId()
sc.parallelize(["a", "b", "c", "d", "e"], 3).zipWithUniqueId().collect()


### BROADCAST
from pyspark.context import SparkContext
sc = SparkContext('local', 'test')
b = sc.broadcast([1, 2, 3, 4, 5])
b.value
sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect()
b.unpersist()

large_broadcast = sc.broadcast(range(10000))


开发者ID:DeepakSinghRawat,项目名称:Tutorials,代码行数:30,代码来源:Spark_tut.py

示例7: MainApp

# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import broadcast [as 别名]
class MainApp(object):
    def __init__(self):
        pass

    def init(self):
        os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
        # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
        # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
        conf = SparkConf()
        conf.setMaster("local")
        conf.setAppName("PySparkShell")
        conf.set("spark.executor.memory", "2g")
        # conf.set("spark.driver.memory", "1g")
        self.sc = SparkContext(conf=conf)
        self.sqlContext = SQLContext(self.sc)

    def loadData(self):
        self.df_review = self.sqlContext.read.json(
            "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json"
        ).cache()
        # self.df_review = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_review.json").cache()
        self.df_business = self.sqlContext.read.json(
            "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json"
        ).cache()
        # self.df_business = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_business.json").cache()
        self.df_review.registerTempTable("reviews")
        self.df_business.registerTempTable("business")

    def createCheckInDataPerUser(self):
        review_user = self.sqlContext.sql("SELECT business_id, user_id FROM reviews")
        business_loc = self.sqlContext.sql("SELECT business_id, latitude, longitude FROM business")
        review_user.registerTempTable("reviews_user")
        business_loc.registerTempTable("business_loc")

        self.df_join_reviewAndBusiness = self.sqlContext.sql(
            "SELECT r.user_id, b.latitude, b.longitude FROM reviews_user r JOIN business_loc b ON r.business_id = b.business_id"
        ).cache()
        self.df_join_reviewAndBusiness.registerTempTable("userBusiness")

        self.df_unique_users = self.sqlContext.sql(
            'SELECT DISTINCT user_id FROM userBusiness where user_id = "SIfJLNMv7vBwo-fSipxNgg"'
        )
        self.df_unique_users.registerTempTable("users")

        pd = self.df_join_reviewAndBusiness.toPandas()
        global_db = self.sc.broadcast(pd)

        schema = StructType([StructField("latitude", FloatType()), StructField("longitude", FloatType())])
        partialFunc = partial(getLocationsOfUser, business_db=global_db.value)

        self.get_locations = udf(partialFunc, ArrayType(schema))
        self.get_centers = udf(getCentersOfUser, ArrayType(schema))

        self.df_unique_users = self.df_unique_users.withColumn(
            "user_locations", self.get_locations(self.df_unique_users["user_id"])
        )
        self.df_unique_users.registerTempTable("users")

        self.df_unique_users.repartition(1).write.save("user.json", "json", "overwrite")

        print(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"]))

        self.df_unique_users = self.df_unique_users.withColumn(
            "user_centers", self.get_centers(self.df_unique_users["user_locations"])
        )
        self.df_unique_users.registerTempTable("users")

        self.df_unique_users.repartition(1).write.save("center.json", "json", "overwrite")
        self.df_unique_users.show()

    def distanceCalc(self):
        self.df_unique_users = self.sqlContext.read.json(
            "user.json/part-r-00000-23a1b514-f5fe-4f61-9a64-01ebbc88c146"
        ).cache()
        print(len(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"])))
开发者ID:abhinavrungta,项目名称:Yelp-Challenge,代码行数:77,代码来源:affinity.py


注:本文中的pyspark.context.SparkContext.broadcast方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。