当前位置: 首页>>代码示例>>Python>>正文


Python SparkContext.setCheckpointDir方法代码示例

本文整理汇总了Python中pyspark.SparkContext.setCheckpointDir方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.setCheckpointDir方法的具体用法?Python SparkContext.setCheckpointDir怎么用?Python SparkContext.setCheckpointDir使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.SparkContext的用法示例。


在下文中一共展示了SparkContext.setCheckpointDir方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: spark_context

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
def spark_context(request):
    """ fixture for creating a spark context
    Args:
    request: pytest.FixtureRequest object
    """
    conf = (SparkConf().setMaster("local[2]").setAppName("SparkTest"))
    sc = SparkContext(conf=conf)
    sc.setCheckpointDir('checkpoint')  # Stackoverflow error
    request.addfinalizer(lambda: sc.stop())

    quiet_py4j()
    return sc
开发者ID:xiaohan2012,项目名称:snpp,代码行数:14,代码来源:conftest.py

示例2: computeRmse

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
def computeRmse(model, evalSet):
    evalSetUserProduct = evalSet.map(lambda x: (x[0], x[1]))
    predictions = model.predictAll(evalSetUserProduct).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = evalSet.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    validationRmse = math.sqrt(ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    return validationRmse


if __name__ == "__main__":
    conf = SparkConf() \
      .setAppName("YelpReviewALS") \
      .set("spark.executor.memory", "1g")
    sc = SparkContext('local', conf=conf)

    reviewRDD = sc.textFile("../../../data/review_large.txt")
    sc.setCheckpointDir("checkpoints/")

    if len(sys.argv) < 2:
        printUsage()
        exit(1)

    if sys.argv[1] == '-e':
        evalModel = True
    else:
        evalModel = False

    if sys.argv[1] == '-c':
        runValidation = True
    else:
        runValidation = False
开发者ID:keyrrae,项目名称:w16paraucsb,代码行数:32,代码来源:alsRecommend.py

示例3: SparkConf

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
    ##### Main Execution Code
    conf = SparkConf().setAppName("Subreddit extraction")
    conf.set("spark.python.worker.memory","10g")
    conf.set("spark.driver.memory","15g")
    conf.set("spark.executor.memory","10g")
    conf.set("spark.default.parallelism", "12")
    conf.set("spark.mesos.coarse", "true")
    conf.set("spark.driver.maxResultSize", "10g")
    # Added the core limit to avoid resource allocation overruns
    conf.set("spark.cores.max", "10")
    conf.setMaster("mesos://zk://scc-culture-slave4.lancs.ac.uk:2181/mesos")
    conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz")
    conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")

    sc = SparkContext(conf=conf)
    sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing")

    # get the HDFS url of the dataset
    dataset = "reddit"
    hdfsUrl = inlocation

    # broadcast the name of the dataset to the cluster
    print("----Broadcasting the name of the dataset being processed")
    datasetName = sc.broadcast(dataset)

    # run a map-reduce job to first compile the RDD for the dataset loaded from the file
    print("-----Dataset file: " + hdfsUrl)
    rawPostsFile = sc.textFile(hdfsUrl, minPartitions=12)

    # clean the posts and write them into HDFS from their respective paritions
    print("Writing to HDFS")
开发者ID:mattroweshow,项目名称:NER-Diff-Paper,代码行数:33,代码来源:thinner-to-brown.py

示例4: loadMovieNames

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
import sys
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, Rating

def loadMovieNames():
    movieNames = {}
    with open("ml-100k/u.ITEM") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1].decode('ascii', 'ignore')
    return movieNames

conf = SparkConf().setMaster("local[*]").setAppName("MovieRecommendationsALS")
sc = SparkContext(conf = conf)
sc.setCheckpointDir('checkpoint')

print "\nLoading movie names..."
nameDict = loadMovieNames()
#umadeup: data create on top of u.data (3 rows)
data = sc.textFile("C:/Users/seeth_000/UdemySpark/ml-100k/umadeup.data")

ratings = data.map(lambda l: l.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

# Build the recommendation model using Alternating Least Squares
print "\nTraining recommendation model..."
rank = 10
# Lowered numIterations to ensure it works on lower-end systems
numIterations = 6
model = ALS.train(ratings, rank, numIterations)

userID = int(sys.argv[1])
开发者ID:seethap,项目名称:04Udemy_Spark,代码行数:33,代码来源:movie-recommendations-als.py

示例5: SparkContext

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
import findspark  # this needs to be the first import
findspark.init()

import networkx as nx
from pyspark import SparkConf
from pyspark import SparkContext
from snpp.cores.lowrank import partition_graph


conf = (SparkConf().setMaster("local[2]").setAppName("SparkTest"))
sc = SparkContext(conf=conf)
sc.setCheckpointDir('checkpoint')  # Stackoverflow error

train_graph_path = 'data/{}/train_graph.pkl'.format('slashdot')
g = nx.read_gpickle(train_graph_path)
partition_graph(g, k=40, sc=sc,
                lambda_=0.1,
                iterations=20,
                seed=123456)

sc.stop()
开发者ID:xiaohan2012,项目名称:snpp,代码行数:23,代码来源:profile_lowrank_partitioning.py

示例6: get_most_liked_courses

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]

def get_most_liked_courses(n):
    """Get the top n courses with the most likes and ratings"""
    input = {
        'sort_mode': 'interesting',
        'count': n
    }
    return m.Course.search(params=input)[0]


def save_recommendations_to_mongo():
    log.info('Saving recommendations to database...')
    for user in m.User.objects:
        try:
            user.recommended_courses = engine.recommend_user(
                str(user.id),
                _PARAMS['num_courses'])
            user.save()
        except Exception as e:
            log.error(e)

if __name__ == '__main__':
    mongoengine.connect(c.MONGO_DB_RMC)
    sc = SparkContext()
    sc.setCheckpointDir('data/recommendation/checkpoint/')
    engine = RecommendationEngine(sc)
    engine.train()
    engine.load_data()
    save_recommendations_to_mongo()
开发者ID:JGulbronson,项目名称:rmc,代码行数:31,代码来源:engine.py

示例7: computeExposureGraphForTop500Entities

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]

#.........这里部分代码省略.........
                                # count how many times the user was exposed to the entity (given that they had interacted with the users beforehand
                                for prior_author in prior_post_authors:
                                    if prior_author in prior_users:
                                        exposure_count += 1

                                # log the exposure count
                                if exposure_count in activation_points:
                                    activation_points[exposure_count] += 1
                                else:
                                    activation_points[exposure_count] = 1

                                # Exit the posts dates loop
                                activated_users.add(user)
                                break

        # Return the mapping between the entity and the activation point distribution
        return (entity, activation_points)


    ###### Execution code
    conf = SparkConf().setAppName("NER Diffusion - Exposure Dynamics")
    conf.set("spark.python.worker.memory","10g")
    conf.set("spark.driver.memory","15g")
    conf.set("spark.executor.memory","10g")
    conf.set("spark.default.parallelism", "12")
    conf.set("spark.mesos.coarse", "true")
    conf.set("spark.driver.maxResultSize", "10g")
    conf.set("spark.cores.max", "15")
    conf.setMaster("mesos://zk://scc-culture-slave9.lancs.ac.uk:2181/mesos")
    conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz")
    conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")

    sc = SparkContext(conf=conf)
    sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing")

    # use sample directory for testing
    # annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated-sample")
    annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated")
    annotationFile.cache()
    thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json")
    # thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json-sample")
    thinnedFile.cache()

    # Top 500 entities file
    top500EntitiesFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/entities/top500_entities.csv")
    top500Entities = top500EntitiesFile.map(lambda x: str(x.encode("utf-8"))).collect()
    print("Top Entities Loaded. Total = " + str(len(top500Entities)))
    top_500_entities_broadcast = sc.broadcast(top500Entities)
    # print(str(len(top500Entities)))
    # print(top500Entities)

    # Load the reply graphs from the thinnedFile
    reply_map_rdd = thinnedFile\
        .flatMap(deriveReplyMap)
    reply_map_rdd.cache()

    reply_orig_map = reply_map_rdd\
        .collectAsMap()
    print("Reply Orig Map Size = " + str(len(reply_orig_map)))
    # print(reply_orig_map)
    # TODO: Convert code to HBase format as this fails to broadcast with Java heap space memory error
    reply_orig_map_broadcast = sc.broadcast(reply_orig_map)

    # # get the: {orig, [reply]} dictionary
    orig_replies_map = reply_map_rdd\
        .map(lambda x: (x[1], [x[0]]))\
开发者ID:mattroweshow,项目名称:NER-Diff-Paper,代码行数:70,代码来源:exposure_dynamics.py

示例8: SparkContext

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
    location = "hdfs"
    try:
        if "avatar" in platform.node():
            location = "local"
    except:
        pass
    try:
        if "avatar" in socket.gethostname():
            location = "local"
    except:
        pass
    print "### location %s" % location

    sc = SparkContext(appName="CRF")
    sc.setCheckpointDir("/tmp")
    year = 2015
    mode = sys.argv[1]
    tag = sys.argv[2]
    month = int(sys.argv[3])
    day = int(sys.argv[4])
    hour = int(sys.argv[5])
    partNum = None
    try:
        partNum = int(sys.argv[6])
    except:
        pass
    limit = None
    try:
        limit = int(sys.argv[7])
    except:
开发者ID:cjsanjay,项目名称:dig-crf,代码行数:32,代码来源:mergedriver.py

示例9: computeEntityTSData

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
def computeEntityTSData():

    ##### Spark Functions
    # input: json_line
    # [(entity: {date, count})]
    def deriveEntityToDate(json_line):
        entity_dates = []

        json_obj = json.loads(json_line)
        created_string = float(json_obj['created_utc'])
        created_date = datetime.fromtimestamp(created_string).date()
        annotations = json_obj['entity_texts']
        for annotation in annotations:
            date_count = {}
            date_count[created_date] = 1
            entity_dates.append((annotation, date_count))
        return entity_dates

    def combineDateCounts(date_counts1, date_counts2):
        date_counts = date_counts1
        for date in date_counts2:
            if date in date_counts:
                date_counts[date] += date_counts2[date]
            else:
                date_counts[date] = date_counts2[date]
        return date_counts


    ###### Execution code
    conf = SparkConf().setAppName("NER Diffusion - Exploratory Plots")
    conf.set("spark.python.worker.memory","10g")
    conf.set("spark.driver.memory","15g")
    conf.set("spark.executor.memory","10g")
    conf.set("spark.default.parallelism", "12")
    conf.set("spark.mesos.coarse", "true")
    conf.set("spark.driver.maxResultSize", "10g")
    # Added the core limit to avoid resource allocation overruns
    conf.set("spark.cores.max", "5")
    conf.setMaster("mesos://zk://scc-culture-slave9.lancs.ac.uk:2181/mesos")
    conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz")
    conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")

    sc = SparkContext(conf=conf)
    sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing")

    # use sample directory for testing
    # distFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/user/derczynskil/RC_2015-01")
    distFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated")
    # Point to local file until data has finished uploading to HDFS
    # distFile = sc.textFile("/home/derczynskil/annotated/")
    distFile.cache()

    # Step 1: Derive the time-sensitive map of when entities appeared
    print("----Loading entity time-series")
    entity_citation_dates = distFile\
        .flatMap(deriveEntityToDate)\
        .reduceByKey(combineDateCounts)
    entity_citation_dates.cache()
    # print(entity_citation_dates.collect())

    print("----Deriving the count of entity citations")
    entity_citation_counts = entity_citation_dates\
        .map(lambda x: (x[0], len(x[1])))\
        .map(lambda x: (x[1], x[0]))\
        .sortByKey(False)\
        .map(lambda x: (x[1], x[0]))\
        .collect()

    # Write to local disk
    print("------Writing the output to a file")
    outputString = ""
    for (entity, count) in entity_citation_counts:
        outputString += str(entity.encode('utf-8')).replace("'", "") + "\t" + str(count) + "\n"
    # print(outputString)
    outputFile = open("data/entity_mention_frequencies.csv", "w")
    outputFile.write(outputString)
    outputFile.close()

    # Write the time-series output to local disk
    print("------Writing the ts output to a file")
    outputString = ""
    for (entity, date_to_count) in entity_citation_dates.collect():
        outputString += str(entity.encode('utf-8')).replace("'", "")
        for date in date_to_count:
            outputString += "\t" + str(date) + "|" + str(date_to_count[date])
        outputString += "\n"
    # print(outputString)
    outputFile = open("data/entity_mention_ts.csv", "w")
    outputFile.write(outputString)
    outputFile.close()

    # stop the Spark context from running
    sc.stop()
开发者ID:mattroweshow,项目名称:NER-Diff-Paper,代码行数:95,代码来源:cluster_exploratory_plots.py

示例10: computeGlobalCascadeIsomorphicDistribution

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]

#.........这里部分代码省略.........
                    chain_posts.add(to_process_post)
                    # get the replies to this post
                    if to_process_post in orig_replies_map:
                        replies = orig_replies_map[to_process_post]
                        to_process += replies
                        for reply in replies:
                            chain.append(reply + "->" + to_process_post)
                    # get the post that this post relied to
                    if to_process_post in reply_orig_map:
                        orig_post = reply_orig_map[to_process_post]
                        to_process += orig_post
                        chain.append(to_process_post + "->" + orig_post)

                # log the chain for the entity
                entity_chains.append(chain)

        # Return the entity chains
        return (entity, entity_chains)

    ###### Execution code
    conf = SparkConf().setAppName("NER Diffusion - Cascade Pattern Mining")
    conf.set("spark.python.worker.memory","10g")
    conf.set("spark.driver.memory","15g")
    conf.set("spark.executor.memory","10g")
    conf.set("spark.default.parallelism", "12")
    conf.set("spark.mesos.coarse", "true")
    conf.set("spark.driver.maxResultSize", "10g")
    conf.set("spark.cores.max", "15")
    conf.setMaster("mesos://zk://scc-culture-slave9.lancs.ac.uk:2181/mesos")
    conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz")
    conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")

    sc = SparkContext(conf=conf)
    sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing")

    # use sample directory for testing
    annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated-sample")
    # annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated")
    annotationFile.cache()
    # thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json")
    thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json-sample")
    thinnedFile.cache()

    # Load the reply graphs from the thinnedFile
    print("Loading replies map")
    reply_map_rdd = thinnedFile\
        .flatMap(deriveReplyMap)

    # Collect as a map and broadcast this to the cluster
    reply_orig_map = reply_map_rdd\
        .collectAsMap()
    print("Reply Orig Map Size = " + str(len(reply_orig_map)))
    # print(reply_orig_map)
    reply_orig_map_broadcast = sc.broadcast(reply_orig_map)

    # # get the: {orig, [reply]} dictionary
    orig_replies_map = reply_map_rdd\
        .map(lambda x: (x[1], [x[0]]))\
        .reduceByKey(combineReplies)\
        .collectAsMap()
    print("Orig Replies Map Size = " + str(len(orig_replies_map)))
    orig_replies_map_broadcast = sc.broadcast(orig_replies_map)


    # Load the entity to post map
    # input: json_line of annotations of each post
开发者ID:mattroweshow,项目名称:NER-Diff-Paper,代码行数:70,代码来源:cascade_shape_mining.py

示例11: int

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
        if error < min_error:
            min_error = error
            best_rank = rank
            best_lambda = lambda_i

print 'The best model was trained with rank %s, lambda %f' % (best_rank, best_lambda)

# Test 
model = ALS.train(training_RDD, best_rank, seed=seed, iterations=iterations,
                      lambda_=best_lambda)
predictions = model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

print 'For testing data the RMSE is %s' % (error)
'''
# Using the complete dataset to build the final model; re-do the above
# Load the complete dataset file
complete_ratings_file = os.path.join('./datasets', 'ml-latest', 'ratings.csv')
complete_ratings_raw_data = sc.textFile(complete_ratings_file)
complete_ratings_raw_data_header = complete_ratings_raw_data.take(1)[0]
# Parse
complete_ratings_data = complete_ratings_raw_data.filter(lambda line: line!=complete_ratings_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),int(tokens[1]),float(tokens[2]))).cache()
print "There are %s recommendations in the complete dataset" % (complete_ratings_data.count())

# to avoid stackover flow
sc.setCheckpointDir('checkpoint/')

training_RDD, test_RDD = complete_ratings_data.randomSplit([7, 3], seed=0L)
complete_model = ALS.train(training_RDD, best_rank, seed=seed, 
开发者ID:Zhou42,项目名称:bt-hollywood,代码行数:33,代码来源:MovieRating.py

示例12: init_spark_context

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
def init_spark_context():
    conf = SparkConf().setAppName("MovieRatings").set("spark.executor.memory", "4g")
    sc = SparkContext(conf=conf)
    sc.setCheckpointDir('/tmp/checkpoint/')
    return sc
开发者ID:MiguelPeralvo,项目名称:mongodb-spark,代码行数:7,代码来源:movie-recommendations.py


注:本文中的pyspark.SparkContext.setCheckpointDir方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。