本文整理汇总了Python中pyspark.SparkContext.setCheckpointDir方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.setCheckpointDir方法的具体用法?Python SparkContext.setCheckpointDir怎么用?Python SparkContext.setCheckpointDir使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext
的用法示例。
在下文中一共展示了SparkContext.setCheckpointDir方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: spark_context
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
def spark_context(request):
""" fixture for creating a spark context
Args:
request: pytest.FixtureRequest object
"""
conf = (SparkConf().setMaster("local[2]").setAppName("SparkTest"))
sc = SparkContext(conf=conf)
sc.setCheckpointDir('checkpoint') # Stackoverflow error
request.addfinalizer(lambda: sc.stop())
quiet_py4j()
return sc
示例2: computeRmse
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
def computeRmse(model, evalSet):
evalSetUserProduct = evalSet.map(lambda x: (x[0], x[1]))
predictions = model.predictAll(evalSetUserProduct).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = evalSet.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
validationRmse = math.sqrt(ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
return validationRmse
if __name__ == "__main__":
conf = SparkConf() \
.setAppName("YelpReviewALS") \
.set("spark.executor.memory", "1g")
sc = SparkContext('local', conf=conf)
reviewRDD = sc.textFile("../../../data/review_large.txt")
sc.setCheckpointDir("checkpoints/")
if len(sys.argv) < 2:
printUsage()
exit(1)
if sys.argv[1] == '-e':
evalModel = True
else:
evalModel = False
if sys.argv[1] == '-c':
runValidation = True
else:
runValidation = False
示例3: SparkConf
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
##### Main Execution Code
conf = SparkConf().setAppName("Subreddit extraction")
conf.set("spark.python.worker.memory","10g")
conf.set("spark.driver.memory","15g")
conf.set("spark.executor.memory","10g")
conf.set("spark.default.parallelism", "12")
conf.set("spark.mesos.coarse", "true")
conf.set("spark.driver.maxResultSize", "10g")
# Added the core limit to avoid resource allocation overruns
conf.set("spark.cores.max", "10")
conf.setMaster("mesos://zk://scc-culture-slave4.lancs.ac.uk:2181/mesos")
conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz")
conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")
sc = SparkContext(conf=conf)
sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing")
# get the HDFS url of the dataset
dataset = "reddit"
hdfsUrl = inlocation
# broadcast the name of the dataset to the cluster
print("----Broadcasting the name of the dataset being processed")
datasetName = sc.broadcast(dataset)
# run a map-reduce job to first compile the RDD for the dataset loaded from the file
print("-----Dataset file: " + hdfsUrl)
rawPostsFile = sc.textFile(hdfsUrl, minPartitions=12)
# clean the posts and write them into HDFS from their respective paritions
print("Writing to HDFS")
示例4: loadMovieNames
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
import sys
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, Rating
def loadMovieNames():
movieNames = {}
with open("ml-100k/u.ITEM") as f:
for line in f:
fields = line.split('|')
movieNames[int(fields[0])] = fields[1].decode('ascii', 'ignore')
return movieNames
conf = SparkConf().setMaster("local[*]").setAppName("MovieRecommendationsALS")
sc = SparkContext(conf = conf)
sc.setCheckpointDir('checkpoint')
print "\nLoading movie names..."
nameDict = loadMovieNames()
#umadeup: data create on top of u.data (3 rows)
data = sc.textFile("C:/Users/seeth_000/UdemySpark/ml-100k/umadeup.data")
ratings = data.map(lambda l: l.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
# Build the recommendation model using Alternating Least Squares
print "\nTraining recommendation model..."
rank = 10
# Lowered numIterations to ensure it works on lower-end systems
numIterations = 6
model = ALS.train(ratings, rank, numIterations)
userID = int(sys.argv[1])
示例5: SparkContext
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
import findspark # this needs to be the first import
findspark.init()
import networkx as nx
from pyspark import SparkConf
from pyspark import SparkContext
from snpp.cores.lowrank import partition_graph
conf = (SparkConf().setMaster("local[2]").setAppName("SparkTest"))
sc = SparkContext(conf=conf)
sc.setCheckpointDir('checkpoint') # Stackoverflow error
train_graph_path = 'data/{}/train_graph.pkl'.format('slashdot')
g = nx.read_gpickle(train_graph_path)
partition_graph(g, k=40, sc=sc,
lambda_=0.1,
iterations=20,
seed=123456)
sc.stop()
示例6: get_most_liked_courses
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
def get_most_liked_courses(n):
"""Get the top n courses with the most likes and ratings"""
input = {
'sort_mode': 'interesting',
'count': n
}
return m.Course.search(params=input)[0]
def save_recommendations_to_mongo():
log.info('Saving recommendations to database...')
for user in m.User.objects:
try:
user.recommended_courses = engine.recommend_user(
str(user.id),
_PARAMS['num_courses'])
user.save()
except Exception as e:
log.error(e)
if __name__ == '__main__':
mongoengine.connect(c.MONGO_DB_RMC)
sc = SparkContext()
sc.setCheckpointDir('data/recommendation/checkpoint/')
engine = RecommendationEngine(sc)
engine.train()
engine.load_data()
save_recommendations_to_mongo()
示例7: computeExposureGraphForTop500Entities
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
#.........这里部分代码省略.........
# count how many times the user was exposed to the entity (given that they had interacted with the users beforehand
for prior_author in prior_post_authors:
if prior_author in prior_users:
exposure_count += 1
# log the exposure count
if exposure_count in activation_points:
activation_points[exposure_count] += 1
else:
activation_points[exposure_count] = 1
# Exit the posts dates loop
activated_users.add(user)
break
# Return the mapping between the entity and the activation point distribution
return (entity, activation_points)
###### Execution code
conf = SparkConf().setAppName("NER Diffusion - Exposure Dynamics")
conf.set("spark.python.worker.memory","10g")
conf.set("spark.driver.memory","15g")
conf.set("spark.executor.memory","10g")
conf.set("spark.default.parallelism", "12")
conf.set("spark.mesos.coarse", "true")
conf.set("spark.driver.maxResultSize", "10g")
conf.set("spark.cores.max", "15")
conf.setMaster("mesos://zk://scc-culture-slave9.lancs.ac.uk:2181/mesos")
conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz")
conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")
sc = SparkContext(conf=conf)
sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing")
# use sample directory for testing
# annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated-sample")
annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated")
annotationFile.cache()
thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json")
# thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json-sample")
thinnedFile.cache()
# Top 500 entities file
top500EntitiesFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/entities/top500_entities.csv")
top500Entities = top500EntitiesFile.map(lambda x: str(x.encode("utf-8"))).collect()
print("Top Entities Loaded. Total = " + str(len(top500Entities)))
top_500_entities_broadcast = sc.broadcast(top500Entities)
# print(str(len(top500Entities)))
# print(top500Entities)
# Load the reply graphs from the thinnedFile
reply_map_rdd = thinnedFile\
.flatMap(deriveReplyMap)
reply_map_rdd.cache()
reply_orig_map = reply_map_rdd\
.collectAsMap()
print("Reply Orig Map Size = " + str(len(reply_orig_map)))
# print(reply_orig_map)
# TODO: Convert code to HBase format as this fails to broadcast with Java heap space memory error
reply_orig_map_broadcast = sc.broadcast(reply_orig_map)
# # get the: {orig, [reply]} dictionary
orig_replies_map = reply_map_rdd\
.map(lambda x: (x[1], [x[0]]))\
示例8: SparkContext
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
location = "hdfs"
try:
if "avatar" in platform.node():
location = "local"
except:
pass
try:
if "avatar" in socket.gethostname():
location = "local"
except:
pass
print "### location %s" % location
sc = SparkContext(appName="CRF")
sc.setCheckpointDir("/tmp")
year = 2015
mode = sys.argv[1]
tag = sys.argv[2]
month = int(sys.argv[3])
day = int(sys.argv[4])
hour = int(sys.argv[5])
partNum = None
try:
partNum = int(sys.argv[6])
except:
pass
limit = None
try:
limit = int(sys.argv[7])
except:
示例9: computeEntityTSData
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
def computeEntityTSData():
##### Spark Functions
# input: json_line
# [(entity: {date, count})]
def deriveEntityToDate(json_line):
entity_dates = []
json_obj = json.loads(json_line)
created_string = float(json_obj['created_utc'])
created_date = datetime.fromtimestamp(created_string).date()
annotations = json_obj['entity_texts']
for annotation in annotations:
date_count = {}
date_count[created_date] = 1
entity_dates.append((annotation, date_count))
return entity_dates
def combineDateCounts(date_counts1, date_counts2):
date_counts = date_counts1
for date in date_counts2:
if date in date_counts:
date_counts[date] += date_counts2[date]
else:
date_counts[date] = date_counts2[date]
return date_counts
###### Execution code
conf = SparkConf().setAppName("NER Diffusion - Exploratory Plots")
conf.set("spark.python.worker.memory","10g")
conf.set("spark.driver.memory","15g")
conf.set("spark.executor.memory","10g")
conf.set("spark.default.parallelism", "12")
conf.set("spark.mesos.coarse", "true")
conf.set("spark.driver.maxResultSize", "10g")
# Added the core limit to avoid resource allocation overruns
conf.set("spark.cores.max", "5")
conf.setMaster("mesos://zk://scc-culture-slave9.lancs.ac.uk:2181/mesos")
conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz")
conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")
sc = SparkContext(conf=conf)
sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing")
# use sample directory for testing
# distFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/user/derczynskil/RC_2015-01")
distFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated")
# Point to local file until data has finished uploading to HDFS
# distFile = sc.textFile("/home/derczynskil/annotated/")
distFile.cache()
# Step 1: Derive the time-sensitive map of when entities appeared
print("----Loading entity time-series")
entity_citation_dates = distFile\
.flatMap(deriveEntityToDate)\
.reduceByKey(combineDateCounts)
entity_citation_dates.cache()
# print(entity_citation_dates.collect())
print("----Deriving the count of entity citations")
entity_citation_counts = entity_citation_dates\
.map(lambda x: (x[0], len(x[1])))\
.map(lambda x: (x[1], x[0]))\
.sortByKey(False)\
.map(lambda x: (x[1], x[0]))\
.collect()
# Write to local disk
print("------Writing the output to a file")
outputString = ""
for (entity, count) in entity_citation_counts:
outputString += str(entity.encode('utf-8')).replace("'", "") + "\t" + str(count) + "\n"
# print(outputString)
outputFile = open("data/entity_mention_frequencies.csv", "w")
outputFile.write(outputString)
outputFile.close()
# Write the time-series output to local disk
print("------Writing the ts output to a file")
outputString = ""
for (entity, date_to_count) in entity_citation_dates.collect():
outputString += str(entity.encode('utf-8')).replace("'", "")
for date in date_to_count:
outputString += "\t" + str(date) + "|" + str(date_to_count[date])
outputString += "\n"
# print(outputString)
outputFile = open("data/entity_mention_ts.csv", "w")
outputFile.write(outputString)
outputFile.close()
# stop the Spark context from running
sc.stop()
示例10: computeGlobalCascadeIsomorphicDistribution
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
#.........这里部分代码省略.........
chain_posts.add(to_process_post)
# get the replies to this post
if to_process_post in orig_replies_map:
replies = orig_replies_map[to_process_post]
to_process += replies
for reply in replies:
chain.append(reply + "->" + to_process_post)
# get the post that this post relied to
if to_process_post in reply_orig_map:
orig_post = reply_orig_map[to_process_post]
to_process += orig_post
chain.append(to_process_post + "->" + orig_post)
# log the chain for the entity
entity_chains.append(chain)
# Return the entity chains
return (entity, entity_chains)
###### Execution code
conf = SparkConf().setAppName("NER Diffusion - Cascade Pattern Mining")
conf.set("spark.python.worker.memory","10g")
conf.set("spark.driver.memory","15g")
conf.set("spark.executor.memory","10g")
conf.set("spark.default.parallelism", "12")
conf.set("spark.mesos.coarse", "true")
conf.set("spark.driver.maxResultSize", "10g")
conf.set("spark.cores.max", "15")
conf.setMaster("mesos://zk://scc-culture-slave9.lancs.ac.uk:2181/mesos")
conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz")
conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")
sc = SparkContext(conf=conf)
sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing")
# use sample directory for testing
annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated-sample")
# annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated")
annotationFile.cache()
# thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json")
thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json-sample")
thinnedFile.cache()
# Load the reply graphs from the thinnedFile
print("Loading replies map")
reply_map_rdd = thinnedFile\
.flatMap(deriveReplyMap)
# Collect as a map and broadcast this to the cluster
reply_orig_map = reply_map_rdd\
.collectAsMap()
print("Reply Orig Map Size = " + str(len(reply_orig_map)))
# print(reply_orig_map)
reply_orig_map_broadcast = sc.broadcast(reply_orig_map)
# # get the: {orig, [reply]} dictionary
orig_replies_map = reply_map_rdd\
.map(lambda x: (x[1], [x[0]]))\
.reduceByKey(combineReplies)\
.collectAsMap()
print("Orig Replies Map Size = " + str(len(orig_replies_map)))
orig_replies_map_broadcast = sc.broadcast(orig_replies_map)
# Load the entity to post map
# input: json_line of annotations of each post
示例11: int
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
if error < min_error:
min_error = error
best_rank = rank
best_lambda = lambda_i
print 'The best model was trained with rank %s, lambda %f' % (best_rank, best_lambda)
# Test
model = ALS.train(training_RDD, best_rank, seed=seed, iterations=iterations,
lambda_=best_lambda)
predictions = model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
print 'For testing data the RMSE is %s' % (error)
'''
# Using the complete dataset to build the final model; re-do the above
# Load the complete dataset file
complete_ratings_file = os.path.join('./datasets', 'ml-latest', 'ratings.csv')
complete_ratings_raw_data = sc.textFile(complete_ratings_file)
complete_ratings_raw_data_header = complete_ratings_raw_data.take(1)[0]
# Parse
complete_ratings_data = complete_ratings_raw_data.filter(lambda line: line!=complete_ratings_raw_data_header)\
.map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),int(tokens[1]),float(tokens[2]))).cache()
print "There are %s recommendations in the complete dataset" % (complete_ratings_data.count())
# to avoid stackover flow
sc.setCheckpointDir('checkpoint/')
training_RDD, test_RDD = complete_ratings_data.randomSplit([7, 3], seed=0L)
complete_model = ALS.train(training_RDD, best_rank, seed=seed,
示例12: init_spark_context
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import setCheckpointDir [as 别名]
def init_spark_context():
conf = SparkConf().setAppName("MovieRatings").set("spark.executor.memory", "4g")
sc = SparkContext(conf=conf)
sc.setCheckpointDir('/tmp/checkpoint/')
return sc