本文整理汇总了Python中pyspark.SparkContext.textFile方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.textFile方法的具体用法?Python SparkContext.textFile怎么用?Python SparkContext.textFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext
的用法示例。
在下文中一共展示了SparkContext.textFile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def __init__(self, file_path, train_file, test_file, real_file=None):
"""
file_path: the folder where data files reside
train_file: (user, item, rating) quote records
test_file: (user, item) records, preferences to be predicted
real_file: (user, option, value) real purchase records, can be none if it doesn't exist
For this specific project:
item here is the combination of options with their values,
e.g. item 10 denotes option A with choice 0; item 21 denotes option B with choice 1
rating is the number of quotes for a certain item by a user
"""
self.file_path = file_path
config = SparkConf().setMaster("local").setAppName("Kaggle")\
.set("spark.executor.memory", "2g")\
.set("spark.storage.memoryFraction", "1")
sc = SparkContext(conf=config)
self.train_data = sc.textFile("file:" + self.file_path + train_file).cache()\
.map(lambda line: array([float(x) for x in line.split(',')]))
self.test_data = sc.textFile("file:" + self.file_path + test_file).cache()\
.map(lambda line: [float(x) for x in line.split(',')])
if real_file:
self.real_data = sc.textFile("file:" + self.file_path + real_file).cache()\
.map(lambda line: [float(x) for x in line.split(',')]).map(lambda r: ((r[0], r[1]), r[2]))
示例2: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main():
""" Train and evaluate an ALS recommender.
"""
# Set up environment
sc = SparkContext("local[*]", "RecSys")
# Load and parse the data
data = sc.textFile("./data/ratings.dat")
ratings = data.map(parse_rating)
# Build the recommendation model using Alternating Least Squares
rank = 10
iterations = 20
model = ALS.train(ratings, rank, iterations)
movies = sc.textFile("./data/movies.dat")\
.map(parse_movie)
# Evaluate the model on training data
testdata = ratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata)\
.map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = ratings.map(lambda r: ((r[0], r[1]), r[2]))\
.join(predictions)
MSE = rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))
示例3: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main():
reviews_parquet = sys.argv[1]
metadata_parquet = sys.argv[2]
users_ascores_file = sys.argv[3]
products_ascores_file = sys.argv[4]
conf = SparkConf().setAppName('Amazon Cassandra Injector').setMaster("local").set("spark.cassandra.connection.host", "localhost")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
sqlContext.read.parquet(reviews_parquet).registerTempTable('amazon_reviews')
reviews = sqlContext.sql("""SELECT * FROM amazon_reviews""").rdd.cache()
reviews_by_reviewer = reviews.map(process_review).map(lambda j: (j["reviewerid"], j))
users_ascores = sc.textFile(users_ascores_file).map(ast.literal_eval).map(lambda (r_id, score, histo): (r_id, (score, histo)))
reviews_joined = reviews_by_reviewer.join(users_ascores).map(lambda (reviewerid, (j, (score, histo))): fillin_review(j, score))
# join with meth2_users_ascores. join on reviewerid -> ascore is reviewer ascore
reviews_joined.saveToCassandra("amzdb", "reviews")
# reviewers need their alternative score
reviewers = reviews.map(process_reviewer).map(lambda j: (j["reviewerid"], j))
# join with meth2_user_ascores. Get ascore and overall_histogram
reviewers_joined = reviewers.join(users_ascores).map(lambda (reviewerid, (j, (score, histo))): fillin_reviewer(j, score, histo))
reviewers_joined.saveToCassandra("amzdb", "reviewers")
# products need their overall score/histogram, and adjuted score/histogram
sqlContext.read.parquet(metadata_parquet).registerTempTable('amazon_metadata')
products = sqlContext.sql("""SELECT * FROM amazon_metadata""").rdd.map(process_product).map(lambda j: (j["asin"], j))
# join with meth2_product_ascores
products_ascores = sc.textFile(products_ascores_file).map(ast.literal_eval).map(lambda (asin, o_s, a_s, o_h, a_h, n): (asin, (o_s, o_h, a_s, a_h)))
products_joined = products.join(products_ascores).map(lambda (asin, (j, (o_s, o_h, a_s, a_h))): fillin_product(j, o_s, o_h, a_s, a_h))
products_joined.saveToCassandra("amzdb", "products")
示例4: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main():
# Set the configuration of the Spark Application
conf = (SparkConf().setMaster("local[*]").setAppName("advancedSparkJoin"))
# Creating a Spark context with the previous configuration
sc = SparkContext(conf = conf)
# Loading the data
show_views_file = sc.textFile("input/join2_gennum?.txt")
show_channel_file = sc.textFile("input/join2_genchan?.txt")
# Closures to parse the files (using the spark map transformation)
def split_show_views(line):
key_value = line.split(",")
return (key_value[0], int(key_value[1]))
def split_show_channel(line):
key_value = line.split(",")
return (key_value[0], key_value[1])
# Map
show_views = show_views_file.map(split_show_views)
show_channel = show_channel_file.map(split_show_channel)
# Join
joined_dataset = show_views.join(show_channel)
# Extract channel as key
channel_views = joined_dataset.map(lambda x: (x[1][1], x[1][0]))
# Sum across (reduce)
sumChannel = channel_views.reduceByKey(lambda a, b: a + b).collect()
print sumChannel
示例5: run
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def run(mode):
sc = SparkContext()
clusters = open(os.path.realpath(__file__+'/../..') + '/clustersGrouped.csv', 'r')
if (mode == 'standard'):
numStuds = 0
else:
numStuds = 4
# RotoGuru stat histories
records = sc.textFile(os.path.realpath(__file__+'/..') + '/data-scraper/data')
kvpairs = records.map(keyAndParse)
# Counts for normalizing
cts = kvpairs.groupByKey().map(lambda (name, statList): (name, len(statList))).collectAsMap()
kvpairs = kvpairs.reduceByKey(combine)
kvpairs = kvpairs.map(lambda (name, statline): (name, normalize(statline, cts[name])))
# RDD of keyed DraftKings prices
prices = sc.textFile(os.path.realpath(__file__+'/../../DKSalaries.csv'))
dkprices_pos = prices.map(getPrice)
# point per dollar RDD
ppd = kvpairs.join(dkprices_pos).map(lambda (k,v): (k, getPpd(k,v)))
studList = getStuds(numStuds, clusters, dkprices_pos)
for stud in studList:
ppd = ppd.filter(lambda (k,v): k != stud[0])
sortedPpd = ppd.sortBy(lambda x: -x[1][0])
getRoster(sortedPpd.collect(), studList)
示例6: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main():
conf = SparkConf().setAppName("mm").set("spark.executor.memory", "2g")
sc = SparkContext(conf=conf)
RDDplayed = sc.textFile('train_visibleSmall.txt')
songs = sc.textFile('Song_PropertiesSmall.txt')
RDDnot_played = sc.textFile('notplayedsongs.txt')
features = songs.map(lambda x: songSplit(x))
played_flipped = RDDplayed.map(lambda x: userSplit(x))
played_joined = played_flipped.join(features)
flip_played_joined = played_joined.map(lambda x: joinFlip(x))
rated = flip_played_joined.reduceByKey(lambda x,y:keys(list(x),list(y)))
notplayed_flipped = RDDnot_played.map(lambda x: userSplit(x))
notplayed_joined = notplayed_flipped.join(features)
flip_notplayed_joined = notplayed_joined.map(lambda x: joinFlip(x))
unrated = flip_notplayed_joined.reduceByKey(lambda x,y:keys(list(x),list(y)))
joined_RDD = rated.join(unrated)
rates = joined_RDD.map(lambda x:comparison_function(x))
hope = rates.flatMap(hoping)
hope_for_better = hope.map(lambda x: str(x[0])+"\t"+str(x[1])+"\t"+str(x[2]))
print hope_for_better.collect()
hope_for_better.coalesce(1).saveAsTextFile('thehope')
示例7: top_ten_movies
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def top_ten_movies():
sc = SparkContext(appName="rating")
lines = sc.textFile(sys.argv[1], 1)
line1 = lines.filter(lambda line: "movieId" not in line)
counts = line1.map(lambda x: (x.split(',')[1], float(x.split(',')[2]))) \
.reduceByKey(add)
output = counts.sortBy(lambda x: -x[1]).collect()
lines = sc.textFile(sys.argv[2], 1)
line1 = lines.filter(lambda line: "movieId" not in line)
counts = line1.map(lambda x: (x.split(',')[0], (x.split(',')[1],x.split(',')[2])) if "\"" not in x else (x.split(',')[0], (x.split('\"')[1],x.split(',')[-1])))
output1 = counts.collect()
ans={}
for (a,b) in output1:
ans[int(a)]=b;
i=1;
toprint={}
html = "<html><head><title>Top Ten Rated Movies</title></head><body>"
for (word, count) in output:
html = html + "<h5>" + str(i) +". "+ans[int(word)][0] + " ---------- " + ans[int(word)][1] + "</h5>"
toprint[i] = ans[int(word)][0] + "\t" + ans[int(word)][1]
if i==10:
break
i = i + 1;
sc.stop()
html = html + "</body></html>"
return html
示例8: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main(inputFile,targetFile):
sc = SparkContext(appName="MLRandomForestTrain")
sqlContext = SQLContext(sc)
# df = sqlContext.read.load('/user/cloudera/DMLESPARK/Monamidata/brandFamily_manual_mapped.csv',
# format='com.databricks.spark.csv',
# header='true',
# inferSchema='true')
########################HierarchyInputWithBFMResult_brandFamily from Hdfs.
#'/user/hue/oozie/workspaces/DMLE_BFM_V1_MLRFTRAIN-Dev/lib/HierarchyInputWithBFMResult_brandFamily.csv'
inputLevelWithBFMrdd = sc.textFile(inputFile)
inputLevelWithBFMrdd = inputLevelWithBFMrdd.map(lambda line: line.split(","))
header = inputLevelWithBFMrdd.first()
inputLevelWithBFMrdd = inputLevelWithBFMrdd.filter(lambda line:line != header)
sparkdf = inputLevelWithBFMrdd.toDF()
df = sparkdf.toPandas()
df.columns = header
inputLevelWithBFM = df
inputLevelWithBFM = pd.DataFrame(inputLevelWithBFM)
########################Read target File from Hdfs.
#'/user/hue/oozie/workspaces/DMLE_BFM_V1_MLRFTRAIN-Dev/lib/brandFamily_target.csv'
targetLevelNamerdd = sc.textFile(targetFile)
targetLevelNamerdd = targetLevelNamerdd.map(lambda line: line.split(","))
header = targetLevelNamerdd.first()
targetLevelNamerdd = targetLevelNamerdd.filter(lambda line:line != header)
sparkdf = targetLevelNamerdd.toDF()
df = sparkdf.toPandas()
df.columns = header
targetLevelName = df
targetLevelName = pd.DataFrame(targetLevelName)
rf(inputLevelWithBFM,targetLevelName)
示例9: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main():
input_train = sys.argv[1]
input_test = sys.argv[2]
conf = SparkConf().setAppName('Sentiment Analysis with Random Forest')
sc = SparkContext(conf=conf)
assert sc.version >= '1.5.1'
train = sc.textFile(input_train).cache()
test = sc.textFile(input_test).cache()
'''sbaronia - get training and testing labeled points'''
train_lp = train.map(to_labeledpoint).cache()
test_lp = test.map(to_labeledpoint).cache()
'''sbaronia - run RandomForest regression on our training data with
default options except numTrees = 5'''
rf_model = RandomForest.trainRegressor(train_lp,categoricalFeaturesInfo={},numTrees=5,featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32)
'''sbaronia - run predictions on testing data and calculate RMSE value'''
predictions = rf_model.predict(test_lp.map(lambda x: x.features))
labelsAndPredictions = test_lp.map(lambda lp: lp.label).zip(predictions)
rmse = math.sqrt(labelsAndPredictions.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x + y)/float(test_lp.count()))
print("RMSE = " + str(rmse))
示例10: stackexchange_json_spark_job
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def stackexchange_json_spark_job():
"""
Spark job to convert json data from hdfs into ques and ans.
Result is written into elasticsearch for text based search from user.
"""
server = bluebook_conf.HDFS_FQDN
conf = SparkConf().setAppName("stackexchange_json_spark_job")
spark_context = SparkContext(conf=conf)
json_ques_folder_address = "hdfs://" + server + "/" +\
bluebook_conf.STACKEXCHANGE_JSON_QUES_FOLDER_NAME +\
"/part-*"
json_ans_folder_address = "hdfs://" + server + "/" +\
bluebook_conf.STACKEXCHANGE_JSON_ANS_FOLDER_NAME +\
"/part-*"
# Ques and ans files are seperately read from hdfs
ques_file = spark_context.textFile(json_ques_folder_address)
ans_file = spark_context.textFile(json_ans_folder_address)
ques_tups = ques_file.map(lambda line: stackexchange_json_mapper(line, 'ques'))
ans_tups = ans_file.map(lambda line: stackexchange_json_mapper(line, 'ans'))
# Join accepted answers with their respective questions
ques_ans = ques_tups.join(ans_tups).map(lambda x: (x[0], {'ques': x[1][0], 'ans': x[1][1]}))
ques_ans.saveAsNewAPIHadoopFile(
path='-',
outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
keyClass="org.apache.hadoop.io.NullWritable",
valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
conf=stackoverflow_es_write_conf)
示例11: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main(argv):
''' matrixDirectory: the hdfs directory where we find users profile matrix. It is assumed to be compressed
and split in several files.
streamFiles: the files used to update the matrix. In userId|country|artistId|trackId format
outputFile: optional output directory for the updated matrix. By default, we simply overwrite the current one'''
matrixDirectory, streamFiles, outputFile = getArguments(argv)
sc = SparkContext(appName="usersProfile")
# open both matrix and non processed stream_xxxxxxxx files
# Turn into (key, value) pair, where key = (user, track), to prepare the join
matrix = (sc.textFile(matrixDirectory + "*.gz")
.map(lambda line: map(int, line.split(" ")))
.map(lambda t: ((t[0], t[1]), t[2])))
streamData = (sc.textFile(streamFiles)
.map(lambda line: line.split("|"))
.map(lambda t: ((int(t[0]), int(t[3])), 1)))
outData = (matrix.join(streamData) # here the entries look like ((user, track), [count, 1, 1 ...])
.map(lambda t: (t[0], sum(t[1])) ) # compute new count => ((user, track), new_count)
.sortByKey()
.map(lambda t: " ".join(map(str, (t[0][0], t[0][1], t[1]))))) # prepare output file
saveAsTextFile(outData, path = outputFile, overwrite = True)
示例12: test
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def test(self):
#sc = SparkContext("local[5]", "my pc 1")
sc = SparkContext("spark://nb.local:7077", "DeskTop11")
tmp = sc.textFile('/user/hsiung/data.csv')
tmp = sc.textFile('d:/ddt.txt')
print(tmp.count())
print(tmp.first())
print(sc._conf.getAll())
示例13: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main():
"""
Driver program for a spam filter using Spark and MLLib
"""
# Consolidate the individual email files into a single spam file
# and a single ham file
makeDataFileFromEmails( "data/spam_2/", "data/spam.txt")
makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" )
# Create the Spark Context for parallel processing
sc = SparkContext( appName="Spam Filter")
# Load the spam and ham data files into RDDs
spam = sc.textFile( "data/spam.txt" )
ham = sc.textFile( "data/ham.txt" )
# Create a HashingTF instance to map email text to vectors of 10,000 features.
tf = HashingTF(numFeatures = 10000)
# Each email is split into words, and each word is mapped to one feature.
spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
hamFeatures = ham.map(lambda email: tf.transform(email.split(" ")))
# Create LabeledPoint datasets for positive (spam) and negative (ham) data points.
positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features))
# Combine positive and negative datasets into one
data = positiveExamples.union(negativeExamples)
# Split the data into 70% for training and 30% test data sets
( trainingData, testData ) = data.randomSplit( [0.7, 0.3] )
# Cache the training data to optmize the Logistic Regression
trainingData.cache()
# Train the model with Logistic Regression using the SGD algorithm.
model = LogisticRegressionWithSGD.train(trainingData)
# Create tuples of actual and predicted values
labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) )
# Calculate the error rate as number wrong / total number
error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() )
print( "*********** SPAM FILTER RESULTS **********" )
print( "\n" )
print( "Error Rate: " + str( error_rate ) )
print( "\n" )
# Serialize the model for presistance
pickle.dump( model, open( "spamFilter.pkl", "wb" ) )
sc.stop()
示例14: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main(argv):
sc = SparkContext(appName="KaggleDato")
#parse labels as JSON
PATH_TO_TRAIN_LABELS = "/user/alexeys/KaggleDato/train_v2.csv"
PATH_TO_SUB_LABELS = "/user/alexeys/KaggleDato/sampleSubmission_v2.csv"
train_label_rdd = sc.textFile(PATH_TO_TRAIN_LABELS).filter(lambda x: 'file' not in x).map(lambda x: parse_input(x)).map(lambda x: json.dumps(x)).repartition(1).saveAsTextFile('/user/alexeys/KaggleDato/train_csv_json')
sub_label_rdd = sc.textFile(PATH_TO_SUB_LABELS).filter(lambda x: 'file' not in x).map(lambda x: parse_input(x)).map(lambda x: json.dumps(x)).repartition(1).saveAsTextFile('/user/alexeys/KaggleDato/sampleSub_csv_json/')
nbuckets = 6
for bucket in range(nbuckets):
for section in range(1,10):
print "Processing bucket ",bucket," section ", section
fIn_rdd = sc.wholeTextFiles("/user/alexeys/KaggleDato/"+str(bucket)+"/"+str(section)+"*_raw_html.txt",12).map(parse_page_rdd).map(lambda x: json.dumps(x))
fIn_rdd.repartition(1).saveAsTextFile('/user/alexeys/KaggleDato/'+str(bucket)+'_'+str(section)+'/')
示例15: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main():
parser = argparse.ArgumentParser(description='Park or Bird Prediction Engine')
parser.add_argument('--i','--input', type=str, required=True, default=None, help='Input file or directory of jpg images')
parser.add_argument('--m','--method', type=str, required=True, default=None, help='Model method, 1 or 2')
args = parser.parse_args()
outfile = '/gpfs/gpfsfpo/prediction/predict_me.txt.gz'
os.system('rm -f ' + outfile)
sc = SparkContext(appName="Park Bird Predction Model 1")
args.m = args.m if args.m in [1,2] else 2
model_path = '/gpfs/gpfsfpo/shared/model_1_LBFGS' if args.m == 1 else '/gpfs/gpfsfpo/shared/model_2'
CreateTestData(args.i, args.m, outfile)
raw_input = sc.textFile(outfile)
k = raw_input.map(lambda x: x.split(',')[0])
p = raw_input.map(lambda x: x.split(',')[1]).map(lambda x: x.split(' ')).map(lambda x: [float(y) for y in x]).map(lambda x: Vectors.dense(x))
model = LogisticRegressionModel.load(sc, model_path)
predictions = model.predict(p)
keyPredictions = k.zip(predictions.map(lambda x: "IT'S A BIRD!" if x==1 else "IT'S A PARK!"))
print("************* RESULTS *******************")
print keyPredictions.collect()
sc.stop()