当前位置: 首页>>代码示例>>Python>>正文


Python SparkContext.textFile方法代码示例

本文整理汇总了Python中pyspark.SparkContext.textFile方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.textFile方法的具体用法?Python SparkContext.textFile怎么用?Python SparkContext.textFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.SparkContext的用法示例。


在下文中一共展示了SparkContext.textFile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
    def __init__(self, file_path, train_file, test_file, real_file=None):
        """
        file_path: the folder where data files reside
        train_file: (user, item, rating) quote records
        test_file: (user, item) records, preferences to be predicted
        real_file: (user, option, value) real purchase records, can be none if it doesn't exist
        For this specific project:
        item here is the combination of options with their values,
            e.g. item 10 denotes option A with choice 0; item 21 denotes option B with choice 1
        rating is the number of quotes for a certain item by a user
        """
        self.file_path = file_path
        config = SparkConf().setMaster("local").setAppName("Kaggle")\
            .set("spark.executor.memory", "2g")\
            .set("spark.storage.memoryFraction", "1")

        sc = SparkContext(conf=config)

        self.train_data = sc.textFile("file:" + self.file_path + train_file).cache()\
            .map(lambda line: array([float(x) for x in line.split(',')]))

        self.test_data = sc.textFile("file:" + self.file_path + test_file).cache()\
            .map(lambda line: [float(x) for x in line.split(',')])

        if real_file:
            self.real_data = sc.textFile("file:" + self.file_path + real_file).cache()\
                .map(lambda line: [float(x) for x in line.split(',')]).map(lambda r: ((r[0], r[1]), r[2]))
开发者ID:farcryzry,项目名称:spala,代码行数:29,代码来源:MatrixFactorization.py

示例2: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main():
    """ Train and evaluate an ALS recommender.
    """
    # Set up environment
    sc = SparkContext("local[*]", "RecSys")

    # Load and parse the data
    data = sc.textFile("./data/ratings.dat")
    ratings = data.map(parse_rating)

    # Build the recommendation model using Alternating Least Squares
    rank = 10
    iterations = 20
    model = ALS.train(ratings, rank, iterations)

    movies = sc.textFile("./data/movies.dat")\
               .map(parse_movie)
    # Evaluate the model on training data
    testdata = ratings.map(lambda p: (p[0], p[1]))
    predictions = model.predictAll(testdata)\
                       .map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = ratings.map(lambda r: ((r[0], r[1]), r[2]))\
                             .join(predictions)
    MSE = rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    print("Mean Squared Error = " + str(MSE))
开发者ID:randidwiputra,项目名称:python-spark-recsys,代码行数:27,代码来源:recsys.py

示例3: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main():
  reviews_parquet = sys.argv[1]
  metadata_parquet = sys.argv[2]
  users_ascores_file = sys.argv[3]
  products_ascores_file = sys.argv[4]

  conf = SparkConf().setAppName('Amazon Cassandra Injector').setMaster("local").set("spark.cassandra.connection.host", "localhost")
  sc = SparkContext(conf=conf)
  sqlContext = SQLContext(sc)

  sqlContext.read.parquet(reviews_parquet).registerTempTable('amazon_reviews')
  reviews = sqlContext.sql("""SELECT * FROM amazon_reviews""").rdd.cache()
  reviews_by_reviewer = reviews.map(process_review).map(lambda j: (j["reviewerid"], j))
  users_ascores = sc.textFile(users_ascores_file).map(ast.literal_eval).map(lambda (r_id, score, histo): (r_id, (score, histo)))
  reviews_joined = reviews_by_reviewer.join(users_ascores).map(lambda (reviewerid, (j, (score, histo))): fillin_review(j, score))
  # join with meth2_users_ascores. join on reviewerid -> ascore is reviewer ascore
  reviews_joined.saveToCassandra("amzdb", "reviews")

  # reviewers need their alternative score
  reviewers = reviews.map(process_reviewer).map(lambda j: (j["reviewerid"], j))
  # join with meth2_user_ascores. Get ascore and overall_histogram
  reviewers_joined = reviewers.join(users_ascores).map(lambda (reviewerid, (j, (score, histo))): fillin_reviewer(j, score, histo))
  reviewers_joined.saveToCassandra("amzdb", "reviewers")

  # products need their overall score/histogram, and adjuted score/histogram
  sqlContext.read.parquet(metadata_parquet).registerTempTable('amazon_metadata')
  products = sqlContext.sql("""SELECT * FROM amazon_metadata""").rdd.map(process_product).map(lambda j: (j["asin"], j))
  # join with meth2_product_ascores
  products_ascores = sc.textFile(products_ascores_file).map(ast.literal_eval).map(lambda (asin, o_s, a_s, o_h, a_h, n): (asin, (o_s, o_h, a_s, a_h)))
  products_joined = products.join(products_ascores).map(lambda (asin, (j, (o_s, o_h, a_s, a_h))): fillin_product(j, o_s, o_h, a_s, a_h))
  products_joined.saveToCassandra("amzdb", "products")
开发者ID:kyledemeule,项目名称:cmpt-732-amazon-review-analysis,代码行数:33,代码来源:cassandra_upload.py

示例4: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main():
	# Set the configuration of the Spark Application
	conf = (SparkConf().setMaster("local[*]").setAppName("advancedSparkJoin"))

	# Creating a Spark context with the previous configuration
	sc = SparkContext(conf = conf)
	
	# Loading the data
	show_views_file = sc.textFile("input/join2_gennum?.txt")
	show_channel_file = sc.textFile("input/join2_genchan?.txt")

	# Closures to parse the files (using the spark map transformation)
	def split_show_views(line):
		key_value = line.split(",")
		return (key_value[0], int(key_value[1]))

	def split_show_channel(line):
		key_value = line.split(",")
		return (key_value[0], key_value[1])

	# Map
	show_views = show_views_file.map(split_show_views)
	show_channel = show_channel_file.map(split_show_channel)

	# Join
	joined_dataset = show_views.join(show_channel)

	# Extract channel as key
	channel_views = joined_dataset.map(lambda x: (x[1][1], x[1][0]))

	# Sum across (reduce)
	sumChannel = channel_views.reduceByKey(lambda a, b: a + b).collect()
	print sumChannel
开发者ID:Hguimaraes,项目名称:bigdata-ucsd,代码行数:35,代码来源:advancedSparkJoin.py

示例5: run

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def run(mode):
    sc = SparkContext()
    clusters = open(os.path.realpath(__file__+'/../..') + '/clustersGrouped.csv', 'r')
    if (mode == 'standard'):
        numStuds = 0
    else:
        numStuds = 4

    # RotoGuru stat histories
    records = sc.textFile(os.path.realpath(__file__+'/..') + '/data-scraper/data')
    kvpairs = records.map(keyAndParse)

    # Counts for normalizing
    cts = kvpairs.groupByKey().map(lambda (name, statList): (name, len(statList))).collectAsMap()
    kvpairs = kvpairs.reduceByKey(combine)
    kvpairs = kvpairs.map(lambda (name, statline): (name, normalize(statline, cts[name])))

    # RDD of keyed DraftKings prices
    prices = sc.textFile(os.path.realpath(__file__+'/../../DKSalaries.csv'))
    dkprices_pos = prices.map(getPrice)

    # point per dollar RDD
    ppd = kvpairs.join(dkprices_pos).map(lambda (k,v): (k, getPpd(k,v)))

    studList = getStuds(numStuds, clusters, dkprices_pos)
    for stud in studList:
        ppd = ppd.filter(lambda (k,v): k != stud[0])
    sortedPpd = ppd.sortBy(lambda x: -x[1][0])
    getRoster(sortedPpd.collect(), studList)
开发者ID:jmaxpugliese,项目名称:daily-fantasy-recommender,代码行数:31,代码来源:recommend.py

示例6: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main():
	conf = SparkConf().setAppName("mm").set("spark.executor.memory", "2g")
	sc = SparkContext(conf=conf)
	RDDplayed = sc.textFile('train_visibleSmall.txt')
	songs = sc.textFile('Song_PropertiesSmall.txt')
	RDDnot_played = sc.textFile('notplayedsongs.txt')
	features = songs.map(lambda x: songSplit(x))
	played_flipped = RDDplayed.map(lambda x: userSplit(x))
	played_joined = played_flipped.join(features)
	flip_played_joined = played_joined.map(lambda x: joinFlip(x))
	rated = flip_played_joined.reduceByKey(lambda x,y:keys(list(x),list(y)))
	

	notplayed_flipped = RDDnot_played.map(lambda x: userSplit(x))
	notplayed_joined = notplayed_flipped.join(features)
	flip_notplayed_joined = notplayed_joined.map(lambda x: joinFlip(x))
	unrated = flip_notplayed_joined.reduceByKey(lambda x,y:keys(list(x),list(y)))
	


	joined_RDD = rated.join(unrated)
	

	rates = joined_RDD.map(lambda x:comparison_function(x))

	hope = rates.flatMap(hoping)
	hope_for_better = hope.map(lambda x: str(x[0])+"\t"+str(x[1])+"\t"+str(x[2]))
	print hope_for_better.collect()
	hope_for_better.coalesce(1).saveAsTextFile('thehope')
开发者ID:malmashhadani,项目名称:MusicRecommender,代码行数:31,代码来源:combined_filter.py

示例7: top_ten_movies

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def top_ten_movies():
    sc = SparkContext(appName="rating")    
    lines = sc.textFile(sys.argv[1], 1)
    line1 = lines.filter(lambda line: "movieId" not in line)    
    counts = line1.map(lambda x: (x.split(',')[1], float(x.split(',')[2]))) \
                  .reduceByKey(add)
    output = counts.sortBy(lambda x: -x[1]).collect()

    lines = sc.textFile(sys.argv[2], 1)
    line1 = lines.filter(lambda line: "movieId" not in line)    
    counts = line1.map(lambda x: (x.split(',')[0], (x.split(',')[1],x.split(',')[2])) if "\"" not in x else (x.split(',')[0], (x.split('\"')[1],x.split(',')[-1])))
    output1 = counts.collect()
    ans={}
    for (a,b) in output1:
        ans[int(a)]=b;
    i=1;
    toprint={}
    html = "<html><head><title>Top Ten Rated Movies</title></head><body>"
    for (word, count) in output:
        html = html + "<h5>" + str(i) +". "+ans[int(word)][0] + "  ----------  " + ans[int(word)][1] + "</h5>"
        toprint[i] = ans[int(word)][0] + "\t" + ans[int(word)][1]
        if i==10:
            break
        i = i  + 1;
    sc.stop()
    html = html + "</body></html>"
    return html 
开发者ID:ChilupuriAnilReddy,项目名称:Cloud-Major-Project-Team-18,代码行数:29,代码来源:flaskprogram.py

示例8: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main(inputFile,targetFile):
    sc = SparkContext(appName="MLRandomForestTrain")
    sqlContext = SQLContext(sc)


# df = sqlContext.read.load('/user/cloudera/DMLESPARK/Monamidata/brandFamily_manual_mapped.csv', 
#                           format='com.databricks.spark.csv', 
#                           header='true', 
#                           inferSchema='true')   
########################HierarchyInputWithBFMResult_brandFamily from Hdfs. 
#'/user/hue/oozie/workspaces/DMLE_BFM_V1_MLRFTRAIN-Dev/lib/HierarchyInputWithBFMResult_brandFamily.csv'
    inputLevelWithBFMrdd = sc.textFile(inputFile)
    inputLevelWithBFMrdd = inputLevelWithBFMrdd.map(lambda line: line.split(","))
    header = inputLevelWithBFMrdd.first()
    inputLevelWithBFMrdd = inputLevelWithBFMrdd.filter(lambda line:line != header)
    sparkdf = inputLevelWithBFMrdd.toDF()
    df = sparkdf.toPandas()
    df.columns = header
    inputLevelWithBFM = df
    inputLevelWithBFM = pd.DataFrame(inputLevelWithBFM)
########################Read target File from Hdfs. 
#'/user/hue/oozie/workspaces/DMLE_BFM_V1_MLRFTRAIN-Dev/lib/brandFamily_target.csv'
    targetLevelNamerdd = sc.textFile(targetFile) 
    targetLevelNamerdd = targetLevelNamerdd.map(lambda line: line.split(","))
    header = targetLevelNamerdd.first()
    targetLevelNamerdd = targetLevelNamerdd.filter(lambda line:line != header)
    sparkdf = targetLevelNamerdd.toDF()
    df = sparkdf.toPandas()
    df.columns = header
    targetLevelName = df
    targetLevelName = pd.DataFrame(targetLevelName)
    
    rf(inputLevelWithBFM,targetLevelName)
开发者ID:rrrpatil90,项目名称:randomforest,代码行数:35,代码来源:randomforestexample.py

示例9: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main():
    input_train = sys.argv[1]
    input_test = sys.argv[2]

    conf = SparkConf().setAppName('Sentiment Analysis with Random Forest')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'

    train = sc.textFile(input_train).cache()
    test = sc.textFile(input_test).cache()

    '''sbaronia - get training and testing labeled points'''
    train_lp = train.map(to_labeledpoint).cache()
    test_lp = test.map(to_labeledpoint).cache()

    '''sbaronia - run RandomForest regression on our training data with
    default options except numTrees = 5'''
    rf_model = RandomForest.trainRegressor(train_lp,categoricalFeaturesInfo={},numTrees=5,featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32)
    
    '''sbaronia - run predictions on testing data and calculate RMSE value'''
    predictions = rf_model.predict(test_lp.map(lambda x: x.features))
    labelsAndPredictions = test_lp.map(lambda lp: lp.label).zip(predictions)
    rmse = math.sqrt(labelsAndPredictions.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x + y)/float(test_lp.count()))

    print("RMSE = " + str(rmse))
开发者ID:gitofsid,项目名称:MyBigDataCode,代码行数:27,代码来源:randomforest.py

示例10: stackexchange_json_spark_job

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def stackexchange_json_spark_job():
    """
    Spark job to convert json data from hdfs into ques and ans.
    Result is written into elasticsearch for text based search from user.
    """
    server = bluebook_conf.HDFS_FQDN
    conf = SparkConf().setAppName("stackexchange_json_spark_job")
    spark_context = SparkContext(conf=conf)    
    json_ques_folder_address = "hdfs://" + server + "/" +\
                              bluebook_conf.STACKEXCHANGE_JSON_QUES_FOLDER_NAME +\
                              "/part-*"
    json_ans_folder_address = "hdfs://" + server + "/" +\
                              bluebook_conf.STACKEXCHANGE_JSON_ANS_FOLDER_NAME +\
                              "/part-*"
    
    # Ques and ans files are seperately read from hdfs
    ques_file = spark_context.textFile(json_ques_folder_address)
    ans_file = spark_context.textFile(json_ans_folder_address)
    ques_tups = ques_file.map(lambda line: stackexchange_json_mapper(line, 'ques'))
    ans_tups = ans_file.map(lambda line: stackexchange_json_mapper(line, 'ans'))

    # Join accepted answers with their respective questions
    ques_ans = ques_tups.join(ans_tups).map(lambda x: (x[0], {'ques': x[1][0], 'ans': x[1][1]}))
    ques_ans.saveAsNewAPIHadoopFile(
        path='-', 
        outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
        keyClass="org.apache.hadoop.io.NullWritable", 
        valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", 
        conf=stackoverflow_es_write_conf)
开发者ID:nave91,项目名称:rebot,代码行数:31,代码来源:joiner_app.py

示例11: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main(argv):
    
    ''' matrixDirectory: the hdfs directory where we find users profile matrix. It is assumed to be compressed 
                        and split in several files.
        streamFiles: the files used to update the matrix. In userId|country|artistId|trackId format
        outputFile: optional output directory for the updated matrix. By default, we simply overwrite the current one'''
    matrixDirectory, streamFiles, outputFile = getArguments(argv)

    sc = SparkContext(appName="usersProfile")
    
    # open both matrix and non processed stream_xxxxxxxx files
    # Turn into (key, value) pair, where key = (user, track), to prepare the join
    matrix = (sc.textFile(matrixDirectory + "*.gz")
                .map(lambda line: map(int, line.split(" ")))
                .map(lambda t: ((t[0], t[1]), t[2])))

    streamData = (sc.textFile(streamFiles)
                    .map(lambda line:  line.split("|"))
		    .map(lambda t: ((int(t[0]), int(t[3])), 1)))
  
  
    outData = (matrix.join(streamData) # here the entries look like ((user, track), [count, 1, 1 ...])
	             .map(lambda t: (t[0], sum(t[1])) ) # compute new count => ((user, track), new_count)
                     .sortByKey()                  
		     .map(lambda t: " ".join(map(str, (t[0][0], t[0][1], t[1]))))) # prepare output file

    saveAsTextFile(outData, path = outputFile, overwrite = True)
开发者ID:KPayet,项目名称:DeezerTestTechnique,代码行数:29,代码来源:updateUsersProfiles.py

示例12: test

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
    def test(self):
    	#sc = SparkContext("local[5]", "my pc 1")
	sc = SparkContext("spark://nb.local:7077", "DeskTop11")
	tmp = sc.textFile('/user/hsiung/data.csv')
	tmp = sc.textFile('d:/ddt.txt')

	print(tmp.count())
	print(tmp.first())
	print(sc._conf.getAll())
开发者ID:ChienHsiung,项目名称:python,代码行数:11,代码来源:new.py

示例13: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main():
	"""
	Driver program for a spam filter using Spark and MLLib
	"""

	# Consolidate the individual email files into a single spam file
	# and a single ham file
	makeDataFileFromEmails( "data/spam_2/", "data/spam.txt")
	makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" )

	# Create the Spark Context for parallel processing
	sc = SparkContext( appName="Spam Filter")

	# Load the spam and ham data files into RDDs
	spam = sc.textFile( "data/spam.txt" )
	ham = sc.textFile( "data/ham.txt" )

	# Create a HashingTF instance to map email text to vectors of 10,000 features.
	tf = HashingTF(numFeatures = 10000)

	# Each email is split into words, and each word is mapped to one feature.
	spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
	hamFeatures = ham.map(lambda email: tf.transform(email.split(" ")))

	# Create LabeledPoint datasets for positive (spam) and negative (ham) data points.
	positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
	negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features))

	# Combine positive and negative datasets into one
	data = positiveExamples.union(negativeExamples)

	# Split the data into 70% for training and 30% test data sets 
	( trainingData, testData ) = data.randomSplit( [0.7, 0.3] )

	# Cache the training data to optmize the Logistic Regression
	trainingData.cache() 

	# Train the model with Logistic Regression using the SGD algorithm.
	model = LogisticRegressionWithSGD.train(trainingData)

	# Create tuples of actual and predicted values
	labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) )

	# Calculate the error rate as number wrong / total number
	error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() )
	print( "*********** SPAM FILTER RESULTS **********" )
	print( "\n" )
	print( "Error Rate: " + str( error_rate ) )
	print( "\n" )

	# Serialize the model for presistance
	pickle.dump( model, open( "spamFilter.pkl", "wb" ) )

	sc.stop()
开发者ID:badpaper,项目名称:coursework,代码行数:56,代码来源:spamFilter.py

示例14: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main(argv):
    sc = SparkContext(appName="KaggleDato")

    #parse labels as JSON
    PATH_TO_TRAIN_LABELS = "/user/alexeys/KaggleDato/train_v2.csv"
    PATH_TO_SUB_LABELS = "/user/alexeys/KaggleDato/sampleSubmission_v2.csv"
    train_label_rdd = sc.textFile(PATH_TO_TRAIN_LABELS).filter(lambda x: 'file' not in x).map(lambda x: parse_input(x)).map(lambda x: json.dumps(x)).repartition(1).saveAsTextFile('/user/alexeys/KaggleDato/train_csv_json')
    sub_label_rdd = sc.textFile(PATH_TO_SUB_LABELS).filter(lambda x: 'file' not in x).map(lambda x: parse_input(x)).map(lambda x: json.dumps(x)).repartition(1).saveAsTextFile('/user/alexeys/KaggleDato/sampleSub_csv_json/')

    nbuckets =  6   
    for bucket in range(nbuckets):
        for section in range(1,10):
            print "Processing bucket ",bucket," section ", section
            fIn_rdd = sc.wholeTextFiles("/user/alexeys/KaggleDato/"+str(bucket)+"/"+str(section)+"*_raw_html.txt",12).map(parse_page_rdd).map(lambda x: json.dumps(x))
            fIn_rdd.repartition(1).saveAsTextFile('/user/alexeys/KaggleDato/'+str(bucket)+'_'+str(section)+'/')
开发者ID:kevglynn,项目名称:largeDataTeam,代码行数:17,代码来源:preprocess_step1.py

示例15: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import textFile [as 别名]
def main():

    parser = argparse.ArgumentParser(description='Park or Bird Prediction Engine')
    parser.add_argument('--i','--input', type=str, required=True, default=None, help='Input file or directory of jpg images')
    parser.add_argument('--m','--method', type=str, required=True, default=None, help='Model method, 1 or 2')
    args = parser.parse_args()

    outfile = '/gpfs/gpfsfpo/prediction/predict_me.txt.gz'
    os.system('rm -f ' + outfile)

    sc = SparkContext(appName="Park Bird Predction Model 1")

    args.m = args.m if args.m in [1,2] else 2
    model_path = '/gpfs/gpfsfpo/shared/model_1_LBFGS' if args.m == 1 else '/gpfs/gpfsfpo/shared/model_2'

    CreateTestData(args.i, args.m, outfile)

    raw_input = sc.textFile(outfile)
    k = raw_input.map(lambda x: x.split(',')[0])
    p = raw_input.map(lambda x: x.split(',')[1]).map(lambda x: x.split(' ')).map(lambda x: [float(y) for y in x]).map(lambda x: Vectors.dense(x))

    model = LogisticRegressionModel.load(sc, model_path)
    predictions = model.predict(p)
    keyPredictions = k.zip(predictions.map(lambda x: "IT'S A BIRD!" if x==1 else "IT'S A PARK!"))

    print("************* RESULTS *******************")
    print keyPredictions.collect()

    sc.stop()
开发者ID:samacart,项目名称:Park-Or-Bird,代码行数:31,代码来源:spark-Prediction.py


注:本文中的pyspark.SparkContext.textFile方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。