本文整理汇总了Python中pyspark.context.SparkContext.textFile方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.textFile方法的具体用法?Python SparkContext.textFile怎么用?Python SparkContext.textFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.context.SparkContext
的用法示例。
在下文中一共展示了SparkContext.textFile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: MainApp
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import textFile [as 别名]
class MainApp(object):
def __init__(self):
pass
def init(self):
os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
# os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
# os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
conf = SparkConf()
conf.setMaster("local[10]")
conf.setAppName("PySparkShell")
conf.set("spark.executor.memory", "2g")
conf.set("spark.driver.memory", "1g")
self.sc = SparkContext(conf=conf)
self.sqlContext = SQLContext(self.sc)
def loadData(self):
category_list = self.sc.textFile("/Users/abhinavrungta/Desktop/uf-study/snc/github/SNC-WEB/src/yahoo/ydata-ymovies-user-movie-ratings-train-v1_0.txt").map(lambda line: (int(line.split(',')[0]), int(line.split(',')[1]), float(line.split(',')[2]), long(line.split(',')[3])))
category_schema = StructType([
StructField("userid", IntegerType(), True),
StructField("movieid", IntegerType(), True),
StructField("rating", FloatType(), True),
StructField("time", LongType(), True)
])
category_list = self.sqlContext.createDataFrame(category_list, category_schema)
category_list.registerTempTable("data")
movie_list = self.sqlContext.sql("SELECT movieid, COUNT(movieid) AS ct FROM data GROUP BY movieid")
movie_list.registerTempTable("movie")
movieid = movie_list.sort(movie_list.ct.desc()).first().movieid
# movieid = category_list.first().movieid
category_list = self.sqlContext.sql("SELECT * FROM data WHERE movieid = {0}".format(movieid))
category_list.registerTempTable("data")
user_list = self.sqlContext.sql("SELECT DISTINCT userid FROM data LIMIT 50")
print(user_list.count())
user_list.show()
user_list.registerTempTable("users")
category_list = self.sqlContext.sql("SELECT d.userid AS userid, d.movieid AS movieid, d.rating AS rating, d.time AS time FROM data d, users u WHERE d.userid = u.userid").repartition(1)
#category_list = self.sqlContext.createDataFrame(category_list, category_schema)
category_list = category_list.map(lambda line: str(line.userid) + "," + str(line.movieid) + "," + str(line.rating) + "," + str(line.time))
category_list = category_list.repartition(1)
category_list.saveAsTextFile("data.txt")
示例2: TestRDDFunctions
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import textFile [as 别名]
class TestRDDFunctions(PySparkTestCase):
def test_failed_sparkcontext_creation(self):
# Regression test for SPARK-1550
self.sc.stop()
self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
self.sc = SparkContext("local")
def test_save_as_textfile_with_unicode(self):
# Regression test for SPARK-970
x = u"\u00A1Hola, mundo!"
data = self.sc.parallelize([x])
tempFile = tempfile.NamedTemporaryFile(delete=True)
tempFile.close()
data.saveAsTextFile(tempFile.name)
raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
def test_transforming_cartesian_result(self):
# Regression test for SPARK-1034
rdd1 = self.sc.parallelize([1, 2])
rdd2 = self.sc.parallelize([3, 4])
cart = rdd1.cartesian(rdd2)
result = cart.map(lambda (x, y): x + y).collect()
def test_transforming_pickle_file(self):
# Regression test for SPARK-2601
data = self.sc.parallelize(["Hello", "World!"])
tempFile = tempfile.NamedTemporaryFile(delete=True)
tempFile.close()
data.saveAsPickleFile(tempFile.name)
pickled_file = self.sc.pickleFile(tempFile.name)
pickled_file.map(lambda x: x).collect()
def test_cartesian_on_textfile(self):
# Regression test for
path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
a = self.sc.textFile(path)
result = a.cartesian(a).collect()
(x, y) = result[0]
self.assertEqual("Hello World!", x.strip())
self.assertEqual("Hello World!", y.strip())
def test_deleting_input_files(self):
# Regression test for SPARK-1025
tempFile = tempfile.NamedTemporaryFile(delete=False)
tempFile.write("Hello World!")
tempFile.close()
data = self.sc.textFile(tempFile.name)
filtered_data = data.filter(lambda x: True)
self.assertEqual(1, filtered_data.count())
os.unlink(tempFile.name)
self.assertRaises(Exception, lambda: filtered_data.count())
def testAggregateByKey(self):
data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)
def seqOp(x, y):
x.add(y)
return x
def combOp(x, y):
x |= y
return x
sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
self.assertEqual(3, len(sets))
self.assertEqual(set([1]), sets[1])
self.assertEqual(set([2]), sets[3])
self.assertEqual(set([1, 3]), sets[5])
示例3: TestRDDFunctions
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import textFile [as 别名]
class TestRDDFunctions(PySparkTestCase):
def test_failed_sparkcontext_creation(self):
# Regression test for SPARK-1550
self.sc.stop()
self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
self.sc = SparkContext("local")
def test_save_as_textfile_with_unicode(self):
# Regression test for SPARK-970
x = u"\u00A1Hola, mundo!"
data = self.sc.parallelize([x])
tempFile = tempfile.NamedTemporaryFile(delete=True)
tempFile.close()
data.saveAsTextFile(tempFile.name)
raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
def test_save_as_textfile_with_utf8(self):
x = u"\u00A1Hola, mundo!"
data = self.sc.parallelize([x.encode("utf-8")])
tempFile = tempfile.NamedTemporaryFile(delete=True)
tempFile.close()
data.saveAsTextFile(tempFile.name)
raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
def test_transforming_cartesian_result(self):
# Regression test for SPARK-1034
rdd1 = self.sc.parallelize([1, 2])
rdd2 = self.sc.parallelize([3, 4])
cart = rdd1.cartesian(rdd2)
result = cart.map(lambda (x, y): x + y).collect()
def test_transforming_pickle_file(self):
# Regression test for SPARK-2601
data = self.sc.parallelize(["Hello", "World!"])
tempFile = tempfile.NamedTemporaryFile(delete=True)
tempFile.close()
data.saveAsPickleFile(tempFile.name)
pickled_file = self.sc.pickleFile(tempFile.name)
pickled_file.map(lambda x: x).collect()
def test_cartesian_on_textfile(self):
# Regression test for
path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
a = self.sc.textFile(path)
result = a.cartesian(a).collect()
(x, y) = result[0]
self.assertEqual("Hello World!", x.strip())
self.assertEqual("Hello World!", y.strip())
def test_deleting_input_files(self):
# Regression test for SPARK-1025
tempFile = tempfile.NamedTemporaryFile(delete=False)
tempFile.write("Hello World!")
tempFile.close()
data = self.sc.textFile(tempFile.name)
filtered_data = data.filter(lambda x: True)
self.assertEqual(1, filtered_data.count())
os.unlink(tempFile.name)
self.assertRaises(Exception, lambda: filtered_data.count())
def testAggregateByKey(self):
data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)
def seqOp(x, y):
x.add(y)
return x
def combOp(x, y):
x |= y
return x
sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
self.assertEqual(3, len(sets))
self.assertEqual(set([1]), sets[1])
self.assertEqual(set([2]), sets[3])
self.assertEqual(set([1, 3]), sets[5])
def test_itemgetter(self):
rdd = self.sc.parallelize([range(10)])
from operator import itemgetter
self.assertEqual([1], rdd.map(itemgetter(1)).collect())
self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())
def test_namedtuple_in_rdd(self):
from collections import namedtuple
Person = namedtuple("Person", "id firstName lastName")
jon = Person(1, "Jon", "Doe")
jane = Person(2, "Jane", "Doe")
theDoes = self.sc.parallelize([jon, jane])
self.assertEquals([jon, jane], theDoes.collect())
def test_large_broadcast(self):
N = 100000
data = [[float(i) for i in range(300)] for i in range(N)]
bdata = self.sc.broadcast(data) # 270MB
m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum()
self.assertEquals(N, m)
#.........这里部分代码省略.........
示例4: SparkConf
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import textFile [as 别名]
'''
Created on Oct 30, 2015
@author: dyerke
'''
from pyspark.context import SparkContext
from pyspark.conf import SparkConf
if __name__ == '__main__':
m_hostname= "dyerke-Inspiron-7537"
#
conf= SparkConf()
conf.setAppName("MyTestApp")
conf.setMaster("spark://" + m_hostname + ":7077")
conf.setSparkHome("/usr/local/spark")
conf.set("spark.driver.host", m_hostname)
logFile = "/usr/local/spark/README.md" # Should be some file on your system
#
sc= SparkContext(conf=conf)
logData= sc.textFile(logFile).cache()
#
countAs= logData.filter(lambda x: 'a' in x).count()
countBs= logData.filter(lambda x: 'b' in x).count()
#
print("Lines with a: %i, lines with b: %i" % (countAs, countBs))
sc.stop()
示例5: run
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import textFile [as 别名]
def run(host, database, collection, start_time=None, end_time=None, center=None, degree=None):
response = tangelo.empty_response()
# Bail with error if any of the required arguments is missing.
missing = map(lambda x: x[0], filter(lambda x: x[1] is None, zip(["start_time", "end_time", "center", "degree"], [start_time, end_time, center, degree])))
if len(missing) > 0:
response["error"] = "missing required arguments: %s" % (", ".join(missing))
return response
# Cast the arguments to the right types.
#
# The degree is the degree of separation between the center element and the
# retrieved nodes - an integer.
try:
degree = int(degree)
except ValueError:
response["error"] = "argument 'degree' must be an integer"
return response
# The start time is the number of milliseconds since the epoch (which is how
# JavaScript dates are constructed, and therefore how dates are stored in
# MongoDB) - an integer.
try:
start_time = datetime.datetime.strptime(start_time, "%Y-%m-%d")
except ValueError:
response["error"] = "argument 'start_time' must be in YYYY-MM-DD format"
return response
# The end time is another date - an integer.
try:
end_time = datetime.datetime.strptime(end_time, "%Y-%m-%d")
except ValueError:
response["error"] = "argument 'end_time' must be in YYYY-MM-DD format"
return response
# Get a handle to the database collection.
if SparkContext._active_spark_context == None:
sc = SparkContext('spark://impaladev.darpa.mil:7077', 'Enron Emailers')
else:
sc = SparkContext._active_spark_context
enronData = sc.textFile('hdfs://localhost:8020/user/bigdata/pgill/enron/email_graph_fixed.txt').map(lambda line: line.split('\t')).cache()
def withinTimespan(record):
recordDate = datetime.datetime.strptime(record[2], "%Y-%m-%d")
return recordDate >= start_time and recordDate < end_time
def emptyRecords(record):
return record[0] != "" and record[1] != ""
def orderRecord(record):
if record[1] < record[0]:
record[0], record[1] = record[1], record[0]
return record
enronSpan = enronData.filter(withinTimespan).filter(emptyRecords).map(orderRecord).map(lambda rec: (rec[0], rec[1])).distinct().cache()
# Start a set of all interlocutors we're interested in - that includes the
# center emailer.
talkers = set([center])
# Also start a table of distances from the center.
distance = {center: 0}
current_talkers = list(talkers)
all_results = []
for i in range(degree):
def emailsInvolved(record):
return any(keyword in record for keyword in current_talkers)
results = enronSpan.filter(emailsInvolved).collect()
# Collect the names.
current_talkers = list(itertools.chain(*map(lambda x: [x[1], x[0]], results)))
current_talkers = list(set(current_talkers))
talkers = talkers.union(current_talkers)
# Compute updates to everyone's distance from center.
for t in current_talkers:
if t not in distance:
distance[t] = i+1
# save the cursor.
all_results.append(results)
# Construct a canonical graph structure from the set of talkers and the list
# of emails.
#
# Start with an index map of the talkers.
talkers = list(talkers)
talker_index = {name: index for (index, name) in enumerate(talkers)}
# Create a chained iterable from all the rewound partial results.
all_results = itertools.chain(*all_results)
# Create a list of graph edges suitable for use by D3 - replace each record
# in the data with one that carries an index into the emailers list.
edges = []
ident = 0
#.........这里部分代码省略.........
示例6: TestRDDFunctions
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import textFile [as 别名]
class TestRDDFunctions(PySparkTestCase):
def test_failed_sparkcontext_creation(self):
# Regression test for SPARK-1550
self.sc.stop()
self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
self.sc = SparkContext("local")
def test_save_as_textfile_with_unicode(self):
# Regression test for SPARK-970
x = u"\u00A1Hola, mundo!"
data = self.sc.parallelize([x])
tempFile = tempfile.NamedTemporaryFile(delete=True)
tempFile.close()
data.saveAsTextFile(tempFile.name)
raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
def test_transforming_cartesian_result(self):
# Regression test for SPARK-1034
rdd1 = self.sc.parallelize([1, 2])
rdd2 = self.sc.parallelize([3, 4])
cart = rdd1.cartesian(rdd2)
result = cart.map(lambda (x, y): x + y).collect()
def test_transforming_pickle_file(self):
# Regression test for SPARK-2601
data = self.sc.parallelize(["Hello", "World!"])
tempFile = tempfile.NamedTemporaryFile(delete=True)
tempFile.close()
data.saveAsPickleFile(tempFile.name)
pickled_file = self.sc.pickleFile(tempFile.name)
pickled_file.map(lambda x: x).collect()
def test_cartesian_on_textfile(self):
# Regression test for
path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
a = self.sc.textFile(path)
result = a.cartesian(a).collect()
(x, y) = result[0]
self.assertEqual("Hello World!", x.strip())
self.assertEqual("Hello World!", y.strip())
def test_deleting_input_files(self):
# Regression test for SPARK-1025
tempFile = tempfile.NamedTemporaryFile(delete=False)
tempFile.write("Hello World!")
tempFile.close()
data = self.sc.textFile(tempFile.name)
filtered_data = data.filter(lambda x: True)
self.assertEqual(1, filtered_data.count())
os.unlink(tempFile.name)
self.assertRaises(Exception, lambda: filtered_data.count())
def testAggregateByKey(self):
data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)
def seqOp(x, y):
x.add(y)
return x
def combOp(x, y):
x |= y
return x
sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
self.assertEqual(3, len(sets))
self.assertEqual(set([1]), sets[1])
self.assertEqual(set([2]), sets[3])
self.assertEqual(set([1, 3]), sets[5])
def test_itemgetter(self):
rdd = self.sc.parallelize([range(10)])
from operator import itemgetter
self.assertEqual([1], rdd.map(itemgetter(1)).collect())
self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())
def test_namedtuple_in_rdd(self):
from collections import namedtuple
Person = namedtuple("Person", "id firstName lastName")
jon = Person(1, "Jon", "Doe")
jane = Person(2, "Jane", "Doe")
theDoes = self.sc.parallelize([jon, jane])
self.assertEquals([jon, jane], theDoes.collect())
def test_large_broadcast(self):
N = 100000
data = [[float(i) for i in range(300)] for i in range(N)]
bdata = self.sc.broadcast(data) # 270MB
m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum()
self.assertEquals(N, m)
示例7: SQLContext
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import textFile [as 别名]
# read data as CSV for Dataframe analysis
# /Volumes/work/data/kaggle/ssi.csv
# read data n0rmally
"""
sqlContext = SQLContext(sc)
df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(BASE_DATA_PATH + '/ssi.csv')
# summarize(df)
print df.show()
#points = df.map(lambda row: LabeledPoint(input[row.C4],[float(row.C0),float(row.C1),float(row.C2),float(row.C3)]))
values using Dataframe
Final weights: [-137.221167143,12.555647803,53.629362055,109.314252441]
Final intercept: 0.0
"""
points = sc.textFile(BASE_DATA_PATH + "/ssi.csv").map(parsePoint)
model = LogisticRegressionWithSGD.train(points, 10)
print("Final weights: " + str(model.weights))
print("Final intercept: " + str(model.intercept))
"""
Final weights: [-137.221167143,12.555647803,53.629362055,109.314252441]
Final intercept: 0.0
"""
sc.stop()
示例8: SparkContext
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import textFile [as 别名]
import os
import sys
import platform
import py4j
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext, HiveContext
from pyspark.storagelevel import StorageLevel
# init - create SparkContext on Azure HdInsight
# on Azure HdInsight master was deafult to yarn
sc = SparkContext(appName="wc")
print "sys.argv[1]: ", sys.argv[1]
wc = sc.textFile(sys.argv[1]) \
.map( lambda x: x.replace(',',' ').replace('.',' ').replace('-',' ').lower()) \
.flatMap(lambda x: x.split(" ")) \
.map(lambda x: (x, 1)) \
.reduceByKey(lambda x, y: x + y)
print wc.collect()
示例9: SparkContext
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import textFile [as 别名]
from pyspark.context import SparkContext
sc = SparkContext(...)
lines = sc.textFile(sys.argv[2],1)
counts = lines.flatMap(lambda x: x.split(' ')) \
.map(lambda x: (x,1)) \
.reduceByKey(lambda x, y: x+y)
for (word, count) in counts.collect():
print "%s:%i" %(word, count)
示例10: int
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import textFile [as 别名]
else:
try:
id1 = int(string.strip(data[0]))
id2 = int(string.strip(data[1]))
return (id1, id2)
except:
return (-1, "error")
filePath = '/home/piyush/datasets/audioscrobbler/'
conf = SparkConf().setAppName("audio_scrobbler")
sc = SparkContext(conf=conf)
# parse raw user artist data
userArtistDataFile = filePath + 'user_artist_data.txt'
rawUserArtistData = sc.textFile(userArtistDataFile)
# parse Artist data file
artistDataFile = filePath + 'artist_data.txt'
rawArtistData = sc.textFile(artistDataFile)
artistById = rawArtistData.map(parseArtistByIdData).filter(lambda (k, v) : k != -1)
# parse artist alias file
artistAliasDataFile = filePath + 'artist_alias.txt'
rawArtistAliasData = sc.textFile(artistAliasDataFile)
artistAlias = rawArtistAliasData.map(parseArtistAliasData).filter(lambda (k, v) : k != -1).collectAsMap()
# broadcast variable
bArtistAlias = sc.broadcast(artistAlias)
示例11: toNumpy
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import textFile [as 别名]
if args.format == "tfr":
images = sc.newAPIHadoopFile(args.images, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
keyClass="org.apache.hadoop.io.BytesWritable",
valueClass="org.apache.hadoop.io.NullWritable")
def toNumpy(bytestr):
example = tf.train.Example()
example.ParseFromString(bytestr)
features = example.features.feature
image = numpy.array(features['image'].int64_list.value)
label = numpy.array(features['label'].int64_list.value)
return (image, label)
dataRDD = images.map(lambda x: toNumpy(str(x[0])))
else:
if args.format == "csv":
images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')])
labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])
else: # args.format == "pickle":
images = sc.pickleFile(args.images)
labels = sc.pickleFile(args.labels)
print("zipping images and labels")
dataRDD = images.zip(labels)
cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK)
if args.mode == "train":
cluster.train(dataRDD, args.epochs)
else:
labelRDD = cluster.inference(dataRDD)
labelRDD.saveAsTextFile(args.output)
cluster.shutdown()
示例12: SparkContext
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import textFile [as 别名]
import os
import sys
from pprint import pprint
from operator import add
import pyspark
from pyspark.context import SparkContext
sc = SparkContext()
file = "SampleData3.txt"
wordcounts = sc.textFile(file) \
.map(lambda l: ((l.split(" ")[0], len([x for x in l.split(" ")[1:] if ("gene_" in x or "disease_" in x)])), [x for x in l.split(" ")[1:] if ("gene_" in x or "disease_" in x)]))\ \
.flatMap(lambda x: x.split()) \
.map(lambda x: (x, 1)) \
.reduceByKey(lambda x,y:x+y) \
.map(lambda x:(x[1],x[0])) \
.sortByKey(False)
示例13: SparkContext
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import textFile [as 别名]
import sys
import time
import math
import utils
from pyspark.context import SparkContext
if (len(sys.argv) > 1):
hdfs_file_path = "/user/lsde02/data/%s/*.gz" % sys.argv[1]
else:
hdfs_file_path = "/user/lsde02/data/1901/*.gz"
hdfs_results_path = "/user/lsde02/results/"
start_time = time.strftime("%Y-%m-%d-%H-%M-%S")
sc = SparkContext()
context = sc.textFile(hdfs_file_path)
stations = context.flatMap(lambda x: [utils.extract(record) for record in x.splitlines()])
stations = stations.filter(lambda x: 'longitude' in x[1] and 'latitude' in x[1])
stations.persist()
# Do computations on month level
month_data = stations.map(lambda x:((x[0][0], x[0][1], x[0][3]), (x[1]['temp'], x[1]['wind-speed'], x[1]['sky-condition'], x[1]['visibility'], \
x[1]['wind-direction'])))
month_data = month_data.combineByKey(lambda value: (x['temp'], 1, x['wind-speed'], 1, x['sky-condition'], 1, x['visibility'], 1, \
math.sin(x['wind-direction'])*math.pi/180., math.cos(x['wind-direction']*math.pi/180.)),\
lambda x, value: (x[0] + value[0], value[1] + 1, x[2]+value[2], 1 + value[3], x[4] + value[4], 1 + value[5],\
x[6]+value[6], 1 + value[7], x[8] + value[8], x[9] + value[9]),\
lambda x, y: (x[0]+y[0], x[1]+y[1], x[2]+y[2], x[3]+y[3], x[4]+y[4], x[5]+y[5], x[6]+y[6], x[7]+y[7], x[8]+y[8]\
x[9]+y[9]))
month_data = month_data.map(lambda (label, (x1, c1, x2, c2, x3, c3, x4, c4, x5a, x5b)): (label, (x1/c1, x2/c2, x3/c3, x4/c4, math.atan2(x5a, x5b))))
month_data = month_data.coalesce(1, True)
month_avg.saveAsTextFile("%s%s-%s" % (hdfs_results_path, start_time, 'all'))
示例14: str
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import textFile [as 别名]
directories += str(i)
if i < int(sys.argv[2]):
directories += ","
directories += "}"
hdfs_file_path = "/user/lsde02/data/%s/*.gz" % directories
forced_partitions = (int(sys.argv[2])+1-int(sys.argv[1]))*12
else:
hdfs_file_path = "/user/lsde02/data/*/*.gz"
forced_partitions = 1500
hdfs_results_path = "/user/lsde02/results/"
start_time = time.strftime("%Y-%m-%d-%H-%M-%S")
print "Started processing: %s" % hdfs_file_path
sc = SparkContext()
context = sc.textFile(hdfs_file_path, forced_partitions)
stations = context.flatMap(lambda x: [utils.extract(record) for record in x.splitlines()])
#stations = stations.filter(lambda x: 'fixed-weather-station' in x[1] or )
# Do computations on month level
month_data = stations.map(lambda x:((x[0][0], x[0][1], x[0][3]), (utils.get_attribute(x[1], 'temp'), utils.get_attribute(x[1], 'windspeed'), \
utils.get_attribute(x[1], 'sky-condition'), utils.get_attribute(x[1], 'visibility'), utils.get_attribute(x[1], 'wind-direction'), \
utils.get_attribute(x[1], 'latitude'), utils.get_attribute(x[1], 'longitude'))))
month_data = month_data.combineByKey(lambda value: (value[0] if value[0] != None else 0, 1 if value[0] != None else 0,\
value[1] if value[1] != None else 0, 1 if value[1] != None else 0, \
value[2] if value[2] != None else 0, 1 if value[2] != None else 0, \
value[3] if value[3] != None else 0, 1 if value[3] != None else 0, \
math.sin(value[4]*math.pi/180.0) if value[4] != None else 0, \
math.cos(value[4]*math.pi/180.0) if value[4] != None else 0, \
value[0]*value[0] if value[0] != None else 0, \
value[1]*value[1] if value[1] != None else 0, \
示例15: print
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import textFile [as 别名]
print('\nRunning example of classification using GradientBoostedTrees\n')
testClassification(trainingData, testData)
print('\nRunning example of regression using GradientBoostedTrees\n')
testRegression(trainingData, testData)
sc.stop()
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint
train = sc.textFile("train.csv")
# Load and parse the data
def parsePoint(line):
values = [float(x) for x in line.split(',')]
return LabeledPoint(values[-1], values[:-1])
#data = sc.textFile("data/mllib/sample_svm_data.txt")
parsedData = train.map(parsePoint)
# Build the model
model = SVMWithSGD.train(parsedData, iterations=100)
# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))