本文整理汇总了Python中pyspark.SparkContext.newAPIHadoopRDD方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.newAPIHadoopRDD方法的具体用法?Python SparkContext.newAPIHadoopRDD怎么用?Python SparkContext.newAPIHadoopRDD使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext
的用法示例。
在下文中一共展示了SparkContext.newAPIHadoopRDD方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
def main():
conf = SparkConf().setAppName("pyspark test")
sc = SparkContext(conf=conf)
# Create an RDD backed by the MongoDB collection.
# MongoInputFormat allows us to read from a live MongoDB instance.
# We could also use BSONFileInputFormat to read BSON snapshots.
rdd = sc.newAPIHadoopRDD(
inputFormatClass='com.mongodb.hadoop.MongoInputFormat',
keyClass='org.apache.hadoop.io.Text',
valueClass='org.apache.hadoop.io.MapWritable',
conf={
'mongo.input.uri': 'mongodb://localhost:27017/db.collection'
}
)
# Save this RDD as a Hadoop "file".
# The path argument is unused; all documents will go to "mongo.output.uri".
rdd.saveAsNewAPIHadoopFile(
path='file:///this-is-unused',
outputFormatClass='com.mongodb.hadoop.MongoOutputFormat',
keyClass='org.apache.hadoop.io.Text',
valueClass='org.apache.hadoop.io.MapWritable',
conf={
'mongo.output.uri': 'mongodb://localhost:27017/output.collection'
}
)
# We can also save this back to a BSON file.
rdd.saveAsNewAPIHadoopFile(
path='hdfs://localhost:8020/user/spark/bson-demo',
outputFormatClass='com.mongodb.hadoop.BSONFileOutputFormat',
keyClass='org.apache.hadoop.io.Text',
valueClass='org.apache.hadoop.io.MapWritable'
)
示例2: index
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
def index():
input_fmt_cls_name = 'com.mongodb.hadoop.MongoInputFormat'
output_fmt_cls_name = 'com.mongodb.spark.PySparkMongoOutputFormat'
val_cls_name = key_cls_name = 'com.mongodb.hadoop.io.BSONWritable'
val_converter = key_converter = 'com.mongodb.spark.pickle.NoopConverter'
config = load_config()
host, port = config.get('mongo', 'host'), config.get('mongo', 'port')
dbname = config.get('mongo', 'dbname')
dbpath_in = 'mongodb://{}:{}/{}.documents'.format(host, port, dbname)
dbpath_out = 'mongodb://{}:{}/{}.indexes_raw'.format(host, port, dbname)
sc = SparkContext('local', 'pyspark')
doc_rdd_raw = sc.newAPIHadoopRDD(input_fmt_cls_name, key_cls_name,
val_cls_name, None, None,
{'mongo.input.uri': dbpath_in})
doc_rdd = doc_rdd_raw.values()
result = doc_rdd.flatMap(index_document)#.reduceByKey(join_hits)
#result.coalesce(1, True).saveAsTextFile('results')
result.saveAsNewAPIHadoopFile(
'file:///placeholder',
outputFormatClass=output_fmt_cls_name,
keyClass=key_cls_name,
valueClass=val_cls_name,
keyConverter=key_converter,
valueConverter=val_converter,
conf={'mongo.output.uri': dbpath_out})
示例3: get_hbase_as_rdd
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
def get_hbase_as_rdd(host,tablename):
sc = SparkContext(appName="hbase2rdd")
conf = {"hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": tablename}
print "Connecting to host: " + conf["hbase.zookeeper.quorum"] + " table: " + conf["hbase.mapreduce.inputtable"]
keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter"
valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter"
hbase_rdd = sc.newAPIHadoopRDD("org.apache.hadoop.hbase.mapreduce.TableInputFormat","org.apache.hadoop.hbase.io.ImmutableBytesWritable","org.apache.hadoop.hbase.client.Result",keyConverter=keyConv,valueConverter=valueConv,conf=conf)
return hbase_rdd
示例4: SparkConf
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
#pattern = r"((([01]?[0-9]?[0-9]|2[0-4][0-9]|25[0-5])[ (\[]?(\.|dot)[ )\]]?){3}([01]?[0-9]?[0-9]|2[0-4][0-9]|25[0-5]))"
pattern = r"((?:[0-9]{1,3}\.){3}[0-9]{1,3})"
if __name__ == "__main__":
#string confstring = { 'es.nodes' : 'elasticsearch', 'es.port' : '9200', 'es.resource' : 'graylog2_0/message', 'es.query' : '""', "fields" : [ "message" ] } }}' }
conf = SparkConf().setAppName("ESTest")
sc = SparkContext(conf=conf)
es_read_conf = {
'es.nodes' : 'elasticsearch',
'es.port' : '9200',
'es.resource' : 'graylog2_*/message',
'es.query' : '{"query": { "multi_match" : { "query" : ' ', "fields" : [ "message" ] } }}'
}
es_read_conf['es.query'] = custom_query
es_rdd = sc.newAPIHadoopRDD(
inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
keyClass="org.apache.hadoop.io.NullWritable",
valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
conf=es_read_conf)
es_write_conf = {
'es.nodes' : 'elasticsearch',
'es.port' : '9200',
'es.resource' : 'spark_analytics/severe_analytics'
}
es_write_conf_ip = {
'es.nodes' : 'elasticsearch',
'es.port' : '9200',
'es.resource' : 'spark_analytics/analytics'
}
doc = es_rdd.first()[1]
# print es_rdd.collect()
# exit()
示例5: SparkConf
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
from pyspark import SparkContext, SparkConf
import sys
#spark-submit /vagrant/spark.py <tablename> <tmp-dir>
if __name__ == "__main__":
conf = SparkConf().setAppName('tweets spark aggregation')
sc = SparkContext(conf=conf)
print 'reading %s'%sys.argv[1]
conf = {"hbase.zookeeper.quorum": "localhost", "hbase.mapreduce.inputtable": sys.argv[1]}
keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter"
valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter"
hbase_rdd = sc.newAPIHadoopRDD("org.apache.hadoop.hbase.mapreduce.TableInputFormat","org.apache.hadoop.hbase.io.ImmutableBytesWritable","org.apache.hadoop.hbase.client.Result",keyConverter=keyConv, valueConverter=valueConv, conf=conf)
#following line doesn't work...probably due to https://issues.apache.org/jira/browse/SPARK-5361
#hbase_rdd.saveAsHadoopFile("/usr/out.txt", outputFormatClass="org.apache.hadoop.mapred.TextOutputFormat", keyClass="org.apache.hadoop.io.Text", valueClass="org.apache.hadoop.io.Text")
print 'writing to %s'%sys.argv[2]
hbase_rdd.filter(lambda (x,y): len(y)>0).map(lambda (_,y):y).saveAsTextFile(sys.argv[2])
示例6: quiet_logs
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
def quiet_logs( sc ):
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
sc = SparkContext("local", "Course Stats")
quiet_logs(sc)
config = {"mongo.input.uri": "mongodb://192.168.99.100:27017/wsl.learning_units"}
inputFormatClassName = "com.mongodb.hadoop.MongoInputFormat"
# these values worked but others might as well
keyClassName = "org.apache.hadoop.io.Text"
valueClassName = "org.apache.hadoop.io.MapWritable"
statsRawRDD = sc.newAPIHadoopRDD(inputFormatClassName, keyClassName, valueClassName, None, None, config)
#valuesRDD = statsRawRDD.values();
#resultRDD = valuesRDD.map(lambda doc: str(doc["type"]))
# configuration for output to MongoDB
config["mongo.output.uri"] = "mongodb://192.168.99.100:27017/wsl.stats"
outputFormatClassName = "com.mongodb.hadoop.MongoOutputFormat"
num = statsRawRDD.count()
# doesn't yet work - certain Java types like HashMap, String, GregorianCalendar
# don't serialize 1.5 of the handoop mongo connector may fix this
#statsRawRDD.saveAsNewAPIHadoopFile("file:///placeholder", outputFormatClassName, None, None, None, None, config)
print "Count: ", num
示例7: SparkConf
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
conf = SparkConf()
conf.setAppName("spark_app_wordcount_merge")
sc = SparkContext(conf=conf)
# "mapreduce.input.fileinputformat.split.minsize.per.node": "67108864"
# "mapreduce.input.fileinputformat.split.minsize.per.rack": "134217728"
hadoopConf = {"mapreduce.input.fileinputformat.inputdir": "/user/hdfs/rawlog/app_weibomobilekafka1234_topweiboaction/",
"mapreduce.input.fileinputformat.input.dir.recursive": "true"}
# TextInputFormat + coalesce
# CombineTextInputFormat
source = sc.newAPIHadoopRDD(inputFormatClass="org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
keyClass="org.apache.hadoop.io.LongWritable",
valueClass="org.apache.hadoop.io.Text",
conf=hadoopConf)
source = source.coalesce(5000)
lines = source.map(lambda pair: pair[1])
words = lines.flatMap(lambda line: line.split(","))
pairs = words.map(lambda word: (word[0:10], 1))
counts = pairs.reduceByKey(lambda a, b: a + b, 30)
counts.saveAsTextFile("/user/yurun/spark/output/wordcount/")
sc.stop()
示例8: exit
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
Run with example jar:
./bin/spark-submit --driver-class-path /path/to/example/jar /path/to/examples/cassandra_inputformat.py <host> <keyspace> <cf>
Assumes you have some data in Cassandra already, running on <host>, in <keyspace> and <cf>
"""
exit(-1)
host = sys.argv[1]
keyspace = sys.argv[2]
cf = sys.argv[3]
sc = SparkContext(appName="CassandraInputFormat")
conf = {"cassandra.input.thrift.address":host,
"cassandra.input.thrift.port":"9160",
"cassandra.input.keyspace":keyspace,
"cassandra.input.columnfamily":cf,
"cassandra.input.partitioner.class":"Murmur3Partitioner",
"cassandra.input.page.row.size":"3"}
cass_rdd = sc.newAPIHadoopRDD(
"org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat",
"java.util.Map",
"java.util.Map",
keyConverter="org.apache.spark.examples.pythonconverters.CassandraCQLKeyConverter",
valueConverter="org.apache.spark.examples.pythonconverters.CassandraCQLValueConverter",
conf=conf)
output = cass_rdd.collect()
for (k, v) in output:
print (k, v)
sc.stop()
示例9: chunks
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
# credit http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i+n]
# set up context
sc = SparkContext("local[*]", "Simple App")
#sc = SparkContext("spark://url:7077", "Simple App")
sqlContext = SQLContext(sc)
sqlContext.setConf("spark.sql.shuffle.partitions", "5")
# issue movies query
conf = {"es.resource" : "movies2/logs", "es.query" : "?q=name:picture"}
movies = sc.newAPIHadoopRDD("org.elasticsearch.hadoop.mr.EsInputFormat",\
"org.apache.hadoop.io.NullWritable", "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=conf)
# place results in table
moviesRows = movies.map(lambda p: Row(id=int(p[1]['id']), name=p[1]['name']))
moviesRowsList = moviesRows.collect()
schemaMovies = sqlContext.createDataFrame(moviesRowsList)
schemaMovies.registerTempTable("movies")
sqlContext.cacheTable("movies")
# get ids in order to form acted_in query
ids = []
for moviesRow in moviesRowsList:
ids.append(moviesRow['id'])
movieIdSnippets = []
for id in ids:
movieIdSnippets.append("movie_id:" + str(id))
示例10: clean_logical
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
}
}
else:
print 'must chose one option [--from-scratch; --last-week]'
# --
# Connecting to ES
resource_string = 'forms/3,4'
rdd = sc.newAPIHadoopRDD(
inputFormatClass = "org.elasticsearch.hadoop.mr.EsInputFormat",
keyClass = "org.apache.hadoop.io.NullWritable",
valueClass = "org.elasticsearch.hadoop.mr.LinkedMapWritable",
conf = {
"es.nodes" : config['elasticsearch']['host'],
"es.port" : str(config['elasticseach']['port']),
"es.resource" : resource_string,
"es.query" : json.dumps(query)
}
)
# --
# Function definition
def clean_logical(x):
if str(x).lower() == 'true':
return 1
if str(x).lower() == 'false':
return 0
else:
示例11: len
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
es.update(index='spark-jobs', doc_type='job', id=task_id, body={
'doc': {
'current': 1,
'status': 'Spark job started..'
}
})
result_indices = len(es.indices.get_aliases(index="titanic-results-*"))
output_resource = "titanic-results-%s/value-counts" % (result_indices + 1)
conf = SparkConf().setAppName("ESTest")
sc = SparkContext(conf=conf)
es_rdd = sc.newAPIHadoopRDD(
inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
keyClass="org.apache.hadoop.io.NullWritable",
valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
conf={ "es.resource" : "titanic/passenger" })
doc = es_rdd.first()[1]
num_fields = len(doc)
for idx, field in enumerate(doc):
es.update(index='spark-jobs', doc_type='job', id=task_id, body={
'doc': {
'current': (idx+1) * 95 / num_fields,
'status': 'Spark job underway..'
}
})
示例12: inverse_ref
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
def inverse_ref(url_words):
url = url_words[0]
words = re.split("[^a-z]+",url_words[1].lower())
for word in words:
stemmed_word = stem(word.decode('utf-8'))
if (is_word_ok(stemmed_word)):
yield {stemmed_word:url}
hbaseConfig={"hbase.mapreduce.inputtable":"wiki","hbase.mapreduce.scan.columns":"cf:content"}
table=sc.newAPIHadoopRDD(
'org.apache.hadoop.hbase.mapreduce.TableInputFormat',
'org.apache.hadoop.hbase.io.ImmutableBytesWritable',
'org.apache.hadoop.hbase.client.Result',
keyConverter="org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter",
valueConverter="org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter",
conf=hbaseConfig)
words_occ=table.map(lambda l:inverse_ref(l)).filter(lambda l:l).map(lambda l:(l,1))
words=words_occ.map(lambda l:(l,1)).reduceByKey(lambda a,b:a+b).filter(lambda (a,b):b>100)
word_counts=dict(words.collectAsMap())
for word in word_counts:
示例13: SparkContext
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
#sc = SparkContext("spark://quickstart.cloudera:7077", "Test MongoDB Connector")
sc = SparkContext("local", "Test MongoDB Connector")
# Config MongoDB
inputConfig = { "mongo.input.uri" : "mongodb://localhost:27017/marketdata.stock_prices" }
outputConfig = { "mongo.output.uri" : "mongodb://localhost:27017/marketdata.maxminprices" }
# Config pour RDD qui va lire les data dans MongoDB
inputFormatClassName = "com.mongodb.hadoop.MongoInputFormat"
keyClassName = "java.lang.Object"
valueClassName = "org.bson.BSONObject"
stockPricesRDD = sc.newAPIHadoopRDD(inputFormatClassName, keyClassName, valueClassName, None, None, inputConfig)
# Config pour RDD qui va ecrire dans MongoDB
outputFormatClassName = "com.mongodb.hadoop.MongoOutputFormat"
# Les traitements...
# ... sur l'ensemble des data
prices = stockPricesRDD.values()
# ... groupby sur (symbol, day)
groupByRDD = prices.groupBy(lambda doc: (doc["Symbol"], doc["Day"]))
# .map(lambda tuple: (tuple[0], tuple[1])) \
# .collect()
# ... aggregate par clef (on prend le max de High et le min de Low)
def maxMin(doc, groupedDocs):
示例14: run_driver
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
def run_driver(keyspace):
sc = SparkContext(appName="PySpark Cassandra Hadoop Example")
# Reading from Cassandra
conf = {
"cassandra.input.thrift.address": "localhost",
"cassandra.input.thrift.port": "9160",
"cassandra.input.keyspace": keyspace,
"cassandra.input.columnfamily": "users",
"cassandra.input.partitioner.class":"Murmur3Partitioner",
"cassandra.input.page.row.size": "5000"
}
cass_rdd = sc.newAPIHadoopRDD(
# inputFormatClass
"org.apache.cassandra.hadoop.cql3.CqlInputFormat",
# keyClass
"java.util.Map",
# valueClass
"java.util.Map",
keyConverter=INPUT_KEY_CONVERTER,
valueConverter=INPUT_VALUE_CONVERTER,
conf=conf)
print cass_rdd.collect()
# Writing to Cassandra
now = dt.datetime.now()
users = (
{
"id": "keith",
"created_at": now,
"updated_at": now,
"first_name": "Keith",
"last_name": "Bourgoin",
"emails": set(["[email protected]"]),
"logins": [now.isoformat()],
"settings": {
"background_color": "blue",
},
},
{
"id": "toms",
"created_at": now,
"updated_at": now,
"first_name": "Toms",
"last_name": "Baugis",
"emails": set(["[email protected]"]),
"logins": [now.isoformat()],
"settings": {
"background_color": "green",
},
},
)
cql = """
UPDATE users
SET created_at=?, updated_at=?, first_name=?, last_name=?, emails=?,
logins=?, settings=?
""".strip()
conf = {
"cassandra.output.thrift.address": "localhost",
"cassandra.output.thrift.port": "9160",
"cassandra.output.keyspace": keyspace,
"cassandra.output.partitioner.class": "Murmur3Partitioner",
"cassandra.output.cql": cql,
"mapreduce.output.basename": "users",
"mapreduce.outputformat.class": "org.apache.cassandra.hadoop.cql3.CqlOutputFormat",
"mapreduce.job.output.key.class": "java.util.Map",
"mapreduce.job.output.value.class": "java.util.List"
}
users = sc.parallelize(users)
users.map(to_cql_output_format)\
.saveAsNewAPIHadoopDataset(conf=conf,
keyConverter=OUTPUT_KEY_CONVERTER,
valueConverter=OUTPUT_VALUE_CONVERTER)
sc.stop()
示例15: row
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
row2 column=f1:, timestamp=1401883415212, value=value2
row3 column=f1:, timestamp=1401883417858, value=value3
row4 column=f1:, timestamp=1401883420805, value=value4
4 row(s) in 0.0240 seconds
"""
if __name__ == "__main__":
if len(sys.argv) != 3:
print >> sys.stderr, """
Usage: hbase_inputformat <host> <table>
Run with example jar:
./bin/spark-submit --driver-class-path /path/to/example/jar /path/to/examples/hbase_inputformat.py <host> <table>
Assumes you have some data in HBase already, running on <host>, in <table>
"""
exit(-1)
host = sys.argv[1]
table = sys.argv[2]
sc = SparkContext(appName="HBaseInputFormat")
conf = {"hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": table}
hbase_rdd = sc.newAPIHadoopRDD(
"org.apache.hadoop.hbase.mapreduce.TableInputFormat",
"org.apache.hadoop.hbase.io.ImmutableBytesWritable",
"org.apache.hadoop.hbase.client.Result",
valueConverter="org.apache.spark.examples.pythonconverters.HBaseConverter",
conf=conf)
output = hbase_rdd.collect()
for (k, v) in output:
print (k, v)