当前位置: 首页>>代码示例>>Python>>正文


Python SparkContext.newAPIHadoopRDD方法代码示例

本文整理汇总了Python中pyspark.SparkContext.newAPIHadoopRDD方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.newAPIHadoopRDD方法的具体用法?Python SparkContext.newAPIHadoopRDD怎么用?Python SparkContext.newAPIHadoopRDD使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.SparkContext的用法示例。


在下文中一共展示了SparkContext.newAPIHadoopRDD方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
def main():
    conf = SparkConf().setAppName("pyspark test")
    sc = SparkContext(conf=conf)

    # Create an RDD backed by the MongoDB collection.
    # MongoInputFormat allows us to read from a live MongoDB instance.
    # We could also use BSONFileInputFormat to read BSON snapshots.
    rdd = sc.newAPIHadoopRDD(
        inputFormatClass='com.mongodb.hadoop.MongoInputFormat',
        keyClass='org.apache.hadoop.io.Text',
        valueClass='org.apache.hadoop.io.MapWritable',
        conf={
            'mongo.input.uri': 'mongodb://localhost:27017/db.collection'
        }
    )

    # Save this RDD as a Hadoop "file".
    # The path argument is unused; all documents will go to "mongo.output.uri".
    rdd.saveAsNewAPIHadoopFile(
        path='file:///this-is-unused',
        outputFormatClass='com.mongodb.hadoop.MongoOutputFormat',
        keyClass='org.apache.hadoop.io.Text',
        valueClass='org.apache.hadoop.io.MapWritable',
        conf={
            'mongo.output.uri': 'mongodb://localhost:27017/output.collection'
        }
    )

    # We can also save this back to a BSON file.
    rdd.saveAsNewAPIHadoopFile(
        path='hdfs://localhost:8020/user/spark/bson-demo',
        outputFormatClass='com.mongodb.hadoop.BSONFileOutputFormat',
        keyClass='org.apache.hadoop.io.Text',
        valueClass='org.apache.hadoop.io.MapWritable'
    )
开发者ID:wingzero321,项目名称:py_code,代码行数:37,代码来源:data_mongo-hadhoop.py

示例2: index

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
def index():
    input_fmt_cls_name = 'com.mongodb.hadoop.MongoInputFormat'
    output_fmt_cls_name = 'com.mongodb.spark.PySparkMongoOutputFormat'
    val_cls_name = key_cls_name = 'com.mongodb.hadoop.io.BSONWritable'
    val_converter = key_converter = 'com.mongodb.spark.pickle.NoopConverter'

    config = load_config()
    host, port = config.get('mongo', 'host'), config.get('mongo', 'port')
    dbname = config.get('mongo', 'dbname')
    dbpath_in = 'mongodb://{}:{}/{}.documents'.format(host, port, dbname)
    dbpath_out = 'mongodb://{}:{}/{}.indexes_raw'.format(host, port, dbname)

    sc = SparkContext('local', 'pyspark')
    doc_rdd_raw = sc.newAPIHadoopRDD(input_fmt_cls_name, key_cls_name,
                                     val_cls_name, None, None,
                                     {'mongo.input.uri': dbpath_in})
    doc_rdd = doc_rdd_raw.values()

    result = doc_rdd.flatMap(index_document)#.reduceByKey(join_hits)
    #result.coalesce(1, True).saveAsTextFile('results')
    result.saveAsNewAPIHadoopFile(
        'file:///placeholder',
        outputFormatClass=output_fmt_cls_name,
        keyClass=key_cls_name,
        valueClass=val_cls_name,
        keyConverter=key_converter,
        valueConverter=val_converter,
        conf={'mongo.output.uri': dbpath_out})
开发者ID:tondzus,项目名称:python-searcher,代码行数:30,代码来源:spark-indexer.py

示例3: get_hbase_as_rdd

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
def get_hbase_as_rdd(host,tablename):

	sc = SparkContext(appName="hbase2rdd")
	conf = {"hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": tablename}
	print "Connecting to host: " + conf["hbase.zookeeper.quorum"] + " table: " + conf["hbase.mapreduce.inputtable"]
	keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter"
	valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter"
	hbase_rdd = sc.newAPIHadoopRDD("org.apache.hadoop.hbase.mapreduce.TableInputFormat","org.apache.hadoop.hbase.io.ImmutableBytesWritable","org.apache.hadoop.hbase.client.Result",keyConverter=keyConv,valueConverter=valueConv,conf=conf)
	return hbase_rdd
开发者ID:usc-isi-i2,项目名称:dig-prep,代码行数:11,代码来源:pyspark-hbase.py

示例4: SparkConf

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
#pattern = r"((([01]?[0-9]?[0-9]|2[0-4][0-9]|25[0-5])[ (\[]?(\.|dot)[ )\]]?){3}([01]?[0-9]?[0-9]|2[0-4][0-9]|25[0-5]))"
pattern = r"((?:[0-9]{1,3}\.){3}[0-9]{1,3})"
if __name__ == "__main__":
    #string confstring = { 'es.nodes' : 'elasticsearch', 'es.port' : '9200', 'es.resource' : 'graylog2_0/message', 'es.query' : '""', "fields" : [ "message" ] } }}' }
    conf = SparkConf().setAppName("ESTest")
    sc = SparkContext(conf=conf)
    es_read_conf = {
        'es.nodes' : 'elasticsearch',
        'es.port' : '9200',
        'es.resource' : 'graylog2_*/message',
        'es.query' : '{"query": { "multi_match" : { "query" : ' ', "fields" : [ "message" ] } }}'
      } 
    es_read_conf['es.query'] = custom_query 
    es_rdd = sc.newAPIHadoopRDD(
        inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
        keyClass="org.apache.hadoop.io.NullWritable", 
        valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", 
        conf=es_read_conf)
    es_write_conf = {
        'es.nodes' : 'elasticsearch',
        'es.port' : '9200',
        'es.resource' : 'spark_analytics/severe_analytics'
    }
    es_write_conf_ip = {
        'es.nodes' : 'elasticsearch',
        'es.port' : '9200',
        'es.resource' : 'spark_analytics/analytics'
    }
    doc = es_rdd.first()[1]
#    print es_rdd.collect()
#    exit()
开发者ID:mayurisapre,项目名称:CMPE272-CloudComplianceDeviation,代码行数:33,代码来源:submit7.py

示例5: SparkConf

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
from pyspark import SparkContext, SparkConf
import sys
#spark-submit /vagrant/spark.py <tablename> <tmp-dir>
if __name__ == "__main__":
		
	conf = SparkConf().setAppName('tweets spark aggregation')
	sc = SparkContext(conf=conf)
	print 'reading %s'%sys.argv[1]
	conf = {"hbase.zookeeper.quorum": "localhost", "hbase.mapreduce.inputtable": sys.argv[1]}
	keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter"
	valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter"
	hbase_rdd = sc.newAPIHadoopRDD("org.apache.hadoop.hbase.mapreduce.TableInputFormat","org.apache.hadoop.hbase.io.ImmutableBytesWritable","org.apache.hadoop.hbase.client.Result",keyConverter=keyConv, valueConverter=valueConv, conf=conf)
	#following line doesn't work...probably due to https://issues.apache.org/jira/browse/SPARK-5361
	#hbase_rdd.saveAsHadoopFile("/usr/out.txt", outputFormatClass="org.apache.hadoop.mapred.TextOutputFormat", keyClass="org.apache.hadoop.io.Text", valueClass="org.apache.hadoop.io.Text")
	print 'writing to %s'%sys.argv[2]
	hbase_rdd.filter(lambda (x,y): len(y)>0).map(lambda (_,y):y).saveAsTextFile(sys.argv[2])
开发者ID:Mattiaz88,项目名称:hadoop-hbase-spark-playground,代码行数:18,代码来源:spark.py

示例6: quiet_logs

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
def quiet_logs( sc ):
  logger = sc._jvm.org.apache.log4j
  logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
  logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )

sc = SparkContext("local", "Course Stats")
quiet_logs(sc)

config = {"mongo.input.uri": "mongodb://192.168.99.100:27017/wsl.learning_units"}
inputFormatClassName = "com.mongodb.hadoop.MongoInputFormat"
# these values worked but others might as well
keyClassName = "org.apache.hadoop.io.Text"
valueClassName = "org.apache.hadoop.io.MapWritable"

statsRawRDD = sc.newAPIHadoopRDD(inputFormatClassName, keyClassName, valueClassName, None, None, config)
#valuesRDD = statsRawRDD.values();
#resultRDD = valuesRDD.map(lambda doc: str(doc["type"]))

# configuration for output to MongoDB
config["mongo.output.uri"] = "mongodb://192.168.99.100:27017/wsl.stats"
outputFormatClassName = "com.mongodb.hadoop.MongoOutputFormat"

num = statsRawRDD.count()

# doesn't yet work - certain Java types like HashMap, String, GregorianCalendar
# don't serialize 1.5 of the handoop mongo connector may fix this
#statsRawRDD.saveAsNewAPIHadoopFile("file:///placeholder", outputFormatClassName, None, None, None, None, config)

print "Count: ", num
开发者ID:mpdevel,项目名称:wsl_data_proc,代码行数:31,代码来源:spark_course_stats.py

示例7: SparkConf

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
conf = SparkConf()

conf.setAppName("spark_app_wordcount_merge")

sc = SparkContext(conf=conf)

# "mapreduce.input.fileinputformat.split.minsize.per.node": "67108864"
# "mapreduce.input.fileinputformat.split.minsize.per.rack": "134217728"
hadoopConf = {"mapreduce.input.fileinputformat.inputdir": "/user/hdfs/rawlog/app_weibomobilekafka1234_topweiboaction/",
              "mapreduce.input.fileinputformat.input.dir.recursive": "true"}

# TextInputFormat + coalesce
# CombineTextInputFormat
source = sc.newAPIHadoopRDD(inputFormatClass="org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
                            keyClass="org.apache.hadoop.io.LongWritable",
                            valueClass="org.apache.hadoop.io.Text",
                            conf=hadoopConf)

source = source.coalesce(5000)

lines = source.map(lambda pair: pair[1])

words = lines.flatMap(lambda line: line.split(","))

pairs = words.map(lambda word: (word[0:10], 1))

counts = pairs.reduceByKey(lambda a, b: a + b, 30)

counts.saveAsTextFile("/user/yurun/spark/output/wordcount/")

sc.stop()
开发者ID:kytle,项目名称:pyspark,代码行数:33,代码来源:spark_app_wordcount_merge.py

示例8: exit

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
        Run with example jar:
        ./bin/spark-submit --driver-class-path /path/to/example/jar /path/to/examples/cassandra_inputformat.py <host> <keyspace> <cf>
        Assumes you have some data in Cassandra already, running on <host>, in <keyspace> and <cf>
        """
        exit(-1)

    host = sys.argv[1]
    keyspace = sys.argv[2]
    cf = sys.argv[3]
    sc = SparkContext(appName="CassandraInputFormat")

    conf = {"cassandra.input.thrift.address":host,
            "cassandra.input.thrift.port":"9160",
            "cassandra.input.keyspace":keyspace,
            "cassandra.input.columnfamily":cf,
            "cassandra.input.partitioner.class":"Murmur3Partitioner",
            "cassandra.input.page.row.size":"3"}
    cass_rdd = sc.newAPIHadoopRDD(
        "org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat",
        "java.util.Map",
        "java.util.Map",
        keyConverter="org.apache.spark.examples.pythonconverters.CassandraCQLKeyConverter",
        valueConverter="org.apache.spark.examples.pythonconverters.CassandraCQLValueConverter",
        conf=conf)
    output = cass_rdd.collect()
    for (k, v) in output:
        print (k, v)

    sc.stop()
开发者ID:BigCrunsh,项目名称:spark,代码行数:31,代码来源:cassandra_inputformat.py

示例9: chunks

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
# credit http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i+n]

# set up context
sc = SparkContext("local[*]", "Simple App")
#sc = SparkContext("spark://url:7077", "Simple App")
sqlContext = SQLContext(sc)
sqlContext.setConf("spark.sql.shuffle.partitions", "5")

# issue movies query
conf = {"es.resource" : "movies2/logs", "es.query" : "?q=name:picture"}
movies = sc.newAPIHadoopRDD("org.elasticsearch.hadoop.mr.EsInputFormat",\
    "org.apache.hadoop.io.NullWritable", "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=conf)

# place results in table
moviesRows = movies.map(lambda p: Row(id=int(p[1]['id']), name=p[1]['name']))
moviesRowsList = moviesRows.collect()
schemaMovies = sqlContext.createDataFrame(moviesRowsList)
schemaMovies.registerTempTable("movies")
sqlContext.cacheTable("movies")

# get ids in order to form acted_in query
ids = []
for moviesRow in moviesRowsList:
    ids.append(moviesRow['id'])
movieIdSnippets = []
for id in ids:
    movieIdSnippets.append("movie_id:" + str(id))
开发者ID:ryancutter,项目名称:bigdata,代码行数:33,代码来源:sql_script.py

示例10: clean_logical

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
          }
    }
else: 
    print 'must chose one option [--from-scratch; --last-week]'


# --
# Connecting to ES

resource_string = 'forms/3,4'
rdd = sc.newAPIHadoopRDD(
    inputFormatClass = "org.elasticsearch.hadoop.mr.EsInputFormat",
    keyClass         = "org.apache.hadoop.io.NullWritable",
    valueClass       = "org.elasticsearch.hadoop.mr.LinkedMapWritable",
    conf             = {
        "es.nodes"    : config['elasticsearch']['host'],
        "es.port"     : str(config['elasticseach']['port']),
        "es.resource" : resource_string,
        "es.query"    : json.dumps(query)
   }
)


# --
# Function definition
def clean_logical(x):
    if str(x).lower() == 'true':
        return 1
    if str(x).lower() == 'false':
        return 0
    else: 
开发者ID:gophronesis,项目名称:edward-os,代码行数:33,代码来源:forms_3_4_map_ownership_changes.py

示例11: len

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
    es.update(index='spark-jobs', doc_type='job', id=task_id, body={
        'doc': { 
            'current': 1,
            'status': 'Spark job started..' 
        }
    })

    result_indices = len(es.indices.get_aliases(index="titanic-results-*"))
    output_resource = "titanic-results-%s/value-counts" % (result_indices + 1)

    conf = SparkConf().setAppName("ESTest")
    sc = SparkContext(conf=conf)
    
    es_rdd = sc.newAPIHadoopRDD(
        inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
        keyClass="org.apache.hadoop.io.NullWritable", 
        valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", 
        conf={ "es.resource" : "titanic/passenger" })

    doc = es_rdd.first()[1]

    num_fields = len(doc)

    for idx, field in enumerate(doc):

        es.update(index='spark-jobs', doc_type='job', id=task_id, body={
            'doc': { 
                'current': (idx+1) * 95 / num_fields,
                'status': 'Spark job underway..' 
            }
        })
开发者ID:Gwill,项目名称:qbox-blog-code,代码行数:33,代码来源:es_spark_test.py

示例12: inverse_ref

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
    
def inverse_ref(url_words):
    url = url_words[0]
    words = re.split("[^a-z]+",url_words[1].lower())
    for word in words:
        stemmed_word = stem(word.decode('utf-8'))
        if (is_word_ok(stemmed_word)):
            yield {stemmed_word:url}


hbaseConfig={"hbase.mapreduce.inputtable":"wiki","hbase.mapreduce.scan.columns":"cf:content"}

table=sc.newAPIHadoopRDD(
    'org.apache.hadoop.hbase.mapreduce.TableInputFormat',
    'org.apache.hadoop.hbase.io.ImmutableBytesWritable',
    'org.apache.hadoop.hbase.client.Result',
    keyConverter="org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter",
    valueConverter="org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter",
    conf=hbaseConfig)

words_occ=table.map(lambda l:inverse_ref(l)).filter(lambda l:l).map(lambda l:(l,1))





words=words_occ.map(lambda l:(l,1)).reduceByKey(lambda a,b:a+b).filter(lambda (a,b):b>100)
word_counts=dict(words.collectAsMap())


for word in word_counts:
开发者ID:Banaei,项目名称:ces-ds,代码行数:33,代码来源:q_06.py

示例13: SparkContext

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]

#sc = SparkContext("spark://quickstart.cloudera:7077", "Test MongoDB Connector")
sc = SparkContext("local", "Test MongoDB Connector")


# Config MongoDB
inputConfig = { "mongo.input.uri" : "mongodb://localhost:27017/marketdata.stock_prices" }
outputConfig = { "mongo.output.uri" : "mongodb://localhost:27017/marketdata.maxminprices" }

# Config pour RDD qui va lire les data dans MongoDB
inputFormatClassName = "com.mongodb.hadoop.MongoInputFormat"
keyClassName = "java.lang.Object"
valueClassName = "org.bson.BSONObject"

stockPricesRDD = sc.newAPIHadoopRDD(inputFormatClassName, keyClassName, valueClassName, None, None, inputConfig)

# Config pour RDD qui va ecrire dans MongoDB
outputFormatClassName = "com.mongodb.hadoop.MongoOutputFormat"

# Les traitements...
# ... sur l'ensemble des data
prices = stockPricesRDD.values()

# ... groupby sur (symbol, day)
groupByRDD = prices.groupBy(lambda doc: (doc["Symbol"], doc["Day"]))
#                  .map(lambda tuple: (tuple[0], tuple[1])) \
#                  .collect()

# ... aggregate par clef (on prend le max de High et le min de Low)
def maxMin(doc, groupedDocs):
开发者ID:bbonnin,项目名称:MUG-Nantes-Demo-Hadoop,代码行数:32,代码来源:connector-demo.py

示例14: run_driver

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
def run_driver(keyspace):
    sc = SparkContext(appName="PySpark Cassandra Hadoop Example")

    # Reading from Cassandra
    conf = {
        "cassandra.input.thrift.address": "localhost",
        "cassandra.input.thrift.port": "9160",
        "cassandra.input.keyspace": keyspace,
        "cassandra.input.columnfamily": "users",
        "cassandra.input.partitioner.class":"Murmur3Partitioner",
        "cassandra.input.page.row.size": "5000"
    }
    cass_rdd = sc.newAPIHadoopRDD(
        # inputFormatClass
        "org.apache.cassandra.hadoop.cql3.CqlInputFormat",
        # keyClass
        "java.util.Map",
        # valueClass
        "java.util.Map",
        keyConverter=INPUT_KEY_CONVERTER,
        valueConverter=INPUT_VALUE_CONVERTER,
        conf=conf)
    print cass_rdd.collect()

    # Writing to Cassandra
    now = dt.datetime.now()
    users = (
        {
            "id": "keith",
            "created_at": now,
            "updated_at": now,
            "first_name": "Keith",
            "last_name": "Bourgoin",
            "emails": set(["[email protected]"]),
            "logins": [now.isoformat()],
            "settings": {
                "background_color": "blue",
            },
        },
        {
            "id": "toms",
            "created_at": now,
            "updated_at": now,
            "first_name": "Toms",
            "last_name": "Baugis",
            "emails": set(["[email protected]"]),
            "logins": [now.isoformat()],
            "settings": {
                "background_color": "green",
            },
        },
    )

    cql = """
        UPDATE users
        SET created_at=?, updated_at=?, first_name=?, last_name=?, emails=?,
            logins=?, settings=?
    """.strip()
    conf = {
        "cassandra.output.thrift.address": "localhost",
        "cassandra.output.thrift.port": "9160",
        "cassandra.output.keyspace": keyspace,
        "cassandra.output.partitioner.class": "Murmur3Partitioner",
        "cassandra.output.cql": cql,
        "mapreduce.output.basename": "users",
        "mapreduce.outputformat.class": "org.apache.cassandra.hadoop.cql3.CqlOutputFormat",
        "mapreduce.job.output.key.class": "java.util.Map",
        "mapreduce.job.output.value.class": "java.util.List"
    }
    users = sc.parallelize(users)
    users.map(to_cql_output_format)\
         .saveAsNewAPIHadoopDataset(conf=conf,
                                    keyConverter=OUTPUT_KEY_CONVERTER,
                                    valueConverter=OUTPUT_VALUE_CONVERTER)

    sc.stop()
开发者ID:NunoEdgarGub1,项目名称:pyspark-cassandra,代码行数:78,代码来源:pyspark_cassandra_hadoop_example.py

示例15: row

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopRDD [as 别名]
 row2                         column=f1:, timestamp=1401883415212, value=value2
 row3                         column=f1:, timestamp=1401883417858, value=value3
 row4                         column=f1:, timestamp=1401883420805, value=value4
4 row(s) in 0.0240 seconds
"""
if __name__ == "__main__":
    if len(sys.argv) != 3:
        print >> sys.stderr, """
        Usage: hbase_inputformat <host> <table>

        Run with example jar:
        ./bin/spark-submit --driver-class-path /path/to/example/jar /path/to/examples/hbase_inputformat.py <host> <table>
        Assumes you have some data in HBase already, running on <host>, in <table>
        """
        exit(-1)

    host = sys.argv[1]
    table = sys.argv[2]
    sc = SparkContext(appName="HBaseInputFormat")

    conf = {"hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": table}
    hbase_rdd = sc.newAPIHadoopRDD(
        "org.apache.hadoop.hbase.mapreduce.TableInputFormat",
        "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
        "org.apache.hadoop.hbase.client.Result",
        valueConverter="org.apache.spark.examples.pythonconverters.HBaseConverter",
        conf=conf)
    output = hbase_rdd.collect()
    for (k, v) in output:
        print (k, v)
开发者ID:13111186,项目名称:spark,代码行数:32,代码来源:hbase_inputformat.py


注:本文中的pyspark.SparkContext.newAPIHadoopRDD方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。