本文整理汇总了Python中pyspark.SparkContext.newAPIHadoopFile方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.newAPIHadoopFile方法的具体用法?Python SparkContext.newAPIHadoopFile怎么用?Python SparkContext.newAPIHadoopFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext
的用法示例。
在下文中一共展示了SparkContext.newAPIHadoopFile方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopFile [as 别名]
def main(hdfs_uri):
"""Divolte Spark Example.
This example processes published Divolte log files at a given location.
It displays:
1. The total number of events in the log files.
2. An arbitrary event.
3. The ID of the session with the most events, along with the first 10
events in that session.
This is equivalent to the scala example.
"""
sc = SparkContext()
# Hadoop files are always read as an RDD of key/value pairs. Avro files contain only keys, however,
# so we immediately map out the values.
events_rdd = sc.newAPIHadoopFile(
hdfs_uri,
'org.apache.avro.mapreduce.AvroKeyInputFormat',
'org.apache.avro.mapred.AvroKey',
'org.apache.hadoop.io.NullWritable',
keyConverter='io.divolte.spark.pyspark.avro.AvroWrapperToJavaConverter').map(lambda (k,v): k)
# We are going to process the RDD several times, so cache the original
# set in cluster memory so it doesn't have to be loaded each time.
events_rdd.cache()
# Calculate the total number of events.
total_event_count = events_rdd.count()
# Get the first event in our dataset (which isn't ordered yet).
an_event = events_rdd.take(1)
# Find the session with the most events.
(longest_session_id, longest_session_count) = events_rdd \
.map(lambda event: (event['sessionId'], 1)) \
.reduceByKey(lambda x,y: x + y) \
.reduce(lambda x,y: max(x, y, key=lambda (e, c): c))
# For the session with the most events, find the first 10 events.
first_events = events_rdd \
.filter(lambda event: event['sessionId'] == longest_session_id) \
.map(lambda event: (event['location'], event['timestamp'])) \
.takeOrdered(10, lambda event: event[1])
# Simple function for rendering timestamps.
def timestamp_to_string(ts):
return datetime.fromtimestamp(ts / 1000.0).strftime('%Y-%m-%d %H:%M:%S')
# Print the results we accumulated, with some whitespace at the
# front to separate this from the logging.
print "\n\n"
print "Number of events in data: %d" % total_event_count
print "An event:\n%s" % json.dumps(an_event, indent=2)
print "Session with id '%s' has the most events: %d" % (longest_session_id, longest_session_count)
print "First 10 events:"
print "\n".join([" %s: %s" % (timestamp_to_string(ts), location) for (location, ts) in first_events])
示例2: len
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopFile [as 别名]
if __name__ == "__main__":
if len(sys.argv) != 2:
print(
"""
Usage: parquet_inputformat.py <data_file>
Run with example jar:
./bin/spark-submit --driver-class-path /path/to/example/jar \\
/path/to/examples/parquet_inputformat.py <data_file>
Assumes you have Parquet data stored in <data_file>.
""",
file=sys.stderr,
)
exit(-1)
path = sys.argv[1]
sc = SparkContext(appName="ParquetInputFormat")
parquet_rdd = sc.newAPIHadoopFile(
path,
"parquet.avro.AvroParquetInputFormat",
"java.lang.Void",
"org.apache.avro.generic.IndexedRecord",
valueConverter="org.apache.spark.examples.pythonconverters.IndexedRecordToJavaConverter",
)
output = parquet_rdd.map(lambda x: x[1]).collect()
for k in output:
print(k)
sc.stop()
示例3: order
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopFile [as 别名]
# * List the 5 most common elements for each order (word, bigram, trigram...). For each element, list the sequence of words and the number of occurances.
#
# Basically, you need to change all punctuations to a space and define as a word anything that is between whitespace or at the beginning or the end of a sentence, and does not consist of whitespace (strings consisiting of only white spaces should not be considered as words). The important thing here is to be simple, not to be 100% correct in terms of parsing English. Evaluation will be primarily based on identifying the 5 most frequent n-grams in correct order for all values of n. Some slack will be allowed in the values of frequency of ngrams to allow flexibility in text processing.
#
# This text is short enough to process on a single core using standard python. However, you are required to solve it using RDD's for the whole process. At the very end you can use `.take(5)` to bring the results to the central node for printing.
# The code for reading the file and splitting it into sentences is shown below:
# In[1]:
#path = '../Data/Moby-Dick.txt'
path = '/data/Moby-Dick.txt'
textRDD = sc.newAPIHadoopFile(path,
'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
'org.apache.hadoop.io.LongWritable',
'org.apache.hadoop.io.Text',
conf={'textinputformat.record.delimiter': "\r\n\r\n"}) \
.map(lambda x: x[1])
sentences=textRDD.flatMap(lambda x: x.split(". "))
# Note: For running the file on cluster, change the file path to `'/data/Moby-Dick.txt'`
# Let `freq_ngramRDD` be the final result RDD containing the n-grams sorted by their frequency in descending order. Use the following function to print your final output:
# In[2]:
def printOutput(n,freq_ngramRDD):
top=freq_ngramRDD.take(5)
print '\n============ %d most frequent %d-grams'%(5,n)
示例4: convBytesToObjectPickle
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopFile [as 别名]
avroRdd.map(lambda x: (x, None)).saveAsNewAPIHadoopFile(
fileAvroOut,
"org.apache.avro.mapreduce.AvroKeyOutputFormat",
"org.apache.avro.mapred.AvroKey",
"org.apache.hadoop.io.NullWritable",
keyConverter="irt.pythonconverters.Scheme1ToAvroKeyConverter",
conf=conf)
# ------------------------------------
# -- read data from avro --
# ------------------------------------
avroRdd2 = sc.newAPIHadoopFile(
fileAvroOut,
"org.apache.avro.mapreduce.AvroKeyInputFormat",
"org.apache.avro.mapred.AvroKey",
"org.apache.hadoop.io.NullWritable",
keyConverter="irt.pythonconverters.AvroWrapperToJavaConverter",
conf=conf)
crudeData = avroRdd2.collect()
output = crudeData[0][0]
for k in ['raw1', 'raw2', 'raw3']:
output[k] = convBytesToObjectPickle(output[k])
print 80 * '#'
print "input Record"
print 80 * '#'
pprint(record)
示例5: exit
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopFile [as 别名]
Run with example jar:
./bin/spark-submit --driver-class-path /path/to/example/jar \
/path/to/examples/avro_inputformat.py <data_file> [reader_schema_file]
Assumes you have Avro data stored in <data_file>. Reader schema can be optionally specified
in [reader_schema_file].
""", file=sys.stderr)
exit(-1)
path = sys.argv[1]
sc = SparkContext(appName="AvroKeyInputFormat")
conf = None
if len(sys.argv) == 3:
schema_rdd = sc.textFile(sys.argv[2], 1).collect()
conf = {"avro.schema.input.key": reduce(lambda x, y: x + y, schema_rdd)}
avro_rdd = sc.newAPIHadoopFile(
path,
"org.apache.avro.mapreduce.AvroKeyInputFormat",
"org.apache.avro.mapred.AvroKey",
"org.apache.hadoop.io.NullWritable",
keyConverter="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter",
conf=conf)
output = avro_rdd.map(lambda x: x[0]).collect()
for k in output:
print(k)
sc.stop()
示例6: run
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopFile [as 别名]
def run(schema_file, data_path, script=None, spec_file=None, verbose=None, yarn=None):
"""
Main function to run pyspark job. It requires a schema file, an HDFS directory
with data and optional script with mapper/reducer functions.
"""
time0 = time.time()
# pyspark modules
from pyspark import SparkContext
# define spark context, it's main object which allow
# to communicate with spark
ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script])
logger = SparkLogger(ctx)
if not verbose:
logger.set_level('ERROR')
if yarn:
logger.info("YARN client mode enabled")
# load FWJR schema
rdd = ctx.textFile(schema_file, 1).collect()
# define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines)
avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list
schema = ''.join(avsc.split()) # remove spaces in avsc map
conf = {"avro.schema.input.key": schema}
# define newAPIHadoopFile parameters, java classes
aformat="org.apache.avro.mapreduce.AvroKeyInputFormat"
akey="org.apache.avro.mapred.AvroKey"
awrite="org.apache.hadoop.io.NullWritable"
aconv="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter"
# load data from HDFS
if isinstance(data_path, list):
avro_rdd = ctx.union([ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path])
else:
avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf)
# process data, here the map will read record from avro file
# if we need a whole record we'll use lambda x: x[0], e.g.
# output = avro_rdd.map(lambda x: x[0]).collect()
#
# if we need a particular key, e.g. jobid, we'll extract it
# within lambda function, e.g. lambda x: x[0]['jobid'], e.g.
# output = avro_rdd.map(lambda x: x[0]['jobid']).collect()
#
# in more general way we write mapper/reducer functions which will be
# executed by Spark via collect call
spec = None
if spec_file:
if os.path.isfile(spec_file):
spec = json.load(open(spec_file))
else:
spec = json.loads(spec_file)
if script:
obj = import_(script)
logger.info("Use user-based script %s" % obj)
if not hasattr(obj, 'MapReduce'):
logger.error('Unable to find MapReduce class in %s, %s' \
% (script, obj))
ctx.stop()
return
mro = obj.MapReduce(spec)
# example of collecting records from mapper and
# passing all of them to reducer function
records = avro_rdd.map(mro.mapper).collect()
out = mro.reducer(records)
# the map(f).reduce(f) example but it does not collect
# intermediate records
# out = avro_rdd.map(obj.mapper).reduce(obj.reducer).collect()
else:
records = avro_rdd.map(basic_mapper).collect()
out = basic_reducer(records)
ctx.stop()
if verbose:
logger.info("Elapsed time %s" % htime(time.time()-time0))
return out
示例7: SparkConf
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopFile [as 别名]
from pyspark import SparkConf, SparkContext
conf = SparkConf()
conf.setAppName("spark_app_wordcount_extend")
sc = SparkContext(conf=conf)
pairs = sc.newAPIHadoopFile(
"/user/yurun/spark/textfile/",
"org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat",
"org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text")
words = pairs.map(lambda pair: pair[1]).flatMap(lambda line: line.split("\t"))
pairs = words.map(lambda word: (word, 1))
counts = pairs.reduceByKey(lambda a, b: a + b)
results = counts.collect()
for result in results:
print result
sc.stop()
示例8: len
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopFile [as 别名]
if __name__ == "__main__":
if len(sys.argv) != 3:
print >>sys.stderr, "Usage: terasort <HDFS_INPUT> <HDFS_OUTPUT>"
exit(-1)
sc = SparkContext(appName="PythonTeraSort")
reducer = int(IOCommon.getProperty("hibench.default.shuffle.parallelism"))
version_api = IOCommon.getProperty("hibench.hadoop.version")
# load
if version_api == "hadoop1":
lines = sc.textFile(sys.argv[1], 1).map(lambda x: (x[:10], x[10:]))
elif version_api == "hadoop2":
lines = sc.newAPIHadoopFile(
sys.argv[1],
"org.apache.hadoop.examples.terasort.TeraInputFormat",
"org.apache.hadoop.io.Text",
"org.apache.hadoop.io.Text",
)
# sort
sortedCount = lines.sortByKey(lambda x: x, numPartitions=reducer)
# save
if version_api == "hadoop1":
lines = sortedCount.map(lambda x: x[0] + x[1])
sortedCount.saveAsTextFile(sys.argv[2])
elif version_api == "hadoop2":
sortedCount.saveAsNewAPIHadoopFile(
sys.argv[2],
"org.apache.hadoop.examples.terasort.TeraOutputFormat",
"org.apache.hadoop.io.Text",
示例9: SparkConf
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopFile [as 别名]
# 初始化spark 配置
conf = SparkConf()
conf.setAppName("Simple App")
conf.set("spark.executor.memory","1g")
sc = SparkContext(conf = conf)
logFile = "./log.log" # Should be some file on your system
logData = sc.textFile(logFile).cache()
#########################################################
""" TOP 100 """
kFileNum = 200
total_words = None
for file_id in range(1,kFileNum):
YOUR_FILE = "wet_data/CC-MAIN-20150728002301-%05d-ip-10-236-191-2.ec2.internal.warc.wet"%file_id
YOUR_DELIMITER = "WARC/1.0"
text_file= sc.newAPIHadoopFile(YOUR_FILE,"org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text", conf = {"textinputformat.record.delimiter":YOUR_DELIMITER}).map(lambda l:l[1])
# 打开文件, 语句的结果是一个RDD
def split_and_remove_no_meaning_word(line):
candidate_words = line.split()
# 去掉符号
candidate_words = map(lambda word: filter(str.isalpha, str(word.encode('utf8')) ), candidate_words)
no_meanning_words = ['was','over','her','them','news','they','what','like','now','use','how','see','add','help','when','who','there','here','back','also','most','over','make','years','had','into','have','may','any','other','more','has','one','which','out','their','some','than','its','off','only','his','just','get','been','were','would','our','ago','not','the','and','with','for','your','you','the','from','are','that','all','will','this','can','but','about','warcdate','warctype','contentlength','warcrecordid','contenttype','warcblockdigest','warctargeturi','warcrefersto']
words = filter(lambda word: len(word) > 2 and word not in no_meanning_words, candidate_words)
return words
text_file1 = text_file.map(lambda line:line.lower()).flatMap(split_and_remove_no_meaning_word)
words = text_file1.map(lambda word: (word,1)).reduceByKey(lambda a,b: a+b)
if total_words == None:
total_words = words
else:
total_words = total_words.union(words)
示例10: run
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopFile [as 别名]
def run(schema_file, data_path, script=None, spec_file=None, verbose=None, rout=None, yarn=None):
"""
Main function to run pyspark job. It requires a schema file, an HDFS directory
with data and optional script with mapper/reducer functions.
"""
if script:
script = get_script(script)
if verbose:
print("### schema: %s" % schema_file)
print("### path : %s" % data_path)
print("### script: %s" % script)
print("### spec : %s" % spec_file)
time0 = time.time()
# pyspark modules
from pyspark import SparkContext
# define spark context, it's main object which allow
# to communicate with spark
ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script])
logger = SparkLogger(ctx)
if not verbose:
logger.set_level('ERROR')
if yarn:
logger.info("YARN client mode enabled")
# load FWJR schema
rdd = ctx.textFile(schema_file, 1).collect()
# define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines)
avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list
schema = ''.join(avsc.split()) # remove spaces in avsc map
conf = {"avro.schema.input.key": schema}
# define newAPIHadoopFile parameters, java classes
aformat="org.apache.avro.mapreduce.AvroKeyInputFormat"
akey="org.apache.avro.mapred.AvroKey"
awrite="org.apache.hadoop.io.NullWritable"
aconv="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter"
# load data from HDFS
if isinstance(data_path, list):
avro_rdd = ctx.union([ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path])
else:
avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf)
# process data, here the map will read record from avro file
# if we need a whole record we'll use lambda x: x[0], e.g.
# output = avro_rdd.map(lambda x: x[0]).collect()
#
# if we need a particular key, e.g. jobid, we'll extract it
# within lambda function, e.g. lambda x: x[0]['jobid'], e.g.
# output = avro_rdd.map(lambda x: x[0]['jobid']).collect()
#
# in more general way we write mapper/reducer functions which will be
# executed by Spark via collect call
spec = None
if spec_file:
if os.path.isfile(spec_file):
spec = json.load(open(spec_file))
else:
spec = json.loads(spec_file)
if verbose:
spec['verbose'] = 1
print("### spec %s" % json.dumps(spec))
if rout:
spec['output'] = rout
if script:
obj = import_(script)
logger.info("Use user-based script %s" % obj)
if not hasattr(obj, 'MapReduce'):
logger.error('Unable to find MapReduce class in %s, %s' \
% (script, obj))
ctx.stop()
return
# we have a nested use case when one MR return WMArchive spec
# we'll loop in that case until we get non-spec output
count = 0
while True:
mro = obj.MapReduce(spec)
mname = mro.__dict__.get('name', '').split('.')[0]
print("### Load %s" % mname)
if mname.lower().endswith('counter'):
out = avro_rdd.filter(mro.mapper).count()
if rout:
with open(rout, 'w') as ostream:
ostream.write(out)
break
# example of collecting records from mapper and
# passing all of them to reducer function
records = avro_rdd.filter(mro.mapper).collect()
out = mro.reducer(records)
if verbose:
print("### Loop count %s" % count)
if count > 3:
print("### WARNING, loop counter exceed its limit")
break
if is_spec(out):
spec = out
else:
break
#.........这里部分代码省略.........
示例11: SparkContext
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopFile [as 别名]
from operator import add
import sys
from pyspark import SparkContext
#Giving a Name and using the local Spark Master
sc = SparkContext(appName="LZO Wordcount")
if __name__ == "__main__":
if len(sys.argv) != 2:
print >> sys.stderr, """
Usage: wordcount_lzo_file.py <data_file>
Run with example jar:
spark-submit --driver-class-path /home/spark/lib/hadoop-lzo.jar /path/to/examples/wordcount_lzo_file.py <data_file>
"""
exit(-1)
path = sys.argv[1]
print path
conf = None
#Reading a file in HDFS(use absolute path)
csv = sc.newAPIHadoopFile(path,"com.hadoop.mapreduce.LzoTextInputFormat","org.apache.hadoop.io.LongWritable","org.apache.hadoop.io.Text").count()
print csv
#for k in output:
# print k
示例12:
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopFile [as 别名]
# blank line is used as delimeter
# Note that the first record have have an extra line "total number:#" as its first line
hadoop_conf = {"textinputformat.record.delimiter": "\n\n"}
is_ec2 = True
# If the submission is on EC2, you need to set access key and secret access key
if is_ec2:
hadoop_conf["fs.s3n.awsAccessKeyId"] = "AKIaYOURaOWNaKEYaJPQ"
hadoop_conf["fs.s3n.awsSecretAccessKey"] = "v5vBmazYOURaOWNaSECRETaKEYaT8yX4jXC+mGLl"
master_add = "ec2-54-213-21-124.us-west-2.compute.amazonaws.com"
# Read the file with the function newAPIHadoopFile. The RDD object has elements like this: <lineNumber, textOfTweet>.
# With the function textFile in the Word Count example, the hadoopConf can not be passed in.
lines = sc.newAPIHadoopFile(filepath, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
"org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text",
conf=hadoop_conf)
# print out records to understand the logic of RDD.
# NOTE: you can not print an RDD object directly. Take a small sample from it
#print(lines.take(5))
# A tweet has text "No Post Title" is considered as a bad record
bad_msg = "W\tNo Post Title"
# In the class, MapReduce is introduced in a simple form. In Spark, map and reduce have more variants. Key-value pair <K, V> can be key
# <K> only. This function maps a record to <0> or <1>
flag = lines.map(lambda x: 0 if -1 == string.find(x[1], bad_msg) else 1)
# print mapped keys
#print(flag.take(5))
示例13: exit
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import newAPIHadoopFile [as 别名]
exit(-1)
conf = SparkConf().set("spark.default.parallelism", "3")
sc = SparkContext(appName="SequenceFile", conf=conf)
path = sys.argv[1]
out = sys.argv[2]
# 读取sequence file 文件
#lines = sc.sequenceFile(path, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.LongWritable")
# hadoop 老的api, mapred
#lines = sc.hadoopFile(path, "org.apache.hadoop.mapred.SequenceFileInputFormat",
# "org.apache.hadoop.io.Text", "org.apache.hadoop.io.LongWritable")
# hadoop 新的api mapreduce
lines = sc.newAPIHadoopFile(path, "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
"org.apache.hadoop.io.Text", "org.apache.hadoop.io.IntWritable")
# hadoop 新api 中getSplits getSplits(JobContext context)
# hadoop 旧的api 中 getSplits(JobConf job, int numSplits)
# textFile 用的旧的api 和 hadoopFile 相同 打印2个partition
# newApiHadoopFile 打印一个partition
print "sequence partitions: %s" % lines.getNumPartitions()
results = lines.mapValues(lambda x: long(x))
for i in results.take(10):
print i[0] , i[1]
print results.count()
# saveAsSequenceFile(path, compressionCodecClass=None)
lines.saveAsSequenceFile(out, "org.apache.hadoop.io.compress.GzipCodec")