本文整理汇总了Python中pyspark.SparkContext.hadoopFile方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.hadoopFile方法的具体用法?Python SparkContext.hadoopFile怎么用?Python SparkContext.hadoopFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext
的用法示例。
在下文中一共展示了SparkContext.hadoopFile方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: SparkConf
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import hadoopFile [as 别名]
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName("spark_app_read_data_from_rcfile")
sc = SparkContext(conf=conf)
rowRDD = sc.hadoopFile(path="hdfs://dip.cdh5.dev:8020/user/yurun/rcfile",
inputFormatClass="org.apache.hadoop.hive.ql.io.RCFileInputFormat",
keyClass="org.apache.hadoop.io.LongWritable",
valueClass="org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable",
valueConverter="com.sina.dip.spark.converter.BytesRefArrayWritableToObjectArrayConverter")
pairs = rowRDD.collect()
for pair in pairs:
print pair[0], pair[1]
sc.stop()
示例2: SparkConf
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import hadoopFile [as 别名]
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName("spark_app_read_data_from_seqfile")
sc = SparkContext(conf=conf)
lineRDD = sc.hadoopFile(path="hdfs://dip.cdh5.dev:8020/user/yurun/seqfile",
inputFormatClass="org.apache.hadoop.mapred.SequenceFileInputFormat",
keyClass="com.sina.dip.spark.converter.IntArrayWritable",
valueClass="org.apache.hadoop.io.NullWritable",
keyConverter="com.sina.dip.spark.converter.IntArrayWritableToObjectArrayConverter").map(lambda pair: pair[0])
lines = lineRDD.collect()
for line in lines:
print line
sc.stop()
示例3: len
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import hadoopFile [as 别名]
if __name__ == '__main__':
if len(sys.argv) != 5:
print("Usage: spark_streaming.py <master> <begin> <end> <input>", file=sys.stderr)
exit(-1)
master, time_begin, time_end, input = sys.argv[1:]
input_path = input + '/' + time_begin + '.csv'
logger.info("--->" + master + " " + input_path)
sc = SparkContext(master, 'wxcity_userlogin_repeat_app')
sql_context = SQLContext(sc)
lines = sc.hadoopFile(input,
'org.apache.hadoop.mapred.TextInputFormat',
'org.apache.hadoop.io.LongWritable',
'org.apache.hadoop.io.Text'
)
rs_tuples = MysqlDao().findWithQuery(ConfigPortalSql.select_mysql_hos_gw_sup)
gwid_hosid_dict = {}
for r in rs_tuples:
hos_id = str(r[0])
gw_id = r[1]
gwid_hosid_dict[gw_id] = hos_id
logger.debug('-->gwid_hosid:' + str(gwid_hosid_dict.__len__()))
users = lines.map(lambda x: x[1].split(',')).filter(lambda x: len(x) == 17) \
.map(lambda p: (p[0].strip(), p[1].strip(), p[2].strip(), p[3].strip(), p[4].strip(), \
p[5].strip(), p[6].strip(), p[7].strip(), p[8].strip(), p[9].strip(), \
p[10].strip(), p[11].strip(), p[12].strip(), p[13].strip(), p[14].strip(), \
p[15].strip(), p[16].strip(), gwid_hosid_dict.get(p[1].strip(), "")))
示例4: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import hadoopFile [as 别名]
def main(cmd_line_args=None):
if cmd_line_args is None:
cmd_line_args = sys.argv[1:]
parser = _make_arg_parser()
args = parser.parse_args(cmd_line_args)
if args.num_reducers is not None and args.num_reducers <= 0:
raise ValueError(
'You can only configure num_reducers to positive number.')
# get job_class
job_module_name, job_class_name = args.job_class.rsplit('.', 1)
job_module = import_module(job_module_name)
job_class = getattr(job_module, job_class_name)
# load initial data
from pyspark import SparkContext
if args.job_args:
job_args = shlex_split(args.job_args)
else:
job_args = []
# determine hadoop_*_format, steps
# try to avoid instantiating a job in the driver; see #2044
job = None
if args.hadoop_input_format is None:
job = job or job_class(job_args)
hadoop_input_format = job.hadoop_input_format()
else:
hadoop_input_format = args.hadoop_input_format or None
if args.hadoop_output_format is None:
job = job or job_class(job_args)
hadoop_output_format = job.hadoop_output_format()
else:
hadoop_output_format = args.hadoop_output_format or None
if args.sort_values is None:
job = job or job_class(job_args)
sort_values = job.sort_values()
else:
sort_values = args.sort_values
if args.steps_desc is None:
job = job or job_class(job_args)
steps = [step.description(step_num)
for step_num, step in enumerate(job.steps())]
else:
steps = json.loads(args.steps_desc)
# pick steps
start = args.first_step_num or 0
end = None if args.last_step_num is None else args.last_step_num + 1
steps_to_run = list(enumerate(steps))[start:end]
sc = SparkContext()
# keep track of one set of counters per job step
counter_accumulators = [
sc.accumulator(defaultdict(dict), CounterAccumulator())
for _ in steps_to_run
]
def make_increment_counter(step_num):
counter_accumulator = counter_accumulators[step_num - start]
def increment_counter(group, name, amount=1):
counter_accumulator.add({group: {name: amount}})
return increment_counter
def make_mrc_job(mrc, step_num):
j = job_class(job_args + [
'--%s' % mrc, '--step-num=%d' % step_num
])
# patch increment_counter() to update the accumulator for this step
j.increment_counter = make_increment_counter(step_num)
return j
try:
if hadoop_input_format:
rdd = sc.hadoopFile(
args.input_path,
inputFormatClass=hadoop_input_format,
keyClass='org.apache.hadoop.io.Text',
valueClass='org.apache.hadoop.io.Text')
# hadoopFile loads each line as a key-value pair in which the
# contents of the line are the key and the value is an empty
# string. Convert to an rdd of just lines, encoded as bytes.
rdd = rdd.map(lambda kv: kv[0].encode('utf-8'))
else:
rdd = sc.textFile(args.input_path, use_unicode=False)
# run steps
#.........这里部分代码省略.........