当前位置: 首页>>代码示例>>Python>>正文


Python SparkContext.hadoopFile方法代码示例

本文整理汇总了Python中pyspark.SparkContext.hadoopFile方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.hadoopFile方法的具体用法?Python SparkContext.hadoopFile怎么用?Python SparkContext.hadoopFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.SparkContext的用法示例。


在下文中一共展示了SparkContext.hadoopFile方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: SparkConf

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import hadoopFile [as 别名]
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("spark_app_read_data_from_rcfile")

sc = SparkContext(conf=conf)

rowRDD = sc.hadoopFile(path="hdfs://dip.cdh5.dev:8020/user/yurun/rcfile",
                       inputFormatClass="org.apache.hadoop.hive.ql.io.RCFileInputFormat",
                       keyClass="org.apache.hadoop.io.LongWritable",
                       valueClass="org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable",
                       valueConverter="com.sina.dip.spark.converter.BytesRefArrayWritableToObjectArrayConverter")

pairs = rowRDD.collect()

for pair in pairs:
    print pair[0], pair[1]

sc.stop()
开发者ID:Leaderman,项目名称:pyspark,代码行数:20,代码来源:spark_app_read_data_from_rcfile.py

示例2: SparkConf

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import hadoopFile [as 别名]
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("spark_app_read_data_from_seqfile")

sc = SparkContext(conf=conf)

lineRDD = sc.hadoopFile(path="hdfs://dip.cdh5.dev:8020/user/yurun/seqfile",
                        inputFormatClass="org.apache.hadoop.mapred.SequenceFileInputFormat",
                        keyClass="com.sina.dip.spark.converter.IntArrayWritable",
                        valueClass="org.apache.hadoop.io.NullWritable",
                        keyConverter="com.sina.dip.spark.converter.IntArrayWritableToObjectArrayConverter").map(lambda pair: pair[0])

lines = lineRDD.collect()

for line in lines:
    print line

sc.stop()
开发者ID:Leaderman,项目名称:pyspark,代码行数:20,代码来源:spark_app_read_data_from_seqfile.py

示例3: len

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import hadoopFile [as 别名]
if __name__ == '__main__':

    if len(sys.argv) != 5:
        print("Usage: spark_streaming.py <master> <begin> <end> <input>", file=sys.stderr)
        exit(-1)

    master, time_begin, time_end, input = sys.argv[1:]
    input_path = input + '/' + time_begin + '.csv'
    logger.info("--->" + master + " " + input_path)

    sc = SparkContext(master, 'wxcity_userlogin_repeat_app')
    sql_context = SQLContext(sc)

    lines = sc.hadoopFile(input,
                          'org.apache.hadoop.mapred.TextInputFormat',
                          'org.apache.hadoop.io.LongWritable',
                          'org.apache.hadoop.io.Text'
                          )

    rs_tuples = MysqlDao().findWithQuery(ConfigPortalSql.select_mysql_hos_gw_sup)
    gwid_hosid_dict = {}
    for r in rs_tuples:
        hos_id = str(r[0])
        gw_id = r[1]
        gwid_hosid_dict[gw_id] = hos_id
    logger.debug('-->gwid_hosid:' + str(gwid_hosid_dict.__len__()))
    users = lines.map(lambda x: x[1].split(',')).filter(lambda x: len(x) == 17) \
        .map(lambda p: (p[0].strip(), p[1].strip(), p[2].strip(), p[3].strip(), p[4].strip(), \
                        p[5].strip(), p[6].strip(), p[7].strip(), p[8].strip(), p[9].strip(), \
                        p[10].strip(), p[11].strip(), p[12].strip(), p[13].strip(), p[14].strip(), \
                        p[15].strip(), p[16].strip(), gwid_hosid_dict.get(p[1].strip(), "")))
开发者ID:wangcunxin,项目名称:spark_py,代码行数:33,代码来源:userlogin_repeat.py

示例4: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import hadoopFile [as 别名]
def main(cmd_line_args=None):
    if cmd_line_args is None:
        cmd_line_args = sys.argv[1:]

    parser = _make_arg_parser()
    args = parser.parse_args(cmd_line_args)

    if args.num_reducers is not None and args.num_reducers <= 0:
        raise ValueError(
            'You can only configure num_reducers to positive number.')

    # get job_class
    job_module_name, job_class_name = args.job_class.rsplit('.', 1)
    job_module = import_module(job_module_name)
    job_class = getattr(job_module, job_class_name)

    # load initial data
    from pyspark import SparkContext

    if args.job_args:
        job_args = shlex_split(args.job_args)
    else:
        job_args = []

    # determine hadoop_*_format, steps
    # try to avoid instantiating a job in the driver; see #2044
    job = None

    if args.hadoop_input_format is None:
        job = job or job_class(job_args)
        hadoop_input_format = job.hadoop_input_format()
    else:
        hadoop_input_format = args.hadoop_input_format or None

    if args.hadoop_output_format is None:
        job = job or job_class(job_args)
        hadoop_output_format = job.hadoop_output_format()
    else:
        hadoop_output_format = args.hadoop_output_format or None

    if args.sort_values is None:
        job = job or job_class(job_args)
        sort_values = job.sort_values()
    else:
        sort_values = args.sort_values

    if args.steps_desc is None:
        job = job or job_class(job_args)
        steps = [step.description(step_num)
                 for step_num, step in enumerate(job.steps())]
    else:
        steps = json.loads(args.steps_desc)

    # pick steps
    start = args.first_step_num or 0
    end = None if args.last_step_num is None else args.last_step_num + 1
    steps_to_run = list(enumerate(steps))[start:end]

    sc = SparkContext()

    # keep track of one set of counters per job step
    counter_accumulators = [
        sc.accumulator(defaultdict(dict), CounterAccumulator())
        for _ in steps_to_run
    ]

    def make_increment_counter(step_num):
        counter_accumulator = counter_accumulators[step_num - start]

        def increment_counter(group, name, amount=1):
            counter_accumulator.add({group: {name: amount}})

        return increment_counter

    def make_mrc_job(mrc, step_num):
        j = job_class(job_args + [
            '--%s' % mrc, '--step-num=%d' % step_num
        ])

        # patch increment_counter() to update the accumulator for this step
        j.increment_counter = make_increment_counter(step_num)

        return j

    try:
        if hadoop_input_format:
            rdd = sc.hadoopFile(
                args.input_path,
                inputFormatClass=hadoop_input_format,
                keyClass='org.apache.hadoop.io.Text',
                valueClass='org.apache.hadoop.io.Text')

            # hadoopFile loads each line as a key-value pair in which the
            # contents of the line are the key and the value is an empty
            # string. Convert to an rdd of just lines, encoded as bytes.
            rdd = rdd.map(lambda kv: kv[0].encode('utf-8'))
        else:
            rdd = sc.textFile(args.input_path, use_unicode=False)

        # run steps
#.........这里部分代码省略.........
开发者ID:Yelp,项目名称:mrjob,代码行数:103,代码来源:harness.py


注:本文中的pyspark.SparkContext.hadoopFile方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。