当前位置: 首页>>代码示例>>Python>>正文


Python UploadDirManager.uri方法代码示例

本文整理汇总了Python中mrjob.setup.UploadDirManager.uri方法的典型用法代码示例。如果您正苦于以下问题:Python UploadDirManager.uri方法的具体用法?Python UploadDirManager.uri怎么用?Python UploadDirManager.uri使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在mrjob.setup.UploadDirManager的用法示例。


在下文中一共展示了UploadDirManager.uri方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_unknown_uri

# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import uri [as 别名]
 def test_unknown_uri(self):
     sd = UploadDirManager("hdfs:///")
     sd.add("foo/bar.py")
     self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
     self.assertEqual(sd.uri("hdfs://host/path/to/bar.py"), "hdfs://host/path/to/bar.py")
     # checking unknown URIs doesn't add them
     self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
开发者ID:irskep,项目名称:mrjob,代码行数:9,代码来源:test_setup.py

示例2: test_unknown_uri

# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import uri [as 别名]
 def test_unknown_uri(self):
     sd = UploadDirManager('hdfs:///')
     sd.add('foo/bar.py')
     self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
     self.assertEqual(sd.uri('hdfs://host/path/to/bar.py'),
                      'hdfs://host/path/to/bar.py')
     # checking unknown URIs doesn't add them
     self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
开发者ID:anirudhreddy92,项目名称:mrjob,代码行数:10,代码来源:test_setup.py

示例3: HadoopJobRunner

# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import uri [as 别名]

#.........这里部分代码省略.........

        self._check_input_exists()
        self._create_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if not self.path_exists(path):
                raise AssertionError(
                    'Input path %s does not exist!' % (path,))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self._mkdir_on_hdfs(self._upload_mgr.prefix)

        log.info('Copying local files into %s' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().iteritems():
            self._upload_to_hdfs(path, uri)

    def _mkdir_on_hdfs(self, path):
        log.debug('Making directory %s on HDFS' % path)
        self.invoke_hadoop(['fs', '-mkdir', path])

    def _upload_to_hdfs(self, path, target):
        log.debug('Uploading %s -> %s on HDFS' % (path, target))
        self.invoke_hadoop(['fs', '-put', path, target])

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN')
         # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s' % stdin_path)
        stdin_file = open(stdin_path, 'w')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        self._counters = []
        steps = self._get_steps()

        for step_num, step in enumerate(steps):
            log.debug('running step %d of %d' % (step_num + 1, len(steps)))

            streaming_args = self._streaming_args(step, step_num, len(steps))
开发者ID:Infolaber,项目名称:mrjob,代码行数:69,代码来源:hadoop.py

示例4: HadoopJobRunner

# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import uri [as 别名]
class HadoopJobRunner(MRJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
    Invoked when you run your job with ``-r hadoop``.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """

    alias = "hadoop"

    OPTION_STORE_CLASS = HadoopRunnerOptionStore

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        self._hdfs_tmp_dir = fully_qualify_hdfs_path(posixpath.join(self._opts["hdfs_scratch_dir"], self._job_key))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, "files", "")
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(self._output_dir or posixpath.join(self._hdfs_tmp_dir, "output"))

        self._hadoop_log_dir = hadoop_log_dir(self._opts["hadoop_home"])

        # Running jobs via hadoop assigns a new timestamp to each job.
        # Running jobs via mrjob only adds steps.
        # Store both of these values to enable log parsing.
        self._job_timestamp = None
        self._start_step_num = 0

        # init hadoop version cache
        self._hadoop_version = None

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem(HadoopFilesystem(self._opts["hadoop_bin"]), LocalFilesystem())
        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        if not self._hadoop_version:
            stdout = self.invoke_hadoop(["version"], return_stdout=True)
            if stdout:
                first_line = stdout.split("\n")[0]
                m = HADOOP_VERSION_RE.match(first_line)
                if m:
                    self._hadoop_version = m.group("version")
                    log.info("Using Hadoop version %s" % self._hadoop_version)
                    return self._hadoop_version
            self._hadoop_version = "0.20.203"
            log.info("Unable to determine Hadoop version. Assuming 0.20.203.")
        return self._hadoop_version

    def _run(self):
        self._check_input_exists()
        self._create_setup_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == "-":
                continue  # STDIN always exists

            if self._opts["check_input_paths"]:
                if not self.path_exists(path):
                    raise AssertionError("Input path %s does not exist!" % (path,))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self._mkdir_on_hdfs(self._upload_mgr.prefix)

        log.info("Copying local files into %s" % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().items():
            self._upload_to_hdfs(path, uri)

#.........这里部分代码省略.........
开发者ID:ZhouYunan,项目名称:mrjob,代码行数:103,代码来源:hadoop.py

示例5: HadoopJobRunner

# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import uri [as 别名]

#.........这里部分代码省略.........
        self._check_input_exists()
        self._create_setup_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if self._opts['check_input_paths']:
                if not self.fs.exists(path):
                    raise AssertionError(
                        'Input path %s does not exist!' % (path,))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self.fs.mkdir(self._upload_mgr.prefix)

        log.info('Copying local files into %s' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().items():
            self._upload_to_hdfs(path, uri)

    def _upload_to_hdfs(self, path, target):
        log.debug('Uploading %s -> %s on HDFS' % (path, target))
        self.fs._put(path, target)

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
         # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s' % stdin_path)
        stdin_file = open(stdin_path, 'wb')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        for step_num in range(self._num_steps()):
            step_args = self._args_for_step(step_num)

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d' %
                      (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)
开发者ID:mmontagna,项目名称:mrjob,代码行数:69,代码来源:hadoop.py

示例6: uri_adds_trailing_slash

# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import uri [as 别名]
 def uri_adds_trailing_slash(self):
     sd = UploadDirManager("s3://bucket/dir")
     sd.add("foo/bar.py")
     self.assertEqual(sd.uri("foo/bar.py"), "s3://bucket/dir/bar.py")
     self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "s3://bucket/dir/bar.py"})
开发者ID:irskep,项目名称:mrjob,代码行数:7,代码来源:test_setup.py

示例7: test_uri

# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import uri [as 别名]
 def test_uri(self):
     sd = UploadDirManager("hdfs:///")
     sd.add("foo/bar.py")
     self.assertEqual(sd.uri("foo/bar.py"), "hdfs:///bar.py")
开发者ID:irskep,项目名称:mrjob,代码行数:6,代码来源:test_setup.py

示例8: DataprocJobRunner

# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import uri [as 别名]

#.........这里部分代码省略.........
        # Example default - "mrjob-us-central1-RANDOMHEX"
        if not chosen_bucket_name:
            chosen_bucket_name = '-'.join(
                ['mrjob', gce_lower_location, random_identifier()])

        return 'gs://%s/tmp/' % chosen_bucket_name

    def _run(self):
        self._launch()
        self._run_steps()

    def _launch(self):
        self._prepare_for_launch()
        self._launch_cluster()

    def _prepare_for_launch(self):
        self._check_input_exists()
        self._check_output_not_exists()
        self._create_setup_wrapper_script()
        self._add_bootstrap_files_for_upload()
        self._add_job_files_for_upload()
        self._upload_local_files_to_fs()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        if not self._opts['check_input_paths']:
            return

        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if is_uri(path) and not is_gcs_uri(path):
                continue  # can't check non-GCS URIs, hope for the best

            if not self.fs.exists(path):
                raise AssertionError(
                    'Input path %s does not exist!' % (path,))

    def _check_output_not_exists(self):
        """Verify the output path does not already exist. This avoids
        provisioning a cluster only to have Hadoop refuse to launch.
        """
        if self.fs.exists(self._output_dir):
            raise IOError(
                'Output path %s already exists!' % (self._output_dir,))

    def _add_bootstrap_files_for_upload(self):
        """Add files needed by the bootstrap script to self._upload_mgr.

        Tar up mrjob if bootstrap_mrjob is True.

        Create the master bootstrap script if necessary.

        """
        # lazily create mrjob.zip
        if self._bootstrap_mrjob():
            self._create_mrjob_zip()
            self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path)

        # all other files needed by the script are already in
        # _bootstrap_dir_mgr
        for path in self._bootstrap_dir_mgr.paths():
            self._upload_mgr.add(path)
开发者ID:davidmarin,项目名称:mrjob,代码行数:69,代码来源:dataproc.py

示例9: uri_adds_trailing_slash

# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import uri [as 别名]
 def uri_adds_trailing_slash(self):
     sd = UploadDirManager('s3://bucket/dir')
     sd.add('foo/bar.py')
     self.assertEqual(sd.uri('foo/bar.py'), 's3://bucket/dir/bar.py')
     self.assertEqual(sd.path_to_uri(),
                      {'foo/bar.py': 's3://bucket/dir/bar.py'})
开发者ID:anirudhreddy92,项目名称:mrjob,代码行数:8,代码来源:test_setup.py

示例10: DataprocJobRunner

# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import uri [as 别名]

#.........这里部分代码省略.........

        # all other files needed by the script are already in
        # _bootstrap_dir_mgr
        for path in self._bootstrap_dir_mgr.paths():
            self._upload_mgr.add(path)

        # now that we know where the above files live, we can create
        # the master bootstrap script
        self._create_master_bootstrap_script_if_needed()
        if self._master_bootstrap_script_path:
            self._upload_mgr.add(self._master_bootstrap_script_path)
            self._upload_mgr.add(_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH)

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

        # TODO - mtai @ davidmarin - hadoop_streaming_jar is currently ignored,
        # see _HADOOP_STREAMING_JAR_URI
        # if self._opts['hadoop_streaming_jar']:
        #     self._upload_mgr.add(self._opts['hadoop_streaming_jar'])

        for step in self._get_steps():
            if step.get('jar'):
                self._upload_mgr.add(step['jar'])

    def _upload_local_files_to_fs(self):
        """Copy local files tracked by self._upload_mgr to FS."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        log.info('Copying non-input files into %s' % self._upload_mgr.prefix)

        for path, gcs_uri in self._upload_mgr.path_to_uri().items():
            log.debug('uploading %s -> %s' % (path, gcs_uri))

            # TODO - mtai @ davidmarin - Implement put function for other FSs
            self.fs.put(path, gcs_uri)

        self._wait_for_fs_sync()

    def _create_fs_tmp_bucket(self, bucket_name, location=None):
        """Create a temp bucket if missing

        Tie the temporary bucket to the same region as the GCE job and set a
        28-day TTL
        """
        # Return early if our bucket already exists
        try:
            self.fs.get_bucket(bucket_name)
            return
        except google_errors.HttpError as e:
            if not e.resp.status == 404:
                raise

        log.info('creating FS bucket %r' % bucket_name)

        location = location or self._gce_region

        # NOTE - By default, we create a bucket in the same GCE region as our
        # job (tmp buckets ONLY)
开发者ID:okomestudio,项目名称:mrjob,代码行数:70,代码来源:dataproc.py

示例11: HadoopJobRunner

# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import uri [as 别名]

#.........这里部分代码省略.........
        self._check_input_exists()
        self._create_setup_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if self._opts['check_input_paths']:
                if not self.fs.exists(path):
                    raise AssertionError(
                        'Input path %s does not exist!' % (path,))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self.fs.mkdir(self._upload_mgr.prefix)

        log.info('Copying local files into %s' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().items():
            self._upload_to_hdfs(path, uri)

    def _upload_to_hdfs(self, path, target):
        log.debug('Uploading %s -> %s on HDFS' % (path, target))
        self.fs._put(path, target)

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
         # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s' % stdin_path)
        stdin_file = open(stdin_path, 'wb')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        self._counters = []

        for step_num in range(self._num_steps()):
            log.debug('running step %d of %d' %
                      (step_num + 1, self._num_steps()))

            step_args = self._args_for_step(step_num)

            log.debug('> %s' % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
开发者ID:alanhdu,项目名称:mrjob,代码行数:70,代码来源:hadoop.py

示例12: DataprocJobRunner

# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import uri [as 别名]
class DataprocJobRunner(HadoopInTheCloudJobRunner, LogInterpretationMixin):
    """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc.
    Invoked when you run your job with ``-r dataproc``.

    :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which
    is basically a temporary Hadoop cluster.

    Input, support, and jar files can be either local or on GCS; use
    ``gs://...`` URLs to refer to files on GCS.

    This class has some useful utilities for talking directly to GCS and
    Dataproc, so you may find it useful to instantiate it without a script::

        from mrjob.dataproc import DataprocJobRunner
        ...
    """
    alias = 'dataproc'

    OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | {
        'cluster_properties',
        'core_instance_config',
        'gcloud_bin',
        'master_instance_config',
        'network',
        'project_id',
        'service_account',
        'service_account_scopes',
        'subnet',
        'task_instance_config',
    }

    # no Spark support yet (see #1765)
    _STEP_TYPES = {'jar', 'streaming'}

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same
        arguments as
        :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(DataprocJobRunner, self).__init__(**kwargs)

        # check for library support
        if google is None:
            raise ImportError(
                'You must install google-cloud-logging and '
                'google-cloud-storage to connect to Dataproc')

        # Dataproc requires a master and >= 2 core instances
        # num_core_instances refers ONLY to number of CORE instances and does
        # NOT include the required 1 instance for master
        # In other words, minimum cluster size is 3 machines, 1 master and 2
        # "num_core_instances" workers
        if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS:
            raise DataprocException(
                'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS)

        if (self._opts['core_instance_type'] !=
                self._opts['task_instance_type']):
            raise DataprocException(
                'Dataproc v1 expects core/task instance types to be identical')

        # see #1820
        if self._opts['image_id']:
            log.warning('mrjob does not yet support custom machine images'
                        ' on Dataproc')

        # load credentials and project ID
        self._credentials, auth_project_id = google.auth.default(
            scopes=[_FULL_SCOPE])  # needed for $GOOGLE_APPLICATION_CREDENTIALS

        self._project_id = self._opts['project_id'] or auth_project_id

        if not self._project_id:
            raise DataprocException(
                'project_id must be set. Use --project_id or'
                ' set $GOOGLE_CLOUD_PROJECT')

        self._fix_zone_and_region_opts()

        if self._opts['service_account_scopes']:
            self._opts['service_account_scopes'] = [
                _fully_qualify_scope_uri(s)
                for s in self._opts['service_account_scopes']
            ]

        # cluster_id can be None here
        self._cluster_id = self._opts['cluster_id']

        self._api_client = None
        self._gcs_fs = None
        self._fs = None

        # BEGIN - setup directories
        base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir'])

        self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir)

        # use job key to make a unique tmp dir
        self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/'
#.........这里部分代码省略.........
开发者ID:Yelp,项目名称:mrjob,代码行数:103,代码来源:dataproc.py

示例13: HadoopJobRunner

# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import uri [as 别名]

#.........这里部分代码省略.........
        self._check_input_exists()
        self._create_setup_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if self._opts['check_input_paths']:
                if not self.fs.exists(path):
                    raise AssertionError(
                        'Input path %s does not exist!' % (path,))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self.fs.mkdir(self._upload_mgr.prefix)

        log.info('Copying local files into %s' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().items():
            self._upload_to_hdfs(path, uri)

    def _upload_to_hdfs(self, path, target):
        log.debug('Uploading %s -> %s on HDFS' % (path, target))
        self.fs._put(path, target)

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
         # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s' % stdin_path)
        stdin_file = open(stdin_path, 'wb')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        for step_num in range(self._num_steps()):
            step_args = self._args_for_step(step_num)

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d' %
                      (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
开发者ID:sebratt,项目名称:mrjob,代码行数:70,代码来源:hadoop.py


注:本文中的mrjob.setup.UploadDirManager.uri方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。