当前位置: 首页>>代码示例>>Python>>正文


Python setup.WorkingDirManager类代码示例

本文整理汇总了Python中mrjob.setup.WorkingDirManager的典型用法代码示例。如果您正苦于以下问题:Python WorkingDirManager类的具体用法?Python WorkingDirManager怎么用?Python WorkingDirManager使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了WorkingDirManager类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

    def __init__(self, **kwargs):
        super(HadoopInTheCloudJobRunner, self).__init__(**kwargs)

        # if *cluster_id* is not set, ``self._cluster_id`` will be
        # set when we create or join a cluster
        self._cluster_id = self._opts['cluster_id']

        # bootstrapping
        self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()

        # add files to manager
        self._bootstrap_dir_mgr = WorkingDirManager()

        for cmd in self._bootstrap:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archive tokens to archives
                    if token['type'] == 'dir':
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'

                    self._bootstrap_dir_mgr.add(**token)

        # we'll create this script later, as needed
        self._master_bootstrap_script_path = None
开发者ID:okomestudio,项目名称:mrjob,代码行数:25,代码来源:cloud.py

示例2: test_allow_hidden_files

    def test_allow_hidden_files(self):
        wd = WorkingDirManager()
        wd.add('archive', '_foo.tar.gz')
        wd.add('file', '.bazrc')

        self.assertEqual(wd.name('archive', '_foo.tar.gz'), '_foo.tar.gz')
        self.assertEqual(wd.name('file', '.bazrc'), '.bazrc')
开发者ID:Affirm,项目名称:mrjob,代码行数:7,代码来源:test_setup.py

示例3: test_simple

 def test_simple(self):
     wd = WorkingDirManager()
     wd.add('archive', 's3://bucket/path/to/baz.tar.gz')
     wd.add('file', 'foo/bar.py')
     self.assertEqual(wd.name_to_path('file'),
                      {'bar.py': 'foo/bar.py'})
     self.assertEqual(wd.name_to_path('archive'),
                      {'baz.tar.gz': 's3://bucket/path/to/baz.tar.gz'})
     self.assertEqual(
         wd.paths(),
         set(['foo/bar.py', 's3://bucket/path/to/baz.tar.gz']))
开发者ID:Affirm,项目名称:mrjob,代码行数:11,代码来源:test_setup.py

示例4: test_auto_names_are_different_from_assigned_names

 def test_auto_names_are_different_from_assigned_names(self):
     wd = WorkingDirManager()
     wd.add('file', 'foo/bar.py', name='qux.py')
     wd.add('file', 'foo/bar.py')  # use default name bar.py
     self.assertEqual(wd.name_to_path('file'),
                      {'qux.py': 'foo/bar.py',
                       'bar.py': 'foo/bar.py'})
     self.assertEqual(wd.paths(), set(['foo/bar.py']))
开发者ID:Affirm,项目名称:mrjob,代码行数:8,代码来源:test_setup.py

示例5: __init__

    def __init__(self, **kwargs):
        super(HadoopInTheCloudJobRunner, self).__init__(**kwargs)

        # if *cluster_id* is not set, ``self._cluster_id`` will be
        # set when we create or join a cluster
        self._cluster_id = self._opts['cluster_id']

        # bootstrapping
        self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()

        # add files to manager
        self._bootstrap_dir_mgr = WorkingDirManager()

        for cmd in self._bootstrap:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archive tokens to archives
                    if token['type'] == 'dir':
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'

                    self._bootstrap_dir_mgr.add(**token)

        # we'll create this script later, as needed
        self._master_bootstrap_script_path = None

        # ssh state

        # the process for the SSH tunnel
        self._ssh_proc = None

        # if this is true, stop trying to launch the SSH tunnel
        self._give_up_on_ssh_tunnel = False

        # store the (tunneled) URL of the job tracker/resource manager
        self._ssh_tunnel_url = None
开发者ID:Affirm,项目名称:mrjob,代码行数:36,代码来源:cloud.py

示例6: test_cant_auto_name_unless_added_as_auto

 def test_cant_auto_name_unless_added_as_auto(self):
     wd = WorkingDirManager()
     wd.add('file', 'bar.py', name='qux.py')
     self.assertEqual(wd.name('file', 'bar.py', 'qux.py'), 'qux.py')
     self.assertRaises(ValueError,
                       wd.name, 'file', 'bar.py')
开发者ID:anirudhreddy92,项目名称:mrjob,代码行数:6,代码来源:test_setup.py

示例7: MRJobRunner

class MRJobRunner(object):
    """Abstract base class for all runners"""

    #: alias for this runner; used for picking section of
    #: :py:mod:``mrjob.conf`` to load one of ``'local'``, ``'emr'``,
    #: or ``'hadoop'``
    alias = None

    # if this is true, when bootstrap_mrjob is true, add it through the
    # setup script
    BOOTSTRAP_MRJOB_IN_SETUP = True

    OPTION_STORE_CLASS = RunnerOptionStore

    ### methods to call from your batch script ###

    def __init__(self, mr_job_script=None, conf_paths=None,
                 extra_args=None, file_upload_args=None,
                 hadoop_input_format=None, hadoop_output_format=None,
                 input_paths=None, output_dir=None, partitioner=None,
                 stdin=None, **opts):
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the
                              :py:class:`~mrjob.job.MRJob`. If this is None,
                              you won't actually be able to :py:meth:`run` the
                              job, but other utilities (e.g. :py:meth:`ls`)
                              will work.
        :type conf_paths: None or list
        :param conf_paths: List of config files to combine and use, or None to
                           search for mrjob.conf in the default locations.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the
                           mr_job script. This is a hook to allow jobs to take
                           additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
                                 The file at the given path will be uploaded
                                 to the local directory of the mr_job script
                                 when it runs, and then passed into the script
                                 with ``--ARGNAME``. Useful for passing in
                                 SQLite DBs and other configuration files to
                                 your job.
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat``
                                    class. Passed to Hadoop along with your
                                    first step with the ``-inputformat``
                                    option. Note that if you write your own
                                    class, you'll need to include it in your
                                    own custom streaming jar (see
                                    *hadoop_streaming_jar*).
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop
                                     ``OutputFormat`` class. Passed to Hadoop
                                     along with your first step with the
                                     ``-outputformat`` option. Note that if you
                                     write your own class, you'll need to
                                     include it in your own custom streaming
                                     jar (see *hadoop_streaming_jar*).
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and
                            recursively walks directories (e.g.
                            ``['data/common/', 'data/training/*.gz']``). If
                            this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: An empty/non-existent directory where Hadoop
                           streaming should put the final output from the job.
                           If you don't specify an output directory, we'll
                           output into a subdirectory of this job's temporary
                           directory. You can control this from the command
                           line with ``--output-dir``. This option cannot be
                           set from configuration files. If used with the
                           hadoop runner, this path does not need to be fully
                           qualified with ``hdfs://`` URIs because it's
                           understood that it has to be on HDFS.
        :type partitioner: str
        :param partitioner: Optional name of a Hadoop partitoner class, e.g.
                            ``'org.apache.hadoop.mapred.lib.HashPartitioner'``.
                            Hadoop streaming will use this to determine how
                            mapper output should be sorted and distributed
                            to reducers.
        :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use
                      as stdin. This is a hook for testing; if you set
                      ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
                      get passed through to the runner. If for some reason
                      your lines are missing newlines, we'll add them;
                      this makes it easier to write automated tests.
        """
        self._ran_job = False

        self._opts = self.OPTION_STORE_CLASS(self.alias, opts, conf_paths)
        self._fs = None

        self._working_dir_mgr = WorkingDirManager()

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
#.........这里部分代码省略.........
开发者ID:parastoo-62,项目名称:mrjob,代码行数:101,代码来源:runner.py

示例8: test_cant_auto_name_unless_added_as_auto

 def test_cant_auto_name_unless_added_as_auto(self):
     wd = WorkingDirManager()
     wd.add("file", "bar.py", name="qux.py")
     self.assertEqual(wd.name("file", "bar.py", "qux.py"), "qux.py")
     self.assertRaises(ValueError, wd.name, "file", "bar.py")
开发者ID:irskep,项目名称:mrjob,代码行数:5,代码来源:test_setup.py

示例9: test_lazy_naming

 def test_lazy_naming(self):
     wd = WorkingDirManager()
     wd.add("file", "qux.py")  # qux.py by default
     wd.add("file", "bar.py", name="qux.py")
     self.assertEqual(wd.name_to_path("file"), {"qux.py": "bar.py", "qux-1.py": "qux.py"})
开发者ID:irskep,项目名称:mrjob,代码行数:5,代码来源:test_setup.py

示例10: test_auto_names_are_different_from_assigned_names

 def test_auto_names_are_different_from_assigned_names(self):
     wd = WorkingDirManager()
     wd.add("file", "foo/bar.py", name="qux.py")
     wd.add("file", "foo/bar.py")  # use default name bar.py
     self.assertEqual(wd.name_to_path("file"), {"qux.py": "foo/bar.py", "bar.py": "foo/bar.py"})
开发者ID:irskep,项目名称:mrjob,代码行数:5,代码来源:test_setup.py

示例11: test_explicit_name_collision

 def test_explicit_name_collision(self):
     wd = WorkingDirManager()
     wd.add("file", "foo.py", name="qux.py")
     self.assertRaises(ValueError, wd.add, "file", "bar.py", name="qux.py")
开发者ID:irskep,项目名称:mrjob,代码行数:4,代码来源:test_setup.py

示例12: test_empty

 def test_empty(self):
     wd = WorkingDirManager()
     self.assertEqual(wd.name_to_path('archive'), {})
     self.assertEqual(wd.name_to_path('file'), {})
     self.assertEqual(wd.paths(), set())
开发者ID:Affirm,项目名称:mrjob,代码行数:5,代码来源:test_setup.py

示例13: MRJobRunner


#.........这里部分代码省略.........
        :type step_output_dir: str
        :param step_output_dir: An empty/non-existent directory where Hadoop
                                should put output from all steps other than
                                the last one (this only matters for multi-step
                                jobs). Currently ignored by local runners.
        """
        self._ran_job = False

        # opts are made from:
        #
        # empty defaults (everything set to None)
        # runner-specific defaults
        # opts from config file(s)
        # opts from command line
        self._opts = self._combine_confs(
            [(None, {key: None for key in self.OPT_NAMES})] +
            [(None, self._default_opts())] +
            load_opts_from_mrjob_confs(self.alias, conf_paths) +
            [('the command line', opts)]
        )

        log.debug('Active configuration:')
        log.debug(pprint.pformat({
            opt_key: self._obfuscate_opt(opt_key, opt_value)
            for opt_key, opt_value in self._opts.items()
        }))

        self._fs = None

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        self._working_dir_mgr = WorkingDirManager()

        # mapping from dir to path for corresponding archive. we pick
        # paths during init(), but don't actually create the archives
        # until self._create_dir_archives() is called
        self._dir_to_archive_path = {}
        # dir archive names (the filename minus ".tar.gz") already taken
        self._dir_archive_names_taken = set()
        # set of dir_archives that have actually been created
        self._dir_archives_created = set()

        # track (name, path) of files and archives to upload to spark.
        # these are a subset of those in self._working_dir_mgr
        self._spark_files = []
        self._spark_archives = []

        self._upload_mgr = None  # define in subclasses that use this

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
        self._job_key = self._make_unique_job_key(
            label=self._opts['label'], owner=self._opts['owner'])

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []
        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if extra_arg.get('type') != 'file':
                    raise NotImplementedError
                self._working_dir_mgr.add(**extra_arg)
开发者ID:okomestudio,项目名称:mrjob,代码行数:67,代码来源:runner.py

示例14: __init__


#.........这里部分代码省略.........
        :type step_output_dir: str
        :param step_output_dir: An empty/non-existent directory where Hadoop
                                should put output from all steps other than
                                the last one (this only matters for multi-step
                                jobs). Currently ignored by local runners.
        """
        self._ran_job = False

        # opts are made from:
        #
        # empty defaults (everything set to None)
        # runner-specific defaults
        # opts from config file(s)
        # opts from command line
        self._opts = self._combine_confs(
            [(None, {key: None for key in self.OPT_NAMES})] +
            [(None, self._default_opts())] +
            load_opts_from_mrjob_confs(self.alias, conf_paths) +
            [('the command line', opts)]
        )

        log.debug('Active configuration:')
        log.debug(pprint.pformat({
            opt_key: self._obfuscate_opt(opt_key, opt_value)
            for opt_key, opt_value in self._opts.items()
        }))

        self._fs = None

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        self._working_dir_mgr = WorkingDirManager()

        # mapping from dir to path for corresponding archive. we pick
        # paths during init(), but don't actually create the archives
        # until self._create_dir_archives() is called
        self._dir_to_archive_path = {}
        # dir archive names (the filename minus ".tar.gz") already taken
        self._dir_archive_names_taken = set()
        # set of dir_archives that have actually been created
        self._dir_archives_created = set()

        # track (name, path) of files and archives to upload to spark.
        # these are a subset of those in self._working_dir_mgr
        self._spark_files = []
        self._spark_archives = []

        self._upload_mgr = None  # define in subclasses that use this

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
        self._job_key = self._make_unique_job_key(
            label=self._opts['label'], owner=self._opts['owner'])

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []
        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if extra_arg.get('type') != 'file':
                    raise NotImplementedError
                self._working_dir_mgr.add(**extra_arg)
开发者ID:okomestudio,项目名称:mrjob,代码行数:67,代码来源:runner.py

示例15: HadoopInTheCloudJobRunner

class HadoopInTheCloudJobRunner(MRJobBinRunner):
    """Abstract base class for all Hadoop-in-the-cloud services."""

    alias = '_cloud'

    OPT_NAMES = MRJobBinRunner.OPT_NAMES | {
        'bootstrap',
        'bootstrap_python',
        'check_cluster_every',
        'cloud_fs_sync_secs',
        'cloud_tmp_dir',
        'cluster_id',
        'core_instance_type',
        'extra_cluster_params',
        'image_version',
        'instance_type',
        'master_instance_type',
        'max_mins_idle',
        'max_hours_idle',
        'num_core_instances',
        'num_task_instances',
        'region',
        'task_instance_type',
        'zone',
    }

    # so far, every service provides the ability to run bootstrap scripts
    _BOOTSTRAP_MRJOB_IN_SETUP = False

    def __init__(self, **kwargs):
        super(HadoopInTheCloudJobRunner, self).__init__(**kwargs)

        # if *cluster_id* is not set, ``self._cluster_id`` will be
        # set when we create or join a cluster
        self._cluster_id = self._opts['cluster_id']

        # bootstrapping
        self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()

        # add files to manager
        self._bootstrap_dir_mgr = WorkingDirManager()

        for cmd in self._bootstrap:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archive tokens to archives
                    if token['type'] == 'dir':
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'

                    self._bootstrap_dir_mgr.add(**token)

        # we'll create this script later, as needed
        self._master_bootstrap_script_path = None

    ### Options ###

    def _fix_opts(self, opts, source=None):
        opts = super(HadoopInTheCloudJobRunner, self)._fix_opts(
            opts, source=source)

        # patch max_hours_idle into max_mins_idle (see #1663)
        if opts.get('max_hours_idle') is not None:
            log.warning(
                'max_hours_idle is deprecated and will be removed in v0.7.0.' +
                (' Please use max_mins_idle instead'
                 if opts.get('max_mins_idle') is None else ''))

        if opts.get('max_mins_idle') is None:
            if opts.get('max_hours_idle') is not None:
                opts['max_mins_idle'] = opts['max_hours_idle'] * 60
            else:
                opts['max_mins_idle'] = _DEFAULT_MAX_MINS_IDLE

        # warn about issues with
        if opts['max_mins_idle'] < _DEFAULT_MAX_MINS_IDLE:
            log.warning('Setting max_mins_idle to less than %.1f may result'
                        ' in cluster shutting down before job can run' %
                        _DEFAULT_MAX_MINS_IDLE)

        return opts

    def _combine_opts(self, opt_list):
        """Propagate *instance_type* to other instance type opts, if not
        already set.

        Also propagate core instance type to task instance type, if it's
        not already set.
        """
        opts = super(HadoopInTheCloudJobRunner, self)._combine_opts(opt_list)

        if opts['instance_type']:
            # figure out how late in the configs opt was set (setting
            # --instance_type on the command line overrides core_instance_type
            # set in configs)
            opt_priority = {k: -1 for k in opts}

            for i, sub_opts in enumerate(opt_list):
                for k, v in sub_opts.items():
                    if v == opts[k]:
#.........这里部分代码省略.........
开发者ID:okomestudio,项目名称:mrjob,代码行数:101,代码来源:cloud.py


注:本文中的mrjob.setup.WorkingDirManager类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。