本文整理汇总了Python中mrjob.setup.WorkingDirManager类的典型用法代码示例。如果您正苦于以下问题:Python WorkingDirManager类的具体用法?Python WorkingDirManager怎么用?Python WorkingDirManager使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了WorkingDirManager类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, **kwargs):
super(HadoopInTheCloudJobRunner, self).__init__(**kwargs)
# if *cluster_id* is not set, ``self._cluster_id`` will be
# set when we create or join a cluster
self._cluster_id = self._opts['cluster_id']
# bootstrapping
self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()
# add files to manager
self._bootstrap_dir_mgr = WorkingDirManager()
for cmd in self._bootstrap:
for token in cmd:
if isinstance(token, dict):
# convert dir archive tokens to archives
if token['type'] == 'dir':
token['path'] = self._dir_archive_path(token['path'])
token['type'] = 'archive'
self._bootstrap_dir_mgr.add(**token)
# we'll create this script later, as needed
self._master_bootstrap_script_path = None
示例2: test_allow_hidden_files
def test_allow_hidden_files(self):
wd = WorkingDirManager()
wd.add('archive', '_foo.tar.gz')
wd.add('file', '.bazrc')
self.assertEqual(wd.name('archive', '_foo.tar.gz'), '_foo.tar.gz')
self.assertEqual(wd.name('file', '.bazrc'), '.bazrc')
示例3: test_simple
def test_simple(self):
wd = WorkingDirManager()
wd.add('archive', 's3://bucket/path/to/baz.tar.gz')
wd.add('file', 'foo/bar.py')
self.assertEqual(wd.name_to_path('file'),
{'bar.py': 'foo/bar.py'})
self.assertEqual(wd.name_to_path('archive'),
{'baz.tar.gz': 's3://bucket/path/to/baz.tar.gz'})
self.assertEqual(
wd.paths(),
set(['foo/bar.py', 's3://bucket/path/to/baz.tar.gz']))
示例4: test_auto_names_are_different_from_assigned_names
def test_auto_names_are_different_from_assigned_names(self):
wd = WorkingDirManager()
wd.add('file', 'foo/bar.py', name='qux.py')
wd.add('file', 'foo/bar.py') # use default name bar.py
self.assertEqual(wd.name_to_path('file'),
{'qux.py': 'foo/bar.py',
'bar.py': 'foo/bar.py'})
self.assertEqual(wd.paths(), set(['foo/bar.py']))
示例5: __init__
def __init__(self, **kwargs):
super(HadoopInTheCloudJobRunner, self).__init__(**kwargs)
# if *cluster_id* is not set, ``self._cluster_id`` will be
# set when we create or join a cluster
self._cluster_id = self._opts['cluster_id']
# bootstrapping
self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()
# add files to manager
self._bootstrap_dir_mgr = WorkingDirManager()
for cmd in self._bootstrap:
for token in cmd:
if isinstance(token, dict):
# convert dir archive tokens to archives
if token['type'] == 'dir':
token['path'] = self._dir_archive_path(token['path'])
token['type'] = 'archive'
self._bootstrap_dir_mgr.add(**token)
# we'll create this script later, as needed
self._master_bootstrap_script_path = None
# ssh state
# the process for the SSH tunnel
self._ssh_proc = None
# if this is true, stop trying to launch the SSH tunnel
self._give_up_on_ssh_tunnel = False
# store the (tunneled) URL of the job tracker/resource manager
self._ssh_tunnel_url = None
示例6: test_cant_auto_name_unless_added_as_auto
def test_cant_auto_name_unless_added_as_auto(self):
wd = WorkingDirManager()
wd.add('file', 'bar.py', name='qux.py')
self.assertEqual(wd.name('file', 'bar.py', 'qux.py'), 'qux.py')
self.assertRaises(ValueError,
wd.name, 'file', 'bar.py')
示例7: MRJobRunner
class MRJobRunner(object):
"""Abstract base class for all runners"""
#: alias for this runner; used for picking section of
#: :py:mod:``mrjob.conf`` to load one of ``'local'``, ``'emr'``,
#: or ``'hadoop'``
alias = None
# if this is true, when bootstrap_mrjob is true, add it through the
# setup script
BOOTSTRAP_MRJOB_IN_SETUP = True
OPTION_STORE_CLASS = RunnerOptionStore
### methods to call from your batch script ###
def __init__(self, mr_job_script=None, conf_paths=None,
extra_args=None, file_upload_args=None,
hadoop_input_format=None, hadoop_output_format=None,
input_paths=None, output_dir=None, partitioner=None,
stdin=None, **opts):
"""All runners take the following keyword arguments:
:type mr_job_script: str
:param mr_job_script: the path of the ``.py`` file containing the
:py:class:`~mrjob.job.MRJob`. If this is None,
you won't actually be able to :py:meth:`run` the
job, but other utilities (e.g. :py:meth:`ls`)
will work.
:type conf_paths: None or list
:param conf_paths: List of config files to combine and use, or None to
search for mrjob.conf in the default locations.
:type extra_args: list of str
:param extra_args: a list of extra cmd-line arguments to pass to the
mr_job script. This is a hook to allow jobs to take
additional arguments.
:param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
The file at the given path will be uploaded
to the local directory of the mr_job script
when it runs, and then passed into the script
with ``--ARGNAME``. Useful for passing in
SQLite DBs and other configuration files to
your job.
:type hadoop_input_format: str
:param hadoop_input_format: name of an optional Hadoop ``InputFormat``
class. Passed to Hadoop along with your
first step with the ``-inputformat``
option. Note that if you write your own
class, you'll need to include it in your
own custom streaming jar (see
*hadoop_streaming_jar*).
:type hadoop_output_format: str
:param hadoop_output_format: name of an optional Hadoop
``OutputFormat`` class. Passed to Hadoop
along with your first step with the
``-outputformat`` option. Note that if you
write your own class, you'll need to
include it in your own custom streaming
jar (see *hadoop_streaming_jar*).
:type input_paths: list of str
:param input_paths: Input files for your job. Supports globs and
recursively walks directories (e.g.
``['data/common/', 'data/training/*.gz']``). If
this is left blank, we'll read from stdin
:type output_dir: str
:param output_dir: An empty/non-existent directory where Hadoop
streaming should put the final output from the job.
If you don't specify an output directory, we'll
output into a subdirectory of this job's temporary
directory. You can control this from the command
line with ``--output-dir``. This option cannot be
set from configuration files. If used with the
hadoop runner, this path does not need to be fully
qualified with ``hdfs://`` URIs because it's
understood that it has to be on HDFS.
:type partitioner: str
:param partitioner: Optional name of a Hadoop partitoner class, e.g.
``'org.apache.hadoop.mapred.lib.HashPartitioner'``.
Hadoop streaming will use this to determine how
mapper output should be sorted and distributed
to reducers.
:param stdin: an iterable (can be a ``BytesIO`` or even a list) to use
as stdin. This is a hook for testing; if you set
``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
get passed through to the runner. If for some reason
your lines are missing newlines, we'll add them;
this makes it easier to write automated tests.
"""
self._ran_job = False
self._opts = self.OPTION_STORE_CLASS(self.alias, opts, conf_paths)
self._fs = None
self._working_dir_mgr = WorkingDirManager()
self._script_path = mr_job_script
if self._script_path:
self._working_dir_mgr.add('file', self._script_path)
# give this job a unique name
#.........这里部分代码省略.........
示例8: test_cant_auto_name_unless_added_as_auto
def test_cant_auto_name_unless_added_as_auto(self):
wd = WorkingDirManager()
wd.add("file", "bar.py", name="qux.py")
self.assertEqual(wd.name("file", "bar.py", "qux.py"), "qux.py")
self.assertRaises(ValueError, wd.name, "file", "bar.py")
示例9: test_lazy_naming
def test_lazy_naming(self):
wd = WorkingDirManager()
wd.add("file", "qux.py") # qux.py by default
wd.add("file", "bar.py", name="qux.py")
self.assertEqual(wd.name_to_path("file"), {"qux.py": "bar.py", "qux-1.py": "qux.py"})
示例10: test_auto_names_are_different_from_assigned_names
def test_auto_names_are_different_from_assigned_names(self):
wd = WorkingDirManager()
wd.add("file", "foo/bar.py", name="qux.py")
wd.add("file", "foo/bar.py") # use default name bar.py
self.assertEqual(wd.name_to_path("file"), {"qux.py": "foo/bar.py", "bar.py": "foo/bar.py"})
示例11: test_explicit_name_collision
def test_explicit_name_collision(self):
wd = WorkingDirManager()
wd.add("file", "foo.py", name="qux.py")
self.assertRaises(ValueError, wd.add, "file", "bar.py", name="qux.py")
示例12: test_empty
def test_empty(self):
wd = WorkingDirManager()
self.assertEqual(wd.name_to_path('archive'), {})
self.assertEqual(wd.name_to_path('file'), {})
self.assertEqual(wd.paths(), set())
示例13: MRJobRunner
#.........这里部分代码省略.........
:type step_output_dir: str
:param step_output_dir: An empty/non-existent directory where Hadoop
should put output from all steps other than
the last one (this only matters for multi-step
jobs). Currently ignored by local runners.
"""
self._ran_job = False
# opts are made from:
#
# empty defaults (everything set to None)
# runner-specific defaults
# opts from config file(s)
# opts from command line
self._opts = self._combine_confs(
[(None, {key: None for key in self.OPT_NAMES})] +
[(None, self._default_opts())] +
load_opts_from_mrjob_confs(self.alias, conf_paths) +
[('the command line', opts)]
)
log.debug('Active configuration:')
log.debug(pprint.pformat({
opt_key: self._obfuscate_opt(opt_key, opt_value)
for opt_key, opt_value in self._opts.items()
}))
self._fs = None
# a local tmp directory that will be cleaned up when we're done
# access/make this using self._get_local_tmp_dir()
self._local_tmp_dir = None
self._working_dir_mgr = WorkingDirManager()
# mapping from dir to path for corresponding archive. we pick
# paths during init(), but don't actually create the archives
# until self._create_dir_archives() is called
self._dir_to_archive_path = {}
# dir archive names (the filename minus ".tar.gz") already taken
self._dir_archive_names_taken = set()
# set of dir_archives that have actually been created
self._dir_archives_created = set()
# track (name, path) of files and archives to upload to spark.
# these are a subset of those in self._working_dir_mgr
self._spark_files = []
self._spark_archives = []
self._upload_mgr = None # define in subclasses that use this
self._script_path = mr_job_script
if self._script_path:
self._working_dir_mgr.add('file', self._script_path)
# give this job a unique name
self._job_key = self._make_unique_job_key(
label=self._opts['label'], owner=self._opts['owner'])
# extra args to our job
self._extra_args = list(extra_args) if extra_args else []
for extra_arg in self._extra_args:
if isinstance(extra_arg, dict):
if extra_arg.get('type') != 'file':
raise NotImplementedError
self._working_dir_mgr.add(**extra_arg)
示例14: __init__
#.........这里部分代码省略.........
:type step_output_dir: str
:param step_output_dir: An empty/non-existent directory where Hadoop
should put output from all steps other than
the last one (this only matters for multi-step
jobs). Currently ignored by local runners.
"""
self._ran_job = False
# opts are made from:
#
# empty defaults (everything set to None)
# runner-specific defaults
# opts from config file(s)
# opts from command line
self._opts = self._combine_confs(
[(None, {key: None for key in self.OPT_NAMES})] +
[(None, self._default_opts())] +
load_opts_from_mrjob_confs(self.alias, conf_paths) +
[('the command line', opts)]
)
log.debug('Active configuration:')
log.debug(pprint.pformat({
opt_key: self._obfuscate_opt(opt_key, opt_value)
for opt_key, opt_value in self._opts.items()
}))
self._fs = None
# a local tmp directory that will be cleaned up when we're done
# access/make this using self._get_local_tmp_dir()
self._local_tmp_dir = None
self._working_dir_mgr = WorkingDirManager()
# mapping from dir to path for corresponding archive. we pick
# paths during init(), but don't actually create the archives
# until self._create_dir_archives() is called
self._dir_to_archive_path = {}
# dir archive names (the filename minus ".tar.gz") already taken
self._dir_archive_names_taken = set()
# set of dir_archives that have actually been created
self._dir_archives_created = set()
# track (name, path) of files and archives to upload to spark.
# these are a subset of those in self._working_dir_mgr
self._spark_files = []
self._spark_archives = []
self._upload_mgr = None # define in subclasses that use this
self._script_path = mr_job_script
if self._script_path:
self._working_dir_mgr.add('file', self._script_path)
# give this job a unique name
self._job_key = self._make_unique_job_key(
label=self._opts['label'], owner=self._opts['owner'])
# extra args to our job
self._extra_args = list(extra_args) if extra_args else []
for extra_arg in self._extra_args:
if isinstance(extra_arg, dict):
if extra_arg.get('type') != 'file':
raise NotImplementedError
self._working_dir_mgr.add(**extra_arg)
示例15: HadoopInTheCloudJobRunner
class HadoopInTheCloudJobRunner(MRJobBinRunner):
"""Abstract base class for all Hadoop-in-the-cloud services."""
alias = '_cloud'
OPT_NAMES = MRJobBinRunner.OPT_NAMES | {
'bootstrap',
'bootstrap_python',
'check_cluster_every',
'cloud_fs_sync_secs',
'cloud_tmp_dir',
'cluster_id',
'core_instance_type',
'extra_cluster_params',
'image_version',
'instance_type',
'master_instance_type',
'max_mins_idle',
'max_hours_idle',
'num_core_instances',
'num_task_instances',
'region',
'task_instance_type',
'zone',
}
# so far, every service provides the ability to run bootstrap scripts
_BOOTSTRAP_MRJOB_IN_SETUP = False
def __init__(self, **kwargs):
super(HadoopInTheCloudJobRunner, self).__init__(**kwargs)
# if *cluster_id* is not set, ``self._cluster_id`` will be
# set when we create or join a cluster
self._cluster_id = self._opts['cluster_id']
# bootstrapping
self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()
# add files to manager
self._bootstrap_dir_mgr = WorkingDirManager()
for cmd in self._bootstrap:
for token in cmd:
if isinstance(token, dict):
# convert dir archive tokens to archives
if token['type'] == 'dir':
token['path'] = self._dir_archive_path(token['path'])
token['type'] = 'archive'
self._bootstrap_dir_mgr.add(**token)
# we'll create this script later, as needed
self._master_bootstrap_script_path = None
### Options ###
def _fix_opts(self, opts, source=None):
opts = super(HadoopInTheCloudJobRunner, self)._fix_opts(
opts, source=source)
# patch max_hours_idle into max_mins_idle (see #1663)
if opts.get('max_hours_idle') is not None:
log.warning(
'max_hours_idle is deprecated and will be removed in v0.7.0.' +
(' Please use max_mins_idle instead'
if opts.get('max_mins_idle') is None else ''))
if opts.get('max_mins_idle') is None:
if opts.get('max_hours_idle') is not None:
opts['max_mins_idle'] = opts['max_hours_idle'] * 60
else:
opts['max_mins_idle'] = _DEFAULT_MAX_MINS_IDLE
# warn about issues with
if opts['max_mins_idle'] < _DEFAULT_MAX_MINS_IDLE:
log.warning('Setting max_mins_idle to less than %.1f may result'
' in cluster shutting down before job can run' %
_DEFAULT_MAX_MINS_IDLE)
return opts
def _combine_opts(self, opt_list):
"""Propagate *instance_type* to other instance type opts, if not
already set.
Also propagate core instance type to task instance type, if it's
not already set.
"""
opts = super(HadoopInTheCloudJobRunner, self)._combine_opts(opt_list)
if opts['instance_type']:
# figure out how late in the configs opt was set (setting
# --instance_type on the command line overrides core_instance_type
# set in configs)
opt_priority = {k: -1 for k in opts}
for i, sub_opts in enumerate(opt_list):
for k, v in sub_opts.items():
if v == opts[k]:
#.........这里部分代码省略.........