本文整理汇总了Python中mrjob.setup.parse_legacy_hash_path函数的典型用法代码示例。如果您正苦于以下问题:Python parse_legacy_hash_path函数的具体用法?Python parse_legacy_hash_path怎么用?Python parse_legacy_hash_path使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了parse_legacy_hash_path函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _parse_setup
def _parse_setup(self):
"""Parse the *setup* option with
:py:func:`mrjob.setup.parse_setup_cmd()`.
If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both
true, create mrjob.tar.gz (if it doesn't exist already) and
prepend a setup command that adds it to PYTHONPATH.
Also patch in the deprecated
options *python_archives*, *setup_cmd*, and *setup_script*
as setup commands.
"""
setup = []
# python_archives
for path in self._opts['python_archives']:
path_dict = parse_legacy_hash_path('archive', path)
setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH'])
# setup
for cmd in self._opts['setup']:
setup.append(parse_setup_cmd(cmd))
# setup_cmds
for cmd in self._opts['setup_cmds']:
if not isinstance(cmd, basestring):
cmd = cmd_line(cmd)
setup.append([cmd])
# setup_scripts
for path in self._opts['setup_scripts']:
path_dict = parse_legacy_hash_path('file', path)
setup.append([path_dict])
return setup
示例2: test_must_name
def test_must_name(self):
self.assertEqual(
parse_legacy_hash_path("file", "foo#bar", must_name="it"), {"type": "file", "path": "foo", "name": "bar"}
)
# use basename if no hash
self.assertEqual(
parse_legacy_hash_path("file", "foo", must_name="it"), {"type": "file", "path": "foo", "name": "foo"}
)
# raise error on explicit empty name
self.assertRaises(ValueError, parse_legacy_hash_path, "file", "foo#", must_name="it")
# raise error if no basename
self.assertRaises(ValueError, parse_legacy_hash_path, "file", "foo/", must_name="it")
示例3: test_trailing_slash_in_name
def test_trailing_slash_in_name(self):
self.assertRaises(ValueError, parse_legacy_hash_path, "file", "foo.tar.gz#bar/")
self.assertRaises(ValueError, parse_legacy_hash_path, "file", "foo.tar.gz#/")
# trailing slash is allowed for archives because that's the new
# way of indicating archives
self.assertEqual(
parse_legacy_hash_path("archive", "foo.tar.gz#bar/"),
{"type": "archive", "path": "foo.tar.gz", "name": "bar"},
)
self.assertEqual(
parse_legacy_hash_path("archive", "foo.tar.gz#/"), {"type": "archive", "path": "foo.tar.gz", "name": None}
)
示例4: test_trailing_slash_in_name
def test_trailing_slash_in_name(self):
self.assertRaises(
ValueError,
parse_legacy_hash_path, 'file', 'foo.tar.gz#bar/')
self.assertRaises(
ValueError,
parse_legacy_hash_path, 'file', 'foo.tar.gz#/')
# trailing slash is allowed for archives because that's the new
# way of indicating archives
self.assertEqual(
parse_legacy_hash_path('archive', 'foo.tar.gz#bar/'),
{'type': 'archive', 'path': 'foo.tar.gz', 'name': 'bar'})
self.assertEqual(
parse_legacy_hash_path('archive', 'foo.tar.gz#/'),
{'type': 'archive', 'path': 'foo.tar.gz', 'name': None})
示例5: test_must_name
def test_must_name(self):
self.assertEqual(
parse_legacy_hash_path('file', 'foo#bar', must_name='it'),
{'type': 'file', 'path': 'foo', 'name': 'bar'})
# use basename if no hash
self.assertEqual(
parse_legacy_hash_path('file', 'foo', must_name='it'),
{'type': 'file', 'path': 'foo', 'name': 'foo'})
# raise error on explicit empty name
self.assertRaises(ValueError,
parse_legacy_hash_path, 'file', 'foo#',
must_name='it')
# raise error if no basename
self.assertRaises(ValueError,
parse_legacy_hash_path, 'file', 'foo/',
must_name='it')
示例6: _parse_setup
def _parse_setup(self):
"""Parse the *setup* option with
:py:func:`mrjob.setup.parse_setup_cmd()`.
If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both
true, create mrjob.tar.gz (if it doesn't exist already) and
prepend a setup command that adds it to PYTHONPATH.
Also patch in the deprecated
options *python_archives*, *setup_cmd*, and *setup_script*
as setup commands.
"""
setup = []
# python_archives
for path in self._opts["python_archives"]:
path_dict = parse_legacy_hash_path("archive", path)
setup.append(["export PYTHONPATH=", path_dict, ":$PYTHONPATH"])
# setup
for cmd in self._opts["setup"]:
setup.append(parse_setup_cmd(cmd))
# setup_cmds
if self._opts["setup_cmds"]:
log.warning(
"setup_cmds is deprecated since v0.4.2 and will be removed" " in v0.6.0. Consider using setup instead."
)
for cmd in self._opts["setup_cmds"]:
if not isinstance(cmd, string_types):
cmd = cmd_line(cmd)
setup.append([cmd])
# setup_scripts
if self._opts["setup_scripts"]:
log.warning(
"setup_scripts is deprecated since v0.4.2 and will be removed"
" in v0.6.0. Consider using setup instead."
)
for path in self._opts["setup_scripts"]:
path_dict = parse_legacy_hash_path("file", path)
setup.append([path_dict])
return setup
示例7: _parse_setup_and_py_files
def _parse_setup_and_py_files(self):
"""Parse the *setup* option with
:py:func:`mrjob.setup.parse_setup_cmd()`, and patch in *py_files*.
"""
setup = []
# py_files
for path in self._opts['py_files']:
# Spark (at least v1.3.1) doesn't work with # and --py-files,
# see #1375
if '#' in path:
raise ValueError("py_files cannot contain '#'")
path_dict = parse_legacy_hash_path('file', path)
setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH'])
# setup
for cmd in self._opts['setup']:
setup.append(parse_setup_cmd(cmd))
return setup
示例8: _non_option_kwargs
def _non_option_kwargs(self):
"""Keyword arguments to runner constructor that can't be set
in mrjob.conf.
These should match the (named) arguments to
:py:meth:`~mrjob.runner.MRJobRunner.__init__`.
"""
# build extra_args
raw_args = _parse_raw_args(self.arg_parser, self._cl_args)
extra_args = []
for dest, option_string, args in raw_args:
if dest in self._file_arg_dests:
extra_args.append(option_string)
extra_args.append(parse_legacy_hash_path('file', args[0]))
elif dest in self._passthru_arg_dests:
# special case for --hadoop-arg=-verbose etc.
if (option_string and len(args) == 1 and
args[0].startswith('-')):
extra_args.append('%s=%s' % (option_string, args[0]))
else:
if option_string:
extra_args.append(option_string)
extra_args.extend(args)
# max_output_files is added by _add_runner_args() but can only
# be set from the command line, so we add it here (see #2040)
return dict(
conf_paths=self.options.conf_paths,
extra_args=extra_args,
hadoop_input_format=self.hadoop_input_format(),
hadoop_output_format=self.hadoop_output_format(),
input_paths=self.options.args,
max_output_files=self.options.max_output_files,
mr_job_script=self._script_path,
output_dir=self.options.output_dir,
partitioner=self.partitioner(),
stdin=self.stdin,
step_output_dir=self.options.step_output_dir,
)
示例9: __init__
def __init__(self, mr_job_script=None, conf_paths=None,
extra_args=None, file_upload_args=None,
hadoop_input_format=None, hadoop_output_format=None,
input_paths=None, output_dir=None, partitioner=None,
stdin=None, **opts):
"""All runners take the following keyword arguments:
:type mr_job_script: str
:param mr_job_script: the path of the ``.py`` file containing the
:py:class:`~mrjob.job.MRJob`. If this is None,
you won't actually be able to :py:meth:`run` the
job, but other utilities (e.g. :py:meth:`ls`)
will work.
:type conf_paths: None or list
:param conf_paths: List of config files to combine and use, or None to
search for mrjob.conf in the default locations.
:type extra_args: list of str
:param extra_args: a list of extra cmd-line arguments to pass to the
mr_job script. This is a hook to allow jobs to take
additional arguments.
:param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
The file at the given path will be uploaded
to the local directory of the mr_job script
when it runs, and then passed into the script
with ``--ARGNAME``. Useful for passing in
SQLite DBs and other configuration files to
your job.
:type hadoop_input_format: str
:param hadoop_input_format: name of an optional Hadoop ``InputFormat``
class. Passed to Hadoop along with your
first step with the ``-inputformat``
option. Note that if you write your own
class, you'll need to include it in your
own custom streaming jar (see
*hadoop_streaming_jar*).
:type hadoop_output_format: str
:param hadoop_output_format: name of an optional Hadoop
``OutputFormat`` class. Passed to Hadoop
along with your first step with the
``-outputformat`` option. Note that if you
write your own class, you'll need to
include it in your own custom streaming
jar (see *hadoop_streaming_jar*).
:type input_paths: list of str
:param input_paths: Input files for your job. Supports globs and
recursively walks directories (e.g.
``['data/common/', 'data/training/*.gz']``). If
this is left blank, we'll read from stdin
:type output_dir: str
:param output_dir: An empty/non-existent directory where Hadoop
streaming should put the final output from the job.
If you don't specify an output directory, we'll
output into a subdirectory of this job's temporary
directory. You can control this from the command
line with ``--output-dir``. This option cannot be
set from configuration files. If used with the
hadoop runner, this path does not need to be fully
qualified with ``hdfs://`` URIs because it's
understood that it has to be on HDFS.
:type partitioner: str
:param partitioner: Optional name of a Hadoop partitoner class, e.g.
``'org.apache.hadoop.mapred.lib.HashPartitioner'``.
Hadoop streaming will use this to determine how
mapper output should be sorted and distributed
to reducers.
:param stdin: an iterable (can be a ``BytesIO`` or even a list) to use
as stdin. This is a hook for testing; if you set
``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
get passed through to the runner. If for some reason
your lines are missing newlines, we'll add them;
this makes it easier to write automated tests.
"""
self._ran_job = False
self._opts = self.OPTION_STORE_CLASS(self.alias, opts, conf_paths)
self._fs = None
self._working_dir_mgr = WorkingDirManager()
self._script_path = mr_job_script
if self._script_path:
self._working_dir_mgr.add('file', self._script_path)
# give this job a unique name
self._job_key = self._make_unique_job_key(
label=self._opts['label'], owner=self._opts['owner'])
# we'll create the wrapper script later
self._setup_wrapper_script_path = None
# extra args to our job
self._extra_args = list(extra_args) if extra_args else []
# extra file arguments to our job
self._file_upload_args = []
if file_upload_args:
for arg, path in file_upload_args:
arg_file = parse_legacy_hash_path('file', path)
self._working_dir_mgr.add(**arg_file)
self._file_upload_args.append((arg, arg_file))
#.........这里部分代码省略.........
示例10: _add_python_archive
def _add_python_archive(self, path):
python_archive = parse_legacy_hash_path('archive', path)
self._working_dir_mgr.add(**python_archive)
self._python_archives.append(python_archive)
示例11: test_no_name
def test_no_name(self):
self.assertEqual(parse_legacy_hash_path("file", "foo"), {"type": "file", "path": "foo", "name": None})
self.assertEqual(parse_legacy_hash_path("file", "foo#"), {"type": "file", "path": "foo", "name": None})
示例12: test_basic
def test_basic(self):
self.assertEqual(parse_legacy_hash_path("file", "foo#bar"), {"type": "file", "path": "foo", "name": "bar"})
self.assertEqual(
parse_legacy_hash_path("file", "/dir/foo#bar"), {"type": "file", "path": "/dir/foo", "name": "bar"}
)
示例13: test_no_name
def test_no_name(self):
self.assertEqual(parse_legacy_hash_path('file', 'foo'),
{'type': 'file', 'path': 'foo', 'name': None})
self.assertEqual(parse_legacy_hash_path('file', 'foo#'),
{'type': 'file', 'path': 'foo', 'name': None})
示例14: test_basic
def test_basic(self):
self.assertEqual(parse_legacy_hash_path('file', 'foo#bar'),
{'type': 'file', 'path': 'foo', 'name': 'bar'})
self.assertEqual(parse_legacy_hash_path('file', '/dir/foo#bar'),
{'type': 'file', 'path': '/dir/foo', 'name': 'bar'})
示例15: __init__
#.........这里部分代码省略.........
# set of dir_archives that have actually been created
self._dir_archives_created = set()
# track (name, path) of files and archives to upload to spark.
# these are a subset of those in self._working_dir_mgr
self._spark_files = []
self._spark_archives = []
self._upload_mgr = None # define in subclasses that use this
self._script_path = mr_job_script
if self._script_path:
self._working_dir_mgr.add('file', self._script_path)
# give this job a unique name
self._job_key = self._make_unique_job_key(
label=self._opts['label'], owner=self._opts['owner'])
# extra args to our job
self._extra_args = list(extra_args) if extra_args else []
for extra_arg in self._extra_args:
if isinstance(extra_arg, dict):
if extra_arg.get('type') != 'file':
raise NotImplementedError
self._working_dir_mgr.add(**extra_arg)
self._spark_files.append(
(extra_arg['name'], extra_arg['path']))
# extra file arguments to our job
if file_upload_args:
log.warning('file_upload_args is deprecated and will be removed'
' in v0.6.0. Pass dicts to extra_args instead.')
for arg, path in file_upload_args:
arg_file = parse_legacy_hash_path('file', path)
self._working_dir_mgr.add(**arg_file)
self._extra_args.extend([arg, arg_file])
self._spark_files.append((arg_file['name'], arg_file['path']))
# set up uploading
for hash_path in self._opts['upload_files']:
uf = parse_legacy_hash_path('file', hash_path,
must_name='upload_files')
self._working_dir_mgr.add(**uf)
self._spark_files.append((uf['name'], uf['path']))
for hash_path in self._opts['upload_archives']:
ua = parse_legacy_hash_path('archive', hash_path,
must_name='upload_archives')
self._working_dir_mgr.add(**ua)
self._spark_archives.append((ua['name'], ua['path']))
for hash_path in self._opts['upload_dirs']:
# pick name based on directory path
ud = parse_legacy_hash_path('dir', hash_path,
must_name='upload_archives')
# but feed working_dir_mgr the archive's path
archive_path = self._dir_archive_path(ud['path'])
self._working_dir_mgr.add(
'archive', archive_path, name=ud['name'])
self._spark_archives.append((ud['name'], archive_path))
# py_files
# self._setup is a list of shell commands with path dicts
# interleaved; see mrjob.setup.parse_setup_cmd() for details
self._setup = self._parse_setup_and_py_files()