本文整理汇总了Python中mrjob.setup.UploadDirManager.path_to_uri方法的典型用法代码示例。如果您正苦于以下问题:Python UploadDirManager.path_to_uri方法的具体用法?Python UploadDirManager.path_to_uri怎么用?Python UploadDirManager.path_to_uri使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类mrjob.setup.UploadDirManager
的用法示例。
在下文中一共展示了UploadDirManager.path_to_uri方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_unknown_uri
# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import path_to_uri [as 别名]
def test_unknown_uri(self):
sd = UploadDirManager("hdfs:///")
sd.add("foo/bar.py")
self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
self.assertEqual(sd.uri("hdfs://host/path/to/bar.py"), "hdfs://host/path/to/bar.py")
# checking unknown URIs doesn't add them
self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
示例2: test_unknown_uri
# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import path_to_uri [as 别名]
def test_unknown_uri(self):
sd = UploadDirManager('hdfs:///')
sd.add('foo/bar.py')
self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
self.assertEqual(sd.uri('hdfs://host/path/to/bar.py'),
'hdfs://host/path/to/bar.py')
# checking unknown URIs doesn't add them
self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
示例3: test_name_collision
# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import path_to_uri [as 别名]
def test_name_collision(self):
sd = UploadDirManager('hdfs:///')
sd.add('foo/bar.py')
sd.add('bar.py')
self.assertEqual(sd.path_to_uri(),
{'foo/bar.py': 'hdfs:///bar.py',
'bar.py': 'hdfs:///bar-1.py'})
示例4: test_underscores_only
# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import path_to_uri [as 别名]
def test_underscores_only(self):
sd = UploadDirManager('hdfs:///')
sd.add('_')
sd.add('_.txt')
self.assertEqual(sd.path_to_uri(),
{'_': 'hdfs:///1',
'_.txt': 'hdfs:///1.txt'})
示例5: test_unhide_files
# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import path_to_uri [as 别名]
def test_unhide_files(self):
# avoid giving names to files that Hadoop will ignore as input
sd = UploadDirManager('hdfs:///')
sd.add('.foo.log')
sd.add('_bar.txt')
self.assertEqual(sd.path_to_uri(),
{'.foo.log': 'hdfs:///foo.log',
'_bar.txt': 'hdfs:///bar.txt'})
示例6: test_dot_underscore
# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import path_to_uri [as 别名]
def test_dot_underscore(self):
sd = UploadDirManager('hdfs:///')
sd.add('._')
sd.add('._.txt')
sd.add('._foo')
self.assertEqual(sd.path_to_uri(),
{'._': 'hdfs:///1',
'._.txt': 'hdfs:///1.txt',
'._foo': 'hdfs:///foo'})
示例7: test_add_is_idempotent
# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import path_to_uri [as 别名]
def test_add_is_idempotent(self):
sd = UploadDirManager('hdfs:///')
sd.add('foo/bar.py')
self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
sd.add('foo/bar.py')
self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
示例8: test_name_collision
# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import path_to_uri [as 别名]
def test_name_collision(self):
sd = UploadDirManager("hdfs:///")
sd.add("foo/bar.py")
sd.add("bar.py")
self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py", "bar.py": "hdfs:///bar-1.py"})
示例9: test_simple
# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import path_to_uri [as 别名]
def test_simple(self):
sd = UploadDirManager("hdfs:///")
sd.add("foo/bar.py")
self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
示例10: test_empty
# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import path_to_uri [as 别名]
def test_empty(self):
sd = UploadDirManager("hdfs:///")
self.assertEqual(sd.path_to_uri(), {})
示例11: HadoopJobRunner
# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import path_to_uri [as 别名]
#.........这里部分代码省略.........
self._check_input_exists()
self._create_setup_wrapper_script()
self._add_job_files_for_upload()
self._upload_local_files_to_hdfs()
self._run_job_in_hadoop()
def _check_input_exists(self):
"""Make sure all input exists before continuing with our job.
"""
for path in self._input_paths:
if path == '-':
continue # STDIN always exists
if self._opts['check_input_paths']:
if not self.fs.exists(path):
raise AssertionError(
'Input path %s does not exist!' % (path,))
def _add_job_files_for_upload(self):
"""Add files needed for running the job (setup and input)
to self._upload_mgr."""
for path in self._get_input_paths():
self._upload_mgr.add(path)
for path in self._working_dir_mgr.paths():
self._upload_mgr.add(path)
def _upload_local_files_to_hdfs(self):
"""Copy files managed by self._upload_mgr to HDFS
"""
self.fs.mkdir(self._upload_mgr.prefix)
log.info('Copying local files into %s' % self._upload_mgr.prefix)
for path, uri in self._upload_mgr.path_to_uri().items():
self._upload_to_hdfs(path, uri)
def _upload_to_hdfs(self, path, target):
log.debug('Uploading %s -> %s on HDFS' % (path, target))
self.fs._put(path, target)
def _dump_stdin_to_local_file(self):
"""Dump sys.stdin to a local file, and return the path to it."""
stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
# prompt user, so they don't think the process has stalled
log.info('reading from STDIN')
log.debug('dumping stdin to local file %s' % stdin_path)
stdin_file = open(stdin_path, 'wb')
for line in self._stdin:
stdin_file.write(line)
return stdin_path
def _run_job_in_hadoop(self):
self._counters = []
for step_num in range(self._num_steps()):
log.debug('running step %d of %d' %
(step_num + 1, self._num_steps()))
step_args = self._args_for_step(step_num)
log.debug('> %s' % cmd_line(step_args))
# try to use a PTY if it's available
try:
示例12: HadoopJobRunner
# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import path_to_uri [as 别名]
class HadoopJobRunner(MRJobRunner):
"""Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
Invoked when you run your job with ``-r hadoop``.
Input and support files can be either local or on HDFS; use ``hdfs://...``
URLs to refer to files on HDFS.
"""
alias = "hadoop"
OPTION_STORE_CLASS = HadoopRunnerOptionStore
def __init__(self, **kwargs):
""":py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
"""
super(HadoopJobRunner, self).__init__(**kwargs)
self._hdfs_tmp_dir = fully_qualify_hdfs_path(posixpath.join(self._opts["hdfs_scratch_dir"], self._job_key))
# Keep track of local files to upload to HDFS. We'll add them
# to this manager just before we need them.
hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, "files", "")
self._upload_mgr = UploadDirManager(hdfs_files_dir)
# Set output dir if it wasn't set explicitly
self._output_dir = fully_qualify_hdfs_path(self._output_dir or posixpath.join(self._hdfs_tmp_dir, "output"))
self._hadoop_log_dir = hadoop_log_dir(self._opts["hadoop_home"])
# Running jobs via hadoop assigns a new timestamp to each job.
# Running jobs via mrjob only adds steps.
# Store both of these values to enable log parsing.
self._job_timestamp = None
self._start_step_num = 0
# init hadoop version cache
self._hadoop_version = None
@property
def fs(self):
""":py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
filesystem.
"""
if self._fs is None:
self._fs = CompositeFilesystem(HadoopFilesystem(self._opts["hadoop_bin"]), LocalFilesystem())
return self._fs
def get_hadoop_version(self):
"""Invoke the hadoop executable to determine its version"""
if not self._hadoop_version:
stdout = self.invoke_hadoop(["version"], return_stdout=True)
if stdout:
first_line = stdout.split("\n")[0]
m = HADOOP_VERSION_RE.match(first_line)
if m:
self._hadoop_version = m.group("version")
log.info("Using Hadoop version %s" % self._hadoop_version)
return self._hadoop_version
self._hadoop_version = "0.20.203"
log.info("Unable to determine Hadoop version. Assuming 0.20.203.")
return self._hadoop_version
def _run(self):
self._check_input_exists()
self._create_setup_wrapper_script()
self._add_job_files_for_upload()
self._upload_local_files_to_hdfs()
self._run_job_in_hadoop()
def _check_input_exists(self):
"""Make sure all input exists before continuing with our job.
"""
for path in self._input_paths:
if path == "-":
continue # STDIN always exists
if self._opts["check_input_paths"]:
if not self.path_exists(path):
raise AssertionError("Input path %s does not exist!" % (path,))
def _add_job_files_for_upload(self):
"""Add files needed for running the job (setup and input)
to self._upload_mgr."""
for path in self._get_input_paths():
self._upload_mgr.add(path)
for path in self._working_dir_mgr.paths():
self._upload_mgr.add(path)
def _upload_local_files_to_hdfs(self):
"""Copy files managed by self._upload_mgr to HDFS
"""
self._mkdir_on_hdfs(self._upload_mgr.prefix)
log.info("Copying local files into %s" % self._upload_mgr.prefix)
for path, uri in self._upload_mgr.path_to_uri().items():
self._upload_to_hdfs(path, uri)
#.........这里部分代码省略.........
示例13: HadoopJobRunner
# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import path_to_uri [as 别名]
#.........这里部分代码省略.........
self._check_input_exists()
self._create_setup_wrapper_script()
self._add_job_files_for_upload()
self._upload_local_files_to_hdfs()
self._run_job_in_hadoop()
def _check_input_exists(self):
"""Make sure all input exists before continuing with our job.
"""
for path in self._input_paths:
if path == '-':
continue # STDIN always exists
if self._opts['check_input_paths']:
if not self.fs.exists(path):
raise AssertionError(
'Input path %s does not exist!' % (path,))
def _add_job_files_for_upload(self):
"""Add files needed for running the job (setup and input)
to self._upload_mgr."""
for path in self._get_input_paths():
self._upload_mgr.add(path)
for path in self._working_dir_mgr.paths():
self._upload_mgr.add(path)
def _upload_local_files_to_hdfs(self):
"""Copy files managed by self._upload_mgr to HDFS
"""
self.fs.mkdir(self._upload_mgr.prefix)
log.info('Copying local files into %s' % self._upload_mgr.prefix)
for path, uri in self._upload_mgr.path_to_uri().items():
self._upload_to_hdfs(path, uri)
def _upload_to_hdfs(self, path, target):
log.debug('Uploading %s -> %s on HDFS' % (path, target))
self.fs._put(path, target)
def _dump_stdin_to_local_file(self):
"""Dump sys.stdin to a local file, and return the path to it."""
stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
# prompt user, so they don't think the process has stalled
log.info('reading from STDIN')
log.debug('dumping stdin to local file %s' % stdin_path)
stdin_file = open(stdin_path, 'wb')
for line in self._stdin:
stdin_file.write(line)
return stdin_path
def _run_job_in_hadoop(self):
for step_num in range(self._num_steps()):
step_args = self._args_for_step(step_num)
# log this *after* _args_for_step(), which can start a search
# for the Hadoop streaming jar
log.info('Running step %d of %d' %
(step_num + 1, self._num_steps()))
log.debug('> %s' % cmd_line(step_args))
log_interpretation = {}
self._log_interpretations.append(log_interpretation)
示例14: uri_adds_trailing_slash
# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import path_to_uri [as 别名]
def uri_adds_trailing_slash(self):
sd = UploadDirManager("s3://bucket/dir")
sd.add("foo/bar.py")
self.assertEqual(sd.uri("foo/bar.py"), "s3://bucket/dir/bar.py")
self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "s3://bucket/dir/bar.py"})
示例15: DataprocJobRunner
# 需要导入模块: from mrjob.setup import UploadDirManager [as 别名]
# 或者: from mrjob.setup.UploadDirManager import path_to_uri [as 别名]
#.........这里部分代码省略.........
# now that we know where the above files live, we can create
# the master bootstrap script
self._create_master_bootstrap_script_if_needed()
if self._master_bootstrap_script_path:
self._upload_mgr.add(self._master_bootstrap_script_path)
self._upload_mgr.add(_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH)
def _add_job_files_for_upload(self):
"""Add files needed for running the job (setup and input)
to self._upload_mgr."""
for path in self._get_input_paths():
self._upload_mgr.add(path)
for path in self._working_dir_mgr.paths():
self._upload_mgr.add(path)
# TODO - mtai @ davidmarin - hadoop_streaming_jar is currently ignored,
# see _HADOOP_STREAMING_JAR_URI
# if self._opts['hadoop_streaming_jar']:
# self._upload_mgr.add(self._opts['hadoop_streaming_jar'])
for step in self._get_steps():
if step.get('jar'):
self._upload_mgr.add(step['jar'])
def _upload_local_files_to_fs(self):
"""Copy local files tracked by self._upload_mgr to FS."""
bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
self._create_fs_tmp_bucket(bucket_name)
log.info('Copying non-input files into %s' % self._upload_mgr.prefix)
for path, gcs_uri in self._upload_mgr.path_to_uri().items():
log.debug('uploading %s -> %s' % (path, gcs_uri))
# TODO - mtai @ davidmarin - Implement put function for other FSs
self.fs.put(path, gcs_uri)
self._wait_for_fs_sync()
def _create_fs_tmp_bucket(self, bucket_name, location=None):
"""Create a temp bucket if missing
Tie the temporary bucket to the same region as the GCE job and set a
28-day TTL
"""
# Return early if our bucket already exists
try:
self.fs.get_bucket(bucket_name)
return
except google_errors.HttpError as e:
if not e.resp.status == 404:
raise
log.info('creating FS bucket %r' % bucket_name)
location = location or self._gce_region
# NOTE - By default, we create a bucket in the same GCE region as our
# job (tmp buckets ONLY)
# https://cloud.google.com/storage/docs/bucket-locations
self.fs.create_bucket(
self._gcp_project, bucket_name, location=location,
object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS)