本文整理汇总了Python中mrjob.setup.UploadDirManager类的典型用法代码示例。如果您正苦于以下问题:Python UploadDirManager类的具体用法?Python UploadDirManager怎么用?Python UploadDirManager使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了UploadDirManager类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_unhide_files
def test_unhide_files(self):
# avoid giving names to files that Hadoop will ignore as input
sd = UploadDirManager('hdfs:///')
sd.add('.foo.log')
sd.add('_bar.txt')
self.assertEqual(sd.path_to_uri(),
{'.foo.log': 'hdfs:///foo.log',
'_bar.txt': 'hdfs:///bar.txt'})
示例2: __init__
def __init__(self, **kwargs):
""":py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
"""
super(HadoopJobRunner, self).__init__(**kwargs)
self._hdfs_tmp_dir = fully_qualify_hdfs_path(
posixpath.join(
self._opts['hdfs_scratch_dir'], self._job_name))
# Keep track of local files to upload to HDFS. We'll add them
# to this manager just before we need them.
hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '')
self._upload_mgr = UploadDirManager(hdfs_files_dir)
# Set output dir if it wasn't set explicitly
self._output_dir = fully_qualify_hdfs_path(
self._output_dir or
posixpath.join(self._hdfs_tmp_dir, 'output'))
self._hadoop_log_dir = hadoop_log_dir(self._opts['hadoop_home'])
# Running jobs via hadoop assigns a new timestamp to each job.
# Running jobs via mrjob only adds steps.
# Store both of these values to enable log parsing.
self._job_timestamp = None
self._start_step_num = 0
# init hadoop version cache
self._hadoop_version = None
示例3: __init__
def __init__(self, **kwargs):
""":py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
"""
super(HadoopJobRunner, self).__init__(**kwargs)
self._hadoop_tmp_dir = fully_qualify_hdfs_path(
posixpath.join(
self._opts['hadoop_tmp_dir'], self._job_key))
# Keep track of local files to upload to HDFS. We'll add them
# to this manager just before we need them.
hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '')
self._upload_mgr = UploadDirManager(hdfs_files_dir)
# Set output dir if it wasn't set explicitly
self._output_dir = fully_qualify_hdfs_path(
self._output_dir or
posixpath.join(self._hadoop_tmp_dir, 'output'))
# Track job and (YARN) application ID to enable log parsing
self._application_id = None
self._job_id = None
# Keep track of where the hadoop streaming jar is
self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar']
self._searched_for_hadoop_streaming_jar = False
# Keep track of the status of each step that ran
#
# these are dictionaries with the same keys as
# mrjob.logs.parse._parse_hadoop_streaming_log()
self._steps_info = []
示例4: __init__
def __init__(self, **kwargs):
""":py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
"""
super(HadoopJobRunner, self).__init__(**kwargs)
self._hadoop_tmp_dir = fully_qualify_hdfs_path(
posixpath.join(
self._opts['hadoop_tmp_dir'], self._job_key))
# Keep track of local files to upload to HDFS. We'll add them
# to this manager just before we need them.
hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '')
self._upload_mgr = UploadDirManager(hdfs_files_dir)
# Set output dir if it wasn't set explicitly
self._output_dir = fully_qualify_hdfs_path(
self._output_dir or
posixpath.join(self._hadoop_tmp_dir, 'output'))
# Track job and (YARN) application ID to enable log parsing
self._application_id = None
self._job_id = None
# Keep track of where the hadoop streaming jar is
self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar']
self._searched_for_hadoop_streaming_jar = False
# List of dicts (one for each step) potentially containing
# the keys 'history', 'step', and 'task' ('step' will always
# be filled because it comes from the hadoop jar command output,
# others will be filled as needed)
self._log_interpretations = []
示例5: test_dot_underscore
def test_dot_underscore(self):
sd = UploadDirManager('hdfs:///')
sd.add('._')
sd.add('._.txt')
sd.add('._foo')
self.assertEqual(sd.path_to_uri(),
{'._': 'hdfs:///1',
'._.txt': 'hdfs:///1.txt',
'._foo': 'hdfs:///foo'})
示例6: test_unknown_uri
def test_unknown_uri(self):
sd = UploadDirManager("hdfs:///")
sd.add("foo/bar.py")
self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
self.assertEqual(sd.uri("hdfs://host/path/to/bar.py"), "hdfs://host/path/to/bar.py")
# checking unknown URIs doesn't add them
self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
示例7: test_name_collision
def test_name_collision(self):
sd = UploadDirManager('hdfs:///')
sd.add('foo/bar.py')
sd.add('bar.py')
self.assertEqual(sd.path_to_uri(),
{'foo/bar.py': 'hdfs:///bar.py',
'bar.py': 'hdfs:///bar-1.py'})
示例8: test_underscores_only
def test_underscores_only(self):
sd = UploadDirManager('hdfs:///')
sd.add('_')
sd.add('_.txt')
self.assertEqual(sd.path_to_uri(),
{'_': 'hdfs:///1',
'_.txt': 'hdfs:///1.txt'})
示例9: test_unknown_uri
def test_unknown_uri(self):
sd = UploadDirManager('hdfs:///')
sd.add('foo/bar.py')
self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
self.assertEqual(sd.uri('hdfs://host/path/to/bar.py'),
'hdfs://host/path/to/bar.py')
# checking unknown URIs doesn't add them
self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
示例10: __init__
def __init__(self, **kwargs):
""":py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
"""
super(HadoopJobRunner, self).__init__(**kwargs)
self._hdfs_tmp_dir = fully_qualify_hdfs_path(
posixpath.join(
self._opts['hdfs_scratch_dir'], self._job_name))
# Keep track of local files to upload to HDFS. We'll add them
# to this manager just before we need them.
hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '')
self._upload_mgr = UploadDirManager(hdfs_files_dir)
# Set output dir if it wasn't set explicitly
self._output_dir = fully_qualify_hdfs_path(
self._output_dir or
posixpath.join(self._hdfs_tmp_dir, 'output'))
示例11: test_uri
def test_uri(self):
sd = UploadDirManager("hdfs:///")
sd.add("foo/bar.py")
self.assertEqual(sd.uri("foo/bar.py"), "hdfs:///bar.py")
示例12: test_add_is_idempotent
def test_add_is_idempotent(self):
sd = UploadDirManager("hdfs:///")
sd.add("foo/bar.py")
self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
sd.add("foo/bar.py")
self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
示例13: test_name_collision
def test_name_collision(self):
sd = UploadDirManager("hdfs:///")
sd.add("foo/bar.py")
sd.add("bar.py")
self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py", "bar.py": "hdfs:///bar-1.py"})
示例14: DataprocJobRunner
class DataprocJobRunner(HadoopInTheCloudJobRunner):
"""Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc.
Invoked when you run your job with ``-r dataproc``.
:py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which
is basically a temporary Hadoop cluster.
Input, support, and jar files can be either local or on GCS; use
``gs://...`` URLs to refer to files on GCS.
This class has some useful utilities for talking directly to GCS and
Dataproc, so you may find it useful to instantiate it without a script::
from mrjob.dataproc import DataprocJobRunner
...
"""
alias = 'dataproc'
OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | {
'gcp_project',
}
def __init__(self, **kwargs):
""":py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same
arguments as
:py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
"""
super(DataprocJobRunner, self).__init__(**kwargs)
# Dataproc requires a master and >= 2 core instances
# num_core_instances refers ONLY to number of CORE instances and does
# NOT include the required 1 instance for master
# In other words, minimum cluster size is 3 machines, 1 master and 2
# "num_core_instances" workers
if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS:
raise DataprocException(
'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS)
if (self._opts['core_instance_type'] !=
self._opts['task_instance_type']):
raise DataprocException(
'Dataproc v1 expects core/task instance types to be identical')
# Lazy-load gcloud config as needed - invocations fail in PyCharm
# debugging
self._gcloud_config = None
# Google Cloud Platform - project
self._gcp_project = (
self._opts['gcp_project'] or self.gcloud_config()['core.project'])
# Google Compute Engine - Region / Zone
self._gce_region = (
self._opts['region'] or self.gcloud_config()['compute.region'])
self._gce_zone = (
self._opts['zone'] or self.gcloud_config()['compute.zone'])
# cluster_id can be None here
self._cluster_id = self._opts['cluster_id']
self._api_client = None
self._gcs_fs = None
self._fs = None
# BEGIN - setup directories
base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir'])
self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir)
# use job key to make a unique tmp dir
self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/'
# pick/validate output dir
if self._output_dir:
self._output_dir = _check_and_fix_fs_dir(self._output_dir)
else:
self._output_dir = self._job_tmpdir + 'output/'
# END - setup directories
# manage local files that we want to upload to GCS. We'll add them
# to this manager just before we need them.
fs_files_dir = self._job_tmpdir + 'files/'
self._upload_mgr = UploadDirManager(fs_files_dir)
# when did our particular task start?
self._dataproc_job_start = None
# init hadoop, ami version caches
self._image_version = None
self._hadoop_version = None
# This will be filled by _run_steps()
# NOTE - log_interpretations will be empty except job_id until we
# parse task logs
self._log_interpretations = []
def _default_opts(self):
return combine_dicts(
super(DataprocJobRunner, self)._default_opts(),
#.........这里部分代码省略.........
示例15: HadoopJobRunner
class HadoopJobRunner(MRJobRunner):
"""Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
Input and support files can be either local or on HDFS; use ``hdfs://...``
URLs to refer to files on HDFS.
"""
alias = 'hadoop'
OPTION_STORE_CLASS = HadoopRunnerOptionStore
def __init__(self, **kwargs):
""":py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
"""
super(HadoopJobRunner, self).__init__(**kwargs)
self._hdfs_tmp_dir = fully_qualify_hdfs_path(
posixpath.join(
self._opts['hdfs_scratch_dir'], self._job_name))
# Keep track of local files to upload to HDFS. We'll add them
# to this manager just before we need them.
hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '')
self._upload_mgr = UploadDirManager(hdfs_files_dir)
# Set output dir if it wasn't set explicitly
self._output_dir = fully_qualify_hdfs_path(
self._output_dir or
posixpath.join(self._hdfs_tmp_dir, 'output'))
self._hadoop_log_dir = hadoop_log_dir(self._opts['hadoop_home'])
# Running jobs via hadoop assigns a new timestamp to each job.
# Running jobs via mrjob only adds steps.
# Store both of these values to enable log parsing.
self._job_timestamp = None
self._start_step_num = 0
# init hadoop version cache
self._hadoop_version = None
@property
def fs(self):
""":py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
filesystem.
"""
if self._fs is None:
self._fs = CompositeFilesystem(
HadoopFilesystem(self._opts['hadoop_bin']),
LocalFilesystem())
return self._fs
def get_hadoop_version(self):
"""Invoke the hadoop executable to determine its version"""
if not self._hadoop_version:
stdout = self.invoke_hadoop(['version'], return_stdout=True)
if stdout:
first_line = stdout.split('\n')[0]
m = HADOOP_VERSION_RE.match(first_line)
if m:
self._hadoop_version = m.group('version')
log.info("Using Hadoop version %s" % self._hadoop_version)
return self._hadoop_version
self._hadoop_version = '0.20.203'
log.info("Unable to determine Hadoop version. Assuming 0.20.203.")
return self._hadoop_version
def _run(self):
if self._opts['bootstrap_mrjob']:
self._add_python_archive(self._create_mrjob_tar_gz())
self._check_input_exists()
self._create_wrapper_script()
self._add_job_files_for_upload()
self._upload_local_files_to_hdfs()
self._run_job_in_hadoop()
def _check_input_exists(self):
"""Make sure all input exists before continuing with our job.
"""
for path in self._input_paths:
if path == '-':
continue # STDIN always exists
if not self.path_exists(path):
raise AssertionError(
'Input path %s does not exist!' % (path,))
def _add_job_files_for_upload(self):
"""Add files needed for running the job (setup and input)
to self._upload_mgr."""
for path in self._get_input_paths():
self._upload_mgr.add(path)
for path in self._working_dir_mgr.paths():
self._upload_mgr.add(path)
def _upload_local_files_to_hdfs(self):
"""Copy files managed by self._upload_mgr to HDFS
#.........这里部分代码省略.........