本文整理汇总了Python中mrjob.parse.is_uri函数的典型用法代码示例。如果您正苦于以下问题:Python is_uri函数的具体用法?Python is_uri怎么用?Python is_uri使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了is_uri函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_uri_parsing
def test_uri_parsing(self):
self.assertEqual(is_uri('notauri!'), False)
self.assertEqual(is_uri('they://did/the/monster/mash'), True)
self.assertEqual(is_s3_uri('s3://a/uri'), True)
self.assertEqual(is_s3_uri('s3n://a/uri'), True)
self.assertEqual(is_s3_uri('hdfs://a/uri'), False)
self.assertEqual(parse_s3_uri('s3://bucket/loc'), ('bucket', 'loc'))
示例2: join
def join(self, dirname, filename):
"""Join *filename* onto *dirname* (which may be a URI)"""
if is_uri(filename):
return filename
elif is_uri(dirname):
return posixpath.join(dirname, filename)
else:
return os.path.join(dirname, filename)
示例3: test_spark_master_mesos
def test_spark_master_mesos(self):
runner = SparkMRJobRunner(spark_master='mesos://host:12345')
self.assertTrue(is_uri(runner._spark_tmp_dir))
self.assertEqual(runner._spark_tmp_dir[:8], 'hdfs:///')
self.assertIsNotNone(runner._upload_mgr)
示例4: test_default
def test_default(self):
runner = SparkMRJobRunner()
self.assertFalse(is_uri(runner._spark_tmp_dir))
self.assertIsNone(runner._upload_mgr)
self.assertEqual(runner._spark_tmp_dir[-6:], '-spark')
示例5: ls
def ls(self, path_glob):
if not is_uri(path_glob):
for path in super(HadoopJobRunner, self).ls(path_glob):
yield path
return
components = urlparse(path_glob)
hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)
stdout = self._invoke_hadoop(
['fs', '-lsr', path_glob],
return_stdout=True,
ok_stderr=[HADOOP_LSR_NO_SUCH_FILE])
for line in StringIO(stdout):
fields = line.rstrip('\r\n').split()
# expect lines like:
# -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar
if len(fields) < 8:
raise Exception('unexpected ls line from hadoop: %r' % line)
# ignore directories
if fields[0].startswith('d'):
continue
# not sure if you can have spaces in filenames; just to be safe
path = ' '.join(fields[7:])
yield hdfs_prefix + path
示例6: _cat_file
def _cat_file(self, filename):
if is_uri(filename):
# stream from HDFS
cat_args = self._opts['hadoop_bin'] + ['fs', '-cat', filename]
log.debug('> %s' % cmd_line(cat_args))
cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)
def stream():
for line in cat_proc.stdout:
yield line
# there shouldn't be any stderr
for line in cat_proc.stderr:
log.error('STDERR: ' + line)
returncode = cat_proc.wait()
if returncode != 0:
raise CalledProcessError(returncode, cat_args)
return read_file(filename, stream())
else:
# read from local filesystem
return super(HadoopJobRunner, self)._cat_file(filename)
示例7: _setup_input
def _setup_input(self):
"""Copy local input files (if any) to a special directory on HDFS.
Set self._hdfs_input_files
"""
# winnow out HDFS files from local ones
self._hdfs_input_files = []
local_input_files = []
for path in self._input_paths:
if is_uri(path):
# Don't even bother running the job if the input isn't there.
if not self.ls(path):
raise AssertionError(
'Input path %s does not exist!' % (path,))
self._hdfs_input_files.append(path)
else:
local_input_files.append(path)
# copy local files into an input directory, with names like
# 00000-actual_name.ext
if local_input_files:
hdfs_input_dir = posixpath.join(self._hdfs_tmp_dir, 'input')
log.info('Uploading input to %s' % hdfs_input_dir)
self._mkdir_on_hdfs(hdfs_input_dir)
for i, path in enumerate(local_input_files):
if path == '-':
path = self._dump_stdin_to_local_file()
target = '%s/%05i-%s' % (
hdfs_input_dir, i, os.path.basename(path))
self._upload_to_hdfs(path, target)
self._hdfs_input_files.append(hdfs_input_dir)
示例8: test_spark_master_yarn
def test_spark_master_yarn(self):
runner = SparkMRJobRunner(spark_master='yarn')
self.assertTrue(is_uri(runner._spark_tmp_dir))
self.assertEqual(runner._spark_tmp_dir[:8], 'hdfs:///')
self.assertIsNotNone(runner._upload_mgr)
示例9: _create_input_manifest_if_needed
def _create_input_manifest_if_needed(self):
"""Create a file with a list of URIs of input files."""
if self._input_manifest_path or not self._uses_input_manifest():
return
uris = []
log.info('finding input files to add to manifest...')
for path in self._get_input_paths():
log.debug(' in %s' % path)
if is_uri(path):
# URIs might be globs
for uri in self.fs.ls(path):
uris.append(uri)
else:
# local paths are expected to be single files
# (shell would resolve globs)
if self._upload_mgr:
uris.append(self._upload_mgr.uri(path))
else:
# just make sure job can find files from it's working dir
uris.append(os.path.abspath(path))
log.info('found %d input files' % len(uris))
path = os.path.join(self._get_local_tmp_dir(), 'input-manifest.txt')
self._write_script(uris, path, 'input manifest')
self._input_manifest_path = path
if self._upload_mgr:
self._upload_mgr.add(self._input_manifest_path)
示例10: fully_qualify_hdfs_path
def fully_qualify_hdfs_path(path):
"""If path isn't an ``hdfs://`` URL, turn it into one."""
if is_uri(path):
return path
elif path.startswith('/'):
return 'hdfs://' + path
else:
return 'hdfs:///user/%s/%s' % (getpass.getuser(), path)
示例11: uri
def uri(self, path):
"""Get the URI for the given path. If *path* is a URI, just return it.
"""
if (not os.path.exists(path)) and is_uri(path):
return path
if path in self._path_to_name:
return posixpath.join(self.prefix, self._path_to_name[path])
else:
raise ValueError('%r is not a URI or a known local file' % (path,))
示例12: ls
def ls(self, path_glob):
components = urlparse(path_glob)
hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)
version = self.get_hadoop_version()
# use ls -R on Hadoop 2 (see #1152)
if uses_yarn(version):
args = ['fs', '-ls', '-R', path_glob]
else:
args = ['fs', '-lsr', path_glob]
try:
stdout = self.invoke_hadoop(args, return_stdout=True,
ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
except CalledProcessError:
raise IOError("Could not ls %s" % path_glob)
for line in BytesIO(stdout):
line = line.rstrip(b'\r\n')
# ignore total item count
if line.startswith(b'Found '):
continue
fields = line.split(b' ')
# Throw out directories
if fields[0].startswith(b'd'):
continue
# Try to figure out which part of the line is the path
# Expected lines:
#
# HDFS:
# -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar
#
# S3:
# -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar
path_index = None
for index, field in enumerate(fields):
# look for time field, and pick one after that
# (can't use field[2] because that's an int in Python 3)
if len(field) == 5 and field[2:3] == b':':
path_index = (index + 1)
if not path_index:
raise IOError("Could not locate path in string %r" % line)
path = to_unicode(line.split(b' ', path_index)[-1])
# handle fully qualified URIs from newer versions of Hadoop ls
# (see Pull Request #577)
if is_uri(path):
yield path
else:
yield hdfs_prefix + path
示例13: _endpoint_url
def _endpoint_url(host_or_uri):
"""If *host_or_uri* is non-empty and isn't a URI, prepend ``'https://'``.
Otherwise, pass through as-is.
"""
if not host_or_uri:
return host_or_uri
elif is_uri(host_or_uri):
return host_or_uri
else:
return 'https://' + host_or_uri
示例14: path_exists
def path_exists(self, path_glob):
"""Does the given path exist?
If dest is a directory (ends with a "/"), we check if there are
any files starting with that path.
"""
if not is_uri(path_glob):
return super(HadoopJobRunner, self).path_exists(path_glob)
return bool(self._invoke_hadoop(['fs', '-test', '-e', path_glob],
ok_returncodes=(0, 1)))
示例15: test_copy_files_with_rename_to_remote_wd_mirror
def test_copy_files_with_rename_to_remote_wd_mirror(self):
self.add_mock_s3_data({'walrus': {'fish': b'salmon',
'fowl': b'goose'}})
foe_path = self.makefile('foe', b'giant')
run_spark_submit = self.start(patch(
'mrjob.bin.MRJobBinRunner._run_spark_submit',
return_value=0))
job = MRSparkOSWalk(['-r', 'spark',
'--spark-master', 'mesos://host:9999',
'--spark-tmp-dir', 's3://walrus/tmp',
'--file', 's3://walrus/fish#ghoti',
'--file', 's3://walrus/fowl',
'--file', foe_path])
job.sandbox()
with job.make_runner() as runner:
runner.run()
# check working dir mirror
wd_mirror = runner._wd_mirror()
fs = runner.fs
self.assertIsNotNone(wd_mirror)
self.assertTrue(is_uri(wd_mirror))
self.assertTrue(fs.exists(wd_mirror))
# uploaded for rename
self.assertTrue(fs.exists(fs.join(wd_mirror, 'ghoti')))
# wrong name
self.assertFalse(fs.exists(fs.join(wd_mirror, 'fish')))
# no need to upload, already visible
self.assertFalse(fs.exists(fs.join(wd_mirror, 'fowl')))
# need to upload from local to remote
self.assertTrue(fs.exists(fs.join(wd_mirror, 'foe')))
run_spark_submit.assert_called_once_with(
ANY, ANY, record_callback=ANY)
spark_submit_args = run_spark_submit.call_args[0][0]
self.assertIn('--files', spark_submit_args)
files_arg = spark_submit_args[
spark_submit_args.index('--files') + 1]
self.assertEqual(
files_arg, ','.join([
fs.join(wd_mirror, 'foe'),
's3://walrus/fowl',
fs.join(wd_mirror, 'ghoti'),
fs.join(wd_mirror, 'mr_spark_os_walk.py'),
]))