本文整理汇总了Python中mrjob.parse.urlparse函数的典型用法代码示例。如果您正苦于以下问题:Python urlparse函数的具体用法?Python urlparse怎么用?Python urlparse使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了urlparse函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: hadoop_fs_ls
def hadoop_fs_ls(stdout, stderr, environ, *args):
"""Implements hadoop fs -ls."""
hdfs_path_globs = args or ['']
failed = False
for hdfs_path_glob in hdfs_path_globs:
parsed = urlparse(hdfs_path_glob)
scheme = parsed.scheme
netloc = parsed.netloc
real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ)
real_paths = glob.glob(real_path_glob)
paths = []
max_size = 0
if not real_paths:
print >> stderr, (
'ls: Cannot access %s: No such file or directory.' %
hdfs_path_glob)
failed = True
else:
for real_path in real_paths:
paths.append((real_path, scheme, netloc, 0))
for path in paths:
print >> stdout, _hadoop_ls_line(*path + (max_size, environ))
if failed:
return -1
else:
return 0
示例2: ls
def ls(self, path_glob):
components = urlparse(path_glob)
hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)
try:
stdout = self.invoke_hadoop(
['fs', '-lsr', path_glob],
return_stdout=True,
ok_stderr=[HADOOP_LSR_NO_SUCH_FILE])
except CalledProcessError:
raise IOError("Could not ls %s" % path_glob)
path_index = None
for line in StringIO(stdout):
fields = line.rstrip('\r\n').split()
# Throw out directories
if fields[0].startswith('d'):
continue
# Try to figure out which part of the line is the path
# Expected lines:
# -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # HDFS
# -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar # S3
if not path_index:
for index, field in enumerate(fields):
if len(field) == 5 and field[2] == ':':
path_index = (index + 1)
if not path_index:
raise IOError("Could not locate path in string '%s'" % line)
path = ' '.join(fields[path_index:])
yield hdfs_prefix + path
示例3: ls
def ls(self, path_glob):
if not is_uri(path_glob):
for path in super(HadoopJobRunner, self).ls(path_glob):
yield path
return
components = urlparse(path_glob)
hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)
stdout = self._invoke_hadoop(
['fs', '-lsr', path_glob],
return_stdout=True,
ok_stderr=[HADOOP_LSR_NO_SUCH_FILE])
for line in StringIO(stdout):
fields = line.rstrip('\r\n').split()
# expect lines like:
# -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar
if len(fields) < 8:
raise Exception('unexpected ls line from hadoop: %r' % line)
# ignore directories
if fields[0].startswith('d'):
continue
# not sure if you can have spaces in filenames; just to be safe
path = ' '.join(fields[7:])
yield hdfs_prefix + path
示例4: hadoop_fs_lsr
def hadoop_fs_lsr(stdout, stderr, environ, *args):
"""Implements hadoop fs -lsr."""
hdfs_path_globs = args or ['']
def ls_line(real_path, scheme, netloc):
hdfs_path = real_path_to_hdfs_path(real_path, environ)
# we could actually implement ls here, but mrjob only cares about
# the path
if os.path.isdir(real_path):
file_type = 'd'
else:
file_type = '-'
if scheme in ('s3', 's3n'):
# no user and group on S3 (see Pull Request #573)
user_and_group = ''
else:
user_and_group = 'dave supergroup'
# newer Hadoop returns fully qualified URIs (see Pull Request #577)
if scheme and environ.get('MOCK_HADOOP_LS_RETURNS_FULL_URIS'):
hdfs_path = '%s://%s%s' % (scheme, netloc, hdfs_path)
return (
'%srwxrwxrwx - %s 18321 2010-10-01 15:16 %s' %
(file_type, user_and_group, hdfs_path))
failed = False
for hdfs_path_glob in hdfs_path_globs:
parsed = urlparse(hdfs_path_glob)
scheme = parsed.scheme
netloc = parsed.netloc
real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ)
real_paths = glob.glob(real_path_glob)
if not real_paths:
print >> stderr, (
'lsr: Cannot access %s: No such file or directory.' %
hdfs_path_glob)
failed = True
else:
for real_path in real_paths:
if os.path.isdir(real_path):
for dirpath, dirnames, filenames in os.walk(real_path):
print >> stdout, ls_line(dirpath, scheme, netloc)
for filename in filenames:
path = os.path.join(dirpath, filename)
print >> stdout, ls_line(path, scheme, netloc)
else:
print >> stdout, ls_line(real_path, scheme, netloc)
if failed:
return -1
else:
return 0
示例5: _ls_detailed
def _ls_detailed(self, path_glob):
"""Recursively list files on GCS and includes some metadata about them:
- object name
- size
- md5 hash
- _uri
*path_glob* can include ``?`` to match single characters or
``*`` to match 0 or more characters. Both ``?`` and ``*`` can match
``/``.
"""
scheme = urlparse(path_glob).scheme
bucket_name, base_name = _path_glob_to_parsed_gcs_uri(path_glob)
# allow subdirectories of the path/glob
if path_glob and not path_glob.endswith('/'):
dir_glob = path_glob + '/*'
else:
dir_glob = path_glob + '*'
list_request = self.api_client.objects().list(
bucket=bucket_name, prefix=base_name, fields=_LS_FIELDS_TO_RETURN)
uri_prefix = '%s://%s' % (scheme, bucket_name)
while list_request:
try:
resp = list_request.execute()
except google_errors.HttpError as e:
if e.resp.status == 404:
return
raise
resp_items = resp.get('items') or []
for item in resp_items:
# We generate the item URI by adding the "gs://" prefix
uri = "%s/%s" % (uri_prefix, item['name'])
# enforce globbing
if not (fnmatch.fnmatchcase(uri, path_glob) or
fnmatch.fnmatchcase(uri, dir_glob)):
continue
# filter out folders
if uri.endswith('/'):
continue
item['_uri'] = uri
item['bucket'] = bucket_name
item['size'] = int(item['size'])
yield item
list_request = self.api_client.objects().list_next(
list_request, resp)
示例6: hdfs_path_to_real_path
def hdfs_path_to_real_path(hdfs_path, environ):
components = urlparse(hdfs_path)
scheme = components.scheme
path = components.path
if not scheme and not path.startswith("/"):
path = "/user/%s/%s" % (environ["USER"], path)
return os.path.join(environ["MOCK_HDFS_ROOT"], path.lstrip("/"))
示例7: ls
def ls(self, path_glob):
components = urlparse(path_glob)
hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)
version = self.get_hadoop_version()
# use ls -R on Hadoop 2 (see #1152)
if uses_yarn(version):
args = ['fs', '-ls', '-R', path_glob]
else:
args = ['fs', '-lsr', path_glob]
try:
stdout = self.invoke_hadoop(args, return_stdout=True,
ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
except CalledProcessError:
raise IOError("Could not ls %s" % path_glob)
for line in BytesIO(stdout):
line = line.rstrip(b'\r\n')
# ignore total item count
if line.startswith(b'Found '):
continue
fields = line.split(b' ')
# Throw out directories
if fields[0].startswith(b'd'):
continue
# Try to figure out which part of the line is the path
# Expected lines:
#
# HDFS:
# -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar
#
# S3:
# -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar
path_index = None
for index, field in enumerate(fields):
# look for time field, and pick one after that
# (can't use field[2] because that's an int in Python 3)
if len(field) == 5 and field[2:3] == b':':
path_index = (index + 1)
if not path_index:
raise IOError("Could not locate path in string %r" % line)
path = to_unicode(line.split(b' ', path_index)[-1])
# handle fully qualified URIs from newer versions of Hadoop ls
# (see Pull Request #577)
if is_uri(path):
yield path
else:
yield hdfs_prefix + path
示例8: copy_from_local
def copy_from_local(self, path, local_file):
# Ensure that local_file has a file:/// at the beginning...
local_file = urlparse(local_file)
assert local_file.scheme in ('', 'test'), "local_file must be local"
assert os.path.exists(local_file.path), "local_file must exist"
local_file = urlunparse(['file'] + list(local_file[1:]))
try:
self.invoke_hadoop(['fs', '-put', local_file, path])
except CalledProcessError as e:
raise OSError("Could not create file: %s" % e)
示例9: hdfs_uri_to_real_path
def hdfs_uri_to_real_path(hdfs_uri, environ):
"""Map an HDFS URI to a path on the filesystem."""
components = urlparse(hdfs_uri)
scheme = components.scheme
path = components.path
if not scheme and not path.startswith('/'):
path = '/user/%s/%s' % (environ['USER'], path)
return os.path.join(get_mock_hdfs_root(environ=environ), path.lstrip('/'))
示例10: _hadoop_fs_ls
def _hadoop_fs_ls(cmd_name, stdout, stderr, environ, path_args, recursive):
"""Helper for hadoop_fs_ls() and hadoop_fs_lsr()."""
hdfs_path_globs = path_args or ['']
failed = False
for hdfs_path_glob in hdfs_path_globs:
parsed = urlparse(hdfs_path_glob)
scheme = parsed.scheme
netloc = parsed.netloc
real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ)
real_paths = glob.glob(real_path_glob)
paths = []
if not real_paths:
print('%s: Cannot access %s: No such file or directory.' %
(cmd_name, hdfs_path_glob), file=stderr)
failed = True
else:
for real_path in real_paths:
if os.path.isdir(real_path):
if recursive:
for dirpath, dirnames, filenames in os.walk(real_path):
paths.append((dirpath, scheme, netloc, 0))
for filename in filenames:
path = os.path.join(dirpath, filename)
size = os.path.getsize(path)
paths.append((path, scheme, netloc, size))
else:
for filename in os.listdir(real_path):
path = os.path.join(real_path, filename)
if os.path.isdir(path):
size = 0
else:
size = os.path.getsize(path)
paths.append((path, scheme, netloc, size))
else:
size = os.path.getsize(real_path)
paths.append((real_path, scheme, netloc, size))
if paths:
print('Found %d items' % len(paths), file=stdout)
max_size = max(size for _, __, ___, size in paths)
for path in paths:
print(_hadoop_ls_line(*path + (max_size, environ)),
file=stdout)
if failed:
return -1
else:
return 0
示例11: parse_gcs_uri
def parse_gcs_uri(uri):
"""Parse a GCS URI into (bucket, key)
>>> parse_gcs_uri("gs://walrus/tmp/")
('walrus', 'tmp/')
If ``uri`` is not a GCS URI, raise a ValueError
"""
components = urlparse(uri)
if components.scheme != "gs" or '/' not in components.path:
raise ValueError('Invalid GCS URI: %s' % uri)
return components.netloc, components.path[1:]
示例12: join
def join(self, path, *paths):
"""Join *paths* onto *path* (which may be a URI)"""
all_paths = (path,) + paths
# if there's a URI, we only care about it and what follows
for i in range(len(all_paths), 0, -1):
if is_uri(all_paths[i - 1]):
scheme, netloc, uri_path = urlparse(all_paths[i - 1])[:3]
return '%s://%s%s' % (
scheme, netloc, posixpath.join(
uri_path or '/', *all_paths[i:]))
else:
return os.path.join(*all_paths)
示例13: hadoop_fs_put
def hadoop_fs_put(stdout, stderr, environ, *args):
"""Implements hadoop fs -put"""
if len(args) < 2:
stderr.write('Usage: java FsShell [-put <localsrc> ... <dst>]')
return -1
srcs = args[:-1]
dst = args[-1]
real_dst = hdfs_path_to_real_path(dst, environ)
dst_dir = os.path.isdir(real_dst)
real_dir = os.path.dirname(real_dst)
# dst could be a dir or a filename; we don't know
if not dst_dir and not os.path.isdir(real_dir):
os.makedirs(real_dir)
skipped = False
for src in srcs:
# If the destination is a directory then we put the source into it
# under its basename. If the destination is a file or does not exist
# then this is where we wish to write to.
target = os.path.join(real_dst, os.path.basename(src)) \
if dst_dir else real_dst
if os.path.exists(target):
if os.path.isdir(src):
stderr.write("Target %s is a directory" %
real_path_to_hdfs_path(target, environ))
else:
stderr.write("Target %s already exists" %
real_path_to_hdfs_path(target, environ))
skipped = True
continue
src_url = urlparse(src)
if src_url.scheme in ('file', ''):
src = src_url.path
else:
raise ValueError("hadoop fs -put mock supports only empty or "
"'file' schemes for input: %s" % src)
shutil.copy(src, real_dst)
return 255 if skipped else 0
示例14: ls
def ls(self, path_glob):
"""Recursively list files on S3.
*path_glob* can include ``?`` to match single characters or
``*`` to match 0 or more characters. Both ``?`` and ``*`` can match
``/``.
.. versionchanged:: 0.5.0
You no longer need a trailing slash to list "directories" on S3;
both ``ls('s3://b/dir')`` and `ls('s3://b/dir/')` will list
all keys starting with ``dir/``.
"""
# clean up the base uri to ensure we have an equal uri to boto (s3://)
# just in case we get passed s3n://
scheme = urlparse(path_glob).scheme
# support globs
glob_match = GLOB_RE.match(path_glob)
# we're going to search for all keys starting with base_uri
if glob_match:
# cut it off at first wildcard
base_uri = glob_match.group(1)
else:
base_uri = path_glob
bucket_name, base_name = parse_s3_uri(base_uri)
# allow subdirectories of the path/glob
if path_glob and not path_glob.endswith('/'):
dir_glob = path_glob + '/*'
else:
dir_glob = path_glob + '*'
bucket = self.get_bucket(bucket_name)
for key in bucket.list(base_name):
uri = "%s://%s/%s" % (scheme, bucket_name, key.name)
# enforce globbing
if not (fnmatch.fnmatchcase(uri, path_glob) or
fnmatch.fnmatchcase(uri, dir_glob)):
continue
yield uri
示例15: test_urlparse
def test_urlparse(self):
self.assertEqual(urlparse('http://www.yelp.com/lil_brudder'),
('http', 'www.yelp.com', '/lil_brudder', '', '', ''))
self.assertEqual(urlparse('cant://touch/this'),
('cant', 'touch', '/this', '', '', ''))
self.assertEqual(urlparse('s3://bucket/path'),
('s3', 'bucket', '/path', '', '', ''))
self.assertEqual(urlparse('s3://bucket/path#customname'),
('s3', 'bucket', '/path', '', '', 'customname'))
self.assertEqual(urlparse('s3://bucket'),
('s3', 'bucket', '', '', '', ''))
self.assertEqual(urlparse('s3://bucket/'),
('s3', 'bucket', '/', '', '', ''))