本文整理汇总了Python中mrjob.py2.to_string函数的典型用法代码示例。如果您正苦于以下问题:Python to_string函数的具体用法?Python to_string怎么用?Python to_string使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了to_string函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _parse_counters_0_18
def _parse_counters_0_18(counter_string):
# 0.18 counters look like this:
# GroupName.CounterName:Value,Group1.Crackers:3,Group2.Nerf:243,...
groups = _COUNTER_RE_0_18.finditer(counter_string)
if groups is None:
log.warning("Cannot parse Hadoop counter string: %s" % counter_string)
for m in groups:
yield (to_string(m.group("group")), to_string(m.group("name")), int(m.group("value")))
示例2: invoke_hadoop
def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None,
return_stdout=False):
"""Run the given hadoop command, raising an exception on non-zero
return code. This only works for commands whose output we don't
care about.
Args:
ok_returncodes -- a list/tuple/set of return codes we expect to
get back from hadoop (e.g. [0,1]). By default, we only expect 0.
If we get an unexpected return code, we raise a CalledProcessError.
ok_stderr -- don't log STDERR or raise CalledProcessError if stderr
matches a regex in this list (even if the returncode is bad)
return_stdout -- return the stdout from the hadoop command rather
than logging it. If this is False, we return the returncode
instead.
"""
args = self._hadoop_bin + args
log.debug('> %s' % cmd_line(args))
proc = Popen(args, stdout=PIPE, stderr=PIPE)
stdout, stderr = proc.communicate()
log_func = log.debug if proc.returncode == 0 else log.error
if not return_stdout:
for line in BytesIO(stdout):
log_func('STDOUT: ' + to_string(line.rstrip(b'\r\n')))
# check if STDERR is okay
stderr_is_ok = False
if ok_stderr:
for stderr_re in ok_stderr:
if stderr_re.match(stderr):
stderr_is_ok = True
break
if not stderr_is_ok:
for line in BytesIO(stderr):
log_func('STDERR: ' + to_string(line.rstrip(b'\r\n')))
ok_returncodes = ok_returncodes or [0]
if not stderr_is_ok and proc.returncode not in ok_returncodes:
raise CalledProcessError(proc.returncode, args)
if return_stdout:
return stdout
else:
return proc.returncode
示例3: find_hadoop_java_stack_trace
def find_hadoop_java_stack_trace(lines):
"""Scan a log file or other iterable for a java stack trace from Hadoop,
and return it as a list of lines (bytes).
In logs from EMR, we find java stack traces in ``task-attempts/*/syslog``
Sample stack trace::
2010-07-27 18:25:48,397 WARN org.apache.hadoop.mapred.TaskTracker (main): Error running child
java.lang.OutOfMemoryError: Java heap space
at org.apache.hadoop.mapred.IFile$Reader.readNextBlock(IFile.java:270)
at org.apache.hadoop.mapred.IFile$Reader.next(IFile.java:332)
at org.apache.hadoop.mapred.Merger$Segment.next(Merger.java:147)
at org.apache.hadoop.mapred.Merger$MergeQueue.adjustPriorityQueue(Merger.java:238)
at org.apache.hadoop.mapred.Merger$MergeQueue.next(Merger.java:255)
at org.apache.hadoop.mapred.Merger.writeFile(Merger.java:86)
at org.apache.hadoop.mapred.Merger$MergeQueue.merge(Merger.java:377)
at org.apache.hadoop.mapred.Merger.merge(Merger.java:58)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:277)
at org.apache.hadoop.mapred.TaskTracker$Child.main(TaskTracker.java:2216)
(We omit the "Error running child" line from the results)
"""
for line in lines:
if line.rstrip(b'\r\n').endswith(b"Error running child"):
st_lines = []
for line in lines:
st_lines.append(line)
for line in lines:
if not line.startswith(b' at '):
break
st_lines.append(line)
return [to_string(line) for line in st_lines]
else:
return None
示例4: _run_job_in_hadoop
def _run_job_in_hadoop(self):
self._counters = []
for step_num in range(self._num_steps()):
log.debug("running step %d of %d" % (step_num + 1, self._num_steps()))
step_args = self._args_for_step(step_num)
log.debug("> %s" % cmd_line(step_args))
# try to use a PTY if it's available
try:
pid, master_fd = pty.fork()
except (AttributeError, OSError):
# no PTYs, just use Popen
step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)
self._process_stderr_from_streaming(step_proc.stderr)
# there shouldn't be much output to STDOUT
for line in step_proc.stdout:
log.error("STDOUT: " + to_string(line.strip(b"\n")))
returncode = step_proc.wait()
else:
# we have PTYs
if pid == 0: # we are the child process
os.execvp(step_args[0], step_args)
else:
with os.fdopen(master_fd, "rb") as master:
# reading from master gives us the subprocess's
# stderr and stdout (it's a fake terminal)
self._process_stderr_from_streaming(master)
_, returncode = os.waitpid(pid, 0)
if returncode == 0:
# parsing needs step number for whole job
self._fetch_counters([step_num + self._start_step_num])
# printing needs step number relevant to this run of mrjob
self.print_counters([step_num + 1])
else:
msg = "Job failed with return code %d: %s" % (returncode, step_args)
log.error(msg)
# look for a Python traceback
cause = self._find_probable_cause_of_failure([step_num + self._start_step_num])
if cause:
# log cause, and put it in exception
cause_msg = [] # lines to log and put in exception
cause_msg.append("Probable cause of failure (from %s):" % cause["log_file_uri"])
cause_msg.extend(line.strip("\n") for line in cause["lines"])
if cause["input_uri"]:
cause_msg.append("(while reading from %s)" % cause["input_uri"])
for line in cause_msg:
log.error(line)
# add cause_msg to exception message
msg += "\n" + "\n".join(cause_msg) + "\n"
raise CalledProcessError(returncode, step_args)
示例5: _process_stderr_from_streaming
def _process_stderr_from_streaming(self, stderr):
def treat_eio_as_eof(iter):
# on Linux, the PTY gives us a specific IOError when the
# when the child process exits, rather than EOF.
while True:
try:
yield next(iter) # okay for StopIteration to bubble up
except IOError as e:
if e.errno == errno.EIO:
return
else:
raise
for line in treat_eio_as_eof(stderr):
line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)
log.info("HADOOP: " + to_string(line))
if b"Streaming Job Failed!" in line:
raise Exception(line)
# The job identifier is printed to stderr. We only want to parse it
# once because we know how many steps we have and just want to know
# what Hadoop thinks the first step's number is.
m = HADOOP_JOB_TIMESTAMP_RE.match(line)
if m and self._job_timestamp is None:
self._job_timestamp = m.group("timestamp")
self._start_step_num = int(m.group("step_num"))
示例6: _cat_log
def _cat_log(fs, path):
"""fs.cat() the given log, converting lines to strings, and logging
errors."""
try:
for line in fs.cat(path):
yield to_string(line)
except IOError as e:
log.warning("couldn't cat() %s: %r" % (path, e))
示例7: stderr_to_log
def stderr_to_log(lines):
for line in lines:
line = to_string(line)
if _HADOOP_NON_LOG_LINE_RE.match(line):
# use error because this is usually "Streaming Command Failed!"
_log_line_from_hadoop(line, level=logging.ERROR)
else:
yield line
示例8: parse_mr_job_stderr
def parse_mr_job_stderr(stderr, counters=None):
"""Parse counters and status messages out of MRJob output.
:param stderr: a filehandle, a list of lines (bytes), or bytes
:param counters: Counters so far, to update; a map from group (string to
counter name (string) to count.
Returns a dictionary with the keys *counters*, *statuses*, *other*:
- *counters*: counters so far; same format as above
- *statuses*: a list of status messages encountered
- *other*: lines (strings) that aren't either counters or status messages
"""
# For the corresponding code in Hadoop Streaming, see ``incrCounter()`` in
# http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/streaming/src/java/org/apache/hadoop/streaming/PipeMapRed.java?view=markup # noqa
if isinstance(stderr, bytes):
stderr = BytesIO(stderr)
if counters is None:
counters = {}
statuses = []
other = []
for line in stderr:
m = _COUNTER_RE.match(line.rstrip(b'\r\n'))
if m:
group, counter, amount_str = m.groups()
# don't leave these as bytes on Python 3
group = to_string(group)
counter = to_string(counter)
counters.setdefault(group, {})
counters[group].setdefault(counter, 0)
counters[group][counter] += int(amount_str)
continue
m = _STATUS_RE.match(line.rstrip(b'\r\n'))
if m:
# don't leave as bytes on Python 3
statuses.append(to_string(m.group(1)))
continue
other.append(to_string(line))
return {'counters': counters, 'statuses': statuses, 'other': other}
示例9: _cat_log
def _cat_log(fs, path):
"""fs.cat() the given log, converting lines to strings, and logging
errors."""
try:
if not fs.exists(path):
return
for line in fs.cat(path):
yield to_string(line)
except (IOError, OSError) as e:
log.warning("couldn't cat() %s: %r" % (path, e))
示例10: yield_lines
def yield_lines():
try:
for line in stderr:
yield to_string(line)
except IOError as e:
# this is just the PTY's way of saying goodbye
if e.errno == errno.EIO:
return
else:
raise
示例11: ls
def ls(self, path_glob):
components = urlparse(path_glob)
hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)
version = self.get_hadoop_version()
# use ls -R on Hadoop 2 (see #1152)
if uses_yarn(version):
args = ['fs', '-ls', '-R', path_glob]
else:
args = ['fs', '-lsr', path_glob]
try:
stdout = self.invoke_hadoop(args, return_stdout=True,
ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
except CalledProcessError:
raise IOError("Could not ls %s" % path_glob)
for line in BytesIO(stdout):
line = line.rstrip(b'\r\n')
# ignore total item count
if line.startswith(b'Found '):
continue
fields = line.split(b' ')
# Throw out directories
if fields[0].startswith(b'd'):
continue
# Try to figure out which part of the line is the path
# Expected lines:
#
# HDFS:
# -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar
#
# S3:
# -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar
path_index = None
for index, field in enumerate(fields):
# look for time field, and pick one after that
# (can't use field[2] because that's an int in Python 3)
if len(field) == 5 and field[2:3] == b':':
path_index = (index + 1)
if not path_index:
raise IOError("Could not locate path in string %r" % line)
path = to_string(line.split(b' ', path_index)[-1])
# handle fully qualified URIs from newer versions of Hadoop ls
# (see Pull Request #577)
if is_uri(path):
yield path
else:
yield hdfs_prefix + path
示例12: find_python_traceback
def find_python_traceback(lines):
"""Scan a log file or other iterable for a Python traceback,
and return it as a list of lines (bytes).
In logs from EMR, we find python tracebacks in ``task-attempts/*/stderr``
"""
# Essentially, we detect the start of the traceback, and continue
# until we find a non-indented line, with some special rules for exceptions
# from subprocesses.
# Lines to pass back representing entire error found
all_tb_lines = []
# This is used to store a working list of lines in a single traceback
tb_lines = []
# This is used to store a working list of non-traceback lines between the
# current traceback and the previous one
non_tb_lines = []
# Track whether or not we are in a traceback rather than consuming the
# iterator
in_traceback = False
for line in lines:
# don't return bytes in Python 3
line = to_string(line)
if in_traceback:
tb_lines.append(line)
# If no indentation, this is the last line of the traceback
if line.lstrip() == line:
in_traceback = False
if line.startswith('subprocess.CalledProcessError'):
# CalledProcessError may mean that the subprocess printed
# errors to stderr which we can show the user
all_tb_lines += non_tb_lines
all_tb_lines += tb_lines
# Reset all working lists
tb_lines = []
non_tb_lines = []
else:
if line.startswith('Traceback (most recent call last):'):
tb_lines.append(line)
in_traceback = True
else:
non_tb_lines.append(line)
if all_tb_lines:
return all_tb_lines
else:
return None
示例13: cleanup
def cleanup():
# this does someties happen; see #1396
for line in cat_proc.stderr:
log.error('STDERR: ' + to_string(line.rstrip(b'\r\n')))
cat_proc.stdout.close()
cat_proc.stderr.close()
returncode = cat_proc.wait()
if returncode != 0:
raise IOError("Could not stream %s" % filename)
示例14: ssh_terminate_single_job
def ssh_terminate_single_job(ssh_bin, address, ec2_key_pair_file):
"""Terminate the only job running the Hadoop cluster with master node
*address* using 'hadoop job -kill JOB_ID'. Return string output of command
or None if there was no job to termiante. Raise :py:class:`IOError` if some
other error occurred.
:param ssh_bin: Path to ``ssh`` binary
:param address: Address of your job's master node (obtained via
:py:meth:`boto.emr.EmrConnection.describe_jobflow`)
:param ec2_key_pair_file: Path to the key pair file (argument to ``-i``)
:return: ``True`` if successful, ``False`` if no job was running
"""
job_list_out = to_string(check_output(*ssh_run(
ssh_bin, address, ec2_key_pair_file, ['hadoop', 'job', '-list'])))
job_list_lines = job_list_out.splitlines()
def job_list_output_error():
raise IOError('Could not read results of "hadoop job -list" and so'
' could not terminate job:\n%s' % job_list_out)
num_jobs_match = HADOOP_JOB_LIST_NUM_RE.match(job_list_lines[0])
if not num_jobs_match:
job_list_output_error()
if int(num_jobs_match.group(1)) > 1:
raise IOError('More than one job is running; unclear which one to'
' terminate, so not terminating any jobs')
if int(num_jobs_match.group(1)) == 0:
return None
job_info_match = HADOOP_JOB_LIST_INFO_RE.match(job_list_lines[2])
if not job_info_match:
job_list_output_error()
job_id = to_string(job_info_match.group(1))
job_kill_out = to_string(check_output(*ssh_run(
ssh_bin, address, ec2_key_pair_file,
['hadoop', 'job', '-kill', job_id])))
return job_kill_out
示例15: _run_on_all_nodes
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True):
"""Given an :py:class:`EMRJobRunner`, run the command specified by
*cmd_args* on all nodes in the cluster and save the stdout and stderr of
each run to subdirectories of *output_dir*.
You should probably have run :py:meth:`_enable_slave_ssh_access()` on the
runner before calling this function.
"""
master_addr = runner._address_of_master()
addresses = [master_addr]
ssh_bin = runner._opts['ssh_bin']
ec2_key_pair_file = runner._opts['ec2_key_pair_file']
keyfile = None
slave_addrs = runner.fs.ssh_slave_hosts(master_addr)
if slave_addrs:
addresses += ['%s!%s' % (master_addr, slave_addr)
for slave_addr in slave_addrs]
# copying key file like a boss (name of keyfile doesn't really matter)
keyfile = 'mrboss-%s.pem' % random_identifier()
_ssh_copy_key(ssh_bin, master_addr, ec2_key_pair_file, keyfile)
for addr in addresses:
stdout, stderr = _ssh_run_with_recursion(
ssh_bin,
addr,
ec2_key_pair_file,
keyfile,
cmd_args,
)
if print_stderr:
print('---')
print('Command completed on %s.' % addr)
print(to_string(stderr), end=' ')
if '!' in addr:
base_dir = os.path.join(output_dir, 'slave ' + addr.split('!')[1])
else:
base_dir = os.path.join(output_dir, 'master')
if not os.path.exists(base_dir):
os.makedirs(base_dir)
with open(os.path.join(base_dir, 'stdout'), 'wb') as f:
f.write(stdout)
with open(os.path.join(base_dir, 'stderr'), 'wb') as f:
f.write(stderr)