当前位置: 首页>>代码示例>>Python>>正文


Python py2.to_string函数代码示例

本文整理汇总了Python中mrjob.py2.to_string函数的典型用法代码示例。如果您正苦于以下问题:Python to_string函数的具体用法?Python to_string怎么用?Python to_string使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了to_string函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _parse_counters_0_18

def _parse_counters_0_18(counter_string):
    # 0.18 counters look like this:
    # GroupName.CounterName:Value,Group1.Crackers:3,Group2.Nerf:243,...
    groups = _COUNTER_RE_0_18.finditer(counter_string)
    if groups is None:
        log.warning("Cannot parse Hadoop counter string: %s" % counter_string)

    for m in groups:
        yield (to_string(m.group("group")), to_string(m.group("name")), int(m.group("value")))
开发者ID:tempcyc,项目名称:mrjob,代码行数:9,代码来源:parse.py

示例2: invoke_hadoop

    def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None,
                      return_stdout=False):
        """Run the given hadoop command, raising an exception on non-zero
        return code. This only works for commands whose output we don't
        care about.

        Args:
        ok_returncodes -- a list/tuple/set of return codes we expect to
            get back from hadoop (e.g. [0,1]). By default, we only expect 0.
            If we get an unexpected return code, we raise a CalledProcessError.
        ok_stderr -- don't log STDERR or raise CalledProcessError if stderr
            matches a regex in this list (even if the returncode is bad)
        return_stdout -- return the stdout from the hadoop command rather
            than logging it. If this is False, we return the returncode
            instead.
        """
        args = self._hadoop_bin + args

        log.debug('> %s' % cmd_line(args))

        proc = Popen(args, stdout=PIPE, stderr=PIPE)
        stdout, stderr = proc.communicate()

        log_func = log.debug if proc.returncode == 0 else log.error
        if not return_stdout:
            for line in BytesIO(stdout):
                log_func('STDOUT: ' + to_string(line.rstrip(b'\r\n')))

        # check if STDERR is okay
        stderr_is_ok = False
        if ok_stderr:
            for stderr_re in ok_stderr:
                if stderr_re.match(stderr):
                    stderr_is_ok = True
                    break

        if not stderr_is_ok:
            for line in BytesIO(stderr):
                log_func('STDERR: ' + to_string(line.rstrip(b'\r\n')))

        ok_returncodes = ok_returncodes or [0]

        if not stderr_is_ok and proc.returncode not in ok_returncodes:
            raise CalledProcessError(proc.returncode, args)

        if return_stdout:
            return stdout
        else:
            return proc.returncode
开发者ID:DanisHack,项目名称:mrjob,代码行数:49,代码来源:hadoop.py

示例3: find_hadoop_java_stack_trace

def find_hadoop_java_stack_trace(lines):
    """Scan a log file or other iterable for a java stack trace from Hadoop,
    and return it as a list of lines (bytes).

    In logs from EMR, we find java stack traces in ``task-attempts/*/syslog``

    Sample stack trace::

        2010-07-27 18:25:48,397 WARN org.apache.hadoop.mapred.TaskTracker (main): Error running child
        java.lang.OutOfMemoryError: Java heap space
                at org.apache.hadoop.mapred.IFile$Reader.readNextBlock(IFile.java:270)
                at org.apache.hadoop.mapred.IFile$Reader.next(IFile.java:332)
                at org.apache.hadoop.mapred.Merger$Segment.next(Merger.java:147)
                at org.apache.hadoop.mapred.Merger$MergeQueue.adjustPriorityQueue(Merger.java:238)
                at org.apache.hadoop.mapred.Merger$MergeQueue.next(Merger.java:255)
                at org.apache.hadoop.mapred.Merger.writeFile(Merger.java:86)
                at org.apache.hadoop.mapred.Merger$MergeQueue.merge(Merger.java:377)
                at org.apache.hadoop.mapred.Merger.merge(Merger.java:58)
                at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:277)
                at org.apache.hadoop.mapred.TaskTracker$Child.main(TaskTracker.java:2216)

    (We omit the "Error running child" line from the results)
    """
    for line in lines:
        if line.rstrip(b'\r\n').endswith(b"Error running child"):
            st_lines = []
            for line in lines:
                st_lines.append(line)
                for line in lines:
                    if not line.startswith(b'        at '):
                        break
                    st_lines.append(line)
                return [to_string(line) for line in st_lines]
    else:
        return None
开发者ID:Roguelazer,项目名称:mrjob,代码行数:35,代码来源:parse.py

示例4: _run_job_in_hadoop

    def _run_job_in_hadoop(self):
        self._counters = []

        for step_num in range(self._num_steps()):
            log.debug("running step %d of %d" % (step_num + 1, self._num_steps()))

            step_args = self._args_for_step(step_num)

            log.debug("> %s" % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen
                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                self._process_stderr_from_streaming(step_proc.stderr)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    log.error("STDOUT: " + to_string(line.strip(b"\n")))

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    with os.fdopen(master_fd, "rb") as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        self._process_stderr_from_streaming(master)
                        _, returncode = os.waitpid(pid, 0)

            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = "Job failed with return code %d: %s" % (returncode, step_args)
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure([step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append("Probable cause of failure (from %s):" % cause["log_file_uri"])
                    cause_msg.extend(line.strip("\n") for line in cause["lines"])
                    if cause["input_uri"]:
                        cause_msg.append("(while reading from %s)" % cause["input_uri"])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += "\n" + "\n".join(cause_msg) + "\n"

                raise CalledProcessError(returncode, step_args)
开发者ID:senseb,项目名称:mrjob,代码行数:60,代码来源:hadoop.py

示例5: _process_stderr_from_streaming

    def _process_stderr_from_streaming(self, stderr):
        def treat_eio_as_eof(iter):
            # on Linux, the PTY gives us a specific IOError when the
            # when the child process exits, rather than EOF.
            while True:
                try:
                    yield next(iter)  # okay for StopIteration to bubble up
                except IOError as e:
                    if e.errno == errno.EIO:
                        return
                    else:
                        raise

        for line in treat_eio_as_eof(stderr):
            line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)
            log.info("HADOOP: " + to_string(line))

            if b"Streaming Job Failed!" in line:
                raise Exception(line)

            # The job identifier is printed to stderr. We only want to parse it
            # once because we know how many steps we have and just want to know
            # what Hadoop thinks the first step's number is.
            m = HADOOP_JOB_TIMESTAMP_RE.match(line)
            if m and self._job_timestamp is None:
                self._job_timestamp = m.group("timestamp")
                self._start_step_num = int(m.group("step_num"))
开发者ID:ZhouYunan,项目名称:mrjob,代码行数:27,代码来源:hadoop.py

示例6: _cat_log

def _cat_log(fs, path):
    """fs.cat() the given log, converting lines to strings, and logging
    errors."""
    try:
        for line in fs.cat(path):
            yield to_string(line)
    except IOError as e:
        log.warning("couldn't cat() %s: %r" % (path, e))
开发者ID:parastoo-62,项目名称:mrjob,代码行数:8,代码来源:interpret.py

示例7: stderr_to_log

 def stderr_to_log(lines):
     for line in lines:
         line = to_string(line)
         if _HADOOP_NON_LOG_LINE_RE.match(line):
             # use error because this is usually "Streaming Command Failed!"
             _log_line_from_hadoop(line, level=logging.ERROR)
         else:
             yield line
开发者ID:BeeswaxIO,项目名称:mrjob,代码行数:8,代码来源:hadoop.py

示例8: parse_mr_job_stderr

def parse_mr_job_stderr(stderr, counters=None):
    """Parse counters and status messages out of MRJob output.

    :param stderr: a filehandle, a list of lines (bytes), or bytes
    :param counters: Counters so far, to update; a map from group (string to
                     counter name (string) to count.

    Returns a dictionary with the keys *counters*, *statuses*, *other*:

    - *counters*: counters so far; same format as above
    - *statuses*: a list of status messages encountered
    - *other*: lines (strings) that aren't either counters or status messages
    """
    # For the corresponding code in Hadoop Streaming, see ``incrCounter()`` in
    # http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/streaming/src/java/org/apache/hadoop/streaming/PipeMapRed.java?view=markup  # noqa
    if isinstance(stderr, bytes):
        stderr = BytesIO(stderr)

    if counters is None:
        counters = {}
    statuses = []
    other = []

    for line in stderr:
        m = _COUNTER_RE.match(line.rstrip(b'\r\n'))
        if m:
            group, counter, amount_str = m.groups()

            # don't leave these as bytes on Python 3
            group = to_string(group)
            counter = to_string(counter)

            counters.setdefault(group, {})
            counters[group].setdefault(counter, 0)
            counters[group][counter] += int(amount_str)
            continue

        m = _STATUS_RE.match(line.rstrip(b'\r\n'))
        if m:
            # don't leave as bytes on Python 3
            statuses.append(to_string(m.group(1)))
            continue

        other.append(to_string(line))

    return {'counters': counters, 'statuses': statuses, 'other': other}
开发者ID:Roguelazer,项目名称:mrjob,代码行数:46,代码来源:parse.py

示例9: _cat_log

def _cat_log(fs, path):
    """fs.cat() the given log, converting lines to strings, and logging
    errors."""
    try:
        if not fs.exists(path):
            return
        for line in fs.cat(path):
            yield to_string(line)
    except (IOError, OSError) as e:
        log.warning("couldn't cat() %s: %r" % (path, e))
开发者ID:davidmarin,项目名称:mrjob,代码行数:10,代码来源:wrap.py

示例10: yield_lines

 def yield_lines():
     try:
         for line in stderr:
             yield to_string(line)
     except IOError as e:
         # this is just the PTY's way of saying goodbye
         if e.errno == errno.EIO:
             return
         else:
             raise
开发者ID:imtiaz39,项目名称:mrjob,代码行数:10,代码来源:step.py

示例11: ls

    def ls(self, path_glob):
        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        version = self.get_hadoop_version()

        # use ls -R on Hadoop 2 (see #1152)
        if uses_yarn(version):
            args = ['fs', '-ls', '-R', path_glob]
        else:
            args = ['fs', '-lsr', path_glob]

        try:
            stdout = self.invoke_hadoop(args, return_stdout=True,
                                        ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
        except CalledProcessError:
            raise IOError("Could not ls %s" % path_glob)

        for line in BytesIO(stdout):
            line = line.rstrip(b'\r\n')

            # ignore total item count
            if line.startswith(b'Found '):
                continue

            fields = line.split(b' ')

            # Throw out directories
            if fields[0].startswith(b'd'):
                continue

            # Try to figure out which part of the line is the path
            # Expected lines:
            #
            # HDFS:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar
            #
            # S3:
            # -rwxrwxrwx   1          3276 010-01-13 14:00 /foo/bar
            path_index = None
            for index, field in enumerate(fields):
                # look for time field, and pick one after that
                # (can't use field[2] because that's an int in Python 3)
                if len(field) == 5 and field[2:3] == b':':
                    path_index = (index + 1)
            if not path_index:
                raise IOError("Could not locate path in string %r" % line)

            path = to_string(line.split(b' ', path_index)[-1])
            # handle fully qualified URIs from newer versions of Hadoop ls
            # (see Pull Request #577)
            if is_uri(path):
                yield path
            else:
                yield hdfs_prefix + path
开发者ID:kodizant,项目名称:mrjob,代码行数:55,代码来源:hadoop.py

示例12: find_python_traceback

def find_python_traceback(lines):
    """Scan a log file or other iterable for a Python traceback,
    and return it as a list of lines (bytes).

    In logs from EMR, we find python tracebacks in ``task-attempts/*/stderr``
    """
    # Essentially, we detect the start of the traceback, and continue
    # until we find a non-indented line, with some special rules for exceptions
    # from subprocesses.

    # Lines to pass back representing entire error found
    all_tb_lines = []

    # This is used to store a working list of lines in a single traceback
    tb_lines = []

    # This is used to store a working list of non-traceback lines between the
    # current traceback and the previous one
    non_tb_lines = []

    # Track whether or not we are in a traceback rather than consuming the
    # iterator
    in_traceback = False

    for line in lines:
        # don't return bytes in Python 3
        line = to_string(line)

        if in_traceback:
            tb_lines.append(line)

            # If no indentation, this is the last line of the traceback
            if line.lstrip() == line:
                in_traceback = False

                if line.startswith('subprocess.CalledProcessError'):
                    # CalledProcessError may mean that the subprocess printed
                    # errors to stderr which we can show the user
                    all_tb_lines += non_tb_lines

                all_tb_lines += tb_lines

                # Reset all working lists
                tb_lines = []
                non_tb_lines = []
        else:
            if line.startswith('Traceback (most recent call last):'):
                tb_lines.append(line)
                in_traceback = True
            else:
                non_tb_lines.append(line)
    if all_tb_lines:
        return all_tb_lines
    else:
        return None
开发者ID:Roguelazer,项目名称:mrjob,代码行数:55,代码来源:parse.py

示例13: cleanup

        def cleanup():
            # this does someties happen; see #1396
            for line in cat_proc.stderr:
                log.error('STDERR: ' + to_string(line.rstrip(b'\r\n')))

            cat_proc.stdout.close()
            cat_proc.stderr.close()

            returncode = cat_proc.wait()

            if returncode != 0:
                raise IOError("Could not stream %s" % filename)
开发者ID:davidmarin,项目名称:mrjob,代码行数:12,代码来源:hadoop.py

示例14: ssh_terminate_single_job

def ssh_terminate_single_job(ssh_bin, address, ec2_key_pair_file):
    """Terminate the only job running the Hadoop cluster with master node
    *address* using 'hadoop job -kill JOB_ID'. Return string output of command
    or None if there was no job to termiante. Raise :py:class:`IOError` if some
    other error occurred.

    :param ssh_bin: Path to ``ssh`` binary
    :param address: Address of your job's master node (obtained via
                    :py:meth:`boto.emr.EmrConnection.describe_jobflow`)
    :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``)

    :return: ``True`` if successful, ``False`` if no job was running
    """
    job_list_out = to_string(check_output(*ssh_run(
        ssh_bin, address, ec2_key_pair_file, ['hadoop', 'job', '-list'])))
    job_list_lines = job_list_out.splitlines()

    def job_list_output_error():
        raise IOError('Could not read results of "hadoop job -list" and so'
                      ' could not terminate job:\n%s' % job_list_out)

    num_jobs_match = HADOOP_JOB_LIST_NUM_RE.match(job_list_lines[0])
    if not num_jobs_match:
        job_list_output_error()
    if int(num_jobs_match.group(1)) > 1:
        raise IOError('More than one job is running; unclear which one to'
                      ' terminate, so not terminating any jobs')
    if int(num_jobs_match.group(1)) == 0:
        return None

    job_info_match = HADOOP_JOB_LIST_INFO_RE.match(job_list_lines[2])
    if not job_info_match:
        job_list_output_error()
    job_id = to_string(job_info_match.group(1))

    job_kill_out = to_string(check_output(*ssh_run(
        ssh_bin, address, ec2_key_pair_file,
        ['hadoop', 'job', '-kill', job_id])))

    return job_kill_out
开发者ID:DanisHack,项目名称:mrjob,代码行数:40,代码来源:ssh.py

示例15: _run_on_all_nodes

def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True):
    """Given an :py:class:`EMRJobRunner`, run the command specified by
    *cmd_args* on all nodes in the cluster and save the stdout and stderr of
    each run to subdirectories of *output_dir*.

    You should probably have run :py:meth:`_enable_slave_ssh_access()` on the
    runner before calling this function.
    """
    master_addr = runner._address_of_master()
    addresses = [master_addr]

    ssh_bin = runner._opts['ssh_bin']
    ec2_key_pair_file = runner._opts['ec2_key_pair_file']

    keyfile = None
    slave_addrs = runner.fs.ssh_slave_hosts(master_addr)

    if slave_addrs:
        addresses += ['%s!%s' % (master_addr, slave_addr)
                      for slave_addr in slave_addrs]
        # copying key file like a boss (name of keyfile doesn't really matter)
        keyfile = 'mrboss-%s.pem' % random_identifier()
        _ssh_copy_key(ssh_bin, master_addr, ec2_key_pair_file, keyfile)

    for addr in addresses:

        stdout, stderr = _ssh_run_with_recursion(
            ssh_bin,
            addr,
            ec2_key_pair_file,
            keyfile,
            cmd_args,
        )

        if print_stderr:
            print('---')
            print('Command completed on %s.' % addr)
            print(to_string(stderr), end=' ')

        if '!' in addr:
            base_dir = os.path.join(output_dir, 'slave ' + addr.split('!')[1])
        else:
            base_dir = os.path.join(output_dir, 'master')

        if not os.path.exists(base_dir):
            os.makedirs(base_dir)

        with open(os.path.join(base_dir, 'stdout'), 'wb') as f:
            f.write(stdout)

        with open(os.path.join(base_dir, 'stderr'), 'wb') as f:
            f.write(stderr)
开发者ID:davidmarin,项目名称:mrjob,代码行数:52,代码来源:mrboss.py


注:本文中的mrjob.py2.to_string函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。