本文整理汇总了Python中mrjob.util.unique函数的典型用法代码示例。如果您正苦于以下问题:Python unique函数的具体用法?Python unique怎么用?Python unique使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了unique函数的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _find_hadoop_bin
def _find_hadoop_bin(self):
"""Look for the hadoop binary in any plausible place. If all
else fails, return ``['hadoop']``.
"""
def yield_paths():
for name in 'HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL':
path = os.environ.get(name)
if path:
yield os.path.join(path, 'bin')
# They use $HADOOP_INSTALL/hadoop/bin here:
# https://wiki.apache.org/hadoop/GettingStartedWithHadoop
if os.environ.get('HADOOP_INSTALL'):
yield os.path.join(
os.environ['HADOOP_INSTALL'], 'hadoop', 'bin')
yield None # use $PATH
# Maybe it's in $HADOOP_MAPRED_HOME? $HADOOP_YARN_HOME? Don't give
# up. Don't worry about duplicates; they're de-duplicated below
for name, path in sorted(os.environ.items()):
if name.startswith('HADOOP_') and name.endswith('_HOME'):
yield os.path.join(path, 'bin')
for path in unique(yield_paths()):
log.info('Looking for hadoop binary in %s...' % (path or '$PATH'))
hadoop_bin = which('hadoop', path=path)
if hadoop_bin:
log.info('Found hadoop binary: %s' % hadoop_bin)
return [hadoop_bin]
else:
log.info("Falling back to 'hadoop'")
return ['hadoop']
示例2: _stream_history_log_dirs
def _stream_history_log_dirs(self, output_dir=None):
"""Yield lists of directories to look for the history log in."""
for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
if _logs_exist(self.fs, log_dir):
log.info('Looking for history log in %s...' % log_dir)
# logs aren't always in a subdir named history/
yield [log_dir]
示例3: stream_history_log_dirs
def stream_history_log_dirs():
for log_dir in unique(
self._hadoop_log_dirs(
output_dir=step_interpretation.get('output_dir'))):
if self.fs.exists(log_dir):
log.info('Looking for history log in %s' % log_dir)
yield [log_dir]
示例4: _pick_error_attempt_ids
def _pick_error_attempt_ids(log_interpretation):
"""Pick error attempt IDs, so we know which task logs to look at."""
errors = _pick_errors(log_interpretation)
errors.sort(key=_is_probably_task_error, reverse=True)
return list(unique(
error['attempt_id'] for error in errors
if error.get('attempt_id')))
示例5: _stream_task_log_dirs
def _stream_task_log_dirs(self, application_id=None, output_dir=None):
"""Yield lists of directories to look for the task logs in."""
# Note: this is unlikely to be super-helpful on "real" (multi-node)
# pre-YARN Hadoop because task logs aren't generally shipped to a
# local directory. It's a start, anyways. See #1201.
for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
if application_id:
path = self.fs.join(log_dir, 'userlogs', application_id)
else:
path = self.fs.join(log_dir, 'userlogs')
if _logs_exist(self.fs, path):
log.info('Looking for task syslogs in %s...' % path)
yield [path]
示例6: _find_spark_submit_bin
def _find_spark_submit_bin(self):
# TODO: this is very similar to _find_hadoop_bin() (in fs)
for path in unique(self._spark_submit_bin_dirs()):
log.info('Looking for spark-submit binary in %s...' % (
path or '$PATH'))
spark_submit_bin = which('spark-submit', path=path)
if spark_submit_bin:
log.info('Found spark-submit binary: %s' % spark_submit_bin)
return [spark_submit_bin]
else:
log.info("Falling back to 'spark-submit'")
return ['spark-submit']
示例7: stream_task_log_dirs
def stream_task_log_dirs():
for log_dir in unique(
self._hadoop_log_dirs(output_dir=output_dir)):
if yarn:
path = self.fs.join(log_dir, 'userlogs', application_id)
else:
# sometimes pre-YARN attempt logs are organized by job_id,
# sometimes not. Play it safe
path = self.fs.join(log_dir, 'userlogs')
if self.fs.exists(path):
log.info('looking for logs in %s' % path)
yield [path]
示例8: stream_task_log_dirs
def stream_task_log_dirs():
for log_dir in unique(
self._hadoop_log_dirs(
output_dir=step_interpretation.get('output_dir'))):
if yarn:
path = self.fs.join(
log_dir, 'userlogs', application_id)
else:
# sometimes pre-YARN attempt logs are organized by
# job_id,
# sometimes not. Play it safe
path = self.fs.join(log_dir, 'userlogs')
if self.fs.exists(path):
log.info('Scanning task syslogs in %s' % path)
yield [path]
示例9: _find_spark_submit_bin
def _find_spark_submit_bin(self):
"""Attempt to find the spark binary. Returns a list of arguments.
Defaults to ``['spark-submit']``.
Re-define this in your subclass if you already know where
to find spark-submit (e.g. on cloud services).
"""
for path in unique(self._spark_submit_bin_dirs()):
log.info('Looking for spark-submit binary in %s...' % (
path or '$PATH'))
spark_submit_bin = which('spark-submit', path=path)
if spark_submit_bin:
log.info('Found spark-submit binary: %s' % spark_submit_bin)
return [spark_submit_bin]
else:
log.info("Falling back to 'spark-submit'")
return ['spark-submit']
示例10: _find_hadoop_streaming_jar
def _find_hadoop_streaming_jar(self):
"""Search for the hadoop streaming jar. See
:py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
for path in unique(self._hadoop_streaming_jar_dirs()):
log.info('Looking for Hadoop streaming jar in %s...' % path)
streaming_jars = []
for path in self.fs.ls(path):
if _HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
streaming_jars.append(path)
if streaming_jars:
# prefer shorter names and shallower paths
def sort_key(p):
return (len(p.split('/')),
len(posixpath.basename(p)),
p)
streaming_jars.sort(key=sort_key)
return streaming_jars[0]
return None
示例11: test_mixed_types_ok
def test_mixed_types_ok(self):
self.assertEqual(list(unique(['a', None, 33, 'a'])),
['a', None, 33])
示例12: test_preserves_order
def test_preserves_order(self):
self.assertEqual(list(unique([6, 7, 2, 0, 7, 1])),
[6, 7, 2, 0, 1])
示例13: test_de_duplication
def test_de_duplication(self):
self.assertEqual(list(unique([1, 2, 1, 5, 1])),
[1, 2, 5])
示例14: test_empty
def test_empty(self):
self.assertEqual(list(unique([])), [])