本文整理汇总了Python中mrjob.local.LocalMRJobRunner._get_file_splits方法的典型用法代码示例。如果您正苦于以下问题:Python LocalMRJobRunner._get_file_splits方法的具体用法?Python LocalMRJobRunner._get_file_splits怎么用?Python LocalMRJobRunner._get_file_splits使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类mrjob.local.LocalMRJobRunner
的用法示例。
在下文中一共展示了LocalMRJobRunner._get_file_splits方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_get_file_splits_sorted_test
# 需要导入模块: from mrjob.local import LocalMRJobRunner [as 别名]
# 或者: from mrjob.local.LocalMRJobRunner import _get_file_splits [as 别名]
def test_get_file_splits_sorted_test(self):
# set up input paths
input_path = os.path.join(self.tmp_dir, 'input')
with open(input_path, 'w') as input_file:
input_file.write(
'1\tbar\n1\tbar\n1\tbar\n2\tfoo\n2\tfoo\n2\tfoo\n3\tqux\n'
'3\tqux\n3\tqux\n')
runner = LocalMRJobRunner(conf_paths=[])
file_splits = runner._get_file_splits([input_path], 3,
keep_sorted=True)
# make sure we get 3 files
self.assertEqual(len(file_splits), 3)
# make sure all the data is preserved in sorted order
content = []
for file_name in sorted(file_splits.keys()):
f = open(file_name, 'r')
content.extend(f.readlines())
self.assertEqual(content,
['1\tbar\n', '1\tbar\n', '1\tbar\n',
'2\tfoo\n', '2\tfoo\n', '2\tfoo\n',
'3\tqux\n', '3\tqux\n', '3\tqux\n'])
示例2: test_get_file_splits_test
# 需要导入模块: from mrjob.local import LocalMRJobRunner [as 别名]
# 或者: from mrjob.local.LocalMRJobRunner import _get_file_splits [as 别名]
def test_get_file_splits_test(self):
# set up input paths
input_path = os.path.join(self.tmp_dir, 'input')
with open(input_path, 'w') as input_file:
input_file.write('bar\nqux\nfoo\nbar\nqux\nfoo\n')
input_path2 = os.path.join(self.tmp_dir, 'input2')
with open(input_path2, 'w') as input_file:
input_file.write('foo\nbar\nbar\n')
runner = LocalMRJobRunner(conf_paths=[])
# split into 3 files
file_splits = runner._get_file_splits([input_path, input_path2], 3)
# make sure we get 3 files
self.assertEqual(len(file_splits), 3)
# make sure all the data is preserved
content = []
for file_name in file_splits:
f = open(file_name)
content.extend(f.readlines())
self.assertEqual(sorted(content),
['bar\n', 'bar\n', 'bar\n', 'bar\n', 'foo\n',
'foo\n', 'foo\n', 'qux\n', 'qux\n'])
示例3: test_get_file_splits_sorted_test
# 需要导入模块: from mrjob.local import LocalMRJobRunner [as 别名]
# 或者: from mrjob.local.LocalMRJobRunner import _get_file_splits [as 别名]
def test_get_file_splits_sorted_test(self):
# set up input paths
input_path = os.path.join(self.tmp_dir, "input")
with open(input_path, "wb") as input_file:
input_file.write(b"1\tbar\n1\tbar\n1\tbar\n2\tfoo\n2\tfoo\n2\tfoo\n3\tqux\n" b"3\tqux\n3\tqux\n")
runner = LocalMRJobRunner(conf_paths=[])
file_splits = runner._get_file_splits([input_path], 3, keep_sorted=True)
# make sure we get 3 files
self.assertEqual(len(file_splits), 3)
# make sure all the data is preserved in sorted order
content = []
for file_name in sorted(file_splits.keys()):
with open(file_name, "rb") as f:
content.extend(f.readlines())
self.assertEqual(
content,
[
b"1\tbar\n",
b"1\tbar\n",
b"1\tbar\n",
b"2\tfoo\n",
b"2\tfoo\n",
b"2\tfoo\n",
b"3\tqux\n",
b"3\tqux\n",
b"3\tqux\n",
],
)
示例4: test_get_file_splits_test
# 需要导入模块: from mrjob.local import LocalMRJobRunner [as 别名]
# 或者: from mrjob.local.LocalMRJobRunner import _get_file_splits [as 别名]
def test_get_file_splits_test(self):
# set up input paths
input_path = os.path.join(self.tmp_dir, "input")
with open(input_path, "w") as input_file:
input_file.write("bar\nqux\nfoo\nbar\nqux\nfoo\n")
input_path2 = os.path.join(self.tmp_dir, "input2")
with open(input_path2, "wb") as input_file:
input_file.write(b"foo\nbar\nbar\n")
runner = LocalMRJobRunner(conf_paths=[])
# split into 3 files
file_splits = runner._get_file_splits([input_path, input_path2], 3)
# make sure we get 3 files
self.assertEqual(len(file_splits), 3)
# make sure all the data is preserved
content = []
for file_name in file_splits:
with open(file_name, "rb") as f:
content.extend(f.readlines())
self.assertEqual(
sorted(content), [b"bar\n", b"bar\n", b"bar\n", b"bar\n", b"foo\n", b"foo\n", b"foo\n", b"qux\n", b"qux\n"]
)
示例5: gz_test
# 需要导入模块: from mrjob.local import LocalMRJobRunner [as 别名]
# 或者: from mrjob.local.LocalMRJobRunner import _get_file_splits [as 别名]
def gz_test(self, dir_path_name):
contents_gz = [b'bar\n', b'qux\n', b'foo\n', b'bar\n',
b'qux\n', b'foo\n']
contents_normal = [b'foo\n', b'bar\n', b'bar\n']
all_contents_sorted = sorted(contents_gz + contents_normal)
input_gz_path = join(dir_path_name, 'input.gz')
input_gz = gzip.GzipFile(input_gz_path, 'wb')
input_gz.write(b''.join(contents_gz))
input_gz.close()
input_path2 = join(dir_path_name, 'input2')
with open(input_path2, 'wb') as input_file:
input_file.write(b''.join(contents_normal))
runner = LocalMRJobRunner(conf_paths=[])
# split into 3 files
file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)
# Make sure that input.gz occurs in a single split that starts at
# its beginning and ends at its end
for split_info in file_splits.values():
if split_info['orig_name'] == input_gz_path:
self.assertEqual(split_info['start'], 0)
self.assertEqual(split_info['length'],
os.stat(input_gz_path)[stat.ST_SIZE])
# make sure we get 3 files
self.assertEqual(len(file_splits), 3)
# make sure all the data is preserved
content = []
for file_name in file_splits:
with open(file_name, 'rb') as f:
lines = list(to_lines(decompress(f, file_name)))
# make sure the input_gz split got its entire contents
if file_name == input_gz_path:
self.assertEqual(lines, contents_gz)
content.extend(lines)
self.assertEqual(sorted(content),
all_contents_sorted)
示例6: gz_test
# 需要导入模块: from mrjob.local import LocalMRJobRunner [as 别名]
# 或者: from mrjob.local.LocalMRJobRunner import _get_file_splits [as 别名]
def gz_test(self, dir_path_name):
contents_gz = [b"bar\n", b"qux\n", b"foo\n", b"bar\n", b"qux\n", b"foo\n"]
contents_normal = [b"foo\n", b"bar\n", b"bar\n"]
all_contents_sorted = sorted(contents_gz + contents_normal)
input_gz_path = os.path.join(dir_path_name, "input.gz")
input_gz = gzip.GzipFile(input_gz_path, "wb")
input_gz.write(b"".join(contents_gz))
input_gz.close()
input_path2 = os.path.join(dir_path_name, "input2")
with open(input_path2, "wb") as input_file:
input_file.write(b"".join(contents_normal))
runner = LocalMRJobRunner(conf_paths=[])
# split into 3 files
file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)
# Make sure that input.gz occurs in a single split that starts at
# its beginning and ends at its end
for split_info in file_splits.values():
if split_info["orig_name"] == input_gz_path:
self.assertEqual(split_info["start"], 0)
self.assertEqual(split_info["length"], os.stat(input_gz_path)[stat.ST_SIZE])
# make sure we get 3 files
self.assertEqual(len(file_splits), 3)
# make sure all the data is preserved
content = []
for file_name in file_splits:
lines = list(read_file(file_name))
# make sure the input_gz split got its entire contents
if file_name == input_gz_path:
self.assertEqual(lines, contents_gz)
content.extend(lines)
self.assertEqual(sorted(content), all_contents_sorted)