本文整理汇总了Python中apache_beam.io.textio._TextSource.read_records方法的典型用法代码示例。如果您正苦于以下问题:Python _TextSource.read_records方法的具体用法?Python _TextSource.read_records怎么用?Python _TextSource.read_records使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam.io.textio._TextSource
的用法示例。
在下文中一共展示了_TextSource.read_records方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from apache_beam.io.textio import _TextSource [as 别名]
# 或者: from apache_beam.io.textio._TextSource import read_records [as 别名]
def __init__(self,
file_name,
range_tracker,
file_pattern,
compression_type,
allow_malformed_records,
**kwargs):
self._header_lines = []
self._last_record = None
self._file_name = file_name
self._allow_malformed_records = allow_malformed_records
text_source = TextSource(
file_pattern,
0, # min_bundle_size
compression_type,
True, # strip_trailing_newlines
coders.StrUtf8Coder(), # coder
validate=False,
header_processor_fns=(lambda x: x.startswith('#'),
self._store_header_lines),
**kwargs)
self._text_lines = text_source.read_records(self._file_name,
range_tracker)
try:
self._vcf_reader = vcf.Reader(fsock=self._create_generator())
except SyntaxError as e:
# Throw the exception inside the generator to ensure file is properly
# closed (it's opened inside TextSource.read_records).
self._text_lines.throw(
ValueError('An exception was raised when reading header from VCF '
'file %s: %s' % (self._file_name,
traceback.format_exc(e))))
示例2: test_header_processing
# 需要导入模块: from apache_beam.io.textio import _TextSource [as 别名]
# 或者: from apache_beam.io.textio._TextSource import read_records [as 别名]
def test_header_processing(self):
file_name, expected_data = write_data(10)
assert len(expected_data) == 10
def header_matcher(line):
return line in expected_data[:5]
header_lines = []
def store_header(lines):
for line in lines:
header_lines.append(line)
source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
coders.StrUtf8Coder(),
header_processor_fns=(header_matcher, store_header))
splits = list(source.split(desired_bundle_size=100000))
assert len(splits) == 1
range_tracker = splits[0].source.get_range_tracker(
splits[0].start_position, splits[0].stop_position)
read_data = list(source.read_records(file_name, range_tracker))
self.assertCountEqual(expected_data[:5], header_lines)
self.assertCountEqual(expected_data[5:], read_data)