Python textio._TextSource类代码示例

本文整理汇总了Python中apache_beam.io.textio._TextSource类的典型用法代码示例。如果您正苦于以下问题：Python _TextSource类的具体用法？Python _TextSource怎么用？Python _TextSource使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了_TextSource类的14个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_progress

  def test_progress(self):
    file_name, expected_data = write_data(10)
    assert len(expected_data) == 10
    source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                        coders.StrUtf8Coder())
    splits = list(source.split(desired_bundle_size=100000))
    assert len(splits) == 1
    fraction_consumed_report = []
    split_points_report = []
    range_tracker = splits[0].source.get_range_tracker(
        splits[0].start_position, splits[0].stop_position)
    for _ in splits[0].source.read(range_tracker):
      fraction_consumed_report.append(range_tracker.fraction_consumed())
      split_points_report.append(range_tracker.split_points())

    self.assertEqual(
        [float(i) / 10 for i in range(0, 10)], fraction_consumed_report)
    expected_split_points_report = [
        ((i - 1), iobase.RangeTracker.SPLIT_POINTS_UNKNOWN)
        for i in range(1, 10)]

    # At last split point, the remaining split points callback returns 1 since
    # the expected position of next record becomes equal to the stop position.
    expected_split_points_report.append((9, 1))

    self.assertEqual(
        expected_split_points_report, split_points_report)

开发者ID:dpmills，项目名称:incubator-beam，代码行数:27，代码来源:textio_test.py

示例2: init

    def __init__(self,
                 file_name,
                 range_tracker,
                 file_pattern,
                 compression_type,
                 allow_malformed_records,
                 **kwargs):
      self._header_lines = []
      self._last_record = None
      self._file_name = file_name
      self._allow_malformed_records = allow_malformed_records

      text_source = TextSource(
          file_pattern,
          0,  # min_bundle_size
          compression_type,
          True,  # strip_trailing_newlines
          coders.StrUtf8Coder(),  # coder
          validate=False,
          header_processor_fns=(lambda x: x.startswith('#'),
                                self._store_header_lines),
          **kwargs)

      self._text_lines = text_source.read_records(self._file_name,
                                                  range_tracker)
      try:
        self._vcf_reader = vcf.Reader(fsock=self._create_generator())
      except SyntaxError as e:
        # Throw the exception inside the generator to ensure file is properly
        # closed (it's opened inside TextSource.read_records).
        self._text_lines.throw(
            ValueError('An exception was raised when reading header from VCF '
                       'file %s: %s' % (self._file_name,
                                        traceback.format_exc(e))))

开发者ID:lukecwik，项目名称:incubator-beam，代码行数:34，代码来源:vcfio.py

示例3: test_dynamic_work_rebalancing

 def test_dynamic_work_rebalancing(self):
   file_name, expected_data = write_data(5)
   assert len(expected_data) == 5
   source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                       coders.StrUtf8Coder())
   splits = list(source.split(desired_bundle_size=100000))
   assert len(splits) == 1
   source_test_utils.assert_split_at_fraction_exhaustive(
       splits[0].source, splits[0].start_position, splits[0].stop_position)

开发者ID:dpmills，项目名称:incubator-beam，代码行数:9，代码来源:textio_test.py

示例4: test_read_reentrant_after_splitting

 def test_read_reentrant_after_splitting(self):
   file_name, expected_data = write_data(10)
   assert len(expected_data) == 10
   source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                       coders.StrUtf8Coder())
   splits = list(source.split(desired_bundle_size=100000))
   assert len(splits) == 1
   source_test_utils.assert_reentrant_reads_succeed(
       (splits[0].source, splits[0].start_position, splits[0].stop_position))

开发者ID:dpmills，项目名称:incubator-beam，代码行数:9，代码来源:textio_test.py

示例5: test_dynamic_work_rebalancing_windows_eol

 def test_dynamic_work_rebalancing_windows_eol(self):
   file_name, expected_data = write_data(15, eol=EOL.CRLF)
   assert len(expected_data) == 15
   source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                       coders.StrUtf8Coder())
   splits = list(source.split(desired_bundle_size=100000))
   assert len(splits) == 1
   source_test_utils.assert_split_at_fraction_exhaustive(
       splits[0].source, splits[0].start_position, splits[0].stop_position,
       perform_multi_threaded_test=False)

开发者ID:dpmills，项目名称:incubator-beam，代码行数:10，代码来源:textio_test.py

示例6: test_read_single_file_without_striping_eol_crlf

  def test_read_single_file_without_striping_eol_crlf(self):
    file_name, written_data = write_data(TextSourceTest.DEFAULT_NUM_RECORDS,
                                         eol=EOL.CRLF)
    assert len(written_data) == TextSourceTest.DEFAULT_NUM_RECORDS
    source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED,
                        False, coders.StrUtf8Coder())

    range_tracker = source.get_range_tracker(None, None)
    read_data = list(source.read(range_tracker))
    self.assertCountEqual([line + '\r\n' for line in written_data], read_data)

开发者ID:dpmills，项目名称:incubator-beam，代码行数:10，代码来源:textio_test.py

示例7: test_dynamic_work_rebalancing_mixed_eol

 def test_dynamic_work_rebalancing_mixed_eol(self):
   file_name, expected_data = write_data(5, eol=EOL.MIXED)
   assert len(expected_data) == 5
   source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                       coders.StrUtf8Coder())
   splits = [split for split in source.split(desired_bundle_size=100000)]
   assert len(splits) == 1
   source_test_utils.assertSplitAtFractionExhaustive(
       splits[0].source, splits[0].start_position, splits[0].stop_position,
       perform_multi_threaded_test=False)

开发者ID:amitsela，项目名称:incubator-beam，代码行数:10，代码来源:textio_test.py

示例8: _run_read_test

 def _run_read_test(self, file_or_pattern, expected_data,
                    buffer_size=DEFAULT_NUM_RECORDS,
                    compression=CompressionTypes.UNCOMPRESSED):
   # Since each record usually takes more than 1 byte, default buffer size is
   # smaller than the total size of the file. This is done to
   # increase test coverage for cases that hit the buffer boundary.
   source = TextSource(file_or_pattern, 0, compression,
                       True, coders.StrUtf8Coder(), buffer_size)
   range_tracker = source.get_range_tracker(None, None)
   read_data = list(source.read(range_tracker))
   self.assertCountEqual(expected_data, read_data)

开发者ID:dpmills，项目名称:incubator-beam，代码行数:11，代码来源:textio_test.py

示例9: _read_skip_header_lines

  def _read_skip_header_lines(self, file_or_pattern, skip_header_lines):
    """Simple wrapper function for instantiating TextSource."""
    source = TextSource(
        file_or_pattern,
        0,
        CompressionTypes.UNCOMPRESSED,
        True,
        coders.StrUtf8Coder(),
        skip_header_lines=skip_header_lines)

    range_tracker = source.get_range_tracker(None, None)
    return list(source.read(range_tracker))

开发者ID:dpmills，项目名称:incubator-beam，代码行数:12，代码来源:textio_test.py

示例10: test_read_after_splitting

  def test_read_after_splitting(self):
    file_name, expected_data = write_data(10)
    assert len(expected_data) == 10
    source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                        coders.StrUtf8Coder())
    splits = list(source.split(desired_bundle_size=33))

    reference_source_info = (source, None, None)
    sources_info = ([
        (split.source, split.start_position, split.stop_position) for
        split in splits])
    source_test_utils.assert_sources_equal_reference_source(
        reference_source_info, sources_info)

开发者ID:dpmills，项目名称:incubator-beam，代码行数:13，代码来源:textio_test.py

示例11: test_progress

  def test_progress(self):
    file_name, expected_data = write_data(10)
    assert len(expected_data) == 10
    source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                        coders.StrUtf8Coder())
    splits = [split for split in source.split(desired_bundle_size=100000)]
    assert len(splits) == 1
    fraction_consumed_report = []
    range_tracker = splits[0].source.get_range_tracker(
        splits[0].start_position, splits[0].stop_position)
    for _ in splits[0].source.read(range_tracker):
      fraction_consumed_report.append(range_tracker.fraction_consumed())

    self.assertEqual(
        [float(i) / 10 for i in range(0, 10)], fraction_consumed_report)

开发者ID:jasonkuster，项目名称:incubator-beam，代码行数:15，代码来源:textio_test.py

示例12: test_read_after_splitting_skip_header

  def test_read_after_splitting_skip_header(self):
    file_name, expected_data = write_data(100)
    assert len(expected_data) == 100
    source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                        coders.StrUtf8Coder(), skip_header_lines=2)
    splits = list(source.split(desired_bundle_size=33))

    reference_source_info = (source, None, None)
    sources_info = ([
        (split.source, split.start_position, split.stop_position) for
        split in splits])
    self.assertGreater(len(sources_info), 1)
    reference_lines = source_test_utils.read_from_source(*reference_source_info)
    split_lines = []
    for source_info in sources_info:
      split_lines.extend(source_test_utils.read_from_source(*source_info))

    self.assertEqual(expected_data[2:], reference_lines)
    self.assertEqual(reference_lines, split_lines)

开发者ID:dpmills，项目名称:incubator-beam，代码行数:19，代码来源:textio_test.py

示例13: test_read_gzip_large_after_splitting

  def test_read_gzip_large_after_splitting(self):
    _, lines = write_data(10000)
    file_name = self._create_temp_file()
    with gzip.GzipFile(file_name, 'wb') as f:
      f.write('\n'.join(lines))

    source = TextSource(file_name, 0, CompressionTypes.GZIP, True,
                        coders.StrUtf8Coder())
    splits = [split for split in source.split(desired_bundle_size=1000)]

    if len(splits) > 1:
      raise ValueError('FileBasedSource generated more than one initial split '
                       'for a compressed file.')

    reference_source_info = (source, None, None)
    sources_info = ([
        (split.source, split.start_position, split.stop_position) for
        split in splits])
    source_test_utils.assert_sources_equal_reference_source(
        reference_source_info, sources_info)

开发者ID:eljefe6a，项目名称:incubator-beam，代码行数:20，代码来源:textio_test.py

示例14: test_header_processing

  def test_header_processing(self):
    file_name, expected_data = write_data(10)
    assert len(expected_data) == 10

    def header_matcher(line):
      return line in expected_data[:5]

    header_lines = []

    def store_header(lines):
      for line in lines:
        header_lines.append(line)

    source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                        coders.StrUtf8Coder(),
                        header_processor_fns=(header_matcher, store_header))
    splits = list(source.split(desired_bundle_size=100000))
    assert len(splits) == 1
    range_tracker = splits[0].source.get_range_tracker(
        splits[0].start_position, splits[0].stop_position)
    read_data = list(source.read_records(file_name, range_tracker))

    self.assertCountEqual(expected_data[:5], header_lines)
    self.assertCountEqual(expected_data[5:], read_data)

开发者ID:dpmills，项目名称:incubator-beam，代码行数:24，代码来源:textio_test.py

注：本文中的apache_beam.io.textio._TextSource类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。