当前位置: 首页>>代码示例>>Python>>正文


Python pyarrow.array方法代码示例

本文整理汇总了Python中pyarrow.array方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.array方法的具体用法?Python pyarrow.array怎么用?Python pyarrow.array使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyarrow的用法示例。


在下文中一共展示了pyarrow.array方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_stats_pipeline_with_sample_rate

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_stats_pipeline_with_sample_rate(self):
    record_batches = [
        pa.RecordBatch.from_arrays(
            [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
    ]

    with beam.Pipeline() as p:
      options = stats_options.StatsOptions(
          sample_rate=1.0,
          num_top_values=2,
          num_rank_histogram_buckets=2,
          num_values_histogram_buckets=2,
          num_histogram_buckets=2,
          num_quantiles_histogram_buckets=2,
          epsilon=0.001)
      result = (
          p | beam.Create(record_batches)
          | stats_api.GenerateStatistics(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, self._sampling_test_expected_result)) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:24,代码来源:stats_api_test.py

示例2: test_get_feature_value_slicer_bytes_feature_valid_utf8

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_get_feature_value_slicer_bytes_feature_valid_utf8(self):
    features = {'b': None}
    input_record_batch = pa.RecordBatch.from_arrays([
        pa.array([[1], [2, 1]]),
        pa.array([[b'dog'], [b'cat']]),
    ], ['a', 'b'])
    expected_result = [
        (u'b_dog',
         pa.RecordBatch.from_arrays(
             [pa.array([[1]]), pa.array([[b'dog']])], ['a', 'b'])
        ),
        (u'b_cat',
         pa.RecordBatch.from_arrays(
             [pa.array([[2, 1]]), pa.array([[b'cat']])], ['a', 'b'])
        ),
    ]
    self._check_results(
        slicing_util.get_feature_value_slicer(features)(input_record_batch),
        expected_result) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:21,代码来源:slicing_util_test.py

示例3: testGetArrayReturnExampleIndices

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def testGetArrayReturnExampleIndices(self):
    record_batch = pa.RecordBatch.from_arrays([
        pa.array([[{
            "sf": [{
                "ssf": [1]
            }, {
                "ssf": [2]
            }]
        }], [{
            "sf": [{
                "ssf": [3, 4]
            }]
        }]]),
        pa.array([["one"], ["two"]])
    ], ["f", "w"])
    feature = types.FeaturePath(["f", "sf", "ssf"])
    actual_arr, actual_indices = arrow_util.get_array(
        record_batch, feature, return_example_indices=True)
    expected_arr = pa.array([[1], [2], [3, 4]])
    expected_indices = np.array([0, 0, 1])
    self.assertTrue(
        actual_arr.equals(expected_arr),
        "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
            feature, expected_arr, actual_arr))
    np.testing.assert_array_equal(expected_indices, actual_indices) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:27,代码来源:arrow_util_test.py

示例4: test_basic_stats_generator_no_runtime_warnings_close_to_max_int

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_basic_stats_generator_no_runtime_warnings_close_to_max_int(self):
    # input has batches with values that are slightly smaller than the maximum
    # integer value.
    less_than_max_int_value = np.iinfo(np.int64).max - 1
    batches = ([
        pa.RecordBatch.from_arrays([pa.array([[less_than_max_int_value]])],
                                   ['a'])
    ] * 2)
    generator = basic_stats_generator.BasicStatsGenerator()
    old_nperr = np.geterr()
    np.seterr(over='raise')
    accumulators = [
        generator.add_input(generator.create_accumulator(), batch)
        for batch in batches
    ]
    generator.merge_accumulators(accumulators)
    np.seterr(**old_nperr) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:19,代码来源:basic_stats_generator_test.py

示例5: test_time_stats_generator_values_threshold_check

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_time_stats_generator_values_threshold_check(self):
    """Tests generator values threshold."""
    # Expected to give 6 matches with the same format.
    input_batches = [
        pa.array([['2018-11-30', '2018-11-30', '2018-11-30'], ['2018-11-30']]),
        pa.array([['2018-11-30', '2018-11-30']]),
        pa.array([None, None]),
    ]
    # Try generator with values_threshold=7 (should not create stats).
    generator = time_stats_generator.TimeStatsGenerator(values_threshold=7)
    self.assertCombinerOutputEqual(input_batches, generator,
                                   statistics_pb2.FeatureNameStatistics())

    # Try generator with values_threshold=6 (should create stats).
    generator = time_stats_generator.TimeStatsGenerator(values_threshold=6)
    self.assertCombinerOutputEqual(
        input_batches, generator,
        statistics_pb2.FeatureNameStatistics(custom_stats=[
            statistics_pb2.CustomStatistic(
                name='domain_info',
                str="time_domain {string_format: '%Y-%m-%d'}"),
            statistics_pb2.CustomStatistic(name='time_match_ratio', num=1.0),
        ])) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:25,代码来源:time_stats_generator_test.py

示例6: test_time_stats_generator_invalidated_exits_add_input_early

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_time_stats_generator_invalidated_exits_add_input_early(
      self, mock_update):
    input_batch = pa.array([['2018-11-30']])
    generator = time_stats_generator.TimeStatsGenerator()
    accumulator = generator.create_accumulator()

    # When an accumulator is invalidated is True, it is not updated when an
    # input batch is added.
    accumulator.invalidated = True
    generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
    self.assertFalse(mock_update.called)

    # When an accumulator is not invalidated, it is updated when an input batch
    # is added.
    accumulator.invalidated = False
    generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
    self.assertTrue(mock_update.called) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:19,代码来源:time_stats_generator_test.py

示例7: test_time_stats_generator_no_values_exits_add_input_early

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_time_stats_generator_no_values_exits_add_input_early(
      self, mock_update):
    generator = time_stats_generator.TimeStatsGenerator()
    accumulator = generator.create_accumulator()

    # The accumulator is not updated when the values list in an input batch is
    # None.
    input_batch = pa.array([None])
    generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
    self.assertFalse(mock_update.called)

    # The accumulator is not updated when the values list in an input batch is
    # empty.
    input_batch = pa.array([])
    generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
    self.assertFalse(mock_update.called)

    # The accumulator is updated when a non-empty input_batch is added.
    input_batch = pa.array([['2018-11-30']])
    generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
    self.assertTrue(mock_update.called) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:23,代码来源:time_stats_generator_test.py

示例8: test_time_stats_generator_match_ratio_with_same_valid_format

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_time_stats_generator_match_ratio_with_same_valid_format(self):
    """Tests match ratio where all valid values have the same format."""
    input_batches = [
        pa.array([['2018-11-30', '2018-11-30', '2018-11-30'],
                  ['2018-11-30', '2018-11-30']]),
        pa.array([['not-valid', 'not-valid', 'not-valid'],
                  ['not-valid', 'not-valid']]),
    ]
    # Try generator with match_ratio 0.51 (should not create stats).
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.51, values_threshold=5)
    self.assertCombinerOutputEqual(input_batches, generator,
                                   statistics_pb2.FeatureNameStatistics())
    # Try generator with match_ratio 0.49 (should create stats).
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.49, values_threshold=5)
    self.assertCombinerOutputEqual(
        input_batches, generator,
        statistics_pb2.FeatureNameStatistics(custom_stats=[
            statistics_pb2.CustomStatistic(
                name='domain_info',
                str="time_domain {string_format: '%Y-%m-%d'}"),
            statistics_pb2.CustomStatistic(name='time_match_ratio', num=0.50),
        ])) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:26,代码来源:time_stats_generator_test.py

示例9: test_time_stats_generator_combined_string_formats

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_time_stats_generator_combined_string_formats(self):
    """Tests that the generator handles combined string formats."""
    # The combined format is the most common, since the generator should count
    # it only as the combined format and not its component parts.
    input_batches = [
        pa.array([['2018/11/30 23:59', '2018/12/01 23:59']]),
        pa.array([['2018/11/30 23:59', '23:59']]),
        pa.array([['2018/11/30', '2018/11/30']]),
    ]
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.1, values_threshold=1)
    self.assertCombinerOutputEqual(
        input_batches, generator,
        statistics_pb2.FeatureNameStatistics(custom_stats=[
            statistics_pb2.CustomStatistic(
                name='domain_info',
                str="time_domain {string_format: '%Y/%m/%d %H:%M'}"),
            statistics_pb2.CustomStatistic(name='time_match_ratio', num=0.5),
        ])) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:21,代码来源:time_stats_generator_test.py

示例10: test_time_stats_generator_integer_formats

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_time_stats_generator_integer_formats(self):
    """Tests that the generator handles integer formats."""
    # Three of values are within the valid range for Unix seconds, one is within
    # the valid range for Unix milliseconds, and the other two are not within
    # the valid range for any integer time formats.
    input_batches = [
        pa.array([[631152001, 631152002]]),
        pa.array([[631152003, 631152000001]]),
        pa.array([[1, 2]])
    ]
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.1, values_threshold=1)
    assert schema_pb2.TimeDomain.UNIX_SECONDS == 1
    self.assertCombinerOutputEqual(
        input_batches, generator,
        statistics_pb2.FeatureNameStatistics(custom_stats=[
            statistics_pb2.CustomStatistic(
                name='domain_info',
                str=('time_domain {integer_format: 1}')
            ),
            statistics_pb2.CustomStatistic(name='time_match_ratio', num=0.5),
        ])) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:24,代码来源:time_stats_generator_test.py

示例11: from_scalars

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def from_scalars(cls, values):
        arr = pa.chunked_array([pa.array(np.asarray(values))])
        return cls(arr) 
开发者ID:Frank-qlu,项目名称:recruit,代码行数:5,代码来源:bool.py

示例12: test_stats_pipeline_with_sample_count

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_stats_pipeline_with_sample_count(self):
    record_batches = [
        pa.RecordBatch.from_arrays(
            [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
        pa.RecordBatch.from_arrays(
            [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
        pa.RecordBatch.from_arrays(
            [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
    ]

    with beam.Pipeline() as p:
      options = stats_options.StatsOptions(
          sample_count=3000,
          num_top_values=2,
          num_rank_histogram_buckets=2,
          num_values_histogram_buckets=2,
          num_histogram_buckets=2,
          num_quantiles_histogram_buckets=2,
          epsilon=0.001,
          desired_batch_size=3000)
      result = (
          p | beam.Create(record_batches)
          | stats_api.GenerateStatistics(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, self._sampling_test_expected_result)) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:29,代码来源:stats_api_test.py

示例13: test_validate_instance_invalid_options

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_validate_instance_invalid_options(self):
    instance = pa.RecordBatch.from_arrays([pa.array([['A']])], ['feature'])
    with self.assertRaisesRegexp(ValueError,
                                 'options must be a StatsOptions object.'):
      _ = validation_api.validate_instance(instance, {}) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:7,代码来源:validation_api_test.py

示例14: test_validate_instance_stats_options_without_schema

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_validate_instance_stats_options_without_schema(self):
    instance = pa.RecordBatch.from_arrays([pa.array([['A']])], ['feature'])
    # This instance of StatsOptions has no schema.
    options = stats_options.StatsOptions()
    with self.assertRaisesRegexp(ValueError, 'options must include a schema.'):
      _ = validation_api.validate_instance(instance, options) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:8,代码来源:validation_api_test.py

示例15: test_identify_anomalous_examples_options_of_wrong_type

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_identify_anomalous_examples_options_of_wrong_type(self):
    examples = [{'annotated_enum': np.array(['D'], dtype=np.object)}]
    options = 1
    with self.assertRaisesRegexp(ValueError, 'options must be a `StatsOptions` '
                                 'object.'):
      with beam.Pipeline() as p:
        _ = (
            p | beam.Create(examples)
            | validation_api.IdentifyAnomalousExamples(options)) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:11,代码来源:validation_api_test.py


注:本文中的pyarrow.array方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。