本文整理汇总了Python中pyarrow.array方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.array方法的具体用法?Python pyarrow.array怎么用?Python pyarrow.array使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow
的用法示例。
在下文中一共展示了pyarrow.array方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_stats_pipeline_with_sample_rate
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_stats_pipeline_with_sample_rate(self):
record_batches = [
pa.RecordBatch.from_arrays(
[pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
]
with beam.Pipeline() as p:
options = stats_options.StatsOptions(
sample_rate=1.0,
num_top_values=2,
num_rank_histogram_buckets=2,
num_values_histogram_buckets=2,
num_histogram_buckets=2,
num_quantiles_histogram_buckets=2,
epsilon=0.001)
result = (
p | beam.Create(record_batches)
| stats_api.GenerateStatistics(options))
util.assert_that(
result,
test_util.make_dataset_feature_stats_list_proto_equal_fn(
self, self._sampling_test_expected_result))
示例2: test_get_feature_value_slicer_bytes_feature_valid_utf8
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_get_feature_value_slicer_bytes_feature_valid_utf8(self):
features = {'b': None}
input_record_batch = pa.RecordBatch.from_arrays([
pa.array([[1], [2, 1]]),
pa.array([[b'dog'], [b'cat']]),
], ['a', 'b'])
expected_result = [
(u'b_dog',
pa.RecordBatch.from_arrays(
[pa.array([[1]]), pa.array([[b'dog']])], ['a', 'b'])
),
(u'b_cat',
pa.RecordBatch.from_arrays(
[pa.array([[2, 1]]), pa.array([[b'cat']])], ['a', 'b'])
),
]
self._check_results(
slicing_util.get_feature_value_slicer(features)(input_record_batch),
expected_result)
示例3: testGetArrayReturnExampleIndices
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def testGetArrayReturnExampleIndices(self):
record_batch = pa.RecordBatch.from_arrays([
pa.array([[{
"sf": [{
"ssf": [1]
}, {
"ssf": [2]
}]
}], [{
"sf": [{
"ssf": [3, 4]
}]
}]]),
pa.array([["one"], ["two"]])
], ["f", "w"])
feature = types.FeaturePath(["f", "sf", "ssf"])
actual_arr, actual_indices = arrow_util.get_array(
record_batch, feature, return_example_indices=True)
expected_arr = pa.array([[1], [2], [3, 4]])
expected_indices = np.array([0, 0, 1])
self.assertTrue(
actual_arr.equals(expected_arr),
"\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
feature, expected_arr, actual_arr))
np.testing.assert_array_equal(expected_indices, actual_indices)
示例4: test_basic_stats_generator_no_runtime_warnings_close_to_max_int
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_basic_stats_generator_no_runtime_warnings_close_to_max_int(self):
# input has batches with values that are slightly smaller than the maximum
# integer value.
less_than_max_int_value = np.iinfo(np.int64).max - 1
batches = ([
pa.RecordBatch.from_arrays([pa.array([[less_than_max_int_value]])],
['a'])
] * 2)
generator = basic_stats_generator.BasicStatsGenerator()
old_nperr = np.geterr()
np.seterr(over='raise')
accumulators = [
generator.add_input(generator.create_accumulator(), batch)
for batch in batches
]
generator.merge_accumulators(accumulators)
np.seterr(**old_nperr)
示例5: test_time_stats_generator_values_threshold_check
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_time_stats_generator_values_threshold_check(self):
"""Tests generator values threshold."""
# Expected to give 6 matches with the same format.
input_batches = [
pa.array([['2018-11-30', '2018-11-30', '2018-11-30'], ['2018-11-30']]),
pa.array([['2018-11-30', '2018-11-30']]),
pa.array([None, None]),
]
# Try generator with values_threshold=7 (should not create stats).
generator = time_stats_generator.TimeStatsGenerator(values_threshold=7)
self.assertCombinerOutputEqual(input_batches, generator,
statistics_pb2.FeatureNameStatistics())
# Try generator with values_threshold=6 (should create stats).
generator = time_stats_generator.TimeStatsGenerator(values_threshold=6)
self.assertCombinerOutputEqual(
input_batches, generator,
statistics_pb2.FeatureNameStatistics(custom_stats=[
statistics_pb2.CustomStatistic(
name='domain_info',
str="time_domain {string_format: '%Y-%m-%d'}"),
statistics_pb2.CustomStatistic(name='time_match_ratio', num=1.0),
]))
示例6: test_time_stats_generator_invalidated_exits_add_input_early
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_time_stats_generator_invalidated_exits_add_input_early(
self, mock_update):
input_batch = pa.array([['2018-11-30']])
generator = time_stats_generator.TimeStatsGenerator()
accumulator = generator.create_accumulator()
# When an accumulator is invalidated is True, it is not updated when an
# input batch is added.
accumulator.invalidated = True
generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
self.assertFalse(mock_update.called)
# When an accumulator is not invalidated, it is updated when an input batch
# is added.
accumulator.invalidated = False
generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
self.assertTrue(mock_update.called)
示例7: test_time_stats_generator_no_values_exits_add_input_early
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_time_stats_generator_no_values_exits_add_input_early(
self, mock_update):
generator = time_stats_generator.TimeStatsGenerator()
accumulator = generator.create_accumulator()
# The accumulator is not updated when the values list in an input batch is
# None.
input_batch = pa.array([None])
generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
self.assertFalse(mock_update.called)
# The accumulator is not updated when the values list in an input batch is
# empty.
input_batch = pa.array([])
generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
self.assertFalse(mock_update.called)
# The accumulator is updated when a non-empty input_batch is added.
input_batch = pa.array([['2018-11-30']])
generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
self.assertTrue(mock_update.called)
示例8: test_time_stats_generator_match_ratio_with_same_valid_format
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_time_stats_generator_match_ratio_with_same_valid_format(self):
"""Tests match ratio where all valid values have the same format."""
input_batches = [
pa.array([['2018-11-30', '2018-11-30', '2018-11-30'],
['2018-11-30', '2018-11-30']]),
pa.array([['not-valid', 'not-valid', 'not-valid'],
['not-valid', 'not-valid']]),
]
# Try generator with match_ratio 0.51 (should not create stats).
generator = time_stats_generator.TimeStatsGenerator(
match_ratio=0.51, values_threshold=5)
self.assertCombinerOutputEqual(input_batches, generator,
statistics_pb2.FeatureNameStatistics())
# Try generator with match_ratio 0.49 (should create stats).
generator = time_stats_generator.TimeStatsGenerator(
match_ratio=0.49, values_threshold=5)
self.assertCombinerOutputEqual(
input_batches, generator,
statistics_pb2.FeatureNameStatistics(custom_stats=[
statistics_pb2.CustomStatistic(
name='domain_info',
str="time_domain {string_format: '%Y-%m-%d'}"),
statistics_pb2.CustomStatistic(name='time_match_ratio', num=0.50),
]))
示例9: test_time_stats_generator_combined_string_formats
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_time_stats_generator_combined_string_formats(self):
"""Tests that the generator handles combined string formats."""
# The combined format is the most common, since the generator should count
# it only as the combined format and not its component parts.
input_batches = [
pa.array([['2018/11/30 23:59', '2018/12/01 23:59']]),
pa.array([['2018/11/30 23:59', '23:59']]),
pa.array([['2018/11/30', '2018/11/30']]),
]
generator = time_stats_generator.TimeStatsGenerator(
match_ratio=0.1, values_threshold=1)
self.assertCombinerOutputEqual(
input_batches, generator,
statistics_pb2.FeatureNameStatistics(custom_stats=[
statistics_pb2.CustomStatistic(
name='domain_info',
str="time_domain {string_format: '%Y/%m/%d %H:%M'}"),
statistics_pb2.CustomStatistic(name='time_match_ratio', num=0.5),
]))
示例10: test_time_stats_generator_integer_formats
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_time_stats_generator_integer_formats(self):
"""Tests that the generator handles integer formats."""
# Three of values are within the valid range for Unix seconds, one is within
# the valid range for Unix milliseconds, and the other two are not within
# the valid range for any integer time formats.
input_batches = [
pa.array([[631152001, 631152002]]),
pa.array([[631152003, 631152000001]]),
pa.array([[1, 2]])
]
generator = time_stats_generator.TimeStatsGenerator(
match_ratio=0.1, values_threshold=1)
assert schema_pb2.TimeDomain.UNIX_SECONDS == 1
self.assertCombinerOutputEqual(
input_batches, generator,
statistics_pb2.FeatureNameStatistics(custom_stats=[
statistics_pb2.CustomStatistic(
name='domain_info',
str=('time_domain {integer_format: 1}')
),
statistics_pb2.CustomStatistic(name='time_match_ratio', num=0.5),
]))
示例11: from_scalars
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def from_scalars(cls, values):
arr = pa.chunked_array([pa.array(np.asarray(values))])
return cls(arr)
示例12: test_stats_pipeline_with_sample_count
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_stats_pipeline_with_sample_count(self):
record_batches = [
pa.RecordBatch.from_arrays(
[pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
pa.RecordBatch.from_arrays(
[pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
pa.RecordBatch.from_arrays(
[pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
]
with beam.Pipeline() as p:
options = stats_options.StatsOptions(
sample_count=3000,
num_top_values=2,
num_rank_histogram_buckets=2,
num_values_histogram_buckets=2,
num_histogram_buckets=2,
num_quantiles_histogram_buckets=2,
epsilon=0.001,
desired_batch_size=3000)
result = (
p | beam.Create(record_batches)
| stats_api.GenerateStatistics(options))
util.assert_that(
result,
test_util.make_dataset_feature_stats_list_proto_equal_fn(
self, self._sampling_test_expected_result))
示例13: test_validate_instance_invalid_options
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_validate_instance_invalid_options(self):
instance = pa.RecordBatch.from_arrays([pa.array([['A']])], ['feature'])
with self.assertRaisesRegexp(ValueError,
'options must be a StatsOptions object.'):
_ = validation_api.validate_instance(instance, {})
示例14: test_validate_instance_stats_options_without_schema
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_validate_instance_stats_options_without_schema(self):
instance = pa.RecordBatch.from_arrays([pa.array([['A']])], ['feature'])
# This instance of StatsOptions has no schema.
options = stats_options.StatsOptions()
with self.assertRaisesRegexp(ValueError, 'options must include a schema.'):
_ = validation_api.validate_instance(instance, options)
示例15: test_identify_anomalous_examples_options_of_wrong_type
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import array [as 别名]
def test_identify_anomalous_examples_options_of_wrong_type(self):
examples = [{'annotated_enum': np.array(['D'], dtype=np.object)}]
options = 1
with self.assertRaisesRegexp(ValueError, 'options must be a `StatsOptions` '
'object.'):
with beam.Pipeline() as p:
_ = (
p | beam.Create(examples)
| validation_api.IdentifyAnomalousExamples(options))