本文整理汇总了Python中moztelemetry.dataset.Dataset.records方法的典型用法代码示例。如果您正苦于以下问题:Python Dataset.records方法的具体用法?Python Dataset.records怎么用?Python Dataset.records使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类moztelemetry.dataset.Dataset
的用法示例。
在下文中一共展示了Dataset.records方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_records_print_output
# 需要导入模块: from moztelemetry.dataset import Dataset [as 别名]
# 或者: from moztelemetry.dataset.Dataset import records [as 别名]
def test_records_print_output(spark_context, capsys):
bucket_name = 'test-bucket'
store = InMemoryStore(bucket_name)
for i in range(1, 100+1):
key = 'dir{}/subdir{}/key{}'.format(*[i]*3)
value = 'value{}'.format(i)
store.store[key] = value
dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
dataset.records(spark_context, decode=lambda x: x)
out, err = capsys.readouterr()
assert out.rstrip() == "fetching 0.00066MB in 100 files..."
示例2: test_records_limit_and_sample
# 需要导入模块: from moztelemetry.dataset import Dataset [as 别名]
# 或者: from moztelemetry.dataset.Dataset import records [as 别名]
def test_records_limit_and_sample(spark_context):
bucket_name = 'test-bucket'
store = InMemoryStore(bucket_name)
for i in range(1, 100 + 1):
key = 'dir{}/subdir{}/key{}'.format(*[i] * 3)
value = 'value{}'.format(i)
store.store[key] = value
dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
records = dataset.records(spark_context, decode=lambda x: x, limit=5, sample=0.9)
assert records.count() == 5
示例3: test_records
# 需要导入模块: from moztelemetry.dataset import Dataset [as 别名]
# 或者: from moztelemetry.dataset.Dataset import records [as 别名]
def test_records(spark_context):
bucket_name = 'test-bucket'
store = InMemoryStore(bucket_name)
store.store['dir1/subdir1/key1'] = 'value1'
store.store['dir2/subdir2/key2'] = 'value2'
dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
records = dataset.records(spark_context, decode=lambda x: x)
records = sorted(records.collect())
assert records == [b'value1', b'value2']
示例4: test_records_summaries
# 需要导入模块: from moztelemetry.dataset import Dataset [as 别名]
# 或者: from moztelemetry.dataset.Dataset import records [as 别名]
def test_records_summaries(spark_context):
bucket_name = 'test-bucket'
store = InMemoryStore(bucket_name)
store.store['dir1/subdir1/key1'] = 'value1'
store.store['dir2/subdir2/key2'] = 'value2'
dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, max_concurrency=1)
records = dataset.records(spark_context, decode=lambda x: x,
summaries=[{'key': 'dir1/subdir1/key1', 'size': len('value1')}])
records = records.collect()
assert records == [b'value1']
示例5: test_records_object
# 需要导入模块: from moztelemetry.dataset import Dataset [as 别名]
# 或者: from moztelemetry.dataset.Dataset import records [as 别名]
def test_records_object(spark_context):
expect = {"uid": 1}
bucket_name = 'test-bucket'
store = InMemoryStore(bucket_name)
store.store['key'] = json.dumps(expect)
ds = Dataset(bucket_name, None, store=store, max_concurrency=1)
row = ds.records(spark_context, decode=decode).first()
assert isinstance(row, dict)
assert row == expect
示例6: test_records_many_groups
# 需要导入模块: from moztelemetry.dataset import Dataset [as 别名]
# 或者: from moztelemetry.dataset.Dataset import records [as 别名]
def test_records_many_groups(spark_context, monkeypatch):
bucket_name = 'test-bucket'
store = InMemoryStore(bucket_name)
for i in range(1, spark_context.defaultParallelism + 2):
store.store['dir1/subdir1/key{}'.format(i)] = 'value{}'.format(i)
# create one group per item
monkeypatch.setattr(moztelemetry.dataset, '_group_by_size', lambda x: [[y] for y in x])
dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
records = dataset.records(spark_context, decode=lambda x: x)
records = records.collect()
assert records == ['value{}'.format(i) for i in range(1, spark_context.defaultParallelism + 2)]
示例7: test_records_selection
# 需要导入模块: from moztelemetry.dataset import Dataset [as 别名]
# 或者: from moztelemetry.dataset.Dataset import records [as 别名]
def test_records_selection(spark_context):
bucket_name = 'test-bucket'
store = InMemoryStore(bucket_name)
key = 'dir1/subdir1/key1'
value = '{"a": {"b": { "c": "value"}}}'
store.store[key] = value
dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).select(field='a.b.c')
records = dataset.records(spark_context, decode=decode)
assert records.collect() == [{'field': 'value'}]
# Check that concatenating `select`s works as expected
records = dataset.select(field2='a.b').records(spark_context, decode=decode)
assert records.collect() == [{'field': 'value', 'field2': {'c': 'value'}}]
示例8: test_records_sample
# 需要导入模块: from moztelemetry.dataset import Dataset [as 别名]
# 或者: from moztelemetry.dataset.Dataset import records [as 别名]
def test_records_sample(spark_context):
bucket_name = 'test-bucket'
store = InMemoryStore(bucket_name)
for i in range(1, 100 + 1):
key = 'dir{}/subdir{}/key{}'.format(*[i] * 3)
value = 'value{}'.format(i)
store.store[key] = value
dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
records_1 = dataset.records(spark_context, decode=lambda x: x, sample=0.1, seed=None).collect()
assert len(records_1) == 10
records_2 = dataset.records(spark_context, decode=lambda x: x, sample=0.1, seed=None).collect()
# The sampling seed is different, so we have two different samples.
assert sorted(records_1) != sorted(records_2)
records_1 = dataset.records(spark_context, decode=lambda x: x, sample=0.1).collect()
records_2 = dataset.records(spark_context, decode=lambda x: x, sample=0.1).collect()
# Same seed, same sample.
assert sorted(records_1) == sorted(records_2)