本文整理汇总了Python中odps.df.DataFrame.bloom_filter方法的典型用法代码示例。如果您正苦于以下问题:Python DataFrame.bloom_filter方法的具体用法?Python DataFrame.bloom_filter怎么用?Python DataFrame.bloom_filter使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类odps.df.DataFrame
的用法示例。
在下文中一共展示了DataFrame.bloom_filter方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Test
# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import bloom_filter [as 别名]
#.........这里部分代码省略.........
def testHeadAndTail(self):
res = self.odps_df.head(2)
self.assertEqual(len(res), 2)
df = self.odps_df[self.odps_df['name'] == 'name1']
res = df.head(1)
self.assertEqual(len(res), 1)
self.assertIsNotNone(df._cache_data)
res = self.odps_df.tail(2)
self.assertEqual(len(res), 2)
self.assertTrue(all(it > 1 for it in res.values['id']))
self.assertEqual(len(self.odps_df.name.head(2)), 2)
self.assertEqual(len(self.odps_df.name.tail(2)), 2)
res = self.pd_df.head(1)
self.assertEqual(len(res), 1)
df = self.pd_df[self.pd_df['name'] == 'name1']
res = df.head(1)
self.assertEqual(len(res), 1)
self.assertIsNotNone(df._cache_data)
res = self.pd_df.tail(1)
self.assertEqual(len(res), 1)
self.assertEqual(res.values['id'][0], 6)
self.assertEqual(len(self.pd_df.name.head(1)), 1)
self.assertEqual(len(self.pd_df.name.tail(1)), 1)
def testMapReduceWithResource(self):
pd_df2 = self.odps_df.to_pandas(wrap=True)
@output(['name', 'id'], ['string', 'int'])
def reducer(resources):
d = dict()
for r in resources[0]:
if r.name in d:
d[r.name] += r.id
else:
d[r.name] = r.id
def inner(keys):
def h(row, done):
if row.name in d:
d[row.name] += row.id
else:
d[row.name] = row.id
if done:
yield row.name, d[row.name]
return h
return inner
expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group='name')
result = expr.execute()
self.assertEqual(result.values['id'].sum(), 17)
odps_df2 = self.pd_df.persist('pyodps_df_mixed2', odps=self.odps)
try:
expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group='name')
result = expr.execute()
self.assertEqual(result.values['id'].sum(), 17)
expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group='name')
result = expr.execute()
self.assertEqual(result.values['id'].sum(), 17)
expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group='name')
result = expr.execute()
self.assertEqual(result.values['id'].sum(), 17)
finally:
next(odps_df2.data_source()).drop()
def testBloomFilter(self):
import numpy as np
data2 = [
['name1'],
['name3']
]
table_name = tn('pyodps_test_mixed_engine_bf_table2')
self.odps.delete_table(table_name, if_exists=True)
table2 = self.odps.create_table(name=table_name,
schema=Schema.from_lists(['name'], ['string']))
expr2 = DataFrame(table2)
self.odps.write_table(table2, 0, [table2.new_record(values=d) for d in data2])
try:
expr = self.odps_df.bloom_filter('name', expr2[:1].name, capacity=10)
res = self.engine.execute(expr)
self.assertTrue(np.all(res['name'] != 'name2'))
finally:
table2.drop()
示例2: Test
# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import bloom_filter [as 别名]
#.........这里部分代码省略.........
def inner(keys):
def h(row, done):
if row.name in d:
d[row.name] += row.id
else:
d[row.name] = row.id
if done:
yield row.name, d[row.name]
return h
return inner
expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group="name")
result = expr.execute()
self.assertEqual(result.values["id"].sum(), 17)
odps_df2 = self.pd_df.persist(tn("pyodps_df_mixed2"), odps=self.odps)
try:
expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group="name")
result = expr.execute()
self.assertEqual(result.values["id"].sum(), 17)
expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group="name")
result = expr.execute()
self.assertEqual(result.values["id"].sum(), 17)
expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group="name")
result = expr.execute()
self.assertEqual(result.values["id"].sum(), 17)
finally:
next(odps_df2.data_source()).drop()
def testBloomFilter(self):
import numpy as np
data2 = [["name1"], ["name3"]]
table_name = tn("pyodps_test_mixed_engine_bf_table2")
self.odps.delete_table(table_name, if_exists=True)
table2 = self.odps.create_table(name=table_name, schema=Schema.from_lists(["name"], ["string"]))
expr2 = DataFrame(table2)
self.odps.write_table(table2, 0, data2)
try:
expr = self.odps_df.bloom_filter("name", expr2[:1].name, capacity=10)
res = self.engine.execute(expr)
self.assertTrue(np.all(res["name"] != "name2"))
finally:
table2.drop()
def testCachePersist(self):
expr = self.odps_df
data2 = [["name1", 3.2], ["name3", 2.4]]
table_name = tn("pyodps_test_mixed_engine_cp_table2")
self.odps.delete_table(table_name, if_exists=True)
table2 = self.odps.create_table(
name=table_name, schema=Schema.from_lists(["name", "fid"], ["string", "double"])
)
expr2 = DataFrame(table2)
self.odps.write_table(table2, 0, data2)
@output(expr.schema.names, expr.schema.types)
def h(row):
yield row
l = expr.filter(expr.id > 0).apply(h, axis=1).cache()
r = expr2.filter(expr2.fid > 0)
joined = l.join(r, on=["name", r.fid < 4])["id", "fid"].cache()
output_table = tn("pyodps_test_mixed_engine_cp_output_table")
self.odps.delete_table(output_table, if_exists=True)
schema = Schema.from_lists(["id", "fid"], ["bigint", "double"], ["ds"], ["string"])
output_t = self.odps.create_table(output_table, schema, if_not_exists=True)
t = joined.persist(output_table, partition="ds=today", create_partition=True)
self.assertEqual(len(t.execute()), 2)
output_t.drop()
def testBigintPartitionedCache(self):
table = tn("pyodps_test_bigint_partitioned_cache")
self.odps.delete_table(table, if_exists=True)
expr = self.odps_df.persist(table, partitions=["id"])
@output(["id", "name"], ["int", "string"])
def handle(row):
return row.id + 1, row.name
expr = expr["tt" + expr.name, expr.id].cache()
new_expr = expr.map_reduce(mapper=handle)
res = self.engine.execute(new_expr)
self.assertEqual(len(res), 3)