本文整理汇总了Python中dask.compute方法的典型用法代码示例。如果您正苦于以下问题:Python dask.compute方法的具体用法?Python dask.compute怎么用?Python dask.compute使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dask
的用法示例。
在下文中一共展示了dask.compute方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: detect_outliers
# 需要导入模块: import dask [as 别名]
# 或者: from dask import compute [as 别名]
def detect_outliers(request):
"""
Detect outliers end point
"""
dataset_id = int(request.GET.get("dataset_id"))
if dataset_id is None:
return JsonResponse({"status": "failure", "message": "Dataset id is not provided"})
dataset = Dataset.objects.get(pk=dataset_id)
file_path = dataset.path
delete_features = json.loads(dataset.deleted_features)
# Create a detection experiment and start outlier detection
process = Process.objects.get(name='Detection')
process_status = ProcessStatus.objects.get(name='Running')
experiment = Experiment(dataset=dataset, process=process, process_status=process_status)
experiment.save()
results = delayed(detect_all)(os.path.join(settings.MEDIA_ROOT, file_path), experiment.id, settings.RESULTS_ROOT,
delete_features)
dask.compute(results)
return JsonResponse(
{'status': 'success', 'message': 'Detection started successfully', 'experiment_id': experiment.id})
示例2: _load
# 需要导入模块: import dask [as 别名]
# 或者: from dask import compute [as 别名]
def _load(self, files_in, files_out, urlpath, meta=True):
"""Download a set of files"""
import dask
out = []
outnames = []
for file_in, file_out in zip(files_in, files_out):
cache_path = file_out.path
outnames.append(cache_path)
# If `_munge_path` did not find a match we want to avoid
# writing to the urlpath.
if cache_path == urlpath:
continue
if not os.path.isfile(cache_path):
logger.debug("Caching file: {}".format(file_in.path))
logger.debug("Original path: {}".format(urlpath))
logger.debug("Cached at: {}".format(cache_path))
if meta:
self._log_metadata(urlpath, file_in.path, cache_path)
ddown = dask.delayed(_download)
out.append(ddown(file_in, file_out, self.blocksize,
self.output))
dask.compute(*out)
return outnames
示例3: _data_to_source
# 需要导入模块: import dask [as 别名]
# 或者: from dask import compute [as 别名]
def _data_to_source(b, path, encoder=None, storage_options=None, **kwargs):
import dask.bag as db
import posixpath
from fsspec import open_files
import dask
import pickle
import json
from intake.source.textfiles import TextFilesSource
encoder = {None: str, 'str': str, 'json': json.dumps,
'pickle': pickle.dumps}.get(encoder, encoder)
if not hasattr(b, 'to_textfiles'):
try:
b = db.from_sequence(b, npartitions=1)
except TypeError:
raise NotImplementedError
files = open_files(posixpath.join(path, 'part.*'), mode='wt',
num=b.npartitions, **(storage_options or {}))
dwrite = dask.delayed(write_file)
out = [dwrite(part, f, encoder)
for part, f in zip(b.to_delayed(), files)]
dask.compute(out)
s = TextFilesSource(posixpath.join(path, 'part.*'), storage_options=storage_options)
return s
示例4: persist
# 需要导入模块: import dask [as 别名]
# 或者: from dask import compute [as 别名]
def persist(self, columns=None):
"""
Return a CatalogSource, where the selected columns are
computed and persist in memory.
"""
import dask.array as da
if columns is None:
columns = self.columns
r = {}
for key in columns:
r[key] = self[key]
r = da.compute(r)[0] # particularity of dask
from nbodykit.source.catalog.array import ArrayCatalog
c = ArrayCatalog(r, comm=self.comm)
c.attrs.update(self.attrs)
return c
示例5: calculate_stats
# 需要导入模块: import dask [as 别名]
# 或者: from dask import compute [as 别名]
def calculate_stats(cls, df, target_var):
"""Calculates descriptive stats of the dataframe required for cleaning.
Arguments:
df : dask dataframe, The dataframe at hand
target_var : string, Dependent variable for the analysis
Returns:
mean : dask series, mean of each column
median : dask series, median of each column
dict(zip(categorical_cols, mode)) : dict, Dictionary containing
categorical column as keys and their modes as values
std : dask series, standard deviation of each column
"""
categorical_columns = [
col for col in df.columns if col != target_var and df[col].dtype == 'object']
mean_op = df.mean()
std_op = df.std()
median_op = df.quantile(0.5)
mode_op = [df[col].value_counts().idxmax()
for col in categorical_columns]
mean, median, mode, std = dask.compute(
mean_op, median_op, mode_op, std_op)
return mean, median, dict(zip(categorical_columns, mode)), std
示例6: impute
# 需要导入模块: import dask [as 别名]
# 或者: from dask import compute [as 别名]
def impute(cls, df, target_var, median, mode):
"""Imputing missing values using median for continuous columns and mode
for categorical columns.
Arguments:
df : dask dataframe, The dataframe at hand
target_var : string, Dependent variable for the analysis
median : list, median of all columns in data
mode : list, mode of all columns in data
Returns:
df : dask dataframe, Dataframe without missing values
"""
missing_stats = df.isna().sum().compute()
cols = [col for col in df.columns if col != target_var]
for col in cols:
if missing_stats[col] > 0 and df[col].dtype == 'object':
df[col] = df[col].fillna(mode[col])
elif missing_stats[col] > 0:
df[col] = df[col].fillna(median[col])
return df
示例7: kmeans_input_fn
# 需要导入模块: import dask [as 别名]
# 或者: from dask import compute [as 别名]
def kmeans_input_fn(self, name, csv_path=None):
"""Input function for kmeans
Arguments:
name : string, Name of the data [Train or Eval]
csv_path : The path of the csv on any storage system
Returns:
A batch of features
"""
pattern = self._get_pattern(name, csv_path)
tf.logging.info('The Pattern of files is : %s', pattern)
df = dd.read_csv(pattern)
vectors = dask.compute(df.values)
return tf.train.limit_epochs(
tf.convert_to_tensor(vectors[0], dtype=tf.float32), num_epochs=1)
示例8: test_clean_data
# 需要导入模块: import dask [as 别名]
# 或者: from dask import compute [as 别名]
def test_clean_data(self):
"""
Testing function clean_csv
"""
copyfile(CSV_PATH, '/tmp/data.csv')
iread = self.init_inputreader()
stats = self.init_basicstats()
ddf, _ = iread._parse_csv()
data, mean, std_dev, csv_defaults = stats.clean_data(
df=ddf,
task_type=TASK_TYPE,
target_var=TARGET_VAR,
name=NAME
)
self_computed_mean = dask.compute(ddf.mean())
self.assertListEqual(list(mean), list(self_computed_mean[0]))
self_computed_std_dev = dask.compute(ddf.std(axis=0, skipna=True))
self.assertListEqual(list(std_dev), list(self_computed_std_dev[0]))
self.assertIsInstance(data, dask.dataframe.core.DataFrame)
self.assertIsInstance(mean, pd.core.series.Series)
self.assertIsInstance(std_dev, pd.core.series.Series)
self.assertIsInstance(csv_defaults, list)
示例9: test_calculate_stats
# 需要导入模块: import dask [as 别名]
# 或者: from dask import compute [as 别名]
def test_calculate_stats(self):
"""
Testing function calculate_stats
"""
iread = self.init_inputreader()
stats = self.init_basicstats()
ddf, _ = iread._parse_csv()
mean, median, mode_dict, std_dev = stats.calculate_stats(
df=ddf,
target_var=TARGET_VAR
)
self_computed_mean = dask.compute(ddf.mean())
self.assertListEqual(list(mean), list(self_computed_mean[0]))
self_computed_std_dev = dask.compute(ddf.std(axis=0, skipna=True))
self.assertListEqual(list(std_dev), list(self_computed_std_dev[0]))
self_computed_median = dask.compute(ddf.quantile(0.5))
self.assertListEqual(list(median), list(self_computed_median[0]))
self.assertIsInstance(mean, pd.core.series.Series)
self.assertIsInstance(std_dev, pd.core.series.Series)
self.assertIsInstance(median, pd.core.series.Series)
self.assertIsInstance(mode_dict, dict)
示例10: test_impute
# 需要导入模块: import dask [as 别名]
# 或者: from dask import compute [as 别名]
def test_impute(self):
"""
Testing function impute
"""
iread = self.init_inputreader()
stats = self.init_basicstats()
ddf, _ = iread._parse_csv()
_, median, _, _ = stats.calculate_stats(
df=ddf,
target_var=TARGET_VAR
)
data = stats.impute(
df=ddf,
target_var=TARGET_VAR,
median=median,
mode=MODE
)
imputed_data = dask.compute(data.isnull().sum())
rows = ddf.columns
for row in rows:
col = imputed_data[0][row]
self.assertEqual(col, 0)
self.assertIsInstance(data, dask.dataframe.core.DataFrame)
示例11: calculate_centroids_old
# 需要导入模块: import dask [as 别名]
# 或者: from dask import compute [as 别名]
def calculate_centroids_old(cnmds, window, grp_dim=['animal', 'session']):
print("computing centroids")
cnt_list = []
for anm, cur_anm in cnmds.groupby('animal'):
for ss, cur_ss in cur_anm.groupby('session'):
# cnt = centroids(cur_ss['A_shifted'], window.sel(animal=anm))
cnt = da.delayed(centroids)(
cur_ss['A_shifted'], window.sel(animal=anm))
cnt_list.append(cnt)
with ProgressBar():
cnt_list, = da.compute(cnt_list)
cnts_ds = pd.concat(cnt_list, ignore_index=True)
cnts_ds.height = cnts_ds.height.astype(float)
cnts_ds.width = cnts_ds.width.astype(float)
cnts_ds.unit_id = cnts_ds.unit_id.astype(int)
cnts_ds.animal = cnts_ds.animal.astype(str)
cnts_ds.session = cnts_ds.session.astype(str)
cnts_ds.session_id = cnts_ds.session_id.astype(str)
return cnts_ds
示例12: centroids_distance_old
# 需要导入模块: import dask [as 别名]
# 或者: from dask import compute [as 别名]
def centroids_distance_old(cents,
A,
window,
shift,
hamming,
corr,
tile=(50, 50)):
sessions = cents['session'].unique()
dim_h = (np.min(cents['height']), np.max(cents['height']))
dim_w = (np.min(cents['width']), np.max(cents['width']))
dist_list = []
for ssA, ssB in itt.combinations(sessions, 2):
# dist = _calc_cent_dist(ssA, ssB, cents, cnmds, window, tile, dim_h, dim_w)
dist = da.delayed(_calc_cent_dist)(ssA, ssB, cents, A, window,
tile, dim_h, dim_w, shift, hamming,
corr)
dist_list.append(dist)
with ProgressBar():
dist_list, = da.compute(dist_list)
dists = pd.concat(dist_list, ignore_index=True)
return dists
示例13: get_noise_welch
# 需要导入模块: import dask [as 别名]
# 或者: from dask import compute [as 别名]
def get_noise_welch(varr,
noise_range=(0.25, 0.5),
noise_method='logmexp',
compute=True):
print("estimating noise")
sn = xr.apply_ufunc(
noise_welch,
varr.chunk(dict(frame=-1)),
input_core_dims=[['frame']],
dask='parallelized',
vectorize=True,
kwargs=dict(noise_range=noise_range, noise_method=noise_method),
output_dtypes=[varr.dtype])
if compute:
sn = sn.compute()
return sn
示例14: run
# 需要导入模块: import dask [as 别名]
# 或者: from dask import compute [as 别名]
def run(self):
self._validate_setup()
write_locks = {}
for times in self._times:
filename = self._get_output_filename(times)
self.setup_netcdf_output(filename, times)
write_locks[filename] = combine_locks([NETCDFC_LOCK, get_write_lock(filename)])
self.logger.info('Starting {} chunks...'.format(len(self.slices)))
delayed_objs = [wrap_run_slice(self.params, write_locks, dslice)
for dslice in self.slices]
persisted = dask.persist(delayed_objs, num_workers=self.params['num_workers'])
self.progress_bar(persisted)
dask.compute(persisted)
self.logger.info('Cleaning up...')
try:
self._client.cluster.close()
self._client.close()
if self.params['verbose'] == logging.DEBUG:
print()
print('closed dask cluster/client')
except Exception:
pass
示例15: benchmark
# 需要导入模块: import dask [as 别名]
# 或者: from dask import compute [as 别名]
def benchmark(datasets=None, datasets_path=None, distributed=True, timeout=None):
if datasets is None:
if datasets_path is None:
datasets = get_available_demos().name
else:
datasets = os.listdir(datasets_path)
if distributed:
import dask
global score_dataset
score_dataset = dask.delayed(score_dataset)
scores = list()
for dataset in datasets:
scores.append(score_dataset(dataset, datasets_path, timeout))
if distributed:
scores = dask.compute(*scores)
return pd.DataFrame(scores)