本文整理汇总了Python中dask.dataframe.read_csv方法的典型用法代码示例。如果您正苦于以下问题:Python dataframe.read_csv方法的具体用法?Python dataframe.read_csv怎么用?Python dataframe.read_csv使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dask.dataframe
的用法示例。
在下文中一共展示了dataframe.read_csv方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: tf_csv_dataset
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def tf_csv_dataset(csv_path, label_col, col_defaults, shuffle=False, batch_size=32):
df = dd.read_csv(csv_path)
# use col_defaults if specified for col, else use defaults base on col type
type_defaults = {np.int64: 0, np.float64: 0.0, np.object_: ""}
record_defaults = [[col_defaults.get(col_name, type_defaults.get(col_type.type, ""))]
for col_name, col_type in df.dtypes.items()]
def parse_csv(value):
columns = tf.decode_csv(value, record_defaults)
features = dict(zip(df.columns.tolist(), columns))
label = features[label_col]
return features, label
# read, parse, shuffle and batch dataset
dataset = tf.data.TextLineDataset(csv_path).skip(1) # skip header
if shuffle:
dataset = dataset.shuffle(buffer_size=1024)
dataset = dataset.map(parse_csv, num_parallel_calls=8)
dataset = dataset.batch(batch_size)
return dataset
示例2: read_feature_file
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def read_feature_file(args):
fname, feature_file_format = os.path.splitext(args.inputNucleiFeatureFile)
if feature_file_format == '.csv':
ddf = dd.read_csv(args.inputNucleiFeatureFile)
elif feature_file_format == '.h5':
ddf = dd.read_hdf(args.inputNucleiFeatureFile, 'Features')
else:
raise ValueError('Extension of output feature file must be .csv or .h5')
return ddf
示例3: split_list
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def split_list(_list=LATEST):
print('Please wait while scene metadata is split')
try:
csv = read_csv(_list, dtype={'PRODUCT_ID': object, 'COLLECTION_NUMBER': object,
'COLLECTION_CATEGORY': object}, blocksize=25e6,
parse_dates=True)
except EmptyDataError:
print('Metadata has already been updated for the day.')
return None
csv = csv[csv.COLLECTION_NUMBER != 'PRE']
sats = unique(csv.SPACECRAFT_ID).tolist()
for sat in sats:
print(sat)
df = csv[csv.SPACECRAFT_ID == sat]
dst = os.path.join(SCENES, sat)
if os.path.isfile(dst):
os.remove(dst)
if not os.path.isdir(dst):
os.mkdir(dst)
df.to_parquet('{}'.format(dst))
return None
示例4: kmeans_input_fn
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def kmeans_input_fn(self, name, csv_path=None):
"""Input function for kmeans
Arguments:
name : string, Name of the data [Train or Eval]
csv_path : The path of the csv on any storage system
Returns:
A batch of features
"""
pattern = self._get_pattern(name, csv_path)
tf.logging.info('The Pattern of files is : %s', pattern)
df = dd.read_csv(pattern)
vectors = dask.compute(df.values)
return tf.train.limit_epochs(
tf.convert_to_tensor(vectors[0], dtype=tf.float32), num_epochs=1)
示例5: test_regress_newsread
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def test_regress_newsread(client, listen_port):
data = dd.read_csv('./system_tests/data/*.gz', compression='gzip', blocksize=None)
dX = data.iloc[:, 1:]
dy = data.iloc[:, 0]
d_regress = dlgbm.LGBMRegressor(n_estimators=50, local_listen_port=listen_port)
d_regress.fit(dX, dy)
dy_pred = d_regress.predict(dX, client=client)
# The dask_ml.metrics.r2_score method fails with dataframes so we compute the R2 score ourselves
numerator = ((dy - dy_pred) ** 2).sum()
denominator = ((dy - dy.mean()) ** 2).sum()
r2_score = 1 - numerator / denominator
r2_score = r2_score.compute()
print(r2_score)
assert r2_score > 0.8
示例6: main
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def main():
client = Client() # noqa
categories = ["category_%d" % i for i in range(26)]
columns = ["click"] + ["numeric_%d" % i for i in range(13)] + categories
df = dd.read_csv("day_1", sep="\t", names=columns, header=None)
encoding = {c: "bytes" for c in categories}
fixed = {c: 8 for c in categories}
df.to_parquet(
"day-1-bytes.parquet",
object_encoding=encoding,
fixed_text=fixed,
compression="SNAPPY",
)
示例7: __init__
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def __init__(self, file_path, sep=",", header="infer", block_size=10e6, random_seed=None):
"""Initializes the loader.
Args:
file_path (str): Path to delimited file.
sep (str, optional): Delimiter. Defaults to ",".
header (str, optional): Number of rows to be used as the header.
See pandas.read_csv()
Defaults to "infer".
block_size (int, optional): Size of partition in bytes.
See dask.dataframe.read_csv()
Defaults to 10e6.
random_seed (int, optional): Random seed. See random.seed().
Defaults to None.
"""
self.df = dd.read_csv(file_path, sep=sep, header=header, blocksize=block_size)
self.random_seed = random_seed
random.seed(random_seed)
示例8: open_gssha
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def open_gssha(filename):
"""
Reads various filetypes produced by GSSHA
"""
# Read metadata
ftype = filename.split('.')[-1]
if ftype in ['fgd', 'asc']:
f = open(filename, 'r')
c, r, xlc, ylc, gsize, nanval = [
t(f.readline().split(' ')[-1].split('\n')[0])
for t in [int, int, float, float, float, float]
]
xs = np.linspace(xlc+gsize/2., xlc+c*gsize-gsize/2., c+1)
ys = np.linspace(ylc+gsize/2., ylc+r*gsize-gsize/2., r)
else:
header_df = pd.read_table(filename, engine='python',
names=['meta_key', 'meta_val'],
sep=' ', nrows=6)
bounds = header_df.loc[:3, 'meta_val'].values.astype(float)
r, c = header_df.loc[4:6, 'meta_val'].values.astype(int)
xs, ys = get_sampling(bounds, (r, c))
# Read data using dask
ddf = dd.read_csv(filename, skiprows=6, header=None,
sep=' ')
darr = ddf.values.compute()
if ftype == 'fgd':
darr[darr==nanval] = np.NaN
return xr.DataArray(darr[::-1], coords={'x': xs, 'y': ys},
name='z', dims=['y', 'x'])
示例9: load_data
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def load_data(src_dir="data/ml-100k"):
data = {item: dd.read_csv(str(Path(src_dir, conf["filename"])), sep=conf["sep"],
header=None, names=conf["columns"], encoding="latin-1")
for item, conf in DATA_CONFIG.items()}
logger.info("data loaded.")
return data
示例10: read_csv
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def read_csv(urlpath, *args, **kwargs):
def rename_dask_index(df, name):
df.index.name = name
return df
index_col = index_name = None
if "index" in kwargs:
del kwargs["index"]
if "index_col" in kwargs:
index_col = kwargs["index_col"]
if isinstance(index_col, list):
index_col = index_col[0]
del kwargs["index_col"]
if "index_name" in kwargs:
index_name = kwargs["index_name"]
del kwargs["index_name"]
df = dd.read_csv(urlpath, *args, **kwargs)
if index_col is not None:
df = df.set_index(index_col)
if index_name is not None:
df = df.map_partitions(rename_dask_index, index_name)
return df
示例11: test_classify_newsread
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def test_classify_newsread(client, listen_port):
data = dd.read_csv('./system_tests/data/*.gz', compression='gzip', blocksize=None)
dX = data.iloc[:, :-1]
dy = data.iloc[:, -1]
d_classif = dlgbm.LGBMClassifier(n_estimators=50, local_listen_port=listen_port)
d_classif.fit(dX, dy)
dy_pred = d_classif.predict(dX, client=client)
acc_score = (dy == dy_pred).sum() / len(dy)
acc_score = acc_score.compute()
print(acc_score)
assert acc_score > 0.8
示例12: __init__
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def __init__(self, filename, filename_info, filetype_info):
"""Make sure filepath is valid and then reads data into a Dask DataFrame.
Args:
filename: Filename
filename_info: Filename information
filetype_info: Filetype information
"""
skip_rows = filetype_info.get('skip_rows', 15)
columns = filetype_info['columns']
self.file_content = dd.read_csv(filename, skiprows=skip_rows, header=None, names=columns)
super(VIIRSActiveFiresTextFileHandler, self).__init__(filename, filename_info, filetype_info)
self.platform_name = PLATFORM_MAP.get(self.filename_info['satellite_name'].upper(), "unknown")
示例13: main
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', required=True)
args = parser.parse_args()
data_dir = Path(args.input)
name = data_dir.parent.name
path = str(data_dir/'*.csv')
df = dd.read_csv(path).compute()
df = df.groupby(['model', 'n_trials', 'model_train_frac']).agg({
'CV AUC': ['mean', 'std'],
'duration[s]': ['mean', 'std'],
})
df.columns = ['%s_%s' % (a, b) for a, b in df.columns]
df = df.reset_index()
print(df)
# plot
fracs = sorted(df['model_train_frac'].unique())
plt.figure(figsize=(12, 5))
for j, frac in enumerate(fracs):
for i, n_trials in enumerate([1, 10, 20, 30]):
idx = (df['model'] == 'auto') &\
(df['n_trials'] == n_trials) &\
(df['model_train_frac'] == frac)
x = df.loc[idx, 'duration[s]_mean']
y = df.loc[idx, 'CV AUC_mean']
xerr = df.loc[idx, 'duration[s]_std']
yerr = df.loc[idx, 'CV AUC_std']
fmt = '%sC%d' % (['x', 'o', 's', 'D'][i], j)
label = 'n_trials=%d, model_train_frac=%.2f' % (n_trials, frac) # noqa
plt.errorbar(x, y, xerr=xerr, yerr=yerr, fmt=fmt, label=label)
plt.title('Parameter Comparison (dataset=%s)' % (name))
plt.xlabel('Training Time[s]')
plt.ylabel('CV AUC')
plt.legend()
plt.savefig(str(data_dir/'frac-and-n_trials.png'))
示例14: main
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', required=True)
args = parser.parse_args()
data_dir = Path(args.input)
path = str(data_dir/'*.csv')
df = dd.read_csv(path).compute()
df['model'] = df['model'].map(const.models)
df = df.sort_values(['dataset', 'model']).reset_index(drop=True)
df = df.groupby(['model', 'dataset']).agg({
'CV AUC': ['mean', 'std'],
'duration[s]': ['mean', 'std'],
})
df.columns = ['%s_%s' % (a, b) for a, b in df.columns]
df = df.reset_index()
df['model'] = df['model'].apply(lambda d: d[1])
print(df)
# plot
plt.figure(figsize=(8, 6))
for i, (_, model) in enumerate(const.models.values()):
for j, dset in enumerate(['airline', 'amazon', 'bank']):
idx = (df['model'] == model) &\
(df['dataset'] == dset)
x = df.loc[idx, 'duration[s]_mean']
y = df.loc[idx, 'CV AUC_mean']
xerr = df.loc[idx, 'duration[s]_std']
yerr = df.loc[idx, 'CV AUC_std']
fmt = '%sC%d' % (['o', 's', 'D', '^'][j], i)
label = 'model=%s, dataset=%s' % (model, dset) # noqa
plt.errorbar(x, y, xerr=xerr, yerr=yerr, fmt=fmt, label=label)
plt.title('Model Comparison')
plt.xlabel('Training Time[s]')
plt.ylabel('CV AUC')
plt.legend(loc='lower right')
plt.savefig(data_dir/'model_and_task.png')
示例15: main
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', required=True)
args = parser.parse_args()
path = str(Path(args.input)/'*.csv')
df = dd.read_csv(
path,
dtype={
'n_trials': 'float64',
},
).compute()
df = df.groupby(['model', 'dataset']).agg({
'duration[s]': ['mean', 'std'],
'CV AUC': ['mean', 'std'],
})
dummy = pd.DataFrame(index=[
('xgb', 'avazu'),
('lgb', 'avazu'),
])
df = df.append(dummy)
df.columns = ['%s_%s' % (a, b) for a, b in df.columns]
df = df.reset_index()
for c in ['duration[s]', 'CV AUC']:
mean = '%s_mean' % (c)
std = '%s_std' % (c)
df[std] = df[std].apply(lambda d: '±%.3f' % (d))
df[mean] = df[mean].apply(lambda d: '%.3f' % (d))
df[c] = (df[mean] + df[std]).apply(_handle_nan)
df['dataset'] = df['dataset'].map(const.competitions)
df['model'] = df['model'].map(const.models)
df = df[['dataset', 'model', 'duration[s]', 'CV AUC']]
df = df.sort_values(['dataset', 'model'])
df = df.reset_index(drop=True)
df['model'] = df['model'].apply(lambda d: d[1])
for dset, grp in df.groupby('dataset'):
grp.pop('dataset')
md = tabulate(grp.values, grp.columns, tablefmt='pipe', floatfmt='.3f')
print('#### %s\n' % (dset))
print(md + '\n')