当前位置: 首页>>代码示例>>Python>>正文


Python dataframe.read_csv方法代码示例

本文整理汇总了Python中dask.dataframe.read_csv方法的典型用法代码示例。如果您正苦于以下问题:Python dataframe.read_csv方法的具体用法?Python dataframe.read_csv怎么用?Python dataframe.read_csv使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在dask.dataframe的用法示例。


在下文中一共展示了dataframe.read_csv方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: tf_csv_dataset

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def tf_csv_dataset(csv_path, label_col, col_defaults, shuffle=False, batch_size=32):
    df = dd.read_csv(csv_path)
    # use col_defaults if specified for col, else use defaults base on col type
    type_defaults = {np.int64: 0, np.float64: 0.0, np.object_: ""}
    record_defaults = [[col_defaults.get(col_name, type_defaults.get(col_type.type, ""))]
                       for col_name, col_type in df.dtypes.items()]

    def parse_csv(value):
        columns = tf.decode_csv(value, record_defaults)
        features = dict(zip(df.columns.tolist(), columns))
        label = features[label_col]
        return features, label

    # read, parse, shuffle and batch dataset
    dataset = tf.data.TextLineDataset(csv_path).skip(1)  # skip header
    if shuffle:
        dataset = dataset.shuffle(buffer_size=1024)
    dataset = dataset.map(parse_csv, num_parallel_calls=8)
    dataset = dataset.batch(batch_size)
    return dataset 
开发者ID:yxtay,项目名称:recommender-tensorflow,代码行数:22,代码来源:tf_utils.py

示例2: read_feature_file

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def read_feature_file(args):

    fname, feature_file_format = os.path.splitext(args.inputNucleiFeatureFile)

    if feature_file_format == '.csv':

        ddf = dd.read_csv(args.inputNucleiFeatureFile)

    elif feature_file_format == '.h5':

        ddf = dd.read_hdf(args.inputNucleiFeatureFile, 'Features')

    else:
        raise ValueError('Extension of output feature file must be .csv or .h5')

    return ddf 
开发者ID:DigitalSlideArchive,项目名称:HistomicsTK,代码行数:18,代码来源:NucleiClassification.py

示例3: split_list

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def split_list(_list=LATEST):

    print('Please wait while scene metadata is split')
    try:
        csv = read_csv(_list, dtype={'PRODUCT_ID': object, 'COLLECTION_NUMBER': object,
                                    'COLLECTION_CATEGORY': object}, blocksize=25e6,
                    parse_dates=True)
    except EmptyDataError:
        print('Metadata has already been updated for the day.')
        return None

    csv = csv[csv.COLLECTION_NUMBER != 'PRE']

    sats = unique(csv.SPACECRAFT_ID).tolist()
    for sat in sats:
        print(sat)
        df = csv[csv.SPACECRAFT_ID == sat]
        dst = os.path.join(SCENES, sat)
        if os.path.isfile(dst):
            os.remove(dst)
        if not os.path.isdir(dst):
            os.mkdir(dst)
        df.to_parquet('{}'.format(dst))

    return None 
开发者ID:dgketchum,项目名称:Landsat578,代码行数:27,代码来源:update_landsat_metadata.py

示例4: kmeans_input_fn

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def kmeans_input_fn(self, name, csv_path=None):
        """Input function for kmeans

        Arguments:
                name : string, Name of the data [Train or Eval]
                csv_path : The path of the csv on any storage system

        Returns:
                A batch of features
        """
        pattern = self._get_pattern(name, csv_path)
        tf.logging.info('The Pattern of files is : %s', pattern)
        df = dd.read_csv(pattern)
        vectors = dask.compute(df.values)
        return tf.train.limit_epochs(
            tf.convert_to_tensor(vectors[0], dtype=tf.float32), num_epochs=1) 
开发者ID:GoogleCloudPlatform,项目名称:professional-services,代码行数:18,代码来源:input_pipeline_dask.py

示例5: test_regress_newsread

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def test_regress_newsread(client, listen_port):
    data = dd.read_csv('./system_tests/data/*.gz', compression='gzip', blocksize=None)
    dX = data.iloc[:, 1:]
    dy = data.iloc[:, 0]

    d_regress = dlgbm.LGBMRegressor(n_estimators=50, local_listen_port=listen_port)
    d_regress.fit(dX, dy)

    dy_pred = d_regress.predict(dX, client=client)

    # The dask_ml.metrics.r2_score method fails with dataframes so we compute the R2 score ourselves
    numerator = ((dy - dy_pred) ** 2).sum()
    denominator = ((dy - dy.mean()) ** 2).sum()
    r2_score = 1 - numerator / denominator
    r2_score = r2_score.compute()
    print(r2_score)

    assert r2_score > 0.8 
开发者ID:dask,项目名称:dask-lightgbm,代码行数:20,代码来源:test_fit_predict.py

示例6: main

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def main():
    client = Client()  # noqa

    categories = ["category_%d" % i for i in range(26)]
    columns = ["click"] + ["numeric_%d" % i for i in range(13)] + categories

    df = dd.read_csv("day_1", sep="\t", names=columns, header=None)

    encoding = {c: "bytes" for c in categories}
    fixed = {c: 8 for c in categories}
    df.to_parquet(
        "day-1-bytes.parquet",
        object_encoding=encoding,
        fixed_text=fixed,
        compression="SNAPPY",
    ) 
开发者ID:dask,项目名称:dask-ml,代码行数:18,代码来源:make_parquet.py

示例7: __init__

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def __init__(self, file_path, sep=",", header="infer", block_size=10e6, random_seed=None):
        """Initializes the loader.

        Args:
            file_path (str): Path to delimited file.
            sep (str, optional): Delimiter. Defaults to ",".
            header (str, optional): Number of rows to be used as the header.
                See pandas.read_csv()
                Defaults to "infer".
            block_size (int, optional): Size of partition in bytes.
                See dask.dataframe.read_csv()
                Defaults to 10e6.
            random_seed (int, optional): Random seed. See random.seed().
                Defaults to None.
        """

        self.df = dd.read_csv(file_path, sep=sep, header=header, blocksize=block_size)

        self.random_seed = random_seed
        random.seed(random_seed) 
开发者ID:microsoft,项目名称:nlp-recipes,代码行数:22,代码来源:data_loaders.py

示例8: open_gssha

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def open_gssha(filename):
    """
    Reads various filetypes produced by GSSHA
    """
    # Read metadata
    ftype = filename.split('.')[-1]
    if ftype in ['fgd', 'asc']:
        f = open(filename, 'r')
        c, r, xlc, ylc, gsize, nanval = [
            t(f.readline().split(' ')[-1].split('\n')[0])
            for t in [int, int, float, float, float, float]
        ]
        xs = np.linspace(xlc+gsize/2., xlc+c*gsize-gsize/2., c+1)
        ys = np.linspace(ylc+gsize/2., ylc+r*gsize-gsize/2., r)
    else:
        header_df = pd.read_table(filename, engine='python',
                              names=['meta_key', 'meta_val'],
                              sep=' ', nrows=6)
        bounds = header_df.loc[:3, 'meta_val'].values.astype(float)
        r, c = header_df.loc[4:6, 'meta_val'].values.astype(int)
        xs, ys = get_sampling(bounds, (r, c))
    
    # Read data using dask
    ddf = dd.read_csv(filename, skiprows=6, header=None,
                      sep=' ')
    darr = ddf.values.compute()
        
    if ftype == 'fgd':
        darr[darr==nanval] = np.NaN
    
    return xr.DataArray(darr[::-1], coords={'x': xs, 'y': ys},
                        name='z', dims=['y', 'x']) 
开发者ID:pyviz-topics,项目名称:EarthSim,代码行数:34,代码来源:io.py

示例9: load_data

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def load_data(src_dir="data/ml-100k"):
    data = {item: dd.read_csv(str(Path(src_dir, conf["filename"])), sep=conf["sep"],
                              header=None, names=conf["columns"], encoding="latin-1")
            for item, conf in DATA_CONFIG.items()}

    logger.info("data loaded.")
    return data 
开发者ID:yxtay,项目名称:recommender-tensorflow,代码行数:9,代码来源:ml_100k.py

示例10: read_csv

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def read_csv(urlpath, *args, **kwargs):
    def rename_dask_index(df, name):
        df.index.name = name
        return df

    index_col = index_name = None

    if "index" in kwargs:
        del kwargs["index"]
    if "index_col" in kwargs:
        index_col = kwargs["index_col"]
        if isinstance(index_col, list):
            index_col = index_col[0]
        del kwargs["index_col"]
    if "index_name" in kwargs:
        index_name = kwargs["index_name"]
        del kwargs["index_name"]

    df = dd.read_csv(urlpath, *args, **kwargs)

    if index_col is not None:
        df = df.set_index(index_col)

    if index_name is not None:
        df = df.map_partitions(rename_dask_index, index_name)

    return df 
开发者ID:ranaroussi,项目名称:pystore,代码行数:29,代码来源:utils.py

示例11: test_classify_newsread

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def test_classify_newsread(client, listen_port):
    data = dd.read_csv('./system_tests/data/*.gz', compression='gzip', blocksize=None)
    dX = data.iloc[:, :-1]
    dy = data.iloc[:, -1]

    d_classif = dlgbm.LGBMClassifier(n_estimators=50, local_listen_port=listen_port)
    d_classif.fit(dX, dy)

    dy_pred = d_classif.predict(dX, client=client)

    acc_score = (dy == dy_pred).sum() / len(dy)
    acc_score = acc_score.compute()
    print(acc_score)

    assert acc_score > 0.8 
开发者ID:dask,项目名称:dask-lightgbm,代码行数:17,代码来源:test_fit_predict.py

示例12: __init__

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def __init__(self, filename, filename_info, filetype_info):
        """Make sure filepath is valid and then reads data into a Dask DataFrame.

        Args:
            filename: Filename
            filename_info: Filename information
            filetype_info: Filetype information

        """
        skip_rows = filetype_info.get('skip_rows', 15)
        columns = filetype_info['columns']
        self.file_content = dd.read_csv(filename, skiprows=skip_rows, header=None, names=columns)
        super(VIIRSActiveFiresTextFileHandler, self).__init__(filename, filename_info, filetype_info)
        self.platform_name = PLATFORM_MAP.get(self.filename_info['satellite_name'].upper(), "unknown") 
开发者ID:pytroll,项目名称:satpy,代码行数:16,代码来源:viirs_edr_active_fires.py

示例13: main

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', required=True)
    args = parser.parse_args()

    data_dir = Path(args.input)
    name = data_dir.parent.name
    path = str(data_dir/'*.csv')
    df = dd.read_csv(path).compute()
    df = df.groupby(['model', 'n_trials', 'model_train_frac']).agg({
        'CV AUC': ['mean', 'std'],
        'duration[s]': ['mean', 'std'],
    })
    df.columns = ['%s_%s' % (a, b) for a, b in df.columns]
    df = df.reset_index()
    print(df)

    # plot
    fracs = sorted(df['model_train_frac'].unique())
    plt.figure(figsize=(12, 5))
    for j, frac in enumerate(fracs):
        for i, n_trials in enumerate([1, 10, 20, 30]):
            idx = (df['model'] == 'auto') &\
                  (df['n_trials'] == n_trials) &\
                  (df['model_train_frac'] == frac)
            x = df.loc[idx, 'duration[s]_mean']
            y = df.loc[idx, 'CV AUC_mean']
            xerr = df.loc[idx, 'duration[s]_std']
            yerr = df.loc[idx, 'CV AUC_std']
            fmt = '%sC%d' % (['x', 'o', 's', 'D'][i], j)
            label = 'n_trials=%d, model_train_frac=%.2f' % (n_trials, frac)  # noqa
            plt.errorbar(x, y, xerr=xerr, yerr=yerr, fmt=fmt, label=label)

    plt.title('Parameter Comparison (dataset=%s)' % (name))
    plt.xlabel('Training Time[s]')
    plt.ylabel('CV AUC')
    plt.legend()
    plt.savefig(str(data_dir/'frac-and-n_trials.png')) 
开发者ID:pfnet-research,项目名称:autogbt-alt,代码行数:40,代码来源:vis_frac_and_duration.py

示例14: main

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', required=True)
    args = parser.parse_args()

    data_dir = Path(args.input)
    path = str(data_dir/'*.csv')
    df = dd.read_csv(path).compute()
    df['model'] = df['model'].map(const.models)
    df = df.sort_values(['dataset', 'model']).reset_index(drop=True)
    df = df.groupby(['model', 'dataset']).agg({
        'CV AUC': ['mean', 'std'],
        'duration[s]': ['mean', 'std'],
    })
    df.columns = ['%s_%s' % (a, b) for a, b in df.columns]
    df = df.reset_index()
    df['model'] = df['model'].apply(lambda d: d[1])
    print(df)

    # plot
    plt.figure(figsize=(8, 6))
    for i, (_, model) in enumerate(const.models.values()):
        for j, dset in enumerate(['airline', 'amazon', 'bank']):
            idx = (df['model'] == model) &\
                  (df['dataset'] == dset)
            x = df.loc[idx, 'duration[s]_mean']
            y = df.loc[idx, 'CV AUC_mean']
            xerr = df.loc[idx, 'duration[s]_std']
            yerr = df.loc[idx, 'CV AUC_std']
            fmt = '%sC%d' % (['o', 's', 'D', '^'][j], i)
            label = 'model=%s, dataset=%s' % (model, dset)  # noqa
            plt.errorbar(x, y, xerr=xerr, yerr=yerr, fmt=fmt, label=label)

    plt.title('Model Comparison')
    plt.xlabel('Training Time[s]')
    plt.ylabel('CV AUC')
    plt.legend(loc='lower right')
    plt.savefig(data_dir/'model_and_task.png') 
开发者ID:pfnet-research,项目名称:autogbt-alt,代码行数:40,代码来源:vis_model_and_task.py

示例15: main

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_csv [as 别名]
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', required=True)
    args = parser.parse_args()

    path = str(Path(args.input)/'*.csv')
    df = dd.read_csv(
        path,
        dtype={
            'n_trials': 'float64',
        },
    ).compute()
    df = df.groupby(['model', 'dataset']).agg({
        'duration[s]': ['mean', 'std'],
        'CV AUC': ['mean', 'std'],
    })
    dummy = pd.DataFrame(index=[
        ('xgb', 'avazu'),
        ('lgb', 'avazu'),
    ])
    df = df.append(dummy)
    df.columns = ['%s_%s' % (a, b) for a, b in df.columns]
    df = df.reset_index()
    for c in ['duration[s]', 'CV AUC']:
        mean = '%s_mean' % (c)
        std = '%s_std' % (c)
        df[std] = df[std].apply(lambda d: '±%.3f' % (d))
        df[mean] = df[mean].apply(lambda d: '%.3f' % (d))
        df[c] = (df[mean] + df[std]).apply(_handle_nan)
    df['dataset'] = df['dataset'].map(const.competitions)
    df['model'] = df['model'].map(const.models)
    df = df[['dataset', 'model', 'duration[s]', 'CV AUC']]
    df = df.sort_values(['dataset', 'model'])
    df = df.reset_index(drop=True)
    df['model'] = df['model'].apply(lambda d: d[1])

    for dset, grp in df.groupby('dataset'):
        grp.pop('dataset')
        md = tabulate(grp.values, grp.columns, tablefmt='pipe', floatfmt='.3f')
        print('#### %s\n' % (dset))
        print(md + '\n') 
开发者ID:pfnet-research,项目名称:autogbt-alt,代码行数:43,代码来源:print.py


注:本文中的dask.dataframe.read_csv方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。