本文整理汇总了Python中pandas.read_hdf方法的典型用法代码示例。如果您正苦于以下问题:Python pandas.read_hdf方法的具体用法?Python pandas.read_hdf怎么用?Python pandas.read_hdf使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pandas
的用法示例。
在下文中一共展示了pandas.read_hdf方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: iMain
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_hdf [as 别名]
def iMain():
"""
Read an hdf file generated by us to make sure
we can recover its content and structure.
Give the name of an hdf5 file as a command-line argument.
"""
assert sys.argv, __doc__
sFile = sys.argv[1]
assert os.path.isfile(sFile)
oHdfStore = pandas.HDFStore(sFile, mode='r')
print oHdfStore.groups()
# bug - no return value
# oSignals = pandas.read_hdf(oHdfStore, '/servings/signals')
mSignals = oHdfStore.select('/recipe/servings/mSignals', auto_close=False)
print mSignals
print oHdfStore.get_node('/recipe')._v_attrs.metadata[0]['sUrl']
示例2: test_write_fspath_hdf5
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_hdf [as 别名]
def test_write_fspath_hdf5(self):
# Same test as write_fspath_all, except HDF5 files aren't
# necessarily byte-for-byte identical for a given dataframe, so we'll
# have to read and compare equality
pytest.importorskip('tables')
df = pd.DataFrame({"A": [1, 2]})
p1 = tm.ensure_clean('string')
p2 = tm.ensure_clean('fspath')
with p1 as string, p2 as fspath:
mypath = CustomFSPath(fspath)
df.to_hdf(mypath, key='bar')
df.to_hdf(string, key='bar')
result = pd.read_hdf(fspath, key='bar')
expected = pd.read_hdf(string, key='bar')
tm.assert_frame_equal(result, expected)
示例3: load_hdf5_data
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_hdf [as 别名]
def load_hdf5_data(file_path, **kwargs):
key = kwargs.get('key', None)
pandas_format = kwargs.get('pandas_format', True)
mode = kwargs.get('mode', 'r')
logger.info("Opening HDF5 file {} to read...".format(file_path))
try:
if pandas_format:
data = pd.read_hdf(file_path, key=key, mode=mode)
else:
with h5py.File(file_path, mode) as f:
data = f[key][()]
except KeyError as e:
logger.exception("Dataset {} does not exist".format(dataset))
raise exceptions.FileLoadError("Dataset does not exist")
except Exception as e:
logger.exception("Problem loading dataset: {0}".format(e))
raise exceptions.FileLoadError
logger.info("Successfully loaded HDF5 data")
return data
示例4: test_write_data_frame
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_hdf [as 别名]
def test_write_data_frame(hdf_file_path):
key = hdf.EntityKey('cause.test.prevalence')
data = build_table([lambda *args, **kwargs: random.choice([0, 1]), "Kenya", 1],
2005, 2010, columns=('age', 'year', 'sex', 'draw', 'location', 'value'))
non_val_columns = data.columns.difference({'value'})
data = data.set_index(list(non_val_columns))
hdf._write_pandas_data(hdf_file_path, key, data)
written_data = pd.read_hdf(hdf_file_path, key.path)
assert written_data.equals(data)
filter_terms = ['draw == 0']
written_data = pd.read_hdf(hdf_file_path, key.path, where=filter_terms)
assert written_data.equals(data.xs(0, level='draw', drop_level=False))
示例5: participation_to_list
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_hdf [as 别名]
def participation_to_list(h5in, outlist):
trade_df = pd.read_hdf(h5in, 'trades')
trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0])
lt_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.count())
lt_df.rename(columns={'quantity': 'trade'}, inplace=True)
if 'p999999' in lt_df.index:
lt_df.drop('p999999', inplace=True)
ltsum_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.sum())
ltsum_df.rename(columns={'quantity': 'trade_vol'}, inplace=True)
ltsum_df = ltsum_df.assign(Participation = 100*ltsum_df.trade_vol/ltsum_df.trade_vol.sum())
providers = ltsum_df.index.unique()
market_makers = [x for x in providers if x.startswith('m')]
market_makers.append('j0')
ltsum_df = ltsum_df.ix[market_makers]
part_dict = {'MCRun': j, 'MM_Participation': ltsum_df.loc['m0', 'Participation']}
if 'j0' in providers:
part_dict.update({'PJ_Participation': ltsum_df.loc['j0', 'Participation']})
outlist.append(part_dict)
示例6: profit_to_list
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_hdf [as 别名]
def profit_to_list(h5in, outlist):
trade_df = pd.read_hdf(h5in, 'trades')
trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0])
buy_trades = trade_df[trade_df.side=='buy']
buy_trades = buy_trades.assign(BuyCashFlow = buy_trades.price*buy_trades.quantity)
buy_trades = buy_trades.assign(BuyVol = buy_trades.groupby('trader_id').quantity.cumsum(),
CumulBuyCF = buy_trades.groupby('trader_id').BuyCashFlow.cumsum()
)
buy_trades.rename(columns={'timestamp': 'buytimestamp'}, inplace=True)
sell_trades = trade_df[trade_df.side=='sell']
sell_trades = sell_trades.assign(SellCashFlow = -sell_trades.price*sell_trades.quantity)
sell_trades = sell_trades.assign(SellVol = sell_trades.groupby('trader_id').quantity.cumsum(),
CumulSellCF = sell_trades.groupby('trader_id').SellCashFlow.cumsum()
)
sell_trades.rename(columns={'timestamp': 'selltimestamp'}, inplace=True)
buy_trades = buy_trades[['trader_id', 'BuyVol', 'CumulBuyCF', 'buytimestamp']]
sell_trades = sell_trades[['trader_id', 'SellVol', 'CumulSellCF', 'selltimestamp']]
cash_flow = pd.merge(buy_trades, sell_trades, left_on=['trader_id', 'BuyVol'], right_on=['trader_id', 'SellVol'])
cash_flow = cash_flow.assign(NetCashFlow = cash_flow.CumulBuyCF + cash_flow.CumulSellCF)
temp_df = cash_flow.groupby('trader_id')['NetCashFlow', 'BuyVol'].last()
temp_df = temp_df.assign(NetCFPerShare = temp_df.NetCashFlow/temp_df.BuyVol)
temp_df = temp_df[['NetCashFlow', 'NetCFPerShare']]
outlist.append(temp_df)
示例7: tradesrets_to_list
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_hdf [as 别名]
def tradesrets_to_list(h5in, outlist):
indf = pd.read_hdf(h5in, 'trades')
trades = indf.price.count()
minprice = indf.price.min()
maxprice = indf.price.max()
indf = indf.assign(ret = 100*indf.price.pct_change())
indf = indf.assign(abs_ret = np.abs(indf.ret))
lags = []
autocorr = []
abs_autocorr = []
for i in range(1,51):
ac = indf.ret.autocorr(lag = i)
aac = indf.abs_ret.autocorr(lag = i)
lags.append(i)
autocorr.append(ac)
abs_autocorr.append(aac)
ar_df = pd.DataFrame({'lag': lags, 'autocorrelation': autocorr, 'autocorrelation_abs': abs_autocorr})
ar_df.set_index('lag', inplace=True)
clustering_constant = np.abs(ar_df.autocorrelation_abs.sum()/ar_df.autocorrelation.sum())
returns_dict = {'Trades': trades, 'MinPrice': minprice, 'MaxPrice': maxprice, 'ClusteringConstant': clustering_constant,
'MeanRet': indf.ret.mean(), 'StdRet': indf.ret.std(), 'SkewRet': indf.ret.skew(),
'KurtosisRet': indf.ret.kurtosis(), 'MCRun': j}
outlist.append(returns_dict)
示例8: tradesrets_to_list
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_hdf [as 别名]
def tradesrets_to_list(h5in, outlist):
indf = pd.read_hdf(h5in, 'trades')
trades = indf.price.count()
minprice = indf.price.min()
maxprice = indf.price.max()
indf = indf.assign(ret = 100*indf.price.pct_change())
indf = indf.assign(abs_ret = np.abs(indf.ret))
lags = []
autocorr = []
abs_autocorr = []
for i in range(1,51):
ac = indf.ret.autocorr(lag = i)
aac = indf.abs_ret.autocorr(lag = i)
lags.append(i)
autocorr.append(ac)
abs_autocorr.append(aac)
ar_df = pd.DataFrame({'lag': lags, 'autocorrelation': autocorr, 'autocorrelation_abs': abs_autocorr})
ar_df.set_index('lag', inplace=True)
clustering_constant = np.abs(ar_df.autocorrelation_abs.sum()/ar_df.autocorrelation.sum())
returns_dict = {'Trades': trades, 'MinPrice': minprice, 'MaxPrice': maxprice, 'ClusteringConstant': clustering_constant,
'MeanRet': indf.ret.mean(), 'StdRet': indf.ret.std(), 'SkewRet': indf.ret.skew(),
'KurtosisRet': indf.ret.kurtosis(), 'MCRun': j}
outlist.append(returns_dict)
示例9: _load_table
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_hdf [as 别名]
def _load_table(self, src, fmt, dst=None, post=None, *args, **kwargs):
""" Load a data frame from table formats: csv, hdf5, feather """
if fmt == 'csv':
_data = pd.read_csv(src, *args, **kwargs)
elif fmt == 'feather':
_data = feather.read_dataframe(src, *args, **kwargs)
elif fmt == 'hdf5':
_data = pd.read_hdf(src, *args, **kwargs)
# Put into this batch only part of it (defined by index)
if isinstance(_data, pd.DataFrame):
_data = _data.loc[self.indices]
elif isinstance(_data, dd.DataFrame):
# dask.DataFrame.loc supports advanced indexing only with lists
_data = _data.loc[list(self.indices)].compute()
if callable(post):
_data = post(_data, src=src, fmt=fmt, dst=dst, **kwargs)
self.load(src=_data, dst=dst)
示例10: write_to_hdf5
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_hdf [as 别名]
def write_to_hdf5(self, list_buildings, locator):
"""read in the temporary results files and append them to the Totals.csv file."""
df = None
for name in list_buildings:
temporary_file = locator.get_temporary_file('%(name)sT.hdf' % locals())
if df is None:
df = pd.read_hdf(temporary_file, key='dataset')
else:
df = df.append(pd.read_hdf(temporary_file, key='dataset'))
df.to_hdf(locator.get_total_demand('hdf'), key='dataset')
"""read saved data of monthly values and return as totals"""
monthly_data_buildings = [pd.read_hdf(locator.get_demand_results_file(building_name, 'hdf'), key=building_name)
for building_name in
list_buildings]
return df, monthly_data_buildings
示例11: _iter_native_dataset
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_hdf [as 别名]
def _iter_native_dataset(self, native_filters=None):
current_fname = None
for meta_tract in self._metadata:
for meta_patch in meta_tract['patches']:
tract_patch = {'tract': meta_tract['tract'], 'patch': meta_patch['patch']}
if native_filters and not native_filters.check_scalar(tract_patch):
continue
if current_fname != meta_tract['filename']:
current_fname = meta_tract['filename']
df = pd.read_hdf(os.path.join(self.base_dir, current_fname), 'df')
slice_this = slice(*meta_patch['slice'])
def native_quantity_getter(native_quantity):
# pylint: disable=W0640
# variables (df and slice_this) intentionally defined in loop
if native_quantity == '_FULL_PDF':
return df.iloc[slice_this, :self._n_pdf_bins].values
return df[native_quantity].values[slice_this]
yield native_quantity_getter
# Native quantity names in the photo-z catalog are too uninformative
# Since native quantities will become regular quantities in composite catalog,
# let us hide them all.
示例12: resample_eICU_patient
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_hdf [as 别名]
def resample_eICU_patient(pid, resample_factor_in_min, variables, upto_in_minutes):
"""
Resample a *single* patient.
"""
pat_df = pd.read_hdf(paths.eICU_hdf_dir + '/vitalPeriodic.h5',
where='patientunitstayid = ' + str(pid),
columns=['observationoffset', 'patientunitstayid'] + variables,
mode='r')
# sometimes it's empty
if pat_df.empty:
return None
if not upto_in_minutes is None:
pat_df = pat_df.loc[0:upto_in_minutes*60]
# convert the offset to a TimedeltaIndex (necessary for resampling)
pat_df.observationoffset = pd.TimedeltaIndex(pat_df.observationoffset, unit='m')
pat_df.set_index('observationoffset', inplace=True)
pat_df.sort_index(inplace=True)
# resample by time
pat_df_resampled = pat_df.resample(str(resample_factor_in_min) + 'T').median() # pandas ignores NA in median by default
# rename pid, cast to int
pat_df_resampled.rename(columns={'patientunitstayid': 'pid'}, inplace=True)
pat_df_resampled['pid'] = np.int32(pat_df_resampled['pid'])
# get offsets in minutes from index
pat_df_resampled['offset'] = np.int32(pat_df_resampled.index.total_seconds()/60)
return pat_df_resampled
示例13: get_twitter_sentiment_multilabel_classification_dataset
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_hdf [as 别名]
def get_twitter_sentiment_multilabel_classification_dataset():
file_name = os.path.join('tests', 'twitter_sentiment.h5')
try:
df_twitter = pd.read_hdf(file_name)
except Exception as e:
print('Error')
print(e)
dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv'
df_twitter = pd.read_csv(dataset_url, encoding='latin-1')
# Do not write the index that pandas automatically creates
df_twitter.to_hdf(file_name, key='df', format='fixed')
# Grab only 10% of the dataset- runs much faster this way
df_twitter = df_twitter.sample(frac=0.1)
df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created)
df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42)
return df_twitter_train, df_twitter_test
示例14: test_hdf5
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_hdf [as 别名]
def test_hdf5():
df = pd.DataFrame({'i': [0, 0], 'j': [1, 2], 't': [0, 1]})
tnet = teneto.TemporalNetwork(from_df=df, hdf5=True)
if not tnet.network == './teneto_temporalnetwork.h5':
raise AssertionError()
df2 = pd.read_hdf('./teneto_temporalnetwork.h5')
if not (df == df2).all().all():
raise AssertionError()
tnet.add_edge([0, 2, 2])
df3 = pd.read_hdf('./teneto_temporalnetwork.h5')
if not (df3.iloc[2].values == [0, 2, 2]).all():
raise AssertionError()
tnet.drop_edge([0, 2, 2])
df4 = pd.read_hdf('./teneto_temporalnetwork.h5')
if not (df == df4).all().all():
raise AssertionError()
示例15: _hdf2csv
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_hdf [as 别名]
def _hdf2csv(path):
from glob import glob
import pandas as pd
from tqdm import tqdm as _tqdm
from os.path import isdir
if isdir(path):
paths = glob(path + "/*.hdf5")
else:
paths = glob(path)
if paths:
import os.path
for path in _tqdm(paths):
base, ext = os.path.splitext(path)
if ext == ".hdf5":
print("Converting {}".format(path))
out_path = base + ".csv"
locs = pd.read_hdf(path)
print("A total of {} rows loaded".format(len(locs)))
locs.to_csv(out_path, sep=",", encoding="utf-8")
print("Complete.")