本文整理汇总了Python中pandas.read_parquet方法的典型用法代码示例。如果您正苦于以下问题:Python pandas.read_parquet方法的具体用法?Python pandas.read_parquet怎么用?Python pandas.read_parquet使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pandas
的用法示例。
在下文中一共展示了pandas.read_parquet方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_split
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_parquet [as 别名]
def get_split(self, k):
train_index_file = os.path.join(
self._results_path, "folds", f"fold_{k}_train_indices.npy"
)
validation_index_file = os.path.join(
self._results_path, "folds", f"fold_{k}_validation_indices.npy"
)
train_index = np.load(train_index_file)
validation_index = np.load(validation_index_file)
X = pd.read_parquet(self._X_train_path)
y = pd.read_parquet(self._y_train_path)
y = y["target"]
return (
{"X": X.loc[train_index], "y": y.loc[train_index]},
{"X": X.loc[validation_index], "y": y.loc[validation_index]},
)
示例2: get_split
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_parquet [as 别名]
def get_split(self, k=0):
X = pd.read_parquet(self._X_train_path)
y = pd.read_parquet(self._y_train_path)
y = y["target"]
stratify = None
if self.stratify:
stratify = y
if self.shuffle == False:
stratify = None
X_train, X_validation, y_train, y_validation = train_test_split(
X,
y,
train_size=self.train_ratio,
test_size=1.0 - self.train_ratio,
shuffle=self.shuffle,
stratify=stratify,
random_state=self.random_seed,
)
return {"X": X_train, "y": y_train}, {"X": X_validation, "y": y_validation}
示例3: test_dataframe_parquet_materialization
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_parquet [as 别名]
def test_dataframe_parquet_materialization():
check_parquet_support()
@solid(output_defs=[OutputDefinition(DataFrame)])
def return_df(_context):
return pd.DataFrame({'num1': [1, 3], 'num2': [2, 4]})
@pipeline
def return_df_pipeline():
return_df()
with get_temp_file_name() as filename:
result = execute_pipeline(
return_df_pipeline,
{'solids': {'return_df': {'outputs': [{'result': {'parquet': {'path': filename}}}]}}},
)
assert result.success
df = pd.read_parquet(filename)
assert df.to_dict('list') == {'num1': [1, 3], 'num2': [2, 4]}
示例4: read_df_detect
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_parquet [as 别名]
def read_df_detect(path):
"""
Read a Pandas data frame, auto-detecting the file format based on filename suffix.
The following file types are supported:
CSV
File has suffix ``.csv``, read with :py:func:`pandas.read_csv`.
Parquet
File has suffix ``.parquet``, ``.parq``, or ``.pq``, read with
:py:func:`pandas.read_parquet`.
"""
import pandas as pd
if not isinstance(path, pathlib.Path):
path = pathlib.Path(path)
if path.suffix == '.csv':
return pd.read_csv(path)
elif path.suffix in ('.parquet', '.parq', '.pq'):
return pd.read_parquet(path)
示例5: test_parquet_write
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_parquet [as 别名]
def test_parquet_write(self):
with self.temp_dir() as tmp:
pdf = self.test_pdf
expected = ks.DataFrame(pdf)
# Write out partitioned by one column
expected.to_parquet(tmp, mode="overwrite", partition_cols="i32")
# Reset column order, as once the data is written out, Spark rearranges partition
# columns to appear first.
actual = ks.read_parquet(tmp)[self.test_column_order]
self.assert_eq(
actual.sort_values(by="f").to_spark().toPandas(),
expected.sort_values(by="f").to_spark().toPandas(),
)
# Write out partitioned by two columns
expected.to_parquet(tmp, mode="overwrite", partition_cols=["i32", "bhello"])
# Reset column order, as once the data is written out, Spark rearranges partition
# columns to appear first.
actual = ks.read_parquet(tmp)[self.test_column_order]
self.assert_eq(
actual.sort_values(by="f").to_spark().toPandas(),
expected.sort_values(by="f").to_spark().toPandas(),
)
示例6: test_sort_parallel
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_parquet [as 别名]
def test_sort_parallel(self):
# create `kde.parquet` file
ParquetGenerator.gen_kde_pq()
# TODO: better parallel sort test
def test_impl():
df = pd.read_parquet('kde.parquet')
df['A'] = df.points.astype(np.float64)
df.sort_values('points', inplace=True)
res = df.A.values
return res
hpat_func = self.jit(locals={'res:return': 'distributed'})(test_impl)
save_min_samples = sdc.hiframes.sort.MIN_SAMPLES
try:
sdc.hiframes.sort.MIN_SAMPLES = 10
res = hpat_func()
self.assertTrue((np.diff(res) >= 0).all())
finally:
# restore global val
sdc.hiframes.sort.MIN_SAMPLES = save_min_samples
示例7: make_subgen
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_parquet [as 别名]
def make_subgen(self, chunk):
filename = chunk
subgen = single_sequence.SequenceForecastBatchGenerator(
df=pd.read_parquet(filename).reset_index(),
batch_size=self.batch_size,
sequence_length=self.sequence_length,
id_column=self.id_column,
sequence_columns=self.sequence_columns,
sequence_prefix=self.sequence_prefix,
last_step_columns=self.last_step_columns,
last_step_prefix=self.last_step_prefix,
forecast_steps_min=self.forecast_steps_min,
forecast_steps_max=self.forecast_steps_max,
batch_offset=self.batch_offset,
batch_offset_period=self.batch_offset_period,
dt_column=self.dt_column,
start_time=self.start_time,
)
return subgen
示例8: read_parquet
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_parquet [as 别名]
def read_parquet(cls, path, engine, columns, **kwargs):
"""Load a parquet object from the file path, returning a Modin DataFrame.
Modin only supports pyarrow engine for now.
Args:
path: The filepath of the parquet file.
We only support local files for now.
engine: Modin only supports pyarrow reader.
This argument doesn't do anything for now.
kwargs: Pass into parquet's read_pandas function.
Notes:
ParquetFile API is used. Please refer to the documentation here
https://arrow.apache.org/docs/python/parquet.html
"""
ErrorMessage.default_to_pandas("`read_parquet`")
return cls.from_pandas(pandas.read_parquet(path, engine, columns, **kwargs))
示例9: parse
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_parquet [as 别名]
def parse(fname, **kwargs):
num_splits = kwargs.pop("num_splits", None)
columns = kwargs.get("columns", None)
if num_splits is None:
return pandas.read_parquet(fname, **kwargs)
kwargs["use_pandas_metadata"] = True
df = pandas.read_parquet(fname, **kwargs)
if isinstance(df.index, pandas.RangeIndex):
idx = len(df.index)
else:
idx = df.index
columns = [c for c in columns if c not in df.index.names and c in df.columns]
if columns is not None:
df = df[columns]
# Append the length of the index here to build it externally
return _split_result_for_readers(0, num_splits, df) + [idx, df.dtypes]
示例10: test_datetime_coercion_explicitly
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_parquet [as 别名]
def test_datetime_coercion_explicitly():
"""
Sanity check that we're using a version of pyarrow that allows us to
truncate timestamps
"""
dt = _datetime.datetime(day=1, month=1, year=2017, hour=1, minute=1, second=1, microsecond=1)
values = [(dt,)]
df = _pd.DataFrame.from_records(values, columns=['testname'])
assert df['testname'][0] == dt
with _utils.AutoDeletingTempDir('test') as tmpdir:
tmpfile = tmpdir.get_named_tempfile('repro.parquet')
df.to_parquet(tmpfile, coerce_timestamps='ms', allow_truncated_timestamps=True)
df2 = _pd.read_parquet(tmpfile)
dt2 = _datetime.datetime(day=1, month=1, year=2017, hour=1, minute=1, second=1)
assert df2['testname'][0] == dt2
示例11: read_as_dataframe
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_parquet [as 别名]
def read_as_dataframe(input_path: str):
if os.path.isfile(input_path):
if input_path.endswith(".csv"):
return pd.read_csv(input_path)
elif input_path.endswith(".parquet"):
return pd.read_parquet(input_path)
else:
dir_path = pathlib.Path(input_path)
csv_files = list(dir_path.glob("**/*.csv"))
if csv_files:
df_from_csv_files = (pd.read_csv(f) for f in csv_files)
return pd.concat(df_from_csv_files, ignore_index=True)
parquet_files = list(dir_path.glob("**/*.parquet"))
if parquet_files:
df_from_parquet_files = (pd.read_parquet(f) for f in parquet_files)
return pd.concat(df_from_parquet_files, ignore_index=True)
raise ValueError(f"Failed to read path: {input_path}")
示例12: load
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_parquet [as 别名]
def load(self, file):
# MEMO: read_parquet only supports a filepath as string (not a file handle)
return pd.read_parquet(file.name)
示例13: add_ticker
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_parquet [as 别名]
def add_ticker(data: pd.DataFrame, ticker: str) -> pd.DataFrame:
"""
Add ticker as first layer of multi-index
Args:
data: raw data
ticker: ticker
Returns:
pd.DataFrame
Examples:
>>> (
... pd.read_parquet('xbbg/tests/data/sample_bdib.parq')
... .pipe(add_ticker, ticker='SPY US Equity')
... .pipe(get_series, col='close')
... )
SPY US Equity
2018-12-28 09:30:00-05:00 249.67
2018-12-28 09:31:00-05:00 249.54
2018-12-28 09:32:00-05:00 249.22
2018-12-28 09:33:00-05:00 249.01
2018-12-28 09:34:00-05:00 248.86
"""
data.columns = pd.MultiIndex.from_product([
[ticker], data.head().rename(columns={'numEvents': 'num_trds'}).columns
])
return data
示例14: get_stored_data
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_parquet [as 别名]
def get_stored_data(
self, func: Callable, *args: Any, **kwargs: Any
) -> Union[pd.DataFrame, pathlib.Path, io.BytesIO]:
argspec = inspect.getfullargspec(func)
for arg_name, arg in zip(argspec.args, args):
kwargs[arg_name] = arg
WebvizStorage.complete_kwargs(func, kwargs)
return_type = inspect.getfullargspec(func).annotations["return"]
path = self._unique_path(func, WebvizStorage._dict_to_tuples(kwargs))
try:
if return_type == pd.DataFrame:
return pd.read_parquet(f"{path}.parquet")
if return_type == pathlib.Path:
return pathlib.Path(glob.glob(f"{path}*")[0])
if return_type == io.BytesIO:
return io.BytesIO(pathlib.Path(path).read_bytes())
raise ValueError(f"Unknown return type {return_type}")
except OSError:
raise OSError(
f"Could not find file {path}, which should be the "
"stored output of the function call "
f"{WebvizStorage.string(func, kwargs)}."
)
示例15: read_df
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_parquet [as 别名]
def read_df(self, f: BinaryIO) -> pd.DataFrame:
columns = self.time_series_columns.columns
datetime_column = self.time_series_columns.datetime_column
df = pd.read_parquet(f, engine="pyarrow", columns=columns).set_index(
datetime_column
)
df.index = pd.to_datetime(df.index, utc=True)
return df