本文整理汇总了Python中pyarrow.parquet方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.parquet方法的具体用法?Python pyarrow.parquet怎么用?Python pyarrow.parquet使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow
的用法示例。
在下文中一共展示了pyarrow.parquet方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: validate_dataframe
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import parquet [as 别名]
def validate_dataframe(df):
if not isinstance(df, DataFrame):
raise ValueError("to_parquet only supports IO with DataFrames")
# must have value column names (strings only)
if df.columns.inferred_type not in {'string', 'unicode'}:
raise ValueError("parquet must have string column names")
# index level names must be strings
valid_names = all(
isinstance(name, string_types)
for name in df.index.names
if name is not None
)
if not valid_names:
raise ValueError("Index level names must be strings")
示例2: __init__
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import parquet [as 别名]
def __init__(self):
# since pandas is a dependency of pyarrow
# we need to import on first use
try:
import pyarrow
import pyarrow.parquet
except ImportError:
raise ImportError(
"pyarrow is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n"
)
if LooseVersion(pyarrow.__version__) < '0.9.0':
raise ImportError(
"pyarrow >= 0.9.0 is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n"
)
self.api = pyarrow
示例3: write
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import parquet [as 别名]
def write(self, df, path, compression='snappy',
coerce_timestamps='ms', index=None, partition_cols=None,
**kwargs):
self.validate_dataframe(df)
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
if index is None:
from_pandas_kwargs = {}
else:
from_pandas_kwargs = {'preserve_index': index}
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
if partition_cols is not None:
self.api.parquet.write_to_dataset(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps,
partition_cols=partition_cols, **kwargs)
else:
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)
示例4: write
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import parquet [as 别名]
def write(self, df, path, compression='snappy',
coerce_timestamps='ms', **kwargs):
self.validate_dataframe(df)
if self._pyarrow_lt_070:
self._validate_write_lt_070(df)
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
if self._pyarrow_lt_060:
table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
self.api.parquet.write_table(
table, path, compression=compression, **kwargs)
else:
table = self.api.Table.from_pandas(df)
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)
示例5: read
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import parquet [as 别名]
def read(self, path, columns=None, **kwargs):
path, _, _, should_close = get_filepath_or_buffer(path)
if self._pyarrow_lt_070:
result = self.api.parquet.read_pandas(path, columns=columns,
**kwargs).to_pandas()
else:
kwargs['use_pandas_metadata'] = True
result = self.api.parquet.read_table(path, columns=columns,
**kwargs).to_pandas()
if should_close:
try:
path.close()
except: # noqa: flake8
pass
return result
示例6: to_parquet
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import parquet [as 别名]
def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
"""
Write a DataFrame to the parquet format.
Parameters
----------
df : DataFrame
path : string
File path
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet library to use. If 'auto', then the option
``io.parquet.engine`` is used. The default ``io.parquet.engine``
behavior is to try 'pyarrow', falling back to 'fastparquet' if
'pyarrow' is unavailable.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
kwargs
Additional keyword arguments passed to the engine
"""
impl = get_engine(engine)
return impl.write(df, path, compression=compression, **kwargs)
示例7: write
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import parquet [as 别名]
def write(self, df, path, compression='snappy',
coerce_timestamps='ms', **kwargs):
self.validate_dataframe(df)
if self._pyarrow_lt_070:
self._validate_write_lt_070(df)
path, _, _ = get_filepath_or_buffer(path)
if self._pyarrow_lt_060:
table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
self.api.parquet.write_table(
table, path, compression=compression, **kwargs)
else:
table = self.api.Table.from_pandas(df)
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)
示例8: to_parquet
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import parquet [as 别名]
def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
"""
Write a DataFrame to the parquet format.
Parameters
----------
df : DataFrame
path : string
File path
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet reader library to use. If 'auto', then the option
'io.parquet.engine' is used. If 'auto', then the first
library to be installed is used.
compression : str, optional, default 'snappy'
compression method, includes {'gzip', 'snappy', 'brotli'}
kwargs
Additional keyword arguments passed to the engine
"""
impl = get_engine(engine)
return impl.write(df, path, compression=compression, **kwargs)
示例9: parquet_file
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import parquet [as 别名]
def parquet_file(
table: Union[Dict[str, List[Any]], pyarrow.Table],
dir: Optional[pathlib.Path] = None,
) -> ContextManager[pathlib.Path]:
"""
Yield a filename with `table` written to a Parquet file.
"""
if isinstance(table, dict):
table = pyarrow.table(table)
with tempfile_context(dir=dir) as parquet_path:
pyarrow.parquet.write_table(
table,
parquet_path,
version="2.0",
compression="SNAPPY",
use_dictionary=[
name.encode("utf-8")
for name, column in zip(table.column_names, table.columns)
if pyarrow.types.is_dictionary(column.type)
],
)
yield parquet_path
示例10: get_engine
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import parquet [as 别名]
def get_engine(engine):
""" return our implementation """
if engine == 'auto':
engine = get_option('io.parquet.engine')
if engine == 'auto':
# try engines in this order
try:
return PyArrowImpl()
except ImportError:
pass
try:
return FastParquetImpl()
except ImportError:
pass
if engine not in ['pyarrow', 'fastparquet']:
raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
if engine == 'pyarrow':
return PyArrowImpl()
elif engine == 'fastparquet':
return FastParquetImpl()
示例11: __init__
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import parquet [as 别名]
def __init__(self):
# since pandas is a dependency of pyarrow
# we need to import on first use
try:
import pyarrow
import pyarrow.parquet
except ImportError:
raise ImportError("pyarrow is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n")
if LooseVersion(pyarrow.__version__) < '0.4.1':
raise ImportError("pyarrow >= 0.4.1 is required for parquet"
"support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n")
self._pyarrow_lt_050 = LooseVersion(pyarrow.__version__) < '0.5.0'
self._pyarrow_lt_060 = LooseVersion(pyarrow.__version__) < '0.6.0'
self.api = pyarrow
示例12: read_parquet
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import parquet [as 别名]
def read_parquet(path, engine='auto', **kwargs):
"""
Load a parquet object from the file path, returning a DataFrame.
.. versionadded 0.21.0
Parameters
----------
path : string
File path
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet reader library to use. If 'auto', then the option
'io.parquet.engine' is used. If 'auto', then the first
library to be installed is used.
kwargs are passed to the engine
Returns
-------
DataFrame
"""
impl = get_engine(engine)
return impl.read(path)
示例13: df_from_bytes_parquet_
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import parquet [as 别名]
def df_from_bytes_parquet_(bytes_: bytes) -> pd.DataFrame:
"""
Since pyabc 0.9.14, pandas DataFrames are converted using
pyarrow parquet. If the conversion to DataFrame fails,
then `df_from_bytes_msgpack_` is tried, which was the formerly
used method. This is in particular useful for databases that
still employ the old format. In case errors occur here, it may
be necessary to use a pandas version prior to 0.25.0.
"""
try:
b = BytesIO(bytes_)
table = parquet.read_table(b)
df = table.to_pandas()
except pyarrow.lib.ArrowIOError:
df = df_from_bytes_msgpack_(bytes_)
return df
示例14: get_engine
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import parquet [as 别名]
def get_engine(engine):
""" return our implementation """
if engine == 'auto':
engine = get_option('io.parquet.engine')
if engine == 'auto':
# try engines in this order
try:
return PyArrowImpl()
except ImportError:
pass
try:
return FastParquetImpl()
except ImportError:
pass
raise ImportError("Unable to find a usable engine; "
"tried using: 'pyarrow', 'fastparquet'.\n"
"pyarrow or fastparquet is required for parquet "
"support")
if engine not in ['pyarrow', 'fastparquet']:
raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
if engine == 'pyarrow':
return PyArrowImpl()
elif engine == 'fastparquet':
return FastParquetImpl()
示例15: read
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import parquet [as 别名]
def read(self, path, columns=None, **kwargs):
path, _, _, should_close = get_filepath_or_buffer(path)
kwargs['use_pandas_metadata'] = True
result = self.api.parquet.read_table(path, columns=columns,
**kwargs).to_pandas()
if should_close:
try:
path.close()
except: # noqa: flake8
pass
return result