本文整理匯總了Python中pyarrow.parquet方法的典型用法代碼示例。如果您正苦於以下問題:Python pyarrow.parquet方法的具體用法?Python pyarrow.parquet怎麽用?Python pyarrow.parquet使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyarrow
的用法示例。
在下文中一共展示了pyarrow.parquet方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: validate_dataframe
# 需要導入模塊: import pyarrow [as 別名]
# 或者: from pyarrow import parquet [as 別名]
def validate_dataframe(df):
if not isinstance(df, DataFrame):
raise ValueError("to_parquet only supports IO with DataFrames")
# must have value column names (strings only)
if df.columns.inferred_type not in {'string', 'unicode'}:
raise ValueError("parquet must have string column names")
# index level names must be strings
valid_names = all(
isinstance(name, string_types)
for name in df.index.names
if name is not None
)
if not valid_names:
raise ValueError("Index level names must be strings")
示例2: __init__
# 需要導入模塊: import pyarrow [as 別名]
# 或者: from pyarrow import parquet [as 別名]
def __init__(self):
# since pandas is a dependency of pyarrow
# we need to import on first use
try:
import pyarrow
import pyarrow.parquet
except ImportError:
raise ImportError(
"pyarrow is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n"
)
if LooseVersion(pyarrow.__version__) < '0.9.0':
raise ImportError(
"pyarrow >= 0.9.0 is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n"
)
self.api = pyarrow
示例3: write
# 需要導入模塊: import pyarrow [as 別名]
# 或者: from pyarrow import parquet [as 別名]
def write(self, df, path, compression='snappy',
coerce_timestamps='ms', index=None, partition_cols=None,
**kwargs):
self.validate_dataframe(df)
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
if index is None:
from_pandas_kwargs = {}
else:
from_pandas_kwargs = {'preserve_index': index}
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
if partition_cols is not None:
self.api.parquet.write_to_dataset(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps,
partition_cols=partition_cols, **kwargs)
else:
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)
示例4: write
# 需要導入模塊: import pyarrow [as 別名]
# 或者: from pyarrow import parquet [as 別名]
def write(self, df, path, compression='snappy',
coerce_timestamps='ms', **kwargs):
self.validate_dataframe(df)
if self._pyarrow_lt_070:
self._validate_write_lt_070(df)
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
if self._pyarrow_lt_060:
table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
self.api.parquet.write_table(
table, path, compression=compression, **kwargs)
else:
table = self.api.Table.from_pandas(df)
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)
示例5: read
# 需要導入模塊: import pyarrow [as 別名]
# 或者: from pyarrow import parquet [as 別名]
def read(self, path, columns=None, **kwargs):
path, _, _, should_close = get_filepath_or_buffer(path)
if self._pyarrow_lt_070:
result = self.api.parquet.read_pandas(path, columns=columns,
**kwargs).to_pandas()
else:
kwargs['use_pandas_metadata'] = True
result = self.api.parquet.read_table(path, columns=columns,
**kwargs).to_pandas()
if should_close:
try:
path.close()
except: # noqa: flake8
pass
return result
示例6: to_parquet
# 需要導入模塊: import pyarrow [as 別名]
# 或者: from pyarrow import parquet [as 別名]
def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
"""
Write a DataFrame to the parquet format.
Parameters
----------
df : DataFrame
path : string
File path
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet library to use. If 'auto', then the option
``io.parquet.engine`` is used. The default ``io.parquet.engine``
behavior is to try 'pyarrow', falling back to 'fastparquet' if
'pyarrow' is unavailable.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
kwargs
Additional keyword arguments passed to the engine
"""
impl = get_engine(engine)
return impl.write(df, path, compression=compression, **kwargs)
示例7: write
# 需要導入模塊: import pyarrow [as 別名]
# 或者: from pyarrow import parquet [as 別名]
def write(self, df, path, compression='snappy',
coerce_timestamps='ms', **kwargs):
self.validate_dataframe(df)
if self._pyarrow_lt_070:
self._validate_write_lt_070(df)
path, _, _ = get_filepath_or_buffer(path)
if self._pyarrow_lt_060:
table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
self.api.parquet.write_table(
table, path, compression=compression, **kwargs)
else:
table = self.api.Table.from_pandas(df)
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)
示例8: to_parquet
# 需要導入模塊: import pyarrow [as 別名]
# 或者: from pyarrow import parquet [as 別名]
def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
"""
Write a DataFrame to the parquet format.
Parameters
----------
df : DataFrame
path : string
File path
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet reader library to use. If 'auto', then the option
'io.parquet.engine' is used. If 'auto', then the first
library to be installed is used.
compression : str, optional, default 'snappy'
compression method, includes {'gzip', 'snappy', 'brotli'}
kwargs
Additional keyword arguments passed to the engine
"""
impl = get_engine(engine)
return impl.write(df, path, compression=compression, **kwargs)
示例9: parquet_file
# 需要導入模塊: import pyarrow [as 別名]
# 或者: from pyarrow import parquet [as 別名]
def parquet_file(
table: Union[Dict[str, List[Any]], pyarrow.Table],
dir: Optional[pathlib.Path] = None,
) -> ContextManager[pathlib.Path]:
"""
Yield a filename with `table` written to a Parquet file.
"""
if isinstance(table, dict):
table = pyarrow.table(table)
with tempfile_context(dir=dir) as parquet_path:
pyarrow.parquet.write_table(
table,
parquet_path,
version="2.0",
compression="SNAPPY",
use_dictionary=[
name.encode("utf-8")
for name, column in zip(table.column_names, table.columns)
if pyarrow.types.is_dictionary(column.type)
],
)
yield parquet_path
示例10: get_engine
# 需要導入模塊: import pyarrow [as 別名]
# 或者: from pyarrow import parquet [as 別名]
def get_engine(engine):
""" return our implementation """
if engine == 'auto':
engine = get_option('io.parquet.engine')
if engine == 'auto':
# try engines in this order
try:
return PyArrowImpl()
except ImportError:
pass
try:
return FastParquetImpl()
except ImportError:
pass
if engine not in ['pyarrow', 'fastparquet']:
raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
if engine == 'pyarrow':
return PyArrowImpl()
elif engine == 'fastparquet':
return FastParquetImpl()
示例11: __init__
# 需要導入模塊: import pyarrow [as 別名]
# 或者: from pyarrow import parquet [as 別名]
def __init__(self):
# since pandas is a dependency of pyarrow
# we need to import on first use
try:
import pyarrow
import pyarrow.parquet
except ImportError:
raise ImportError("pyarrow is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n")
if LooseVersion(pyarrow.__version__) < '0.4.1':
raise ImportError("pyarrow >= 0.4.1 is required for parquet"
"support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n")
self._pyarrow_lt_050 = LooseVersion(pyarrow.__version__) < '0.5.0'
self._pyarrow_lt_060 = LooseVersion(pyarrow.__version__) < '0.6.0'
self.api = pyarrow
示例12: read_parquet
# 需要導入模塊: import pyarrow [as 別名]
# 或者: from pyarrow import parquet [as 別名]
def read_parquet(path, engine='auto', **kwargs):
"""
Load a parquet object from the file path, returning a DataFrame.
.. versionadded 0.21.0
Parameters
----------
path : string
File path
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet reader library to use. If 'auto', then the option
'io.parquet.engine' is used. If 'auto', then the first
library to be installed is used.
kwargs are passed to the engine
Returns
-------
DataFrame
"""
impl = get_engine(engine)
return impl.read(path)
示例13: df_from_bytes_parquet_
# 需要導入模塊: import pyarrow [as 別名]
# 或者: from pyarrow import parquet [as 別名]
def df_from_bytes_parquet_(bytes_: bytes) -> pd.DataFrame:
"""
Since pyabc 0.9.14, pandas DataFrames are converted using
pyarrow parquet. If the conversion to DataFrame fails,
then `df_from_bytes_msgpack_` is tried, which was the formerly
used method. This is in particular useful for databases that
still employ the old format. In case errors occur here, it may
be necessary to use a pandas version prior to 0.25.0.
"""
try:
b = BytesIO(bytes_)
table = parquet.read_table(b)
df = table.to_pandas()
except pyarrow.lib.ArrowIOError:
df = df_from_bytes_msgpack_(bytes_)
return df
示例14: get_engine
# 需要導入模塊: import pyarrow [as 別名]
# 或者: from pyarrow import parquet [as 別名]
def get_engine(engine):
""" return our implementation """
if engine == 'auto':
engine = get_option('io.parquet.engine')
if engine == 'auto':
# try engines in this order
try:
return PyArrowImpl()
except ImportError:
pass
try:
return FastParquetImpl()
except ImportError:
pass
raise ImportError("Unable to find a usable engine; "
"tried using: 'pyarrow', 'fastparquet'.\n"
"pyarrow or fastparquet is required for parquet "
"support")
if engine not in ['pyarrow', 'fastparquet']:
raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
if engine == 'pyarrow':
return PyArrowImpl()
elif engine == 'fastparquet':
return FastParquetImpl()
示例15: read
# 需要導入模塊: import pyarrow [as 別名]
# 或者: from pyarrow import parquet [as 別名]
def read(self, path, columns=None, **kwargs):
path, _, _, should_close = get_filepath_or_buffer(path)
kwargs['use_pandas_metadata'] = True
result = self.api.parquet.read_table(path, columns=columns,
**kwargs).to_pandas()
if should_close:
try:
path.close()
except: # noqa: flake8
pass
return result