本文整理汇总了Python中pandas.io.parsers.TextFileReader方法的典型用法代码示例。如果您正苦于以下问题:Python parsers.TextFileReader方法的具体用法?Python parsers.TextFileReader怎么用?Python parsers.TextFileReader使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pandas.io.parsers
的用法示例。
在下文中一共展示了parsers.TextFileReader方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: chunked_write
# 需要导入模块: from pandas.io import parsers [as 别名]
# 或者: from pandas.io.parsers import TextFileReader [as 别名]
def chunked_write(df_iterator: TextFileReader, parquet_writer: pq.ParquetWriter, date_cols: List[str]):
"""
Writes Parquet version of the chunked dataframe input.
Arrow table creation and Parquet-writes take up around 25% of the time on this function.
The CSV read takes around 75%.
"""
rows_processed = 0
for df in df_iterator:
rows_processed += min(BUFFER_SIZE_ROWS, len(df))
for col_name in date_cols:
df[col_name] = pd.to_datetime(df[col_name], unit="ms")
pa_table = pa.Table.from_pandas(df=df, schema=parquet_writer.schema)
parquet_writer.write_table(pa_table)
print("Rows processed: {}".format(rows_processed), end="\r", flush=True)
print()
示例2: test_integer_thousands_alt
# 需要导入模块: from pandas.io import parsers [as 别名]
# 或者: from pandas.io.parsers import TextFileReader [as 别名]
def test_integer_thousands_alt(self):
data = '123.456\n12.500'
reader = TextFileReader(StringIO(data), delimiter=':',
thousands='.', header=None)
result = reader.read()
expected = DataFrame([123456, 12500])
tm.assert_frame_equal(result, expected)
示例3: test_empty_csv_input
# 需要导入模块: from pandas.io import parsers [as 别名]
# 或者: from pandas.io.parsers import TextFileReader [as 别名]
def test_empty_csv_input(self):
# GH14867
df = read_csv(StringIO(), chunksize=20, header=None,
names=['a', 'b', 'c'])
assert isinstance(df, TextFileReader)
示例4: write_files
# 需要导入模块: from pandas.io import parsers [as 别名]
# 或者: from pandas.io.parsers import TextFileReader [as 别名]
def write_files(metadata: AlchemyMetadata) -> None:
"""
Creates a Parquet file for each table in the schema.
"""
tables: Iterator[AlchemyTable] = metadata.tables.values()
for table in tables:
name = table.name
print(name)
def get_path(prefix: Path, suffix: str):
parent_dir = prefix.joinpath(metadata.schema)
parent_dir.mkdir(exist_ok=True, parents=True)
return parent_dir.joinpath(name).with_suffix(suffix)
extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst")
parquet_file = get_path(PARQUET_PREFIX, ".parquet")
pandas_fields = get_pandas_fields(table)
arrow_fields = get_arrow_fields(table)
arrow_schema = pa.schema(get_arrow_fields(table))
column_names = [name for name, dtype in pandas_fields]
date_cols = [name for name, dtype in arrow_fields if "timestamp" in dtype]
# Using both Arrow and Pandas allows each library to cover the other's current shortcomings.
# Pandas's read_csv can handle chunked/complex reads, while Arrow's WriteParquet can handle chunked writes.
# Arrow's input streams are capable of handling zstd files, which Pandas hasn't implemented yet.
in_buf = pa.OSFile(str(extract_file), mode="r")
reader = pa.CompressedInputStream(in_buf, compression="zstd")
# Have to use snappy codec for Parquet because Drill doesn't read zstd
parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='snappy',
version="2.0", use_dictionary=True)
df_iterator: TextFileReader = pd.read_csv(reader, header=None, names=column_names, dtype=dict(pandas_fields),
true_values=map_to_bytes('T'), false_values=map_to_bytes('F'),
chunksize=BUFFER_SIZE_ROWS, parse_dates=date_cols)
chunked_write(df_iterator, parquet_writer, date_cols)
示例5: test_override_set_noconvert_columns
# 需要导入模块: from pandas.io import parsers [as 别名]
# 或者: from pandas.io.parsers import TextFileReader [as 别名]
def test_override_set_noconvert_columns():
# see gh-17351
#
# Usecols needs to be sorted in _set_noconvert_columns based
# on the test_usecols_with_parse_dates test from test_usecols.py
class MyTextFileReader(TextFileReader):
def __init__(self):
self._currow = 0
self.squeeze = False
class MyCParserWrapper(CParserWrapper):
def _set_noconvert_columns(self):
if self.usecols_dtype == "integer":
# self.usecols is a set, which is documented as unordered
# but in practice, a CPython set of integers is sorted.
# In other implementations this assumption does not hold.
# The following code simulates a different order, which
# before GH 17351 would cause the wrong columns to be
# converted via the parse_dates parameter
self.usecols = list(self.usecols)
self.usecols.reverse()
return CParserWrapper._set_noconvert_columns(self)
data = """a,b,c,d,e
0,1,20140101,0900,4
0,1,20140102,1000,4"""
parse_dates = [[1, 2]]
cols = {
"a": [0, 0],
"c_d": [
Timestamp("2014-01-01 09:00:00"),
Timestamp("2014-01-02 10:00:00")
]
}
expected = DataFrame(cols, columns=["c_d", "a"])
parser = MyTextFileReader()
parser.options = {"usecols": [0, 2, 3],
"parse_dates": parse_dates,
"delimiter": ","}
parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
result = parser.read()
tm.assert_frame_equal(result, expected)
示例6: test_iterator
# 需要导入模块: from pandas.io import parsers [as 别名]
# 或者: from pandas.io.parsers import TextFileReader [as 别名]
def test_iterator(self):
# See gh-6607
reader = self.read_csv(StringIO(self.data1), index_col=0,
iterator=True)
df = self.read_csv(StringIO(self.data1), index_col=0)
chunk = reader.read(3)
tm.assert_frame_equal(chunk, df[:3])
last_chunk = reader.read(5)
tm.assert_frame_equal(last_chunk, df[3:])
# pass list
lines = list(csv.reader(StringIO(self.data1)))
parser = TextParser(lines, index_col=0, chunksize=2)
df = self.read_csv(StringIO(self.data1), index_col=0)
chunks = list(parser)
tm.assert_frame_equal(chunks[0], df[:2])
tm.assert_frame_equal(chunks[1], df[2:4])
tm.assert_frame_equal(chunks[2], df[4:])
# pass skiprows
parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1])
chunks = list(parser)
tm.assert_frame_equal(chunks[0], df[1:3])
treader = self.read_table(StringIO(self.data1), sep=',', index_col=0,
iterator=True)
assert isinstance(treader, TextFileReader)
# gh-3967: stopping iteration when chunksize is specified
data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
reader = self.read_csv(StringIO(data), iterator=True)
result = list(reader)
expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[
3, 6, 9]), index=['foo', 'bar', 'baz'])
tm.assert_frame_equal(result[0], expected)
# chunksize = 1
reader = self.read_csv(StringIO(data), chunksize=1)
result = list(reader)
expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[
3, 6, 9]), index=['foo', 'bar', 'baz'])
assert len(result) == 3
tm.assert_frame_equal(pd.concat(result), expected)
# skipfooter is not supported with the C parser yet
if self.engine == 'python':
# test bad parameter (skipfooter)
reader = self.read_csv(StringIO(self.data1), index_col=0,
iterator=True, skipfooter=1)
pytest.raises(ValueError, reader.read, 3)
示例7: test_override__set_noconvert_columns
# 需要导入模块: from pandas.io import parsers [as 别名]
# 或者: from pandas.io.parsers import TextFileReader [as 别名]
def test_override__set_noconvert_columns(self):
# GH 17351 - usecols needs to be sorted in _setnoconvert_columns
# based on the test_usecols_with_parse_dates test from usecols.py
from pandas.io.parsers import CParserWrapper, TextFileReader
s = """a,b,c,d,e
0,1,20140101,0900,4
0,1,20140102,1000,4"""
parse_dates = [[1, 2]]
cols = {
'a': [0, 0],
'c_d': [
Timestamp('2014-01-01 09:00:00'),
Timestamp('2014-01-02 10:00:00')
]
}
expected = DataFrame(cols, columns=['c_d', 'a'])
class MyTextFileReader(TextFileReader):
def __init__(self):
self._currow = 0
self.squeeze = False
class MyCParserWrapper(CParserWrapper):
def _set_noconvert_columns(self):
if self.usecols_dtype == 'integer':
# self.usecols is a set, which is documented as unordered
# but in practice, a CPython set of integers is sorted.
# In other implementations this assumption does not hold.
# The following code simulates a different order, which
# before GH 17351 would cause the wrong columns to be
# converted via the parse_dates parameter
self.usecols = list(self.usecols)
self.usecols.reverse()
return CParserWrapper._set_noconvert_columns(self)
parser = MyTextFileReader()
parser.options = {'usecols': [0, 2, 3],
'parse_dates': parse_dates,
'delimiter': ','}
parser._engine = MyCParserWrapper(StringIO(s), **parser.options)
df = parser.read()
tm.assert_frame_equal(df, expected)