当前位置: 首页>>代码示例>>Python>>正文


Python parsers.TextFileReader方法代码示例

本文整理汇总了Python中pandas.io.parsers.TextFileReader方法的典型用法代码示例。如果您正苦于以下问题:Python parsers.TextFileReader方法的具体用法?Python parsers.TextFileReader怎么用?Python parsers.TextFileReader使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pandas.io.parsers的用法示例。


在下文中一共展示了parsers.TextFileReader方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: chunked_write

# 需要导入模块: from pandas.io import parsers [as 别名]
# 或者: from pandas.io.parsers import TextFileReader [as 别名]
def chunked_write(df_iterator: TextFileReader, parquet_writer: pq.ParquetWriter, date_cols: List[str]):
    """
    Writes  Parquet version of the chunked dataframe input.

    Arrow table creation and Parquet-writes take up around 25% of the time on this function.
    The CSV read takes around 75%.
    """
    rows_processed = 0
    for df in df_iterator:
        rows_processed += min(BUFFER_SIZE_ROWS, len(df))
        for col_name in date_cols:
            df[col_name] = pd.to_datetime(df[col_name], unit="ms")
        pa_table = pa.Table.from_pandas(df=df, schema=parquet_writer.schema)
        parquet_writer.write_table(pa_table)

        print("Rows processed: {}".format(rows_processed), end="\r", flush=True)
    print() 
开发者ID:droher,项目名称:boxball,代码行数:19,代码来源:parquet.py

示例2: test_integer_thousands_alt

# 需要导入模块: from pandas.io import parsers [as 别名]
# 或者: from pandas.io.parsers import TextFileReader [as 别名]
def test_integer_thousands_alt(self):
        data = '123.456\n12.500'

        reader = TextFileReader(StringIO(data), delimiter=':',
                                thousands='.', header=None)
        result = reader.read()

        expected = DataFrame([123456, 12500])
        tm.assert_frame_equal(result, expected) 
开发者ID:Frank-qlu,项目名称:recruit,代码行数:11,代码来源:test_textreader.py

示例3: test_empty_csv_input

# 需要导入模块: from pandas.io import parsers [as 别名]
# 或者: from pandas.io.parsers import TextFileReader [as 别名]
def test_empty_csv_input(self):
        # GH14867
        df = read_csv(StringIO(), chunksize=20, header=None,
                      names=['a', 'b', 'c'])
        assert isinstance(df, TextFileReader) 
开发者ID:Frank-qlu,项目名称:recruit,代码行数:7,代码来源:test_textreader.py

示例4: write_files

# 需要导入模块: from pandas.io import parsers [as 别名]
# 或者: from pandas.io.parsers import TextFileReader [as 别名]
def write_files(metadata: AlchemyMetadata) -> None:
    """
    Creates a Parquet file for each table in the schema.
    """
    tables: Iterator[AlchemyTable] = metadata.tables.values()
    for table in tables:
        name = table.name
        print(name)

        def get_path(prefix: Path, suffix: str):
            parent_dir = prefix.joinpath(metadata.schema)
            parent_dir.mkdir(exist_ok=True, parents=True)
            return parent_dir.joinpath(name).with_suffix(suffix)

        extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst")
        parquet_file = get_path(PARQUET_PREFIX, ".parquet")

        pandas_fields = get_pandas_fields(table)
        arrow_fields = get_arrow_fields(table)
        arrow_schema = pa.schema(get_arrow_fields(table))
        column_names = [name for name, dtype in pandas_fields]
        date_cols = [name for name, dtype in arrow_fields if "timestamp" in dtype]

        # Using both Arrow and Pandas allows each library to cover the other's current shortcomings.
        # Pandas's read_csv can handle chunked/complex reads, while Arrow's WriteParquet can handle chunked writes.
        # Arrow's input streams are capable of handling zstd files, which Pandas hasn't implemented yet.
        in_buf = pa.OSFile(str(extract_file), mode="r")
        reader = pa.CompressedInputStream(in_buf, compression="zstd")

        # Have to use snappy codec for Parquet because Drill doesn't read zstd
        parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='snappy',
                                          version="2.0", use_dictionary=True)
        df_iterator: TextFileReader = pd.read_csv(reader, header=None, names=column_names, dtype=dict(pandas_fields),
                                                  true_values=map_to_bytes('T'), false_values=map_to_bytes('F'),
                                                  chunksize=BUFFER_SIZE_ROWS, parse_dates=date_cols)

        chunked_write(df_iterator, parquet_writer, date_cols) 
开发者ID:droher,项目名称:boxball,代码行数:39,代码来源:parquet.py

示例5: test_override_set_noconvert_columns

# 需要导入模块: from pandas.io import parsers [as 别名]
# 或者: from pandas.io.parsers import TextFileReader [as 别名]
def test_override_set_noconvert_columns():
    # see gh-17351
    #
    # Usecols needs to be sorted in _set_noconvert_columns based
    # on the test_usecols_with_parse_dates test from test_usecols.py
    class MyTextFileReader(TextFileReader):
        def __init__(self):
            self._currow = 0
            self.squeeze = False

    class MyCParserWrapper(CParserWrapper):
        def _set_noconvert_columns(self):
            if self.usecols_dtype == "integer":
                # self.usecols is a set, which is documented as unordered
                # but in practice, a CPython set of integers is sorted.
                # In other implementations this assumption does not hold.
                # The following code simulates a different order, which
                # before GH 17351 would cause the wrong columns to be
                # converted via the parse_dates parameter
                self.usecols = list(self.usecols)
                self.usecols.reverse()
            return CParserWrapper._set_noconvert_columns(self)

    data = """a,b,c,d,e
0,1,20140101,0900,4
0,1,20140102,1000,4"""

    parse_dates = [[1, 2]]
    cols = {
        "a": [0, 0],
        "c_d": [
            Timestamp("2014-01-01 09:00:00"),
            Timestamp("2014-01-02 10:00:00")
        ]
    }
    expected = DataFrame(cols, columns=["c_d", "a"])

    parser = MyTextFileReader()
    parser.options = {"usecols": [0, 2, 3],
                      "parse_dates": parse_dates,
                      "delimiter": ","}
    parser._engine = MyCParserWrapper(StringIO(data), **parser.options)

    result = parser.read()
    tm.assert_frame_equal(result, expected) 
开发者ID:Frank-qlu,项目名称:recruit,代码行数:47,代码来源:test_common.py

示例6: test_iterator

# 需要导入模块: from pandas.io import parsers [as 别名]
# 或者: from pandas.io.parsers import TextFileReader [as 别名]
def test_iterator(self):
        # See gh-6607
        reader = self.read_csv(StringIO(self.data1), index_col=0,
                               iterator=True)
        df = self.read_csv(StringIO(self.data1), index_col=0)

        chunk = reader.read(3)
        tm.assert_frame_equal(chunk, df[:3])

        last_chunk = reader.read(5)
        tm.assert_frame_equal(last_chunk, df[3:])

        # pass list
        lines = list(csv.reader(StringIO(self.data1)))
        parser = TextParser(lines, index_col=0, chunksize=2)

        df = self.read_csv(StringIO(self.data1), index_col=0)

        chunks = list(parser)
        tm.assert_frame_equal(chunks[0], df[:2])
        tm.assert_frame_equal(chunks[1], df[2:4])
        tm.assert_frame_equal(chunks[2], df[4:])

        # pass skiprows
        parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1])
        chunks = list(parser)
        tm.assert_frame_equal(chunks[0], df[1:3])

        treader = self.read_table(StringIO(self.data1), sep=',', index_col=0,
                                  iterator=True)
        assert isinstance(treader, TextFileReader)

        # gh-3967: stopping iteration when chunksize is specified
        data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
        reader = self.read_csv(StringIO(data), iterator=True)
        result = list(reader)
        expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[
            3, 6, 9]), index=['foo', 'bar', 'baz'])
        tm.assert_frame_equal(result[0], expected)

        # chunksize = 1
        reader = self.read_csv(StringIO(data), chunksize=1)
        result = list(reader)
        expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[
            3, 6, 9]), index=['foo', 'bar', 'baz'])
        assert len(result) == 3
        tm.assert_frame_equal(pd.concat(result), expected)

        # skipfooter is not supported with the C parser yet
        if self.engine == 'python':
            # test bad parameter (skipfooter)
            reader = self.read_csv(StringIO(self.data1), index_col=0,
                                   iterator=True, skipfooter=1)
            pytest.raises(ValueError, reader.read, 3) 
开发者ID:birforce,项目名称:vnpy_crypto,代码行数:60,代码来源:common.py

示例7: test_override__set_noconvert_columns

# 需要导入模块: from pandas.io import parsers [as 别名]
# 或者: from pandas.io.parsers import TextFileReader [as 别名]
def test_override__set_noconvert_columns(self):
        # GH 17351 - usecols needs to be sorted in _setnoconvert_columns
        # based on the test_usecols_with_parse_dates test from usecols.py
        from pandas.io.parsers import CParserWrapper, TextFileReader

        s = """a,b,c,d,e
        0,1,20140101,0900,4
        0,1,20140102,1000,4"""

        parse_dates = [[1, 2]]
        cols = {
            'a': [0, 0],
            'c_d': [
                Timestamp('2014-01-01 09:00:00'),
                Timestamp('2014-01-02 10:00:00')
            ]
        }
        expected = DataFrame(cols, columns=['c_d', 'a'])

        class MyTextFileReader(TextFileReader):
            def __init__(self):
                self._currow = 0
                self.squeeze = False

        class MyCParserWrapper(CParserWrapper):
            def _set_noconvert_columns(self):
                if self.usecols_dtype == 'integer':
                    # self.usecols is a set, which is documented as unordered
                    # but in practice, a CPython set of integers is sorted.
                    # In other implementations this assumption does not hold.
                    # The following code simulates a different order, which
                    # before GH 17351 would cause the wrong columns to be
                    # converted via the parse_dates parameter
                    self.usecols = list(self.usecols)
                    self.usecols.reverse()
                return CParserWrapper._set_noconvert_columns(self)

        parser = MyTextFileReader()
        parser.options = {'usecols': [0, 2, 3],
                          'parse_dates': parse_dates,
                          'delimiter': ','}
        parser._engine = MyCParserWrapper(StringIO(s), **parser.options)
        df = parser.read()

        tm.assert_frame_equal(df, expected) 
开发者ID:birforce,项目名称:vnpy_crypto,代码行数:47,代码来源:test_parsers.py


注:本文中的pandas.io.parsers.TextFileReader方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。