当前位置: 首页>>代码示例>>Python>>正文


Python pyarrow.string方法代码示例

本文整理汇总了Python中pyarrow.string方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.string方法的具体用法?Python pyarrow.string怎么用?Python pyarrow.string使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyarrow的用法示例。


在下文中一共展示了pyarrow.string方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_redshift_spectrum_long_string

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_redshift_spectrum_long_string(path, glue_table, glue_database, redshift_external_schema):
    df = pd.DataFrame(
        {
            "id": [1, 2],
            "col_str": [
                "".join(random.choice(string.ascii_letters) for _ in range(300)),
                "".join(random.choice(string.ascii_letters) for _ in range(300)),
            ],
        }
    )
    paths = wr.s3.to_parquet(
        df=df, path=path, database=glue_database, table=glue_table, mode="overwrite", index=False, dataset=True
    )["paths"]
    wr.s3.wait_objects_exist(paths=paths, use_threads=False)
    engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift")
    with engine.connect() as con:
        cursor = con.execute(f"SELECT * FROM {redshift_external_schema}.{glue_table}")
        rows = cursor.fetchall()
        assert len(rows) == len(df.index)
        for row in rows:
            assert len(row) == len(df.columns) 
开发者ID:awslabs,项目名称:aws-data-wrangler,代码行数:23,代码来源:test_db.py

示例2: __init__

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def __init__(self, name, dataType, nullable=True, metadata=None):
        """
        >>> (StructField("f1", StringType(), True)
        ...      == StructField("f1", StringType(), True))
        True
        >>> (StructField("f1", StringType(), True)
        ...      == StructField("f2", StringType(), True))
        False
        """
        assert isinstance(dataType, DataType),\
            "dataType %s should be an instance of %s" % (dataType, DataType)
        assert isinstance(name, basestring), "field name %s should be string" % (name)
        if not isinstance(name, str):
            name = name.encode('utf-8')
        self.name = name
        self.dataType = dataType
        self.nullable = nullable
        self.metadata = metadata or {} 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:20,代码来源:types.py

示例3: test_dataframe_to_arrow_dict_sequence_schema

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_dataframe_to_arrow_dict_sequence_schema(module_under_test):
    dict_schema = [
        {"name": "field01", "type": "STRING", "mode": "REQUIRED"},
        {"name": "field02", "type": "BOOL", "mode": "NULLABLE"},
    ]

    dataframe = pandas.DataFrame(
        {"field01": [u"hello", u"world"], "field02": [True, False]}
    )

    arrow_table = module_under_test.dataframe_to_arrow(dataframe, dict_schema)
    arrow_schema = arrow_table.schema

    expected_fields = [
        pyarrow.field("field01", "string", nullable=False),
        pyarrow.field("field02", "bool", nullable=True),
    ]
    assert list(arrow_schema) == expected_fields 
开发者ID:googleapis,项目名称:python-bigquery,代码行数:20,代码来源:test__pandas_helpers.py

示例4: test_num_bytes_getter

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_num_bytes_getter(self):
        dataset = DatasetReference(self.PROJECT, self.DS_ID)
        table_ref = dataset.table(self.TABLE_NAME)
        table = self._make_one(table_ref)

        # Check with no value set.
        self.assertIsNone(table.num_bytes)

        num_bytes = 1337
        # Check with integer value set.
        table._properties = {"numBytes": num_bytes}
        self.assertEqual(table.num_bytes, num_bytes)

        # Check with a string value set.
        table._properties = {"numBytes": str(num_bytes)}
        self.assertEqual(table.num_bytes, num_bytes)

        # Check with invalid int value.
        table._properties = {"numBytes": "x"}
        with self.assertRaises(ValueError):
            getattr(table, "num_bytes") 
开发者ID:googleapis,项目名称:python-bigquery,代码行数:23,代码来源:test_table.py

示例5: test_text_zfill

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_text_zfill(data, fletcher_variant):
    if any("\x00" in x for x in data if x):
        # pytest.skip("pandas cannot handle \\x00 characters in tests")
        # Skip is not working properly with hypothesis
        return
    ser_pd = pd.Series(data, dtype=str)
    max_str_len = ser_pd.map(_optional_len).max()
    if pd.isna(max_str_len):
        max_str_len = 0
    arrow_data = pa.array(data, type=pa.string())
    if fletcher_variant == "chunked":
        fr_array = fr.FletcherChunkedArray(arrow_data)
    else:
        fr_array = fr.FletcherContinuousArray(arrow_data)
    ser_fr = pd.Series(fr_array)

    result_pd = ser_pd.str.zfill(max_str_len + 1)
    result_fr = ser_fr.fr_text.zfill(max_str_len + 1)
    result_fr = result_fr.astype(object)
    # Pandas returns np.nan for NA values in cat, keep this in line
    result_fr[result_fr.isna()] = np.nan
    tm.assert_series_equal(result_fr, result_pd) 
开发者ID:xhochy,项目名称:fletcher,代码行数:24,代码来源:test_text.py

示例6: test_string_builder_simple

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_string_builder_simple(data):
    builder = NumbaStringArrayBuilder(2, 6)

    for s in data:
        if s is None:
            builder.finish_null()
            continue

        for c in s:
            builder.put_byte(ord(c))

        builder.finish_string()

    builder.finish()

    expected = pa.array(data, pa.string())
    missing, offsets, data = buffers_as_arrays(expected)

    np.testing.assert_array_equal(builder.offsets, offsets)
    np.testing.assert_array_equal(builder.data, data) 
开发者ID:xhochy,项目名称:fletcher,代码行数:22,代码来源:test_numba_integration.py

示例7: __eq__

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def __eq__(self, other) -> bool:
        """Check whether 'other' is equal to self.

        By default, 'other' is considered equal if
        * it's a string matching 'self.name'.
        * it's an instance of this type.

        Parameters
        ----------
        other : Any

        Returns
        -------
        bool
        """
        if isinstance(other, str):
            return other == self.name
        elif isinstance(other, type(self)):
            return self.arrow_dtype == other.arrow_dtype
        else:
            return False 
开发者ID:xhochy,项目名称:fletcher,代码行数:23,代码来源:base.py

示例8: _text_cat

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def _text_cat(a: pa.Array, b: pa.Array) -> pa.Array:
    if len(a) != len(b):
        raise ValueError("Lengths of arrays don't match")

    offsets_a, data_a = _extract_string_buffers(a)
    offsets_b, data_b = _extract_string_buffers(b)
    if len(a) > 0:
        valid = _merge_valid_bitmaps(a, b)
        result_offsets = np.empty(len(a) + 1, dtype=np.int32)
        result_offsets[0] = 0
        total_size = (offsets_a[-1] - offsets_a[0]) + (offsets_b[-1] - offsets_b[0])
        result_data = np.empty(total_size, dtype=np.uint8)
        _merge_string_data(
            len(a),
            valid,
            offsets_a,
            data_a,
            offsets_b,
            data_b,
            result_offsets,
            result_data,
        )
        buffers = [pa.py_buffer(x) for x in [valid, result_offsets, result_data]]
        return pa.Array.from_buffers(pa.string(), len(a), buffers)
    return a 
开发者ID:xhochy,项目名称:fletcher,代码行数:27,代码来源:string.py

示例9: get_pa_translated_schema

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def get_pa_translated_schema(self):
        """Translates a BigQuery schema to an parquet schema.

        Returns: Translated parquet schema in pyarrow.Schema format.
        """

        type_conversions = {
            'STRING': pa.string(),
            'NUMERIC': pa.int64(),
        }

        # TODO(annarudy@google.com): add support for nested fields
        pa_schema_list = [
            pa.field(
                bq_field.name,
                type_conversions[bq_field.field_type],
            ) for bq_field in self.bq_schema
        ]

        return pa.schema(pa_schema_list) 
开发者ID:GoogleCloudPlatform,项目名称:professional-services,代码行数:22,代码来源:parquet_util.py

示例10: test_dataframe_all_null_category_column

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_dataframe_all_null_category_column(self):
        assert_arrow_table_equals(
            dataframe_to_arrow_table(
                pd.DataFrame({"A": [None]}, dtype=str).astype("category"),
                [Column("A", ColumnType.TEXT())],
                self.path,
            ),
            arrow_table(
                {
                    "A": pyarrow.DictionaryArray.from_arrays(
                        pyarrow.array([None], type=pyarrow.int8()),
                        pyarrow.array([], type=pyarrow.string()),
                    )
                }
            ),
        ) 
开发者ID:CJWorkbench,项目名称:cjworkbench,代码行数:18,代码来源:test_types.py

示例11: load_from_buffer

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def load_from_buffer(
        buf, store: KeyValueStore, format: str = "json"
    ) -> "DatasetMetadata":
        """
        Load a dataset from a (string) buffer.

        Parameters
        ----------
        buf:
            Input to be parsed.
        store:
            Object that implements the .get method for file/object loading.

        Returns
        -------
        dataset_metadata:
            Parsed metadata.
        """
        if format == "json":
            metadata = load_json(buf)
        elif format == "msgpack":
            metadata = msgpack.unpackb(buf)
        return DatasetMetadata.load_from_dict(metadata, store) 
开发者ID:JDASoftwareGroup,项目名称:kartothek,代码行数:25,代码来源:dataset.py

示例12: test_iterate_over_string_chunk

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_iterate_over_string_chunk():
    random.seed(datetime.datetime.now())
    column_meta = [
            {"logicalType": "TEXT"},
            {"logicalType": "TEXT"}
    ]
    field_foo = pyarrow.field("column_foo", pyarrow.string(), True, column_meta[0])
    field_bar = pyarrow.field("column_bar", pyarrow.string(), True, column_meta[1])
    pyarrow.schema([field_foo, field_bar])

    def str_generator():
        return str(random.randint(-100, 100))

    iterate_over_test_chunk([pyarrow.string(), pyarrow.string()],
                            column_meta, str_generator) 
开发者ID:snowflakedb,项目名称:snowflake-connector-python,代码行数:17,代码来源:test_unit_arrow_chunk_iterator.py

示例13: get_pyarrow_types

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def get_pyarrow_types():
    return {
        'bool': PA_BOOL,
        'float32': PA_FLOAT32,
        'float64': PA_FLOAT64,
        'int8': PA_INT8,
        'int16': PA_INT16,
        'int32': PA_INT32,
        'int64': PA_INT64,
        'string': PA_STRING,
        'timestamp': PA_TIMESTAMP,
        'base64': PA_BINARY
    }

# pylint: disable=too-many-branches,too-many-statements 
开发者ID:cldellow,项目名称:csv2parquet,代码行数:17,代码来源:csv2parquet.py

示例14: test_argparse_types

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def test_argparse_types():
    _map = {}
    csv2parquet.main_with_args(capture_args(_map), ['foo.csv', '--type', '0=string', '0=int8?'])
    assert _map['raw_types'] == [('0', pa.string(), False), ('0', pa.int8(), True)] 
开发者ID:cldellow,项目名称:csv2parquet,代码行数:6,代码来源:test_argparse.py

示例15: testIsBinaryLike

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import string [as 别名]
def testIsBinaryLike(self):
    for t in (pa.binary(), pa.large_binary(), pa.string(), pa.large_string()):
      self.assertTrue(arrow_util.is_binary_like(t))

    for t in (pa.list_(pa.binary()), pa.large_list(pa.string())):
      self.assertFalse(arrow_util.is_binary_like(t)) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:8,代码来源:arrow_util_test.py


注:本文中的pyarrow.string方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。