当前位置: 首页>>代码示例>>Python>>正文


Python types.LongType方法代码示例

本文整理汇总了Python中pyspark.sql.types.LongType方法的典型用法代码示例。如果您正苦于以下问题:Python types.LongType方法的具体用法?Python types.LongType怎么用?Python types.LongType使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.types的用法示例。


在下文中一共展示了types.LongType方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _numpy_to_spark_mapping

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def _numpy_to_spark_mapping():
    """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation
    of multiple objects in each call."""

    # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot'
    # notation to avoid copy/paste/typo mistakes
    cache_attr_name = 'cached_numpy_to_pyspark_types_map'
    if not hasattr(_numpy_to_spark_mapping, cache_attr_name):
        import pyspark.sql.types as T

        setattr(_numpy_to_spark_mapping, cache_attr_name,
                {
                    np.int8: T.ByteType(),
                    np.uint8: T.ShortType(),
                    np.int16: T.ShortType(),
                    np.uint16: T.IntegerType(),
                    np.int32: T.IntegerType(),
                    np.int64: T.LongType(),
                    np.float32: T.FloatType(),
                    np.float64: T.DoubleType(),
                    np.string_: T.StringType(),
                    np.str_: T.StringType(),
                    np.unicode_: T.StringType(),
                    np.bool_: T.BooleanType(),
                })

    return getattr(_numpy_to_spark_mapping, cache_attr_name)


# TODO: Changing fields in this class or the UnischemaField will break reading due to the schema being pickled next to
# the dataset on disk 
开发者ID:uber,项目名称:petastorm,代码行数:33,代码来源:unischema.py

示例2: test_invalid_schema_field

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def test_invalid_schema_field(synthetic_dataset, reader_factory):
    # Let's assume we are selecting columns using a schema which is different from the one
    # stored in the dataset. Would expect to get a reasonable error message
    BogusSchema = Unischema('BogusSchema', [
        UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False),
        UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False)])

    expected_values = {'bogus_key': 11, 'id': 1}
    with pytest.raises(ValueError, match='bogus_key'):
        reader_factory(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(),
                       shuffle_row_groups=False,
                       predicate=EqualPredicate(expected_values)) 
开发者ID:uber,项目名称:petastorm,代码行数:15,代码来源:test_end_to_end.py

示例3: test_create_schema_view_fails_validate

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def test_create_schema_view_fails_validate():
    """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__."""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])
    with pytest.raises(ValueError, match='does not belong to the schema'):
        TestSchema.create_schema_view([UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False)]) 
开发者ID:uber,项目名称:petastorm,代码行数:10,代码来源:test_unischema.py

示例4: encode

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def encode(self, unischema_field, value):
        # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
        # (currently works only with make_batch_reader). We should move all pyspark related code into a separate module
        import pyspark.sql.types as sql_types

        # We treat ndarrays with shape=() as scalars
        unsized_numpy_array = isinstance(value, np.ndarray) and value.shape == ()
        # Validate the input to be a scalar (or an unsized numpy array)
        if not unsized_numpy_array and hasattr(value, '__len__') and (not isinstance(value, str)):
            raise TypeError('Expected a scalar as a value for field \'{}\'. '
                            'Got a non-numpy type\'{}\''.format(unischema_field.name, type(value)))

        if unischema_field.shape:
            raise ValueError('The shape field of unischema_field \'%s\' must be an empty tuple (i.e. \'()\' '
                             'to indicate a scalar. However, the actual shape is %s',
                             unischema_field.name, unischema_field.shape)
        if isinstance(self._spark_type, (sql_types.ByteType, sql_types.ShortType, sql_types.IntegerType,
                                         sql_types.LongType)):
            return int(value)
        if isinstance(self._spark_type, (sql_types.FloatType, sql_types.DoubleType)):
            return float(value)
        if isinstance(self._spark_type, sql_types.BooleanType):
            return bool(value)
        if isinstance(self._spark_type, sql_types.StringType):
            if not isinstance(value, str):
                raise ValueError(
                    'Expected a string value for field {}. Got type {}'.format(unischema_field.name, type(value)))
            return str(value)

        return value 
开发者ID:uber,项目名称:petastorm,代码行数:32,代码来源:codecs.py

示例5: year

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def year(self) -> "ks.Series":
        """
        The year of the datetime.
        """
        return column_op(lambda c: F.year(c).cast(LongType()))(self._data).alias(self._data.name) 
开发者ID:databricks,项目名称:koalas,代码行数:7,代码来源:datetimes.py

示例6: month

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def month(self) -> "ks.Series":
        """
        The month of the timestamp as January = 1 December = 12.
        """
        return column_op(lambda c: F.month(c).cast(LongType()))(self._data).alias(self._data.name) 
开发者ID:databricks,项目名称:koalas,代码行数:7,代码来源:datetimes.py

示例7: day

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def day(self) -> "ks.Series":
        """
        The days of the datetime.
        """
        return column_op(lambda c: F.dayofmonth(c).cast(LongType()))(self._data).alias(
            self._data.name
        ) 
开发者ID:databricks,项目名称:koalas,代码行数:9,代码来源:datetimes.py

示例8: hour

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def hour(self) -> "ks.Series":
        """
        The hours of the datetime.
        """
        return column_op(lambda c: F.hour(c).cast(LongType()))(self._data).alias(self._data.name) 
开发者ID:databricks,项目名称:koalas,代码行数:7,代码来源:datetimes.py

示例9: minute

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def minute(self) -> "ks.Series":
        """
        The minutes of the datetime.
        """
        return column_op(lambda c: F.minute(c).cast(LongType()))(self._data).alias(self._data.name) 
开发者ID:databricks,项目名称:koalas,代码行数:7,代码来源:datetimes.py

示例10: week

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def week(self) -> "ks.Series":
        """
        The week ordinal of the year.
        """
        return column_op(lambda c: F.weekofyear(c).cast(LongType()))(self._data).alias(
            self._data.name
        ) 
开发者ID:databricks,项目名称:koalas,代码行数:9,代码来源:datetimes.py

示例11: numpy_column_op

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def numpy_column_op(f):
    @wraps(f)
    def wrapper(self, *args):
        # PySpark does not support NumPy type out of the box. For now, we convert NumPy types
        # into some primitive types understandable in PySpark.
        new_args = []
        for arg in args:
            # TODO: This is a quick hack to support NumPy type. We should revisit this.
            if isinstance(self.spark.data_type, LongType) and isinstance(arg, np.timedelta64):
                new_args.append(float(arg / np.timedelta64(1, "s")))
            else:
                new_args.append(arg)
        return column_op(f)(self, *new_args)

    return wrapper 
开发者ID:databricks,项目名称:koalas,代码行数:17,代码来源:base.py

示例12: _select_rows_by_iterable

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def _select_rows_by_iterable(
        self, rows_sel: Iterable
    ) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
        sdf = self._internal.spark_frame

        if any(isinstance(key, (int, np.int, np.int64, np.int32)) and key < 0 for key in rows_sel):
            offset = sdf.count()
        else:
            offset = 0

        new_rows_sel = []
        for key in list(rows_sel):
            if not isinstance(key, (int, np.int, np.int64, np.int32)):
                raise TypeError(
                    "cannot do positional indexing with these indexers [{}] of {}".format(
                        key, type(key)
                    )
                )
            if key < 0:
                key = key + offset
            new_rows_sel.append(key)

        if len(new_rows_sel) != len(set(new_rows_sel)):
            raise NotImplementedError(
                "Duplicated row selection is not currently supported; "
                "however, normalised index was [%s]" % new_rows_sel
            )

        sequence_scol = sdf[self._sequence_col]
        cond = []
        for key in new_rows_sel:
            cond.append(sequence_scol == F.lit(int(key)).cast(LongType()))

        if len(cond) == 0:
            cond = [F.lit(False)]
        return reduce(lambda x, y: x | y, cond), None, None 
开发者ID:databricks,项目名称:koalas,代码行数:38,代码来源:indexing.py

示例13: as_spark_type

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def as_spark_type(tpe) -> types.DataType:
    """
    Given a python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    """
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes,):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int", np.int, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (datetime.date,):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in (np.ndarray,):
        # TODO: support other child types
        return types.ArrayType(types.StringType())
    else:
        raise TypeError("Type %s was not understood." % tpe) 
开发者ID:databricks,项目名称:koalas,代码行数:39,代码来源:typehints.py

示例14: len

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def len(self) -> "ks.Series":
        """
        Computes the length of each element in the Series.

        The element may be a sequence (such as a string, tuple or list).

        Returns
        -------
        Series of int
            A Series of integer values indicating the length of each element in
            the Series.

        Examples
        --------
        Returns the length (number of characters) in a string. Returns the
        number of entries for lists or tuples.

        >>> s1 = ks.Series(['dog', 'monkey'])
        >>> s1.str.len()
        0    3
        1    6
        Name: 0, dtype: int64

        >>> s2 = ks.Series([["a", "b", "c"], []])
        >>> s2.str.len()
        0    3
        1    0
        Name: 0, dtype: int64
        """
        if isinstance(self._data.spark.data_type, (ArrayType, MapType)):
            return column_op(lambda c: F.size(c).cast(LongType()))(self._data).alias(
                self._data.name
            )
        else:
            return column_op(lambda c: F.length(c).cast(LongType()))(self._data).alias(
                self._data.name
            ) 
开发者ID:databricks,项目名称:koalas,代码行数:39,代码来源:strings.py

示例15: read

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def read(self, file_path, spark_session, indexcol=0, schema=None):
        """
        Creates a dataframe from the csv file

        :param indexcol: if 1, create a tuple id column as auto increment
        :param schema: optional schema of file if known
        :param spark_session: The spark_session we created in Holoclean object
        :param file_path: The path to the file

        :return: dataframe
        """
        if schema is None:
            df = spark_session.read.csv(file_path, header=True)
        else:
            df = spark_session.read.csv(file_path, header=True, schema=schema)

        if indexcol == 0:
            return df

        index_name = GlobalVariables.index_name

        new_cols = df.schema.names + [index_name]
        list_schema = []
        for index_attribute in range(len(df.schema.names)):
            list_schema.append(StructField("_" + str(index_attribute),
                                           df.schema[
                                               index_attribute].dataType,
                                           True))
        list_schema.append(
            StructField("_" + str(len(new_cols)), LongType(), True))

        schema = StructType(list_schema)
        ix_df = df.rdd.zipWithIndex().map(
            lambda (row, ix): row + (ix + 1,)).toDF(schema)
        tmp_cols = ix_df.schema.names
        new_df = reduce(lambda data, idx: data.withColumnRenamed(tmp_cols[idx],
                        new_cols[idx]),
                        xrange(len(tmp_cols)), ix_df)
        new_df = self.checking_string_size(new_df)
        return new_df 
开发者ID:HoloClean,项目名称:HoloClean-Legacy-deprecated,代码行数:42,代码来源:reader.py


注:本文中的pyspark.sql.types.LongType方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。