本文整理汇总了Python中pyspark.sql.types.IntegerType方法的典型用法代码示例。如果您正苦于以下问题:Python types.IntegerType方法的具体用法?Python types.IntegerType怎么用?Python types.IntegerType使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.types
的用法示例。
在下文中一共展示了types.IntegerType方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_featurizer_in_pipeline
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import IntegerType [as 别名]
def test_featurizer_in_pipeline(self):
"""
Tests that featurizer fits into an MLlib Pipeline.
Does not test how good the featurization is for generalization.
"""
featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features",
modelName=self.name)
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
pipeline = Pipeline(stages=[featurizer, lr])
# add arbitrary labels to run logistic regression
# TODO: it's weird that the test fails on some combinations of labels. check why.
label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType())
train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["image"]["origin"]))
lrModel = pipeline.fit(train_df)
# see if we at least get the training examples right.
# with 5 examples and e.g. 131k features (for InceptionV3), it ought to.
pred_df_collected = lrModel.transform(train_df).collect()
for row in pred_df_collected:
self.assertEqual(int(row.prediction), row.label)
示例2: test_serialize_filesystem_factory
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import IntegerType [as 别名]
def test_serialize_filesystem_factory(tmpdir):
SimpleSchema = Unischema('SimpleSchema', [
UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
UnischemaField('foo', np.int32, (), ScalarCodec(IntegerType()), False),
])
class BogusFS(pyarrow.LocalFileSystem):
def __getstate__(self):
raise RuntimeError("can not serialize")
rows_count = 10
output_url = "file://{0}/fs_factory_test".format(tmpdir)
rowgroup_size_mb = 256
spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate()
sc = spark.sparkContext
with materialize_dataset(spark, output_url, SimpleSchema, rowgroup_size_mb, filesystem_factory=BogusFS):
rows_rdd = sc.parallelize(range(rows_count))\
.map(lambda x: {'id': x, 'foo': x})\
.map(lambda x: dict_to_spark_row(SimpleSchema, x))
spark.createDataFrame(rows_rdd, SimpleSchema.as_spark_schema()) \
.write \
.parquet(output_url)
示例3: test_as_spark_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import IntegerType [as 别名]
def test_as_spark_schema():
"""Try using 'as_spark_schema' function"""
TestSchema = Unischema('TestSchema', [
UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
UnischemaField('string_field_implicit', np.string_, ()),
])
spark_schema = TestSchema.as_spark_schema()
assert spark_schema.fields[0].name == 'int_field'
assert spark_schema.fields[1].name == 'string_field'
assert spark_schema.fields[1].dataType == StringType()
assert spark_schema.fields[2].name == 'string_field_implicit'
assert spark_schema.fields[2].dataType == StringType()
assert TestSchema.fields['int_field'].name == 'int_field'
assert TestSchema.fields['string_field'].name == 'string_field'
示例4: main
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import IntegerType [as 别名]
def main():
schema = StructType([
StructField('subreddit', StringType(), False),
StructField('score', IntegerType(), False),
])
inputs = sqlContext.read.json(inputs1, schema=schema)
# Uncomment this then shcema is not added
# inputs = sqlContext.read.json(inputs1)
# Uncomment these when there are 2 inputs dir
# comments_input1 = sqlContext.read.json(inputs1, schema=schema)
# comments_input2 = sqlContext.read.json(inputs2, schema=schema)
# inputs = comments_input1.unionAll(comments_input2)
df = get_avg(inputs)
df.write.save(output, format='json', mode='overwrite')
示例5: read_groundtruth
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import IntegerType [as 别名]
def read_groundtruth(self):
"""
Create a dataframe from the ground truth csv file
Takes as argument the full path name of the csv file
and the spark_session
"""
filereader = Reader(self.spark_session)
groundtruth_schema = StructType([
StructField("tid", IntegerType(), False),
StructField("attr_name", StringType(), False),
StructField("attr_val", StringType(), False)])
self.ground_truth_flat = filereader.read(self.path_to_grand_truth, 0,
groundtruth_schema).\
drop(GlobalVariables.index_name)
self.dataengine.add_db_table(
'Groundtruth', self.ground_truth_flat, self.dataset)
示例6: _join_results_single
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import IntegerType [as 别名]
def _join_results_single(self, scaffolds_df, sampled_df):
def _join_scaffold(scaff, decs):
mol = usc.join_joined_attachments(scaff, decs)
if mol:
return usc.to_smiles(mol)
join_scaffold_udf = psf.udf(_join_scaffold, pst.StringType())
def _create_decorations_map(decorations_smi, attachment_points):
decorations = decorations_smi.split(usc.ATTACHMENT_SEPARATOR_TOKEN)
return {idx: _cleanup_decoration(dec) for dec, idx in zip(decorations, attachment_points)}
create_decorations_map_udf = psf.udf(_create_decorations_map, pst.MapType(pst.IntegerType(), pst.StringType()))
return sampled_df.join(scaffolds_df, on="id")\
.select(
join_scaffold_udf("randomized_scaffold", "decoration_smi").alias("smiles"),
create_decorations_map_udf("decoration_smi", "attachment_points").alias("decorations"),
"scaffold")
示例7: format_to_file_path
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import IntegerType [as 别名]
def format_to_file_path(spark_session):
rows = [
Row(8, 32, "bat"),
Row(64, 40, "mouse"),
Row(-27, 55, "horse")
]
schema = StructType([
StructField("number2", IntegerType()),
StructField("number1", IntegerType()),
StructField("word", StringType())
])
rdd = spark_session.sparkContext.parallelize(rows)
df = spark_session.createDataFrame(rdd, schema)
res = {}
tempdir = tempfile.mkdtemp()
for data_format in ["csv", "parquet", "json"]:
res[data_format] = os.path.join(tempdir, "test-data-%s" % data_format)
for data_format, file_path in res.items():
df.write.option("header", "true").format(data_format).save(file_path)
yield res
shutil.rmtree(tempdir)
示例8: train
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import IntegerType [as 别名]
def train(df, dbn_config):
"""Generate relevance labels for the provided dataframe.
Process the provided data frame to generate relevance scores for
all provided pairs of (wikiid, norm_query_id, hit_page_id). The input
DataFrame must have a row per hit_page_id that was seen by a session.
Parameters
----------
df : pyspark.sql.DataFrame
User click logs with columns wikiid, norm_query_id, session_id,
hit_page_id, hit_position, clicked.
dbn_config : dict
Configuration needed by the DBN. See scala implementation docs
for more information.
Returns
-------
spark.sql.DataFrame
DataFrame with columns wikiid, norm_query_id, hit_page_id, relevance.
"""
df = (
df
.withColumn('hit_page_id', F.col('hit_page_id').cast(T.IntegerType()))
.withColumn('hit_position', F.col('hit_position').cast(T.IntegerType())))
jvm = df._sc._jvm
# jvm side expects Map[String, String]
j_config = jvm.PythonUtils.toScalaMap({str(k): str(v) for k, v in dbn_config.items()})
assert j_config.size() == len(dbn_config)
j_df = jvm.org.wikimedia.search.mjolnir.DBN.train(df._jdf, j_config)
return pyspark.sql.DataFrame(j_df, df.sql_ctx)
示例9: cluster_within_norm_query_groups
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import IntegerType [as 别名]
def cluster_within_norm_query_groups(df: DataFrame) -> DataFrame:
make_groups = F.udf(_make_query_groups, T.ArrayType(T.StructType([
T.StructField('query', T.StringType(), nullable=False),
T.StructField('norm_query_group_id', T.IntegerType(), nullable=False),
])))
return (
df
.groupBy('wikiid', 'norm_query')
.agg(F.collect_list(F.struct('query', 'hit_page_ids')).alias('source'))
.select(
'wikiid', 'norm_query',
F.explode(make_groups('source')).alias('group'))
.select('wikiid', 'norm_query', 'group.query', 'group.norm_query_group_id'))
示例10: test_dataframe_with_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import IntegerType [as 别名]
def test_dataframe_with_schema(dataset, spark):
schema = StructType([StructField("foo", IntegerType(), True)])
df = dataset.dataframe(spark, decode=decode, schema=schema, table_name='bar')
assert type(df) == DataFrame
assert df.columns == ['foo']
assert df.orderBy(["foo"]).collect() == [Row(foo=1), Row(foo=2)]
示例11: get_petastorm_column
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import IntegerType [as 别名]
def get_petastorm_column(df_column):
column_type = df_column.type
column_name = df_column.name
column_is_nullable = df_column.is_nullable
column_array_dimensions = df_column.array_dimensions
# Reference:
# https://github.com/uber/petastorm/blob/master/petastorm/
# tests/test_common.py
petastorm_column = None
if column_type == ColumnType.INTEGER:
petastorm_column = UnischemaField(column_name,
np.int32,
(),
ScalarCodec(IntegerType()),
column_is_nullable)
elif column_type == ColumnType.FLOAT:
petastorm_column = UnischemaField(column_name,
np.float64,
(),
ScalarCodec(FloatType()),
column_is_nullable)
elif column_type == ColumnType.TEXT:
petastorm_column = UnischemaField(column_name,
np.string_,
(),
ScalarCodec(StringType()),
column_is_nullable)
elif column_type == ColumnType.NDARRAY:
petastorm_column = UnischemaField(column_name,
np.uint8,
column_array_dimensions,
NdarrayCodec(),
column_is_nullable)
else:
LoggingManager().log("Invalid column type: " + str(column_type),
LoggingLevel.ERROR)
return petastorm_column
示例12: _numpy_to_spark_mapping
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import IntegerType [as 别名]
def _numpy_to_spark_mapping():
"""Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation
of multiple objects in each call."""
# Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot'
# notation to avoid copy/paste/typo mistakes
cache_attr_name = 'cached_numpy_to_pyspark_types_map'
if not hasattr(_numpy_to_spark_mapping, cache_attr_name):
import pyspark.sql.types as T
setattr(_numpy_to_spark_mapping, cache_attr_name,
{
np.int8: T.ByteType(),
np.uint8: T.ShortType(),
np.int16: T.ShortType(),
np.uint16: T.IntegerType(),
np.int32: T.IntegerType(),
np.int64: T.LongType(),
np.float32: T.FloatType(),
np.float64: T.DoubleType(),
np.string_: T.StringType(),
np.str_: T.StringType(),
np.unicode_: T.StringType(),
np.bool_: T.BooleanType(),
})
return getattr(_numpy_to_spark_mapping, cache_attr_name)
# TODO: Changing fields in this class or the UnischemaField will break reading due to the schema being pickled next to
# the dataset on disk
示例13: test_predicate_on_partitioned_dataset
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import IntegerType [as 别名]
def test_predicate_on_partitioned_dataset(tmpdir):
"""
Generates a partitioned dataset and ensures that readers evaluate the type of the partition
column according to the type given in the Unischema.
"""
TestSchema = Unischema('TestSchema', [
UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
UnischemaField('test_field', np.int32, (), ScalarCodec(IntegerType()), False),
])
def test_row_generator(x):
"""Returns a single entry in the generated dataset."""
return {'id': x,
'test_field': x*x}
rowgroup_size_mb = 256
dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir)
spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate()
sc = spark.sparkContext
rows_count = 10
with materialize_dataset(spark, dataset_url, TestSchema, rowgroup_size_mb):
rows_rdd = sc.parallelize(range(rows_count))\
.map(test_row_generator)\
.map(lambda x: dict_to_spark_row(TestSchema, x))
spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \
.write \
.partitionBy('id') \
.parquet(dataset_url)
with make_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == 3)) as reader:
assert next(reader).id == 3
with make_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == '3')) as reader:
with pytest.raises(StopIteration):
# Predicate should have selected none, so a StopIteration should be raised.
next(reader)
示例14: test_fields
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import IntegerType [as 别名]
def test_fields():
"""Try using 'fields' getter"""
TestSchema = Unischema('TestSchema', [
UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
])
assert len(TestSchema.fields) == 2
assert TestSchema.fields['int_field'].name == 'int_field'
assert TestSchema.fields['string_field'].name == 'string_field'
示例15: test_create_schema_view_using_invalid_type
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import IntegerType [as 别名]
def test_create_schema_view_using_invalid_type():
""" Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__."""
TestSchema = Unischema('TestSchema', [
UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
])
with pytest.raises(ValueError, match='must be either a string'):
TestSchema.create_schema_view([42])