本文整理汇总了Python中pyspark.sql.types.Row方法的典型用法代码示例。如果您正苦于以下问题:Python types.Row方法的具体用法?Python types.Row怎么用?Python types.Row使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.types
的用法示例。
在下文中一共展示了types.Row方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _create_train_image_uris_and_labels
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import Row [as 别名]
def _create_train_image_uris_and_labels(self, repeat_factor=1, cardinality=100, dense=True):
image_uris = getSampleImagePaths() * repeat_factor
# Create image categorical labels (integer IDs)
local_rows = []
for uri in image_uris:
label = np.random.randint(low=0, high=cardinality, size=1)[0]
if dense:
label_inds = np.zeros(cardinality)
label_inds[label] = 1.0
label_inds = label_inds.ravel()
assert label_inds.shape[0] == cardinality, label_inds.shape
one_hot_vec = spla.Vectors.dense(label_inds.tolist())
else: # sparse
one_hot_vec = spla.Vectors.sparse(cardinality, {label: 1})
_row_struct = {self.input_col: uri, self.one_hot_col: one_hot_vec,
self.one_hot_label_col: float(label)}
row = sptyp.Row(**_row_struct)
local_rows.append(row)
image_uri_df = self.session.createDataFrame(local_rows)
return image_uri_df
示例2: range
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import Row [as 别名]
def range(self, start, end=None, step=1, numPartitions=None):
"""
Create a :class:`DataFrame` with single :class:`pyspark.sql.types.LongType` column named
``id``, containing elements in a range from ``start`` to ``end`` (exclusive) with
step value ``step``.
:param start: the start value
:param end: the end value (exclusive)
:param step: the incremental step (default: 1)
:param numPartitions: the number of partitions of the DataFrame
:return: :class:`DataFrame`
>>> sqlContext.range(1, 7, 2).collect()
[Row(id=1), Row(id=3), Row(id=5)]
If only one argument is specified, it will be used as the end value.
>>> sqlContext.range(3).collect()
[Row(id=0), Row(id=1), Row(id=2)]
"""
return self.sparkSession.range(start, end, step, numPartitions)
示例3: tables
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import Row [as 别名]
def tables(self, dbName=None):
"""Returns a :class:`DataFrame` containing names of tables in the given database.
If ``dbName`` is not specified, the current database will be used.
The returned DataFrame has two columns: ``tableName`` and ``isTemporary``
(a column with :class:`BooleanType` indicating if a table is a temporary one or not).
:param dbName: string, name of the database to use.
:return: :class:`DataFrame`
>>> sqlContext.registerDataFrameAsTable(df, "table1")
>>> df2 = sqlContext.tables()
>>> df2.filter("tableName = 'table1'").first()
Row(database=u'', tableName=u'table1', isTemporary=True)
"""
if dbName is None:
return DataFrame(self._ssql_ctx.tables(), self)
else:
return DataFrame(self._ssql_ctx.tables(dbName), self)
示例4: toNDArray
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import Row [as 别名]
def toNDArray(self, image):
"""
Converts an image to an array with metadata.
:param `Row` image: A row that contains the image to be converted. It should
have the attributes specified in `ImageSchema.imageSchema`.
:return: a `numpy.ndarray` that is an image.
.. versionadded:: 2.3.0
"""
if not isinstance(image, Row):
raise TypeError(
"image argument should be pyspark.sql.types.Row; however, "
"it got [%s]." % type(image))
if any(not hasattr(image, f) for f in self.imageFields):
raise ValueError(
"image argument should have attributes specified in "
"ImageSchema.imageSchema [%s]." % ", ".join(self.imageFields))
height = image.height
width = image.width
nChannels = image.nChannels
return np.ndarray(
shape=(height, width, nChannels),
dtype=np.uint8,
buffer=image.data,
strides=(width * nChannels, nChannels, 1))
示例5: toImage
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import Row [as 别名]
def toImage(self, array, origin=""):
"""
Converts an array with metadata to a two-dimensional image.
:param `numpy.ndarray` array: The array to convert to image.
:param str origin: Path to the image, optional.
:return: a :class:`Row` that is a two dimensional image.
.. versionadded:: 2.3.0
"""
if not isinstance(array, np.ndarray):
raise TypeError(
"array argument should be numpy.ndarray; however, it got [%s]." % type(array))
if array.ndim != 3:
raise ValueError("Invalid array shape")
height, width, nChannels = array.shape
ocvTypes = ImageSchema.ocvTypes
if nChannels == 1:
mode = ocvTypes["CV_8UC1"]
elif nChannels == 3:
mode = ocvTypes["CV_8UC3"]
elif nChannels == 4:
mode = ocvTypes["CV_8UC4"]
else:
raise ValueError("Invalid number of channels")
# Running `bytearray(numpy.array([1]))` fails in specific Python versions
# with a specific Numpy version, for example in Python 3.6.0 and NumPy 1.13.3.
# Here, it avoids it by converting it to bytes.
data = bytearray(array.astype(dtype=np.uint8).ravel().tobytes())
# Creating new Row with _create_row(), because Row(name = value, ... )
# orders fields by name, which conflicts with expected schema order
# when the new DataFrame is created by UDF
return _create_row(self.imageFields,
[origin, height, width, nChannels, mode, data])
示例6: __init__
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import Row [as 别名]
def __init__(self, sparkContext, sparkSession=None, jsqlContext=None):
"""Creates a new SQLContext.
>>> from datetime import datetime
>>> sqlContext = SQLContext(sc)
>>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1,
... b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1),
... time=datetime(2014, 8, 1, 14, 1, 5))])
>>> df = allTypes.toDF()
>>> df.createOrReplaceTempView("allTypes")
>>> sqlContext.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a '
... 'from allTypes where b and i > 0').collect()
[Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \
dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)]
>>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect()
[(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
"""
self._sc = sparkContext
self._jsc = self._sc._jsc
self._jvm = self._sc._jvm
if sparkSession is None:
sparkSession = SparkSession.builder.getOrCreate()
if jsqlContext is None:
jsqlContext = sparkSession._jwrapped
self.sparkSession = sparkSession
self._jsqlContext = jsqlContext
_monkey_patch_RDD(self.sparkSession)
install_exception_handler()
if SQLContext._instantiatedContext is None:
SQLContext._instantiatedContext = self
示例7: _inferSchema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import Row [as 别名]
def _inferSchema(self, rdd, samplingRatio=None):
"""
Infer schema from an RDD of Row or tuple.
:param rdd: an RDD of Row or tuple
:param samplingRatio: sampling ratio, or no sampling (default)
:return: :class:`pyspark.sql.types.StructType`
"""
return self.sparkSession._inferSchema(rdd, samplingRatio)
示例8: _test
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import Row [as 别名]
def _test():
import os
import doctest
import tempfile
from pyspark.context import SparkContext
from pyspark.sql import Row, SQLContext
import pyspark.sql.context
os.chdir(os.environ["SPARK_HOME"])
globs = pyspark.sql.context.__dict__.copy()
sc = SparkContext('local[4]', 'PythonTest')
globs['tempfile'] = tempfile
globs['os'] = os
globs['sc'] = sc
globs['sqlContext'] = SQLContext(sc)
globs['rdd'] = rdd = sc.parallelize(
[Row(field1=1, field2="row1"),
Row(field1=2, field2="row2"),
Row(field1=3, field2="row3")]
)
globs['df'] = rdd.toDF()
jsonStrings = [
'{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
'{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
'"field6":[{"field7": "row2"}]}',
'{"field1" : null, "field2": "row3", '
'"field3":{"field4":33, "field5": []}}'
]
globs['jsonStrings'] = jsonStrings
globs['json'] = sc.parallelize(jsonStrings)
(failure_count, test_count) = doctest.testmod(
pyspark.sql.context, globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
globs['sc'].stop()
if failure_count:
sys.exit(-1)
示例9: read_images
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import Row [as 别名]
def read_images(spark, filenames):
filenames_rdd = spark.sparkContext.parallelize(filenames)
schema = StructType(
[StructField("filename", StringType(), True), StructField("image", StringType(), True)])
return filenames_rdd.map(lambda x: Row(filename=x,
image=read_image_bytes_base64(x))).toDF(schema=schema)
示例10: evaluate_communities
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import Row [as 别名]
def evaluate_communities(args):
log = logging.getLogger("evalcc")
model = CommunitiesModel().load(args.input)
configure(args)
spark = create_spark("evalcc-%s" % uuid4(), **filter_kwargs(args.__dict__, create_spark))
log.info("Preparing the communities' RDD")
items = []
for i, c in progress_bar(enumerate(model.communities), log,
expected_size=len(model.communities)):
for m in c:
if m < len(model.id_to_element):
items.append(Row(sha1=model.id_to_element[m], community=i))
log.info("Running")
items_in_spark = spark.sparkContext.parallelize(items).toDF()
bags = spark \
.read \
.format("org.apache.spark.sql.cassandra") \
.options(table=args.tables["bags"], keyspace=args.keyspace) \
.load()
log.info("Loaded the bags, calculating the vocabulary")
vocabulary = bags.drop("sha1", "value").distinct().rdd.map(lambda x: x.item).collect()
vocabulary = {v: i for i, v in enumerate(vocabulary)}
log.info("Vocabulary size: %d", len(vocabulary))
element_to_id = {e: i for i, e in enumerate(model.id_to_element)}
metrics = items_in_spark.join(bags, "sha1").rdd \
.map(lambda r: (r.community, (element_to_id[r.sha1], vocabulary[r.item], r.value))) \
.groupByKey() \
.map(CommunityEvaluator(args.threshold, len(vocabulary))) \
.reduce(lambda v1, v2: [v1[i] + v2[i] for i in range(4)])
log.info("Total misses: %d", metrics[0])
log.info("Average normalized misses: %f", metrics[1] / len(model.communities))
log.info("Total loss: %f", metrics[2])
log.info("Average normalized loss: %f", numpy.sqrt(metrics[3] / len(model.communities)))
示例11: __call__
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import Row [as 别名]
def __call__(self, record):
key, wmh = record
for hti in range(self.htnum):
yield Row(sha1=key, hashtable=hti,
value=bytearray(wmh[hti * self.band_size:(hti + 1) * self.band_size].data))
示例12: __call__
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import Row [as 别名]
def __call__(self, head):
rows = head.map(lambda row: Row(sha1=row.document,
item=row.token,
value=float(row.value)))
if self.explained:
self._log.info("toDebugString():\n%s", rows.toDebugString().decode())
rows.toDF() \
.write \
.format("org.apache.spark.sql.cassandra") \
.mode("append") \
.options(table=self.table, keyspace=self.keyspace) \
.save()
return head