本文整理汇总了Python中pyspark.ml.linalg.VectorUDT方法的典型用法代码示例。如果您正苦于以下问题:Python linalg.VectorUDT方法的具体用法?Python linalg.VectorUDT怎么用?Python linalg.VectorUDT使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.linalg
的用法示例。
在下文中一共展示了linalg.VectorUDT方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: append_features
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def append_features(df, *cols):
"""Append features from columns to the features vector.
Parameters
----------
df : pyspark.sql.DataFrame
cols : list of str
Returns
-------
pyspark.sql.DataFrame
"""
def add_features(feat, *other):
raw = feat.toArray()
return Vectors.dense(np.append(raw, list(map(float, other))))
add_features_udf = F.udf(add_features, VectorUDT())
new_feat_list = df.schema['features'].metadata['features'] + cols
return df.withColumn('features', mjolnir.spark.add_meta(
df._sc, add_features_udf('features', *cols), {'features': new_feat_list}))
示例2: zero_features
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def zero_features(df, *feature_names):
"""Zero out features in the feature vector.
Parameters
----------
df : pyspark.sql.DataFrame
feature_names : list of str
Returns
-------
pyspark.sql.DataFrame
"""
features = df.schema['features'].metadata['features']
idxs = [features.index(name) for name in feature_names]
def zero_features(feat):
raw = feat.toArray()
for idx in idxs:
raw[idx] = 0.
return Vectors.dense(raw)
zero_features_udf = F.udf(zero_features, VectorUDT())
return df.withColumn('features', mjolnir.spark.add_meta(
df._sc, zero_features_udf('features'), {'features': features}))
示例3: transform
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def transform(self,X_rdd,y_rdd=None):
'''
given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels
'''
#check input type
if type(X_rdd) != RDD:
raise TypeError("Arguments must be pySpark RDDs")
if y_rdd and type(y_rdd) != RDD:
raise TypeError("Arguments must be pySpark RDDs")
#get term frequencies
X = X_rdd.map(self._term_frequency).cache()
#convert to sparse
X = X.map(lambda (hash,features): (hash,SparseVector(self.num_features,np.nonzero(features)[0],features[features>0])))
#check if labels exist
if y_rdd:
#combine X and y into single dataframe
X = X.zipWithIndex().map(lambda r: (r[1],r[0]))
y = y_rdd.zipWithIndex().map(lambda r: (r[1],r[0]))
data = X.join(y).map(lambda (idx,((hash,features),label)): (hash,features,label))
schema = StructType([StructField('hash',StringType(),True),StructField('features',VectorUDT(),True),StructField('label',StringType(),True)])
data = data.toDF(schema)
data = data.withColumn('label',data.label.cast(DoubleType()))
else:
schema = StructType([StructField('hash',StringType(),True),StructField("features", VectorUDT(), True)])
data = X.toDF(schema)
return data
示例4: transform
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def transform(self,X_rdd,y_rdd=None,train=True):
'''
given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels
'''
#check input type
if type(X_rdd) != RDD:
raise TypeError("Arguments must be pySpark RDDs")
if y_rdd and type(y_rdd) != RDD:
raise TypeError("Arguments must be pySpark RDDs")
#word tokenization
X = X_rdd.map(self._tokenize).cache()
#create dictionary of words
if train:
self.dictionary = X.map(lambda row: row[1]).flatMap(lambda word: word).map(lambda word: (word,1)).reduceByKey(lambda acc, w: acc + w).filter(lambda x: x[1]>=self.min_df).collectAsMap()
self.dictionary = dict(zip(self.dictionary,xrange(len(self.dictionary))))
#create word vectors
X = X.map(self._term_frequency)
#check if labels exist
if y_rdd:
#combine X and y into single dataframe
X = X.zipWithIndex().map(lambda r: (r[1],r[0]))
y = y_rdd.zipWithIndex().map(lambda r: (r[1],r[0]))
data = X.join(y).map(lambda (idx,((hash,features),label)): (hash,features,label))
schema = StructType([StructField('hash',StringType(),True),StructField('features',VectorUDT(),True),StructField('label',StringType(),True)])
data = data.toDF(schema)
data = data.withColumn('label',data.label.cast(DoubleType()))
else:
schema = StructType([StructField('hash',StringType(),True),StructField("features", VectorUDT(), True)])
data = X.toDF(schema)
return data
示例5: read_partition
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def read_partition(
spark: SparkSession,
table: str,
partition_spec: Mapping[str, str],
schema: Optional[T.StructType] = None,
direct_parquet_read: bool = False
) -> DataFrame:
"""Read a single partition from a hive table.
Verifies the partition specification describes a complete partition,
that the partition exists, and optionally that the table is compatible
with an expected schema. The partition could still be empty.
"""
# We don't need to do anything with the result, our goal is to
# trigger AnalysisException when the arguments are invalid.
spark.sql(_describe_partition_ql(table, partition_spec)).collect()
partition_cond = F.lit(True)
for k, v in partition_spec.items():
partition_cond &= F.col(k) == v
df = spark.read.table(table).where(partition_cond)
# The df we have now has types defined by the hive table, but this downgrades
# non-standard types like VectorUDT() to it's sql equivalent. Use the first
# df to find the files, then read them directly.
if direct_parquet_read:
input_files = list(df._jdf.inputFiles()) # type: ignore
input_dirs = set(os.path.dirname(path) for path in input_files)
if len(input_dirs) != 1:
raise Exception('Expected single directory containing partition data: [{}]'.format(
'],['.join(input_files)))
df = spark.read.parquet(list(input_dirs)[0])
if schema is not None:
# TODO: This only allows extra top level columns, anything
# nested must be exactly the same. Fine for now.
_verify_schema_compatability(schema, df.schema)
df = df.select(*(field.name for field in schema))
# Drop partitioning columns. These are not part of the mjolnir transformations, and
# are only an implementation detail of putting them on disk and tracking history.
return df.drop(*partition_spec.keys())
示例6: _transform
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def _transform(self, dataset):
inp = self.getOrDefault(self.inputCol)
out = self.getOrDefault(self.predictionCol)
mod_str = self.getOrDefault(self.modStr)
use_vector_out = self.getOrDefault(self.useVectorOut)
model = dill.loads(codecs.decode(mod_str.encode(), "base64"))
model_broadcast = dataset._sc.broadcast(model)
def predict_vec(data):
features = data.toArray().reshape((1, len(data)))
x_data = torch.from_numpy(features).float()
model = model_broadcast.value
model.eval()
return Vectors.dense(model(x_data).detach().numpy().flatten())
def predict_float(data):
features = data.toArray().reshape((1, len(data)))
x_data = torch.from_numpy(features).float()
model = model_broadcast.value
model.eval()
raw_prediction = model(x_data).detach().numpy().flatten()
if len(raw_prediction) > 1:
return float(np.argmax(raw_prediction))
return float(raw_prediction[0])
if use_vector_out:
udfGenerateCode = F.udf(predict_vec, VectorUDT())
else:
udfGenerateCode = F.udf(predict_float, DoubleType())
return dataset.withColumn(out, udfGenerateCode(inp))
示例7: _convert_vector
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def _convert_vector(df, dtype):
from pyspark.ml.linalg import VectorUDT
from pyspark.mllib.linalg import VectorUDT as OldVectorUDT
for field in df.schema:
col_name = field.name
if isinstance(field.dataType, VectorUDT) or \
isinstance(field.dataType, OldVectorUDT):
df = df.withColumn(col_name,
vector_to_array(df[col_name], dtype))
return df
示例8: _average_feature_vectors
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def _average_feature_vectors(self, data, outputCol):
'''Average the feature vectors
Parameters
----------
data : DataFrame
input dataframe
outputCol : str
name of the output column
'''
session = SparkSession.builder.getOrCreate()
def _averager(v1, v2, v3):
f1 = v1.toArray()
f2 = v2.toArray()
f3 = v3.toArray()
length = min(len(f1), len(f2), len(f3))
average = []
for i in range(length):
average.append((f1[i] + f2[i] + f3[i])/3.0)
return Vectors.dense(average)
session.udf.register("averager", _averager, VectorUDT())
data.createOrReplaceTempView("table")
sql = f"SELECT *, averager(feature0, feature1, feature2) AS {self.outputCol} from table"
data = session.sql(sql)
return data
示例9: toPandas
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def toPandas(self, df):
"""
This is similar to the Spark DataFrame built-in toPandas() method, but it handles
MLlib Vector columns differently. It converts MLlib Vectors into rows of
scipy.sparse.csr_matrix, which is generally friendlier for PyData tools like scikit-learn.
.. note:: Experimental: This will likely be replaced in later releases with improved APIs.
:param df: Spark DataFrame
:return: Pandas dataframe
"""
cols = df.columns
# Convert any MLlib Vector columns to scipy.sparse.csr_matrix
matrixCols = []
def toscipy(v):
if isinstance(v, DenseVector):
return csr_matrix((v.values, np.array(range(v.size)), np.array([0, v.size])),
shape=(1, v.size))
elif isinstance(v, SparseVector):
return csr_matrix((v.values, v.indices, np.array([0, len(v.indices)])),
shape=(1, v.size))
else:
raise TypeError("Converter.toPandas found unknown Vector type: %s" % type(v))
tosparse = udf(lambda v: toscipy(v), CSRVectorUDT())
for i in range(len(cols)):
c = cols[i]
if isinstance(df.schema.fields[i].dataType, VectorUDT):
cols[i] = tosparse(df[c]).alias(c)
matrixCols.append(c)
else:
cols[i] = df[c]
return df.select(*cols).toPandas()
示例10: dataframe_to_nparray
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def dataframe_to_nparray(df):
from pyspark.ml.linalg import VectorUDT
schema = df.schema
npcols = []
for i in range(0, len(df.columns)):
if isinstance(schema.fields[i].dataType, VectorUDT):
npcols.append(df.select(df.columns[i]).toPandas().apply(
lambda x : numpy.array(x[0].toArray())).as_matrix().reshape(-1, 1))
else:
npcols.append(df.select(df.columns[i]).collect())
return numpy.array(npcols)
示例11: _getNumpyFeaturesAndLabels
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def _getNumpyFeaturesAndLabels(self, dataset):
"""
We assume the training data fits in memory on a single server.
The input dataframe is converted to numerical image features and
broadcast to all the worker nodes.
"""
image_uri_col = self.getInputCol()
label_col = None
if self.isDefined(self.labelCol) and self.getLabelCol() != "":
label_col = self.getLabelCol()
tmp_image_col = self._loadedImageCol()
image_df = self.loadImagesInternal(dataset, image_uri_col).dropna(subset=[tmp_image_col])
# Extract features
localFeatures = []
rows = image_df.collect()
for row in rows:
spimg = row[tmp_image_col]
features = imageStructToArray(spimg)
localFeatures.append(features)
if not localFeatures: # NOTE(phi-dbq): pep-8 recommended against testing 0 == len(array)
raise ValueError("Cannot extract any feature from dataset!")
X = np.stack(localFeatures, axis=0)
# Extract labels
y = None
if label_col is not None:
label_schema = image_df.schema[label_col]
label_dtype = label_schema.dataType
assert isinstance(label_dtype, spla.VectorUDT), \
"must encode labels in one-hot vector format, but got {}".format(label_dtype)
localLabels = []
for row in rows:
try:
_keras_label = row[label_col].toArray()
except ValueError:
raise ValueError("Cannot extract encoded label array")
localLabels.append(_keras_label)
if not localLabels:
raise ValueError("Failed to load any labels from dataset, but labels are required")
y = np.stack(localLabels, axis=0)
assert y.shape[0] == X.shape[0], \
"number of features {} != number of labels {}".format(X.shape[0], y.shape[0])
return X, y
示例12: collect_from_ltr_plugin_and_kafka
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def collect_from_ltr_plugin_and_kafka(df, brokers, model, feature_names_accu, indices=None):
"""Collect feature vectors from elasticsearch via kafka
Pushes queries into a kafka topic and retrieves results from a second kafka topic.
A daemon must be running on relforge to collect the queries and produce results.
Parameters
----------
df : pyspark.sql.DataFrame
Source dataframe containing wikiid, query and hit_page_id fields
to collect feature vectors for.
brokers : list of str
List of kafka brokers used to bootstrap access into the kafka cluster.
model : string
definition of the model/featureset: "featureset:name", "model:name" or "featureset:name@storeName"
feature_names_accu : Accumulator
used to collect feature names
indices : dict, optional
map from wikiid to elasticsearch index to query. If wikiid is
not present the wikiid will be used as index name. (Default: None)
"""
mjolnir.spark.assert_columns(df, ['wikiid', 'query', 'hit_page_id'])
if indices is None:
indices = {}
eltType, name, store = mjolnir.utils.explode_ltr_model_definition(model)
log_query = LtrLoggingQuery(eltType, name, store)
def kafka_handle_response(record):
assert record['status_code'] == 200
parsed = json.loads(record['text'])
response = parsed['responses'][0]
meta = record['meta']
for hit_page_id, features in extract_ltr_log_feature_values(response, feature_names_accu):
yield [meta['wikiid'], meta['query'], hit_page_id, features]
rdd = mjolnir.kafka.client.msearch(
df.groupBy('wikiid', 'query').agg(F.collect_set('hit_page_id').alias('hit_page_ids')),
client_config=brokers,
meta_keys=['wikiid', 'query'],
create_es_query=lambda row: log_query.make_msearch(row, indices),
handle_response=kafka_handle_response)
return df.sql_ctx.createDataFrame(rdd, T.StructType([
df.schema['wikiid'], df.schema['query'], df.schema['hit_page_id'],
T.StructField('features', VectorUDT(), nullable=False)
# We could have gotten duplicate data from kafka. Clean them up.
])).drop_duplicates(['wikiid', 'query', 'hit_page_id'])
示例13: one_hot_encode
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def one_hot_encode(self, data = None, inputCol = None, outputCol = None):
'''
One-hot encodes a protein sequence. The one-hot encoding
encodes the 20 natural amino acids, plus X for any other
residue for a total of 21 elements per residue.
Parameters
----------
data : DataFrame
input data to be encoded [None]
inputCol : str
name of the input column [None]
outputCol : str
name of the output column [None]
'''
# Setting class variables
if data is not None:
self.data = data
if inputCol is not None:
self.inputCol = inputCol
if outputCol is not None:
self.outputCol = outputCol
if self.data is None:
raise ValueError("Class variable data is not defined, please pass\
in a dataframe into the data parameter")
session = SparkSession.builder.getOrCreate()
AMINO_ACIDS21 = self.AMINO_ACIDS21
# Encoder function to be passed as User Defined Function (UDF)
def _encoder(s):
values = [0] * len(AMINO_ACIDS21) * len(s)
for i in range(len(s)):
if s[i] in AMINO_ACIDS21:
index = AMINO_ACIDS21.index(s[i])
else:
index = AMINO_ACIDS21.index('X')
values[i*len(AMINO_ACIDS21) + index] = 1
return Vectors.dense(values)
session.udf.register("encoder", _encoder, VectorUDT())
self.data.createOrReplaceTempView("table")
sql = f"SELECT *, encoder({self.inputCol}) AS {self.outputCol} from table"
data = session.sql(sql)
return data
示例14: property_encode
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def property_encode(self, data = None, inputCol = None, outputCol = None):
'''Encodes a protein sequence by 7 physicochemical properties
References
----------
Meiler, J., Müller, M., Zeidler, A. et al. J Mol Model (2001)
https://link.springer.com/article/10.1007/s008940100038
Parameters
----------
data : DataFrame
input data to be encoded [None]
inputCol : str
name of the input column [None]
outputCol : str
name of the output column [None]
Returns
-------
dataset
dataset with feature vector appended
'''
# Setting class variables
if data is not None:
self.data = data
if inputCol is not None:
self.inputCol = inputCol
if outputCol is not None:
self.outputCol = outputCol
if self.data is None:
raise ValueError("Class variable data is not defined, please pass\
in a dataframe into the data parameter")
session = SparkSession.builder.getOrCreate()
properties = self.properties
#Encoder function to be passed as User Defined Function (UDF)
def _encoder(s):
values = []
for i in range(len(s)):
if s[i] in properties:
values += properties[s[i]]
return Vectors.dense(values)
session.udf.register("encoder", _encoder, VectorUDT())
self.data.createOrReplaceTempView("table")
sql = f"SELECT *, encoder({self.inputCol}) AS {self.outputCol} from table"
data = session.sql(sql)
return data
示例15: blosum62_encode
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def blosum62_encode(self, data = None, inputCol = None, outputCol = None):
'''Encodes a protein sequence by 7 Blosum62
References
----------
Blosum Matrix
https://ftp.ncbi.nih.gov/repository/blocks/unix/blosum/BLOSUM/blosum62.blast.new
Parameters
----------
data : DataFrame
input data to be encoded [None]
inputCol : str
name of the input column [None]
outputCol : str
name of the output column [None]
Returns
-------
dataset
dataset with feature vector appended
'''
if data is not None:
self.data = data
if inputCol is not None:
self.inputCol = inputCol
if outputCol is not None:
self.outputCol = outputCol
if self.data is None:
raise ValueError("Class variable data is not defined, please pass\
in a dataframe into the data parameter")
session = SparkSession.builder.getOrCreate()
blosum62 = self.blosum62
#Encoder function to be passed as User Defined Function (UDF)
def _encoder(s):
values = []
for i in range(len(s)):
if s[i] in blosum62:
values += blosum62[s[i]]
return Vectors.dense(values)
session.udf.register("encoder", _encoder, VectorUDT())
self.data.createOrReplaceTempView("table")
sql = f"SELECT *, encoder({self.inputCol}) AS {self.outputCol} from table"
data = session.sql(sql)
return data