本文整理汇总了Python中pyspark.ml.feature.StringIndexer方法的典型用法代码示例。如果您正苦于以下问题:Python feature.StringIndexer方法的具体用法?Python feature.StringIndexer怎么用?Python feature.StringIndexer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.feature
的用法示例。
在下文中一共展示了feature.StringIndexer方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_index_to_string
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import StringIndexer [as 别名]
def test_index_to_string(self):
original_data = self.spark.createDataFrame(
[(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
["id", "category"])
string_indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
string_indexer_model = string_indexer.fit(original_data)
data = string_indexer_model.transform(original_data)
model = IndexToString(inputCol="categoryIndex", outputCol="originalCategory",
labels=['A', 'B', 'C'])
# the input name should match that of what IndexToString.inputCol
model_onnx = convert_sparkml(model, 'Sparkml IndexToString', [('categoryIndex', Int64TensorType([1, 1]))])
self.assertTrue(model_onnx is not None)
# run the model
predicted = model.transform(data)
expected = predicted.select("originalCategory").toPandas().values
data_np = data.select('categoryIndex').toPandas().values.astype(numpy.int64)
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlIndexToString")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['originalCategory'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例2: main
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import StringIndexer [as 别名]
def main(sc, spark):
# Load and vectorize the corpus
corpus = load_corpus(sc, spark)
vector = make_vectorizer().fit(corpus)
# Index the labels of the classification
labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
labelIndex = labelIndex.fit(corpus)
# Split the data into training and test sets
training, test = corpus.randomSplit([0.8, 0.2])
# Create the classifier
clf = LogisticRegression(
maxIter=10, regParam=0.3, elasticNetParam=0.8,
family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")
# Create the model
model = Pipeline(stages=[
vector, labelIndex, clf
]).fit(training)
# Make predictions
predictions = model.transform(test)
predictions.select("prediction", "indexedLabel", "tfidf").show(5)
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
gbtModel = model.stages[2]
print(gbtModel) # summary only
示例3: get_labels
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import StringIndexer [as 别名]
def get_labels(rf_pipeline: pyspark.ml.PipelineModel) -> List[str]:
"""
Returns the labels from the StringIndexer stage at index 0 from an RF pipeline model
:param rf_pipeline: Input pipeline
:return: labels
"""
return rf_pipeline.stages[0].labels
示例4: test_index_to_string_throws
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import StringIndexer [as 别名]
def test_index_to_string_throws(self):
original_data = self.spark.createDataFrame(
[(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
["id", "category"])
string_indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
string_indexer_model = string_indexer.fit(original_data)
data = string_indexer_model.transform(original_data)
model = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
# the input name should match that of what IndexToString.inputCol
model_onnx = None
with pytest.raises(SparkMlConversionError):
model_onnx = convert_sparkml(model, 'Sparkml IndexToString', [('categoryIndex', Int64TensorType([1, 1]))])
示例5: test_model_pipeline_3_stage
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import StringIndexer [as 别名]
def test_model_pipeline_3_stage(self):
this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
full_data = self.spark.read.format('csv')\
.options(header='true', inferschema='true').load(input_path)
cols = ['workclass', 'education', 'marital_status']
training_data, test_data = full_data.select(*cols).limit(1000).randomSplit([0.9, 0.1], seed=1)
stages = []
for col in cols:
stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
# we need the dropLast option otherwise when assembled together (below)
# we won't be able to expand the features without difficulties
stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False))
stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features'))
pipeline = Pipeline(stages=stages)
model = pipeline.fit(training_data)
model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
('workclass', StringTensorType([1, 1])),
('education', StringTensorType([1, 1])),
('marital_status', StringTensorType([1, 1]))
])
self.assertTrue(model_onnx is not None)
self.assertTrue(model_onnx.graph.node is not None)
# run the model
predicted = model.transform(test_data)
data_np = {
'workclass': test_data.select('workclass').toPandas().values,
'education': test_data.select('education').toPandas().values,
'marital_status': test_data.select('marital_status').toPandas().values
}
expected = predicted.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlPipeline_3Stage")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['features'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例6: test_random_forrest_regression
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import StringIndexer [as 别名]
def test_random_forrest_regression(self):
this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt")
original_data = self.spark.read.format("libsvm").load(input_path)
#
# truncate the features
#
feature_count = 5
self.spark.udf.register("truncateFeatures",
lambda x: SparseVector(feature_count, range(0,feature_count), x.toArray()[125:130]),
VectorUDT())
data = original_data.selectExpr("cast(label as string) as label", "truncateFeatures(features) as features")
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures",
maxCategories=10, handleInvalid='error')
rf = RandomForestRegressor(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)
pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf])
model = pipeline.fit(data)
model_onnx = convert_sparkml(model, 'Sparkml RandomForest Regressor', [
('label', StringTensorType([1, 1])),
('features', FloatTensorType([1, feature_count]))
], spark_session=self.spark)
self.assertTrue(model_onnx is not None)
# run the model
predicted = model.transform(data.limit(1))
data_np = {
'label': data.limit(1).toPandas().label.values,
'features': data.limit(1).toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
}
expected = [
predicted.toPandas().indexedLabel.values.astype(numpy.int64),
predicted.toPandas().prediction.values.astype(numpy.float32)
]
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlRandomForestRegressor")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['indexedLabel', 'prediction'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例7: test_gbt_classifier
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import StringIndexer [as 别名]
def test_gbt_classifier(self):
raw_data = self.spark.createDataFrame([
(1.0, Vectors.dense(1.0)),
(0.0, Vectors.sparse(1, [], []))
], ["label", "features"])
string_indexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = string_indexer.fit(raw_data)
data = si_model.transform(raw_data)
gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42)
model = gbt.fit(data)
feature_count = data.first()[1].size
model_onnx = convert_sparkml(model, 'Sparkml GBT Classifier', [
('features', FloatTensorType([1, feature_count]))
], spark_session=self.spark)
self.assertTrue(model_onnx is not None)
# run the model
predicted = model.transform(data)
data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
expected = [
predicted.toPandas().prediction.values.astype(numpy.float32),
predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
]
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlGBTClassifier")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例8: test_random_forrest_classification
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import StringIndexer [as 别名]
def test_random_forrest_classification(self):
this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt")
original_data = self.spark.read.format("libsvm").load(input_path)
#
# truncate the features
#
feature_count = 5
self.spark.udf.register("truncateFeatures",
lambda x: SparseVector(feature_count, range(0,feature_count), x.toArray()[125:130]),
VectorUDT())
data = original_data.selectExpr("cast(label as string) as label", "truncateFeatures(features) as features")
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures",
maxCategories=10, handleInvalid='keep')
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)
pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf])
model = pipeline.fit(data)
model_onnx = convert_sparkml(model, 'Sparkml RandomForest Classifier', [
('label', StringTensorType([1, 1])),
('features', FloatTensorType([1, feature_count]))
], spark_session=self.spark)
self.assertTrue(model_onnx is not None)
# run the model
predicted = model.transform(data)
data_np = {
'label': data.toPandas().label.values,
'features': data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
}
expected = [
predicted.toPandas().indexedLabel.values.astype(numpy.int64),
predicted.toPandas().prediction.values.astype(numpy.float32),
predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
]
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlRandomForestClassifier")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['indexedLabel', 'prediction', 'probability'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例9: test_model_pipeline_4_stage
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import StringIndexer [as 别名]
def test_model_pipeline_4_stage(self):
this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
full_data = self.spark.read.format('csv')\
.options(header='true', inferschema='true').load(input_path)
cols = ['workclass', 'education', 'marital_status']
training_data, test_data = full_data.select('income', *cols).limit(1000).randomSplit([0.9, 0.1],seed=1)
stages = []
for col in cols:
stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False))
stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features'))
stages.append(StringIndexer(inputCol='income', outputCol='label', handleInvalid='skip'))
stages.append(LogisticRegression(maxIter=100, tol=0.0001))
pipeline = Pipeline(stages=stages)
model = pipeline.fit(training_data)
model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
('income', StringTensorType([1, 1])),
('workclass', StringTensorType([1, 1])),
('education', StringTensorType([1, 1])),
('marital_status', StringTensorType([1, 1]))
])
self.assertTrue(model_onnx is not None)
self.assertTrue(model_onnx.graph.node is not None)
# run the model
predicted = model.transform(test_data)
data_np = {
'income': test_data.select('income').toPandas().values,
'workclass': test_data.select('workclass').toPandas().values,
'education': test_data.select('education').toPandas().values,
'marital_status': test_data.select('marital_status').toPandas().values
}
expected = [
predicted.toPandas().label.values.astype(numpy.float32),
predicted.toPandas().prediction.values.astype(numpy.float32),
predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
]
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlPipeline_4Stage")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['label', 'prediction', 'probability'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例10: test_model_pipeline_2_stage
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import StringIndexer [as 别名]
def test_model_pipeline_2_stage(self):
this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
full_data = self.spark.read.format('csv')\
.options(header='true', inferschema='true').load(input_path)
cols = ['workclass', 'education', 'marital_status']
training_data, test_data = full_data.select(*cols).limit(1000).randomSplit([0.9, 0.1], seed=1)
stages = []
for col in cols:
stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec']))
pipeline = Pipeline(stages=stages)
model = pipeline.fit(training_data)
model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
('workclass', StringTensorType([1, 1])),
('education', StringTensorType([1, 1])),
('marital_status', StringTensorType([1, 1]))
])
self.assertTrue(model_onnx is not None)
self.assertTrue(model_onnx.graph.node is not None)
# run the model
predicted = model.transform(test_data)
data_np = {
'workclass': test_data.select('workclass').toPandas().values,
'education': test_data.select('education').toPandas().values,
'marital_status': test_data.select('marital_status').toPandas().values
}
predicted_np = [
predicted.toPandas().workclass_vec.apply(lambda x: pandas.Series(x.toArray())).values,
predicted.toPandas().education_vec.apply(lambda x: pandas.Series(x.toArray())).values,
predicted.toPandas().marital_status_vec.apply(lambda x: pandas.Series(x.toArray())).values
]
expected = [numpy.asarray([expand_one_hot_vec(x) for x in row]) for row in predicted_np]
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlPipeline_2Stage")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['workclass_vec', 'education_vec', 'marital_status_vec'],
data_np, onnx_model_path)
compare_results(expected, output, decimal=5)