本文整理匯總了Python中pyspark.ml.feature.StringIndexer.getOutputCol方法的典型用法代碼示例。如果您正苦於以下問題:Python StringIndexer.getOutputCol方法的具體用法?Python StringIndexer.getOutputCol怎麽用?Python StringIndexer.getOutputCol使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.ml.feature.StringIndexer
的用法示例。
在下文中一共展示了StringIndexer.getOutputCol方法的5個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: StringIndexer
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import getOutputCol [as 別名]
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("IndexToStringExample")\
.getOrCreate()
# $example on$
df = spark.createDataFrame(
[(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
["id", "category"])
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = indexer.fit(df)
indexed = model.transform(df)
print("Transformed string column '%s' to indexed column '%s'"
% (indexer.getInputCol(), indexer.getOutputCol()))
indexed.show()
print("StringIndexer will store labels in output column metadata\n")
converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
converted = converter.transform(indexed)
print("Transformed indexed column '%s' back to original string column '%s' using "
"labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
converted.select("id", "categoryIndex", "originalCategory").show()
# $example off$
spark.stop()
示例2: SQLContext
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import getOutputCol [as 別名]
print "Text is cleaned"
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ["review", "label"])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])
print "Random split is done"
tokenizerNoSw = tr.NLTKWordPunctTokenizer(
inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english"))
)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)
pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])
# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************
evaluator = MulticlassClassificationEvaluator(
predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)
grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()
示例3: StringIndexer
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import getOutputCol [as 別名]
features=dfBigram.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(schema)
print "Features from bigrams created"
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(features)
featIndexed = string_indexer_model.transform(features)
print "labels indexed"
dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol=string_indexer.getOutputCol(), maxDepth=10)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
grid=(ParamGridBuilder()
.baseOn([evaluator.metricName,'precision'])
.addGrid(dt.maxDepth, [10,20])
.build())
cv = CrossValidator(estimator=dt, estimatorParamMaps=grid,evaluator=evaluator)
from time import time
示例4: StructType
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import getOutputCol [as 別名]
schema = StructType([StructField('label',DoubleType(),True),StructField('Vectors',VectorUDT(),True)])
features=dfTrainTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema)
print "Features created"
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(features)
featIndexed = string_indexer_model.transform(features)
print "labels indexed"
lr = LogisticRegression(featuresCol='Vectors', labelCol=string_indexer.getOutputCol())
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
lr_model = lr.fit(featIndexed)
dfTestTok = tokenizer.transform(dfTest)
featuresTest=dfTestTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema)
testIndexed = string_indexer_model.transform(featuresTest)
df_test_pred = lr_model.transform(testIndexed)
res=evaluator.evaluate(df_test_pred)
print res
示例5: SQLContext
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import getOutputCol [as 別名]
print "Text is cleaned"
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])
print "Random split is done"
tokenizerNoSw = tr.NLTKWordPunctTokenizer(
inputCol="review", outputCol="wordsNoSw",
stopwords=set(nltk.corpus.stopwords.words('english')))
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(),maxIter=30, regParam=0.01)
pipeline = Pipeline(stages=[tokenizerNoSw,
hashing_tf,
idf,
string_indexer,
dt])
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
# grid=(ParamGridBuilder()
# .baseOn([evaluator.metricName,'precision'])
# .addGrid(dt.maxDepth, [10,20])
# .build())
#cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid,evaluator=evaluator)