本文整理匯總了Python中pyspark.ml.feature.StringIndexer.fit方法的典型用法代碼示例。如果您正苦於以下問題:Python StringIndexer.fit方法的具體用法?Python StringIndexer.fit怎麽用?Python StringIndexer.fit使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.ml.feature.StringIndexer
的用法示例。
在下文中一共展示了StringIndexer.fit方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: mapClickCategoricalFeatures
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 別名]
def mapClickCategoricalFeatures():
indexed = ""
df = getDataFrame(CLICKS_HDPFILEPATH)
df.persist(StorageLevel.DISK_ONLY)
print df.columns
#select columns to be mapped
click_cols = ["C2", "C3", "C4", "C5", "C7", "C8"]
for col in click_cols:
if(indexed == ""):
indexed = df
print indexed
outcol = col+"Index"
indexer = StringIndexer(inputCol=col, outputCol=outcol)
indexed = indexer.fit(indexed).transform(indexed)
indexed.show()
indexed.persist(StorageLevel.DISK_ONLY)
#indexed.select('C0', 'C1', 'C2Index', 'C3Index', 'C4Index', 'C5Index', 'C6', 'C7Index', 'C8Index').write.format('com.databricks.spark.csv').save(PATH+"extraction/clicks1.csv")
indexed.select('C0', 'C1', 'C2Index', 'C3Index', 'C4Index', 'C5Index', 'C6', 'C7Index', 'C8Index').write.format('com.databricks.spark.csv').save(HADOOPDIR+"data/click_fraud/extraction/clicks_23feb12.csv")
示例2: train_random_forest
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 別名]
def train_random_forest(df):
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(df)
td = si_model.transform(df)
rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed",
seed=int(random.random()))
return rf, rf.fit(td)
示例3: build_decisionTree
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 別名]
def build_decisionTree(path):
df = load_data(path)
avg_age=find_avg_age(df)
df = data_preparation(df, avg_age)
df = df.drop('Cabin')
df = df.drop('Ticket')
df = df.drop('Name')
stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
si_model = stringIndexer.fit(df)
df = si_model.transform(df)
df.show(truncate=False)
dt = DecisionTreeClassifier(labelCol='indexed')
grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(df)
prediction = cvModel.transform(df)
prediction.show(truncate=False)
print "classification evaluation :" , evaluator.evaluate(prediction)
return cvModel,avg_age
示例4: build_randomForest
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 別名]
def build_randomForest(path):
df = load_data(path)
avg_age=find_avg_age(df)
df = data_preparation(df, avg_age)
df = df.drop('Cabin')
df = df.drop('Ticket')
df = df.drop('Name')
stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
si_model = stringIndexer.fit(df)
df = si_model.transform(df)
df.show()
rdf = RandomForestClassifier(labelCol='indexed')
grid = ParamGridBuilder().addGrid(rdf.maxDepth, [1,2,3,5,6,8,10])\
.addGrid(rdf.numTrees,[1,5,10,30,50,100,200]).build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=rdf, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = rdf.fit(df)
prediction = cvModel.transform(df)
prediction.show()
print "classification evaluation :" , evaluator.evaluate(prediction)
return cvModel,avg_age
示例5: main
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 別名]
def main(sc, spark):
# Load and vectorize the corpus
corpus = load_corpus(sc, spark)
vector = make_vectorizer().fit(corpus)
# Index the labels of the classification
labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
labelIndex = labelIndex.fit(corpus)
# Split the data into training and test sets
training, test = corpus.randomSplit([0.8, 0.2])
# Create the classifier
clf = LogisticRegression(
maxIter=10, regParam=0.3, elasticNetParam=0.8,
family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")
# Create the model
model = Pipeline(stages=[
vector, labelIndex, clf
]).fit(training)
# Make predictions
predictions = model.transform(test)
predictions.select("prediction", "indexedLabel", "tfidf").show(5)
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
gbtModel = model.stages[2]
print(gbtModel) # summary only
示例6: mapPublisherCategoricalFeatures
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 別名]
def mapPublisherCategoricalFeatures():
indexed = ""
df = getDataFrame(PUBLISHERS_HDPFILEPATH)
df.persist(StorageLevel.DISK_ONLY)
print df.columns
publisher_cols = ["C0", "C1", "C2", "C3"]
for col in publisher_cols:
if(indexed == ""):
indexed = df
print indexed
outcol = col+"Index"
#stringindexer maps each value in inout colun into a double indexed value and creates a new column in dataframe
indexer = StringIndexer(inputCol=col, outputCol=outcol)
#fit and transform the columns using indexer
indexed = indexer.fit(indexed).transform(indexed)
indexed.show()
indexed.persist(StorageLevel.DISK_ONLY)
indexed.select('C0Index', 'C1Index', 'C2Index', "C3Index").write.format('com.databricks.spark.csv').save(HADOOPDIR+"data/click_fraud/extraction/publishers_23feb12.csv")
示例7: testClassification
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 別名]
def testClassification(data):
# Train a GradientBoostedTrees model.
stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel")
si_model = stringIndexer.fit(data)
td = si_model.transform(data)
rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel",seed=13)
trainData,testData = td.randomSplit([0.8,0.2],13)
predictionDF = rf.fit(trainData).transform(testData)
selected = predictionDF\
.select('label','indexLabel','prediction','rawPrediction','probability')
for row in selected.collect():
print row
scoresAndLabels = predictionDF\
.map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel))
for sl in scoresAndLabels.collect():
print sl
evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',metricName='areaUnderROC')
metric = evaluator.evaluate(selected)
print metric
示例8: label
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 別名]
def label(df, column):
"""
Create a labeled column.
"""
indexer = StringIndexer(inputCol=column, outputCol=column+'_label')
df = indexer.fit(df).transform(df)
return df
示例9: indexStringColumns
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 別名]
def indexStringColumns(df, cols):
#variable newdf will be updated several times
newdata = df
for c in cols:
si = StringIndexer(inputCol=c, outputCol=c+"-x")
sm = si.fit(newdata)
newdata = sm.transform(newdata).drop(c)
newdata = newdata.withColumnRenamed(c+"-x", c)
return newdata
示例10: events
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 別名]
def events(df,column_name):
i = column_name+"I"
v = column_name+"V"
stringIndexer = StringIndexer(inputCol=column_name, outputCol=i)
model = stringIndexer.fit(df)
indexed = model.transform(df)
encoder = OneHotEncoder(inputCol=i, outputCol=v)
encoded = encoder.transform(indexed)
return encoded
示例11: indexStringColumns
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 別名]
def indexStringColumns(df, cols):
from pyspark.ml.feature import StringIndexer
#variable newdf will be updated several times
newdf = df
for c in cols:
si = StringIndexer(inputCol=c, outputCol=c+"-num")
sm = si.fit(newdf)
newdf = sm.transform(newdf).drop(c)
newdf = newdf.withColumnRenamed(c+"-num", c)
return newdf
示例12: oneHotEncoding
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 別名]
def oneHotEncoding(self, df, input_col):
stringInd = StringIndexer(inputCol=input_col, outputCol="indexed")
model = stringInd.fit(df)
td = model.transform(df)
encoder = OneHotEncoder(inputCol="indexed", outputCol="features", dropLast=False)
final_encoding = encoder.transform(td).select(df.id, 'features').cache()
conv_udf = udf(lambda line: Vectors.dense(line).tolist())
final_encoding = final_encoding.select(df.id,conv_udf(final_encoding.features).alias("num_"+input_col)).cache()
return final_encoding
示例13: test_string_indexer_handle_invalid
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 別名]
def test_string_indexer_handle_invalid(self):
df = self.spark.createDataFrame([
(0, "a"),
(1, "d"),
(2, None)], ["id", "label"])
si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep",
stringOrderType="alphabetAsc")
model1 = si1.fit(df)
td1 = model1.transform(df)
actual1 = td1.select("id", "indexed").collect()
expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)]
self.assertEqual(actual1, expected1)
si2 = si1.setHandleInvalid("skip")
model2 = si2.fit(df)
td2 = model2.transform(df)
actual2 = td2.select("id", "indexed").collect()
expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)]
self.assertEqual(actual2, expected2)
示例14: time
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 別名]
print "Creating sparse vectors for all data based on this new dictionary"
t0 = time()
dfTrainSelect=dfTrain.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(schema)
dfTestSelect=dfTest.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(schema)
dfTrainSelect.take(1)
dfTestSelect.take(1)
tt = time() - t0
print "Done in {} second".format(round(tt,3))
# In[328]:
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect)
# In[329]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol='target_indexed', maxDepth=10)
# In[330]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
示例15: StringIndexer
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 別名]
# $example off$
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("IndexToStringExample")\
.getOrCreate()
# $example on$
df = spark.createDataFrame(
[(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
["id", "category"])
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = indexer.fit(df)
indexed = model.transform(df)
print("Transformed string column '%s' to indexed column '%s'"
% (indexer.getInputCol(), indexer.getOutputCol()))
indexed.show()
print("StringIndexer will store labels in output column metadata\n")
converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
converted = converter.transform(indexed)
print("Transformed indexed column '%s' back to original string column '%s' using "
"labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
converted.select("id", "categoryIndex", "originalCategory").show()
# $example off$