Python StringIndexer.fit方法代码示例

本文整理汇总了Python中pyspark.ml.feature.StringIndexer.fit方法的典型用法代码示例。如果您正苦于以下问题:Python StringIndexer.fit方法的具体用法?Python StringIndexer.fit怎么用?Python StringIndexer.fit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.ml.feature.StringIndexer的用法示例。


示例1: mapClickCategoricalFeatures

# 需要导入模块: from pyspark.ml.feature import StringIndexer [as 别名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 别名]
def mapClickCategoricalFeatures():

	indexed = ""

	df = getDataFrame(CLICKS_HDPFILEPATH)

	print df.columns
	#select columns to be mapped
	click_cols = ["C2", "C3", "C4", "C5", "C7", "C8"]

	for col in click_cols:

		if(indexed == ""):	
			indexed = df
		print indexed
		outcol = col+"Index"
		indexer = StringIndexer(inputCol=col, outputCol=outcol)
		indexed = indexer.fit(indexed).transform(indexed)



	#indexed.select('C0', 'C1', 'C2Index', 'C3Index', 'C4Index', 'C5Index', 'C6', 'C7Index', 'C8Index').write.format('com.databricks.spark.csv').save(PATH+"extraction/clicks1.csv")

	indexed.select('C0', 'C1', 'C2Index', 'C3Index', 'C4Index', 'C5Index', 'C6', 'C7Index', 'C8Index').write.format('com.databricks.spark.csv').save(HADOOPDIR+"data/click_fraud/extraction/clicks_23feb12.csv")

示例2: train_random_forest

# 需要导入模块: from pyspark.ml.feature import StringIndexer [as 别名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 别名]
def train_random_forest(df):
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    td = si_model.transform(df)
    rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed",
    return rf, rf.fit(td)

示例3: build_decisionTree

# 需要导入模块: from pyspark.ml.feature import StringIndexer [as 别名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 别名]
def build_decisionTree(path):

    df = load_data(path)
    df = data_preparation(df, avg_age)

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    df = si_model.transform(df)

    dt = DecisionTreeClassifier(labelCol='indexed')
    grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build()

    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)

    print "classification evaluation :" , evaluator.evaluate(prediction)

    return cvModel,avg_age

示例4: build_randomForest

# 需要导入模块: from pyspark.ml.feature import StringIndexer [as 别名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 别名]
def build_randomForest(path):
    df = load_data(path)
    df = data_preparation(df, avg_age)

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    df = si_model.transform(df)

    rdf = RandomForestClassifier(labelCol='indexed')
    grid = ParamGridBuilder().addGrid(rdf.maxDepth, [1,2,3,5,6,8,10])\

    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=rdf, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = rdf.fit(df)

    prediction = cvModel.transform(df)

    print "classification evaluation :" , evaluator.evaluate(prediction)

    return cvModel,avg_age

示例5: main

# 需要导入模块: from pyspark.ml.feature import StringIndexer [as 别名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 别名]
def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)

    # Index the labels of the classification
    labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
    labelIndex = labelIndex.fit(corpus)

    # Split the data into training and test sets
    training, test = corpus.randomSplit([0.8, 0.2])

    # Create the classifier
    clf = LogisticRegression(
        maxIter=10, regParam=0.3, elasticNetParam=0.8,
        family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")

    # Create the model
    model = Pipeline(stages=[
        vector, labelIndex, clf

    # Make predictions
    predictions = model.transform(test)
    predictions.select("prediction", "indexedLabel", "tfidf").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[2]
    print(gbtModel)  # summary only

示例6: mapPublisherCategoricalFeatures

# 需要导入模块: from pyspark.ml.feature import StringIndexer [as 别名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 别名]
def mapPublisherCategoricalFeatures():
	indexed = ""



	print df.columns
	publisher_cols = ["C0", "C1", "C2", "C3"]
	for col in publisher_cols:

		if(indexed == ""):	
			indexed = df

		print indexed
		outcol = col+"Index"
		#stringindexer maps each value in inout colun into a double indexed value and creates a new column in dataframe
		indexer = StringIndexer(inputCol=col, outputCol=outcol)
		#fit and transform the columns using indexer		
		indexed = indexer.fit(indexed).transform(indexed)



	indexed.select('C0Index', 'C1Index', 'C2Index', "C3Index").write.format('com.databricks.spark.csv').save(HADOOPDIR+"data/click_fraud/extraction/publishers_23feb12.csv")

示例7: testClassification

# 需要导入模块: from pyspark.ml.feature import StringIndexer [as 别名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 别名]
def testClassification(data):
    # Train a GradientBoostedTrees model.

    stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel")
    si_model = stringIndexer.fit(data)
    td = si_model.transform(data)

    rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel",seed=13)

    trainData,testData = td.randomSplit([0.8,0.2],13)

    predictionDF = rf.fit(trainData).transform(testData)

    selected = predictionDF\
    for row in selected.collect():
        print row

    scoresAndLabels = predictionDF\
       .map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel))
    for sl in scoresAndLabels.collect():
        print sl
    evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',metricName='areaUnderROC')
    metric = evaluator.evaluate(selected)
    print metric

示例8: label

# 需要导入模块: from pyspark.ml.feature import StringIndexer [as 别名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 别名]
def label(df, column):
    Create a labeled column.
    indexer = StringIndexer(inputCol=column, outputCol=column+'_label')
    df = indexer.fit(df).transform(df)
    return df

示例9: indexStringColumns

# 需要导入模块: from pyspark.ml.feature import StringIndexer [as 别名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 别名]
def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdata = df
    for c in cols:
        si = StringIndexer(inputCol=c, outputCol=c+"-x")
        sm = si.fit(newdata)
        newdata = sm.transform(newdata).drop(c)
        newdata = newdata.withColumnRenamed(c+"-x", c)
    return newdata

示例10: events

# 需要导入模块: from pyspark.ml.feature import StringIndexer [as 别名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 别名]
def events(df,column_name):
    i = column_name+"I"
    v = column_name+"V"
    stringIndexer = StringIndexer(inputCol=column_name, outputCol=i)
    model = stringIndexer.fit(df)
    indexed = model.transform(df)
    encoder = OneHotEncoder(inputCol=i, outputCol=v)
    encoded = encoder.transform(indexed)
    return encoded

示例11: indexStringColumns

# 需要导入模块: from pyspark.ml.feature import StringIndexer [as 别名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 别名]
def indexStringColumns(df, cols):
    from pyspark.ml.feature import StringIndexer
    #variable newdf will be updated several times
    newdf = df
    for c in cols:
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

示例12: oneHotEncoding

# 需要导入模块: from pyspark.ml.feature import StringIndexer [as 别名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 别名]
    def oneHotEncoding(self, df, input_col):
        stringInd = StringIndexer(inputCol=input_col, outputCol="indexed")
        model = stringInd.fit(df)
        td = model.transform(df)
        encoder = OneHotEncoder(inputCol="indexed", outputCol="features", dropLast=False)
        final_encoding = encoder.transform(td).select(df.id, 'features').cache()
        conv_udf = udf(lambda line: Vectors.dense(line).tolist())
        final_encoding = final_encoding.select(df.id,conv_udf(final_encoding.features).alias("num_"+input_col)).cache()

        return final_encoding

示例13: test_string_indexer_handle_invalid

# 需要导入模块: from pyspark.ml.feature import StringIndexer [as 别名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 别名]
    def test_string_indexer_handle_invalid(self):
        df = self.spark.createDataFrame([
            (0, "a"),
            (1, "d"),
            (2, None)], ["id", "label"])

        si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep",
        model1 = si1.fit(df)
        td1 = model1.transform(df)
        actual1 = td1.select("id", "indexed").collect()
        expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)]
        self.assertEqual(actual1, expected1)

        si2 = si1.setHandleInvalid("skip")
        model2 = si2.fit(df)
        td2 = model2.transform(df)
        actual2 = td2.select("id", "indexed").collect()
        expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)]
        self.assertEqual(actual2, expected2)

示例14: time

# 需要导入模块: from pyspark.ml.feature import StringIndexer [as 别名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 别名]
print "Creating sparse vectors for all data based on this new dictionary"
t0 = time()
tt = time() - t0
print "Done in {} second".format(round(tt,3))

# In[328]:

from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect)

# In[329]:

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol='target_indexed', maxDepth=10)

# In[330]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')


示例15: StringIndexer

# 需要导入模块: from pyspark.ml.feature import StringIndexer [as 别名]
# 或者: from pyspark.ml.feature.StringIndexer import fit [as 别名]
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\

    # $example on$
    df = spark.createDataFrame(
        [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
        ["id", "category"])

    indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
    model = indexer.fit(df)
    indexed = model.transform(df)

    print("Transformed string column '%s' to indexed column '%s'"
          % (indexer.getInputCol(), indexer.getOutputCol()))

    print("StringIndexer will store labels in output column metadata\n")

    converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
    converted = converter.transform(indexed)

    print("Transformed indexed column '%s' back to original string column '%s' using "
          "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
    converted.select("id", "categoryIndex", "originalCategory").show()
    # $example off$
