本文整理匯總了Python中pyspark.ml.feature.StringIndexer.transform方法的典型用法代碼示例。如果您正苦於以下問題:Python StringIndexer.transform方法的具體用法?Python StringIndexer.transform怎麽用?Python StringIndexer.transform使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.ml.feature.StringIndexer
的用法示例。
在下文中一共展示了StringIndexer.transform方法的3個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: run
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import transform [as 別名]
def run(start1, end1, start2, end2, df, sc, sql_context, is_pred):
lp_data= get_labeled_points(start1, end2, df, sc, sql_context)
print lp_data.count()
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(lp_data)
td = labelIndexer.transform(lp_data)
label2index = {}
for each in sorted(set([(i[0], i[1]) for i in td.select(td.label, td.indexedLabel).distinct().collect()]),
key=lambda x: x[0]):
label2index[int(each[0])] = int(each[1])
print label2index
featureIndexer = \
VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(lp_data)
rf = get_model()
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])
lp_train = lp_data.filter(lp_data.date3<end1).filter(lp_data.is_labeled == 1)
model = pipeline.fit(lp_train)
lp_check = lp_data.filter(lp_data.date2>start2)
predictions = model.transform(lp_check)
predictions = val(predictions, label2index, sql_context)
if is_pred:
predictions = predictions.filter(predictions.is_labeled ==0).filter(predictions.date2 == get_cur()).sort(predictions.prob.desc())
dfToTableWithPar(sql_context, predictions, "predictions", get_cur())
for each in predictions.take(10):
print each
示例2: applyModel
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import transform [as 別名]
#.........這裏部分代碼省略.........
print('Loaded and prapared %d entries' % df.count())
#########
# keep only needed features
#########
features = ['ADLOADINGTIME',
'PLACEMENTID',
'TIMESTAMP',
'CREATIVETYPE',
'UA_HARDWARETYPE',
'UA_VENDOR',
'UA_MODEL',
'UA_BROWSER',
'UA_BROWSERVERSION',
'FILESJSON',
'ERRORSJSON',
'TOPMOSTREACHABLEWINDOWAREA',
'FILESJSON_SIZE',
'COMBINEDID',
'COMBINEDEXTERNALID',
'PLATFORMCOMBINED',
'UA_OSCOMB',
'SDK',
'EXTERNALADSERVER'
]
df = df.select(features)
#########
# Convert categorical features to numerical
#########
featuresCat = [
'PLACEMENTID',
'CREATIVETYPE',
'UA_HARDWARETYPE',
'UA_VENDOR',
'UA_MODEL',
'UA_BROWSER',
'UA_BROWSERVERSION',
'FILESJSON',
'ERRORSJSON',
'COMBINEDID',
'COMBINEDEXTERNALID',
'PLATFORMCOMBINED',
'UA_OSCOMB',
'SDK',
'EXTERNALADSERVER'
]
for i in range(len(featuresCat)):
indexer = StringIndexer(inputCol=featuresCat[i], outputCol='_'+featuresCat[i]).setHandleInvalid("skip").fit(df)
df = indexer.transform(df).drop(featuresCat[i])
writer = indexer._call_java("write")
writer.overwrite().save("indexer_" + featuresCat[i])
featuresCat = [ '_' + featuresCat[i] for i in range(len(featuresCat))]
features = featuresCat[:]
features.append('TIMESTAMP')
features.append('FILESJSON_SIZE')
features.append('TOPMOSTREACHABLEWINDOWAREA')
#########
# Assemble features
#########
assembler = VectorAssembler(
inputCols=features,
outputCol="features")
df = assembler.transform(df)
#########
# Convert to labeled point
#########
lp = (df.select(func.col("ADLOADINGTIME").alias("label"), func.col("features"))
.map(lambda row: LabeledPoint(row.label, row.features)))
lp.cache()
#########
# Load trained model
#########
model = RandomForestModel.load(sc, loadModelName)
print('Model loaded!')
predictions = model.predict(lp.map(lambda x: x.features)).collect()
return predictions
示例3: VectorAssembler
# 需要導入模塊: from pyspark.ml.feature import StringIndexer [as 別名]
# 或者: from pyspark.ml.feature.StringIndexer import transform [as 別名]
train_feature_df = feature_df.filter(feature_df['time'] <= split_time)
test_feature_df = feature_df.filter(feature_df['time'] > split_time)
train_feature_df = train_feature_df.drop('time')
test_feature_df = test_feature_df.drop('time')
assembler = VectorAssembler(
inputCols=list(set(train_feature_df.columns) - set(['result', 'home_name', 'away_name'])),
outputCol="features")
train_df = assembler.transform(train_feature_df)
test_df = assembler.transform(test_feature_df)
labelIndexer = StringIndexer(inputCol="result", outputCol="indexedResult").fit(feature_df)
train_df = labelIndexer.transform(train_df)
test_df = labelIndexer.transform(test_df)
label_mapping = dict(enumerate(labelIndexer.labels()))
reverse_mapping = {}
for key in label_mapping:
reverse_mapping[label_mapping[key]] = key
# ## Dimensionality reduction
#
# Feature selection is not really supported yet in mllib, therefore, we just applied dim reduction using PCA
# In[509]:
pca = PCA(inputCol="features", outputCol="pca", k=15).fit(train_df)