本文整理匯總了Python中pyspark.ml.feature.IDF.getOutputCol方法的典型用法代碼示例。如果您正苦於以下問題:Python IDF.getOutputCol方法的具體用法?Python IDF.getOutputCol怎麽用?Python IDF.getOutputCol使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.ml.feature.IDF
的用法示例。
在下文中一共展示了IDF.getOutputCol方法的5個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: fit_kmeans
# 需要導入模塊: from pyspark.ml.feature import IDF [as 別名]
# 或者: from pyspark.ml.feature.IDF import getOutputCol [as 別名]
def fit_kmeans(spark, products_df):
step = 0
step += 1
tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")
step += 1
stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")
step += 1
tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)
step += 1
idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")
step += 1
normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")
step += 1
kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)
kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])
model = kmeans_pipeline.fit(products_df)
words_prediction = model.transform(products_df)
model.save("./kmeans") # the whole machine learning instance is saved in a folder
return model, words_prediction
示例2: SQLContext
# 需要導入模塊: from pyspark.ml.feature import IDF [as 別名]
# 或者: from pyspark.ml.feature.IDF import getOutputCol [as 別名]
print "Text is cleaned"
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ["review", "label"])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])
print "Random split is done"
tokenizerNoSw = tr.NLTKWordPunctTokenizer(
inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english"))
)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)
pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])
# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************
evaluator = MulticlassClassificationEvaluator(
predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)
grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()
示例3: SQLContext
# 需要導入模塊: from pyspark.ml.feature import IDF [as 別名]
# 或者: from pyspark.ml.feature.IDF import getOutputCol [as 別名]
print "Text is cleaned"
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])
print "Random split is done"
tokenizerNoSw = tr.NLTKWordPunctTokenizer(
inputCol="review", outputCol="wordsNoSw",
stopwords=set(nltk.corpus.stopwords.words('english')))
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(),maxIter=30, regParam=0.01)
pipeline = Pipeline(stages=[tokenizerNoSw,
hashing_tf,
idf,
string_indexer,
dt])
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
# grid=(ParamGridBuilder()
# .baseOn([evaluator.metricName,'precision'])
# .addGrid(dt.maxDepth, [10,20])
# .build())
#cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid,evaluator=evaluator)
示例4: HashingTF
# 需要導入模塊: from pyspark.ml.feature import IDF [as 別名]
# 或者: from pyspark.ml.feature.IDF import getOutputCol [as 別名]
## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
outputCol="wordToIndex",
numFeatures=1 << 10)
## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
outputCol="tf_idf",
minDocFreq=4)
if algo == "gbm":
## Create GBM model
algoStage = H2OGBM(ratio=0.8,
seed=1,
featuresCols=[idf.getOutputCol()],
predictionCol="label")
elif algo == "dl":
## Create H2ODeepLearning model
algoStage = H2ODeepLearning(epochs=10,
seed=1,
l1=0.001,
l2=0.0,
hidden=[200, 200],
featuresCols=[idf.getOutputCol()],
predictionCol="label")
elif algo == "automl":
## Create H2OAutoML model
algoStage = H2OAutoML(convertUnknownCategoricalLevelsToNa=True,
maxRuntimeSecs=60, # 1 minutes
maxModels=3,
示例5: RegexTokenizer
# 需要導入模塊: from pyspark.ml.feature import IDF [as 別名]
# 或者: from pyspark.ml.feature.IDF import getOutputCol [as 別名]
tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W+")
# COMMAND ----------
# MAGIC %md
# MAGIC Create a `HashingTF` transformer to hash words to buckets with counts, then use an `IDF` estimator to compute inverse-document frequency for buckets based on how frequently words have hashed to those buckets in the given documents. Next, normalize the tf-idf values so that the \\( l^2 \\) norm is one for each row.
# COMMAND ----------
from pyspark.ml.feature import IDF, HashingTF, Normalizer
hashingTF = HashingTF().setNumFeatures(10000).setInputCol(tokenizer.getOutputCol()).setOutputCol("hashingTF")
idf = IDF().setMinDocFreq(10).setInputCol(hashingTF.getOutputCol()).setOutputCol("idf")
normalizer = Normalizer().setInputCol(idf.getOutputCol()).setOutputCol("features")
# COMMAND ----------
# MAGIC %md
# MAGIC Now, let's build the `KMeans` estimator and a `Pipeline` that will contain all of the stages. We'll then call fit on the `Pipeline` which will give us back a `PipelineModel`. This will take about a minute to run.
# COMMAND ----------
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
kmeans = KMeans().setFeaturesCol("features").setPredictionCol("prediction").setK(5).setSeed(0)
pipeline = Pipeline().setStages([tokenizer, hashingTF, idf, normalizer, kmeans])
model = pipeline.fit(parsed)