当前位置: 首页>>代码示例>>Python>>正文


Python IDF.getOutputCol方法代码示例

本文整理汇总了Python中pyspark.ml.feature.IDF.getOutputCol方法的典型用法代码示例。如果您正苦于以下问题:Python IDF.getOutputCol方法的具体用法?Python IDF.getOutputCol怎么用?Python IDF.getOutputCol使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.ml.feature.IDF的用法示例。


在下文中一共展示了IDF.getOutputCol方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: fit_kmeans

# 需要导入模块: from pyspark.ml.feature import IDF [as 别名]
# 或者: from pyspark.ml.feature.IDF import getOutputCol [as 别名]
def fit_kmeans(spark, products_df):
    step = 0

    step += 1
    tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")

    step += 1
    stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")

    step += 1
    tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)

    step += 1
    idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")

    step += 1
    normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")

    step += 1
    kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)

    kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])

    model = kmeans_pipeline.fit(products_df)
    words_prediction = model.transform(products_df)
    model.save("./kmeans")  # the whole machine learning instance is saved in a folder
    return model, words_prediction
开发者ID:ohliumliu,项目名称:flash_deals_c9,代码行数:29,代码来源:kmean_model.py

示例2: SQLContext

# 需要导入模块: from pyspark.ml.feature import IDF [as 别名]
# 或者: from pyspark.ml.feature.IDF import getOutputCol [as 别名]
print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ["review", "label"])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english"))
)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])


# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************

evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)

grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()
开发者ID:pifouuu,项目名称:ProjetBigData,代码行数:31,代码来源:script1.py

示例3: SQLContext

# 需要导入模块: from pyspark.ml.feature import IDF [as 别名]
# 或者: from pyspark.ml.feature.IDF import getOutputCol [as 别名]
print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw",  
    stopwords=set(nltk.corpus.stopwords.words('english')))
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(),maxIter=30, regParam=0.01)

pipeline = Pipeline(stages=[tokenizerNoSw,
                            hashing_tf,
                            idf,
                            string_indexer,
                            dt])

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

# grid=(ParamGridBuilder()
#      .baseOn([evaluator.metricName,'precision'])
#      .addGrid(dt.maxDepth, [10,20])
#      .build())

#cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid,evaluator=evaluator)
开发者ID:pifouuu,项目名称:ProjetBigData,代码行数:32,代码来源:script1_nocv.py

示例4: HashingTF

# 需要导入模块: from pyspark.ml.feature import IDF [as 别名]
# 或者: from pyspark.ml.feature.IDF import getOutputCol [as 别名]
## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)


if algo == "gbm":
    ## Create GBM model
    algoStage = H2OGBM(ratio=0.8,
                 seed=1,
                 featuresCols=[idf.getOutputCol()],
                 predictionCol="label")
elif algo == "dl":
    ## Create H2ODeepLearning model
    algoStage = H2ODeepLearning(epochs=10,
                         seed=1,
                         l1=0.001,
                         l2=0.0,
                         hidden=[200, 200],
                         featuresCols=[idf.getOutputCol()],
                         predictionCol="label")
elif algo == "automl":
    ## Create H2OAutoML model
    algoStage = H2OAutoML(convertUnknownCategoricalLevelsToNa=True,
                       maxRuntimeSecs=60, # 1 minutes
                       maxModels=3,
开发者ID:h2oai,项目名称:sparkling-water,代码行数:33,代码来源:ham_or_spam_multi_algo.py

示例5: RegexTokenizer

# 需要导入模块: from pyspark.ml.feature import IDF [as 别名]
# 或者: from pyspark.ml.feature.IDF import getOutputCol [as 别名]
tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W+")

# COMMAND ----------

# MAGIC %md
# MAGIC Create a `HashingTF` transformer to hash words to buckets with counts, then use an `IDF` estimator to compute inverse-document frequency for buckets based on how frequently words have hashed to those buckets in the given documents.  Next, normalize the tf-idf values so that the \\( l^2 \\) norm is one for each row.

# COMMAND ----------

from pyspark.ml.feature import IDF, HashingTF, Normalizer

hashingTF = HashingTF().setNumFeatures(10000).setInputCol(tokenizer.getOutputCol()).setOutputCol("hashingTF")

idf = IDF().setMinDocFreq(10).setInputCol(hashingTF.getOutputCol()).setOutputCol("idf")

normalizer = Normalizer().setInputCol(idf.getOutputCol()).setOutputCol("features")

# COMMAND ----------

# MAGIC %md
# MAGIC Now, let's build the `KMeans` estimator and a `Pipeline` that will contain all of the stages.  We'll then call fit on the `Pipeline` which will give us back a `PipelineModel`.  This will take about a minute to run.

# COMMAND ----------

from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans

kmeans = KMeans().setFeaturesCol("features").setPredictionCol("prediction").setK(5).setSeed(0)

pipeline = Pipeline().setStages([tokenizer, hashingTF, idf, normalizer, kmeans])
model = pipeline.fit(parsed)
开发者ID:Inscrutive,项目名称:spark,代码行数:33,代码来源:TFIDF.py


注:本文中的pyspark.ml.feature.IDF.getOutputCol方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。