当前位置: 首页>>代码示例>>Python>>正文

Python HashingTF.getOutputCol方法代码示例

本文整理汇总了Python中pyspark.ml.feature.HashingTF.getOutputCol方法的典型用法代码示例。如果您正苦于以下问题:Python HashingTF.getOutputCol方法的具体用法?Python HashingTF.getOutputCol怎么用?Python HashingTF.getOutputCol使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.ml.feature.HashingTF的用法示例。


示例1: train_lg

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 别名]
    def train_lg(training_data, collection):
        # Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr.
        hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features")
        idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
        pipeline1 = Pipeline(stages=[hashingTF, idf])

        # Fit the pipeline1 to training documents.
        model1 = pipeline1.fit(training_data)

        lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
        pipeline2 = Pipeline(stages=[model1, lr])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100, 1000, 10000]) \
            .addGrid(lr.regParam, [0.1, 0.01]) \

        crossval = CrossValidator(estimator=pipeline2,

        # Run cross-validation, and choose the best set of parameters.
        cvModel = crossval.fit(training_data)

    #     model_path = os.path.join(models_dir , time.strftime("%Y%m%d-%H%M%S") + '_'
    #                             + collection["Id"] + '_'
    #                             + collection["name"])
    #     cvModel.save(sc, model_path)
        return cvModel

示例2: fit_kmeans

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 别名]
def fit_kmeans(spark, products_df):
    step = 0

    step += 1
    tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")

    step += 1
    stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")

    step += 1
    tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)

    step += 1
    idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")

    step += 1
    normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")

    step += 1
    kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)

    kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])

    model = kmeans_pipeline.fit(products_df)
    words_prediction = model.transform(products_df)
    model.save("./kmeans")  # the whole machine learning instance is saved in a folder
    return model, words_prediction

示例3: create_pipeline

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 别名]
def create_pipeline(model_type, num_features=10000):
    Defines pipeline from BOW to prediction.

    remover = StopWordsRemover(inputCol="bow", outputCol="words")
    hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=num_features)
    tfidf = IDF(inputCol=hashingTF.getOutputCol(),

    if model_type == 'log_reg':
        model = LogisticRegression()
    elif model_type == 'gbt':
        model = GBTClassifier()
    elif model_type == 'naive_bayes':
        model = NaiveBayes()
    elif model_type == 'rf':
        model = RandomForestClassifier()

    return Pipeline(stages=[remover, hashingTF, tfidf,

示例4: BaselinePipelineEngine

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 别名]
class BaselinePipelineEngine(PipelineEngine):
    def __init__(self, cv):
        super(BaselinePipelineEngine, self).__init__(cv)
        self.hashing_tf_map = [pow(2, 20)]
        self.lr_map = [0.1, 0.01]
        self.stages = self._build_stages()
        self.pipeline = Pipeline(stages=[self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr])
        self.param_grid = self._build_param_grid()

    def _build_stages(self):
        self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
        self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
        self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features")
        self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features")
        self.lr = LogisticRegression(maxIter=10, regParam=0.01)
        return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]

    def _build_param_grid(self):
        param_grid_builder = ParamGridBuilder()
        param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
        param_grid_builder.addGrid(self.lr.regParam, self.lr_map)
        return param_grid_builder.build()

示例5: SQLContext

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 别名]
rdd = labeledRdd.map(lambda doc: (cleanLower(doc[0]), doc[1]))

print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ["review", "label"])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english"))
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])

# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************

evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction", labelCol="target_indexed", metricName="precision"

示例6: SQLContext

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 别名]
    sqc = SQLContext(sc)
    sm = SparkModel(sc, conn, rdd_path='meta_rdd.pkl')

    logging.basicConfig(format='%(asctime)s %(message)s')
    grid_search = logging.getLogger('main')
    handler = logging.FileHandler('../logs/grid_search.txt')

    bow_rdd = sm.RDD.map(lambda (key, (bow, meta)): (key, bow))
    bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow))

    remover = StopWordsRemover(inputCol="raw", outputCol="words")
    hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts",
    tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features",
    indexer = StringIndexer(inputCol="string_label", outputCol="label")

    for model in [GBTClassifier(), RandomForestClassifier(), MultilayerPerceptronClassifier()]:

        if type(model) == MultilayerPerceptronClassifier:
            layers = [10000, 100, 2]
            model = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128)

        pipeline = Pipeline(stages=[remover, hashingTF, tfidf, # scaler,
                                    indexer, model])
        scores = cross_val_score(pipeline, bow_rdd)
        grid_search.debug('Model: %s\nscores: %s\nAverage: %s' \
                % (type(model), scores, scores.mean()))

示例7: StopWordsRemover

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 别名]
    bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) \
            .sample(withReplacement=False, fraction=.5, seed=1)
    df = sqc.createDataFrame(bow_rdd, ['string_label', 'raw'])
    train_rdd, test_rdd = df.randomSplit([.8, .2], seed=1)
    results = []

    num_features = 5000
    min_doc_freq = 20
    layers = [[5000, 2056, 512, 128, 2], [5000, 1000, 128, 2], [5000, 100, 2], [5000, 5000, 2]]

    for l in layers:
        remover = StopWordsRemover(inputCol="raw", outputCol="words")
        hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts",
        tfidf = IDF(inputCol=hashingTF.getOutputCol(),
                    outputCol="features", minDocFreq=min_doc_freq)
        indexer = StringIndexer(inputCol="string_label", outputCol="label")

        mlpc = MultilayerPerceptronClassifier(maxIter=100,

        pipeline = Pipeline(stages=[remover, hashingTF, tfidf,
                                    indexer, mlpc])

        model = pipeline.fit(train_rdd)
        df_output = model.transform(train_rdd)
        test_output = model.transform(test_rdd).select("label", "prediction")
        score = test_output.rdd.map(lambda row: row.label == row.prediction).mean()
        nn_gridsearch.debug("Layers: %s, Accuracy: %s" % (layers, score))

示例8: StopWordsRemover

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 别名]

## Remove ignored words
stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),

if algo == "gbm":
    ## Create GBM model
    algoStage = H2OGBM(ratio=0.8,
elif algo == "dl":
    ## Create H2ODeepLearning model
    algoStage = H2ODeepLearning(epochs=10,

示例9: RegexTokenizer

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 别名]
from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W+")

# COMMAND ----------

# MAGIC %md
# MAGIC Create a `HashingTF` transformer to hash words to buckets with counts, then use an `IDF` estimator to compute inverse-document frequency for buckets based on how frequently words have hashed to those buckets in the given documents.  Next, normalize the tf-idf values so that the \\( l^2 \\) norm is one for each row.

# COMMAND ----------

from pyspark.ml.feature import IDF, HashingTF, Normalizer

hashingTF = HashingTF().setNumFeatures(10000).setInputCol(tokenizer.getOutputCol()).setOutputCol("hashingTF")

idf = IDF().setMinDocFreq(10).setInputCol(hashingTF.getOutputCol()).setOutputCol("idf")

normalizer = Normalizer().setInputCol(idf.getOutputCol()).setOutputCol("features")

# COMMAND ----------

# MAGIC %md
# MAGIC Now, let's build the `KMeans` estimator and a `Pipeline` that will contain all of the stages.  We'll then call fit on the `Pipeline` which will give us back a `PipelineModel`.  This will take about a minute to run.

# COMMAND ----------

from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans

kmeans = KMeans().setFeaturesCol("features").setPredictionCol("prediction").setK(5).setSeed(0)

示例10: Normalizer

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 别名]
normalizerBi = Normalizer(inputCol="bigrams",outputCol='normBigrams',p=2.0)
dfNorm = normalizerUni.transform(dfVect2)
dfNorm2 = normalizerBi.transform(dfNorm)
print "DataFrame(bi-gram): normalisé"
# La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements 
# non nuls et pas leur valeur
# On passe au TFIDF
# Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs
# à n'importz quelle colonne (bigrammes, avec stop words ou sans...)
from pyspark.ml.feature import HashingTF
htf = HashingTF(inputCol='words',outputCol='wordsTF',numFeatures=10000)
dfTrainTF = htf.transform(dfTrainTokNoSw)
# INverse doc frequency
from pyspark.ml.feature import IDF
idf = IDF(inputCol=htf.getOutputCol(),outputCol="wordsTFIDF")
idfModel = idf.fit(dfTrainTF)
dfTrainTFIDF = idfModel.transform(dfTrainTF)

# Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainTFIDF)
dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF)

#-----------Training the model for prediction--------------------------
