本文整理匯總了Python中pyspark.ml.feature.HashingTF.getOutputCol方法的典型用法代碼示例。如果您正苦於以下問題:Python HashingTF.getOutputCol方法的具體用法?Python HashingTF.getOutputCol怎麽用?Python HashingTF.getOutputCol使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.ml.feature.HashingTF
的用法示例。
在下文中一共展示了HashingTF.getOutputCol方法的10個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: train_lg
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 別名]
def train_lg(training_data, collection):
# Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr.
hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
pipeline1 = Pipeline(stages=[hashingTF, idf])
# Fit the pipeline1 to training documents.
model1 = pipeline1.fit(training_data)
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
pipeline2 = Pipeline(stages=[model1, lr])
paramGrid = ParamGridBuilder() \
.addGrid(hashingTF.numFeatures, [10, 100, 1000, 10000]) \
.addGrid(lr.regParam, [0.1, 0.01]) \
.build()
crossval = CrossValidator(estimator=pipeline2,
estimatorParamMaps=paramGrid,
evaluator=BinaryClassificationEvaluator(),
numFolds=5)
# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training_data)
# model_path = os.path.join(models_dir , time.strftime("%Y%m%d-%H%M%S") + '_'
# + collection["Id"] + '_'
# + collection["name"])
# cvModel.save(sc, model_path)
return cvModel
示例2: fit_kmeans
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 別名]
def fit_kmeans(spark, products_df):
step = 0
step += 1
tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")
step += 1
stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")
step += 1
tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)
step += 1
idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")
step += 1
normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")
step += 1
kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)
kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])
model = kmeans_pipeline.fit(products_df)
words_prediction = model.transform(products_df)
model.save("./kmeans") # the whole machine learning instance is saved in a folder
return model, words_prediction
示例3: create_pipeline
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 別名]
def create_pipeline(model_type, num_features=10000):
"""
Defines pipeline from BOW to prediction.
"""
remover = StopWordsRemover(inputCol="bow", outputCol="words")
hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=num_features)
tfidf = IDF(inputCol=hashingTF.getOutputCol(),
outputCol="features")
if model_type == 'log_reg':
model = LogisticRegression()
elif model_type == 'gbt':
model = GBTClassifier()
elif model_type == 'naive_bayes':
model = NaiveBayes()
elif model_type == 'rf':
model = RandomForestClassifier()
return Pipeline(stages=[remover, hashingTF, tfidf,
model])
示例4: BaselinePipelineEngine
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 別名]
class BaselinePipelineEngine(PipelineEngine):
@keyword_only
def __init__(self, cv):
super(BaselinePipelineEngine, self).__init__(cv)
self.hashing_tf_map = [pow(2, 20)]
self.lr_map = [0.1, 0.01]
self.stages = self._build_stages()
self.pipeline = Pipeline(stages=[self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr])
self.param_grid = self._build_param_grid()
def _build_stages(self):
self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features")
self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features")
self.lr = LogisticRegression(maxIter=10, regParam=0.01)
return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]
def _build_param_grid(self):
param_grid_builder = ParamGridBuilder()
param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
param_grid_builder.addGrid(self.lr.regParam, self.lr_map)
return param_grid_builder.build()
示例5: SQLContext
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 別名]
rdd = labeledRdd.map(lambda doc: (cleanLower(doc[0]), doc[1]))
print "Text is cleaned"
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ["review", "label"])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])
print "Random split is done"
tokenizerNoSw = tr.NLTKWordPunctTokenizer(
inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english"))
)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)
pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])
# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************
evaluator = MulticlassClassificationEvaluator(
predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)
示例6: SQLContext
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 別名]
sqc = SQLContext(sc)
sm = SparkModel(sc, conn, rdd_path='meta_rdd.pkl')
logging.basicConfig(format='%(asctime)s %(message)s')
grid_search = logging.getLogger('main')
grid_search.setLevel(logging.DEBUG)
handler = logging.FileHandler('../logs/grid_search.txt')
grid_search.addHandler(handler)
bow_rdd = sm.RDD.map(lambda (key, (bow, meta)): (key, bow))
bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow))
remover = StopWordsRemover(inputCol="raw", outputCol="words")
hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts",
numFeatures=10000)
tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features",
minDocFreq=20)
indexer = StringIndexer(inputCol="string_label", outputCol="label")
for model in [GBTClassifier(), RandomForestClassifier(), MultilayerPerceptronClassifier()]:
if type(model) == MultilayerPerceptronClassifier:
layers = [10000, 100, 2]
model = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128)
pipeline = Pipeline(stages=[remover, hashingTF, tfidf, # scaler,
indexer, model])
scores = cross_val_score(pipeline, bow_rdd)
grid_search.debug('Model: %s\nscores: %s\nAverage: %s' \
% (type(model), scores, scores.mean()))
示例7: StopWordsRemover
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 別名]
bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) \
.sample(withReplacement=False, fraction=.5, seed=1)
df = sqc.createDataFrame(bow_rdd, ['string_label', 'raw'])
train_rdd, test_rdd = df.randomSplit([.8, .2], seed=1)
results = []
num_features = 5000
min_doc_freq = 20
layers = [[5000, 2056, 512, 128, 2], [5000, 1000, 128, 2], [5000, 100, 2], [5000, 5000, 2]]
for l in layers:
remover = StopWordsRemover(inputCol="raw", outputCol="words")
hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts",
numFeatures=num_features)
tfidf = IDF(inputCol=hashingTF.getOutputCol(),
outputCol="features", minDocFreq=min_doc_freq)
indexer = StringIndexer(inputCol="string_label", outputCol="label")
mlpc = MultilayerPerceptronClassifier(maxIter=100,
layers=l,
blockSize=128)
pipeline = Pipeline(stages=[remover, hashingTF, tfidf,
indexer, mlpc])
model = pipeline.fit(train_rdd)
df_output = model.transform(train_rdd)
test_output = model.transform(test_rdd).select("label", "prediction")
score = test_output.rdd.map(lambda row: row.label == row.prediction).mean()
nn_gridsearch.debug("Layers: %s, Accuracy: %s" % (layers, score))
示例8: StopWordsRemover
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 別名]
gaps=False,
pattern="[a-zA-Z]+")
## Remove ignored words
stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
outputCol="filtered",
stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
caseSensitive=False)
## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
outputCol="wordToIndex",
numFeatures=1 << 10)
## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
outputCol="tf_idf",
minDocFreq=4)
if algo == "gbm":
## Create GBM model
algoStage = H2OGBM(ratio=0.8,
seed=1,
featuresCols=[idf.getOutputCol()],
predictionCol="label")
elif algo == "dl":
## Create H2ODeepLearning model
algoStage = H2ODeepLearning(epochs=10,
seed=1,
l1=0.001,
示例9: RegexTokenizer
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 別名]
from pyspark.ml.feature import RegexTokenizer
tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W+")
# COMMAND ----------
# MAGIC %md
# MAGIC Create a `HashingTF` transformer to hash words to buckets with counts, then use an `IDF` estimator to compute inverse-document frequency for buckets based on how frequently words have hashed to those buckets in the given documents. Next, normalize the tf-idf values so that the \\( l^2 \\) norm is one for each row.
# COMMAND ----------
from pyspark.ml.feature import IDF, HashingTF, Normalizer
hashingTF = HashingTF().setNumFeatures(10000).setInputCol(tokenizer.getOutputCol()).setOutputCol("hashingTF")
idf = IDF().setMinDocFreq(10).setInputCol(hashingTF.getOutputCol()).setOutputCol("idf")
normalizer = Normalizer().setInputCol(idf.getOutputCol()).setOutputCol("features")
# COMMAND ----------
# MAGIC %md
# MAGIC Now, let's build the `KMeans` estimator and a `Pipeline` that will contain all of the stages. We'll then call fit on the `Pipeline` which will give us back a `PipelineModel`. This will take about a minute to run.
# COMMAND ----------
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
kmeans = KMeans().setFeaturesCol("features").setPredictionCol("prediction").setK(5).setSeed(0)
示例10: Normalizer
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import getOutputCol [as 別名]
normalizerBi = Normalizer(inputCol="bigrams",outputCol='normBigrams',p=2.0)
dfNorm = normalizerUni.transform(dfVect2)
dfNorm2 = normalizerBi.transform(dfNorm)
print "DataFrame(bi-gram): normalisé"
dfNorm2.select('words','normWords').show()
# La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements
# non nuls et pas leur valeur
# On passe au TFIDF
# Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs
# à n'importz quelle colonne (bigrammes, avec stop words ou sans...)
from pyspark.ml.feature import HashingTF
htf = HashingTF(inputCol='words',outputCol='wordsTF',numFeatures=10000)
dfTrainTF = htf.transform(dfTrainTokNoSw)
# INverse doc frequency
from pyspark.ml.feature import IDF
idf = IDF(inputCol=htf.getOutputCol(),outputCol="wordsTFIDF")
idfModel = idf.fit(dfTrainTF)
dfTrainTFIDF = idfModel.transform(dfTrainTF)
dfTrainTFIDF.select('review','wordsTF','wordsTFIDF').show()
# Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainTFIDF)
dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF)
dfTrainFinal.select('review','label','target_indexed').show()
#**********************************************************************
#-----------Training the model for prediction--------------------------