本文整理汇总了Python中pyspark.ml.feature.Tokenizer.getOutputCol方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.getOutputCol方法的具体用法?Python Tokenizer.getOutputCol怎么用?Python Tokenizer.getOutputCol使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.feature.Tokenizer
的用法示例。
在下文中一共展示了Tokenizer.getOutputCol方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: fit_kmeans
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import getOutputCol [as 别名]
def fit_kmeans(spark, products_df):
step = 0
step += 1
tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")
step += 1
stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")
step += 1
tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)
step += 1
idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")
step += 1
normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")
step += 1
kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)
kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])
model = kmeans_pipeline.fit(products_df)
words_prediction = model.transform(products_df)
model.save("./kmeans") # the whole machine learning instance is saved in a folder
return model, words_prediction
示例2: main
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import getOutputCol [as 别名]
def main():
# Read training data as a DataFrame
sqlCt = SQLContext(sc)
trainDF = sqlCt.read.parquet(training_input)
testDF = sqlCt.read.parquet(testing_input)
tokenizer = Tokenizer(inputCol="text", outputCol="words")
evaluator = BinaryClassificationEvaluator()
# no parameter tuning
hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
model_notuning = pipeline_notuning.fit(trainDF)
prediction_notuning = model_notuning.transform(testDF)
notuning_output = evaluator.evaluate(prediction_notuning)
# for cross validation
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=20)
paramGrid = ParamGridBuilder()\
.addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
.addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
.build()
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
cvModel = cv.fit(trainDF)
# Make predictions on test documents. cvModel uses the best model found.
best_prediction = cvModel.transform(testDF)
best_output = evaluator.evaluate(best_prediction)
s = str(notuning_output) + '\n' + str(best_output)
output_data = sc.parallelize([s])
output_data.saveAsTextFile(output)
示例3: getPipeline
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import getOutputCol [as 别名]
def getPipeline(self, df):
# notify pipeline
self.success('Initializing ML Pipeline ...')
# initialize our tokenizer, we're going to tokenize features
tokenizer = Tokenizer(inputCol='tag_features', outputCol='words')
# convert the tokenize data to vectorize data
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='features')
# initialize logistic regression algorithm
lr = LogisticRegression(maxIter=10, regParam=0.01)
# create / initialize the ml pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# fit the pipeline on our training dataframe
model = pipeline.fit(df)
return model
示例4: main
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import getOutputCol [as 别名]
def main():
'''
takes one input argument :: Location of the directory for training and test data files.
:return: Print output on console for the area under the ROC curve.
'''
conf = SparkConf().setAppName("MLPipeline")
sc = SparkContext(conf=conf)
# Read training data as a DataFrame
sqlCt = SQLContext(sc)
trainDF = sqlCt.read.parquet("20news_train.parquet")
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
lr = LogisticRegression(maxIter=20, regParam=0.1)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# Fit the pipeline to training data.
model = pipeline.fit(trainDF)
numFeatures = (1000, 5000, 10000)
regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build()
cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2)
# Evaluate the model on testing data
testDF = sqlCt.read.parquet("20news_test.parquet")
prediction = model.transform(testDF)
evaluator = BinaryClassificationEvaluator()
model_cv = cv.fit(trainDF)
prediction_cv = model_cv.transform(testDF)
print evaluator.evaluate(prediction)
print evaluator.evaluate(prediction_cv)
开发者ID:PranavGoel,项目名称:Python-Spark---Matrix-Multiplication---ML-pipeline,代码行数:41,代码来源:ml_pipeline.py
示例5: BaselinePipelineEngine
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import getOutputCol [as 别名]
class BaselinePipelineEngine(PipelineEngine):
@keyword_only
def __init__(self, cv):
super(BaselinePipelineEngine, self).__init__(cv)
self.hashing_tf_map = [pow(2, 20)]
self.lr_map = [0.1, 0.01]
self.stages = self._build_stages()
self.pipeline = Pipeline(stages=[self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr])
self.param_grid = self._build_param_grid()
def _build_stages(self):
self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features")
self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features")
self.lr = LogisticRegression(maxIter=10, regParam=0.01)
return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]
def _build_param_grid(self):
param_grid_builder = ParamGridBuilder()
param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
param_grid_builder.addGrid(self.lr.regParam, self.lr_map)
return param_grid_builder.build()
示例6: float
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import getOutputCol [as 别名]
distinct_labels[curr_cat] = category_dir
next_docs = sc.wholeTextFiles(('/').join([input_dir, category_dir]))
docs = docs.union(next_docs.map(lambda (doc, lines): (format_text(lines), float(curr_cat))))
curr_cat += 1
training_rows = docs.sample(False, train_fraction)
testing_rows = docs.subtract(training_rows)
# Prepare training and test documents, which are labeled.
LabeledDocument = Row("text", "label")
train = training_rows.map(lambda x: LabeledDocument(*x)).toDF()
test = testing_rows.map(lambda x: LabeledDocument(*x)).toDF()
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures") #outputCol="features")
idf = IDF(inputCol="rawFeatures", outputCol="features")
lr = LogisticRegression(maxIter=1000, regParam=0.001)
#pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
p0 = Pipeline(stages=[tokenizer, hashingTF, idf ,lr])
#m0 = p0.fit(train)
#pipeline = Pipeline(stages=[m0, lr])
pipeline = p0
# Fit the pipeline to training documents.
model = pipeline.fit(train)
print('\n\n --------------- RESULT ----------------------\n\n')
print(model.transform(test).head())
print('\n\n ---------------------------------------------\n\n')
示例7: cleanLower
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import getOutputCol [as 别名]
def cleanLower(doc):
return doc.replace("<br /><br />"," ").lower()
rdd = labeledRdd.map(lambda doc : (cleanLower(doc[0]),doc[1]))
print "Text is cleaned"
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])
print "Random split is done"
tokenizer = Tokenizer(inputCol='review', outputCol='reviews_words')
hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)
pipeline = Pipeline(stages=[tokenizer,
hashing_tf,
idf,
string_indexer,
dt])
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
# grid=(ParamGridBuilder()
# .baseOn([evaluator.metricName,'precision'])
# .addGrid(dt.maxDepth, [10,20])
示例8: main
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import getOutputCol [as 别名]
def main(args):
textFiles = sc.wholeTextFiles(maindir + '4').map(readContents)
#print "READ second {} check ".format(textFiles.take(10))
'''
filter the rows based on all the index available in
training file else drop
http://stackoverflow.com/questions/24718697/pyspark-drop-rows
'''
htmldf = sqlContext.createDataFrame(textFiles)
htmldf.cache()
traindf = getCleanedRDD(maindir + 'train_v2.csv', ["id", "images", "links", "text", "label"], htmldf)
traindf.write.save(maindir+"output/train_4.parquet", format="parquet")
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=20, regParam=0.01)
rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="label")
rf = RandomForestClassifier(labelCol="features", numTrees=3, maxDepth=4)
#https://databricks.com/blog/2015/07/29/new-features-in-machine-learning-pipelines-in-spark-1-4.html
#http://spark.apache.org/docs/latest/api/python/pyspark.ml.html
#w2v = Word2Vec(inputCol="text", outputCol="w2v")
rfc = RandomForestClassifier(labelCol="label", numTrees=3, maxDepth=4)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# Fit the pipeline to training documents.
model = pipeline.fit(traindf)
print '-----------------------------------------------------------------------------'
testdf = getCleanedRDD(maindir + 'test.csv', ["id", "images", "links", "text", "label"], htmldf)
#print testdf.count()
# Make predictions on test documents and print columns of interest.
prediction = model.transform(testdf)
#print('prediction', prediction)
'''
pand = prediction.toPandas()
pand.to_csv('testpanda.csv', sep='\t', encoding='utf-8')
print "Done!!! CSV"
'''
#prediction.select('id','probability','prediction').write.format('com.databricks.spark.csv').option("header", "true").save(maindir + 'output/result_lr0.csv')
# ('prediction', DataFrame[id: string, images: bigint, links: bigint, text: string, label: double,
# words: array<string>, features: vector, rawPrediction: vector, probability: vector, prediction: double])
'''
#write in scala
selected = prediction.select("id", "probability", "prediction")
for row in selected.collect():
print row
'''
sc.stop()
示例9: Tokenizer
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import getOutputCol [as 别名]
spark = SparkSession\
.builder\
.appName("SimpleTextClassificationPipeline")\
.getOrCreate()
# Prepare training documents, which are labeled.
training = spark.createDataFrame([
(0, "a b c d e spark", 1.0),
(1, "b d", 0.0),
(2, "spark f g h", 1.0),
(3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(numFeatures=1000, inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# Fit the pipeline to training documents.
model = pipeline.fit(training)
# Prepare test documents, which are unlabeled.
test = spark.createDataFrame([
(4, "spark i j k"),
(5, "l m n"),
(6, "spark hadoop spark"),
(7, "apache hadoop")
], ["id", "text"])
# Make predictions on test documents and print columns of interest.
示例10: Row
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import getOutputCol [as 别名]
# Prepare training documents, which are labeled.
LabeledDocument = Row('id', 'text', 'label')
training = sqlCtx.inferSchema(
sc.parallelize([(0L, "a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
(3L, "hadoop mapreduce", 0.0)])
.map(lambda x: LabeledDocument(*x)))
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer() \
.setInputCol("text") \
.setOutputCol("words")
hashingTF = HashingTF() \
.setInputCol(tokenizer.getOutputCol()) \
.setOutputCol("features")
lr = LogisticRegression() \
.setMaxIter(10) \
.setRegParam(0.01)
pipeline = Pipeline() \
.setStages([tokenizer, hashingTF, lr])
# Fit the pipeline to training documents.
model = pipeline.fit(training)
# Prepare test documents, which are unlabeled.
Document = Row('id', 'text')
test = sqlCtx.inferSchema(
sc.parallelize([(4L, "spark i j k"),
(5L, "l m n"),
示例11: Tokenizer
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import getOutputCol [as 别名]
##Split training and testing
(trainingData, testData) = smsDf.randomSplit([0.9, 0.1])
print trainingData.count()
print testData.count()
testData.collect()
#Setup pipeline
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.feature import IDF
tokenizer = Tokenizer(inputCol="message", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), \
outputCol="tempfeatures")
idf=IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
nbClassifier=NaiveBayes()
pipeline = Pipeline(stages=[tokenizer, hashingTF, \
idf, nbClassifier])
nbModel=pipeline.fit(trainingData)
prediction=nbModel.transform(testData)
#prediction.where(prediction.prediction == 1.0).show()
prediction.groupBy("label","prediction").count().show()
示例12: filter
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import getOutputCol [as 别名]
positiveTrainTmp = posTrainTmp1.select(posTrainTmp1.Id, posTrainTmp1.Flag)
positiveTest = positive.join( positiveTrainTmp, positive.Id == positiveTrainTmp.Id, "LeftOuter").\
filter("Flag is null").\
select(positive.Id, positive.Text, positive.Label)
testing = negativeTest.unionAll(positiveTest)
# CREATE MODEL
numFeatures = 20000
numEpochs = 20
regParam = 0.02
tokenizer = Tokenizer().setInputCol("Text").setOutputCol("Words")
hashingTF = HashingTF().setNumFeatures(numFeatures).\
setInputCol(tokenizer.getOutputCol()).setOutputCol("Features")
lr = LogisticRegression().setMaxIter(numEpochs).setRegParam(regParam).\
setFeaturesCol("Features").setLabelCol("Label").\
setRawPredictionCol("Score").setPredictionCol("Prediction")
pipeline = Pipeline().setStages([tokenizer, hashingTF, lr])
# this comand takes a time
model = pipeline.fit(training)
testTitle = "Easiest way to merge a release into one JAR file"
testBody = """Is there a tool or script which easily merges a bunch of
href="http://en.wikipedia.org/wiki/JAR_%28file_format%29"
>JAR</a> files into one JAR file? A bonus would be to easily set the main-file manifest
and make it executable. I would like to run it with something like:
</p>

<blockquote>
 <p>java -jar
rst.jar</p>
</blockquote>

<p>
示例13: Tokenizer
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import getOutputCol [as 别名]
# être que les pipelines c'est faisables. A voir
df_test_words = tokenizer.transform(dfTest)
df_test_tf = htf.transform(df_test_words)
df_test_tfidf = idfModel.transform(df_test_tf)
df_test_final = string_indexer_model.transform(df_test_tfidf)
# Les prédictions
df_test_pred = dt_model.transform(df_test_final)
df_test_pred.select('review', 'target_indexed', 'prediction', 'probability').show(5)
# Je fais un pipeline très basique
from pyspark.ml import Pipeline
# Instanciate all the Estimators and Transformers necessary
tokenizer = Tokenizer(inputCol='review', outputCol='reviews_words')
hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='reviews_tf', numFeatures=10000)
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)
# Instanciate a Pipeline
pipeline = Pipeline(stages=[tokenizer,hashing_tf,idf,string_indexer,dt])
pipeline_model = pipeline.fit(dfTrain)
df_test_pred = pipeline_model.transform(dfTest)
df_test_pred.select('review', 'target_indexed', 'prediction', 'probability').show()
# Un outil automatique pour calculer le taux de bonne classif.
# La encore pas très utile en vrai
from pyspark.ml.evaluation import MulticlassClassificationEvaluator