本文整理汇总了Python中pyspark.mllib.classification.LogisticRegressionWithLBFGS.train方法的典型用法代码示例。如果您正苦于以下问题:Python LogisticRegressionWithLBFGS.train方法的具体用法?Python LogisticRegressionWithLBFGS.train怎么用?Python LogisticRegressionWithLBFGS.train使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.classification.LogisticRegressionWithLBFGS
的用法示例。
在下文中一共展示了LogisticRegressionWithLBFGS.train方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: cross_validation_lr
# 需要导入模块: from pyspark.mllib.classification import LogisticRegressionWithLBFGS [as 别名]
# 或者: from pyspark.mllib.classification.LogisticRegressionWithLBFGS import train [as 别名]
def cross_validation_lr(Data_1,Data_2,Data_3,regType, num_iter):
# Training the model using Logistic Regression Classifier
model_train_1 =LogisticRegressionWithLBFGS.train(Data_1.union(Data_2),
regType =regType, iterations=num_iter, numClasses=5)
# Evaluate model on test instances and compute test error
predictions_1 = model_train_1.predict(Data_3.map(lambda x: x.features))
labelsAndPredictions_1 = Data_3.map(lambda lp: lp.label).zip(predictions_1)
testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v +0.5 - p) * (v +0.5- p )).sum() /\
float(Data_3.count())
model_train_2 =LogisticRegressionWithLBFGS.train(Data_2.union(Data_3),
regType =regType, iterations=num_iter, numClasses=5)
# Evaluate model on test instances and compute test error
predictions_2 = model_train_2.predict(Data_1.map(lambda x: x.features))
labelsAndPredictions_2 = Data_1.map(lambda lp: lp.label).zip(predictions_2)
testMSE_2 = labelsAndPredictions_2.map(lambda (v, p): (v +0.5- p) * (v +0.5- p )).sum() /\
float(Data_1.count())
model_train_3 =LogisticRegressionWithLBFGS.train(Data_3.union(Data_1),
regType =regType, iterations=num_iter, numClasses=5)
# Evaluate model on test instances and compute test error
predictions_3 = model_train_3.predict(Data_2.map(lambda x: x.features))
labelsAndPredictions_3 = Data_2.map(lambda lp: lp.label).zip(predictions_3)
testMSE_3 = labelsAndPredictions_3.map(lambda (v, p): (v +0.5- p ) * (v +0.5- p)).sum() /\
float(Data_2.count())
return (testMSE_1+testMSE_2+testMSE_3)/3
示例2: test_train
# 需要导入模块: from pyspark.mllib.classification import LogisticRegressionWithLBFGS [as 别名]
# 或者: from pyspark.mllib.classification.LogisticRegressionWithLBFGS import train [as 别名]
def test_train(self, df, target, train_split, test_split, regularization=None, num_of_iterations=100):
try:
LOGGER.info("Generation logistic regression")
spark_df = self.sql_context.createDataFrame(df)
feature_columns = spark_df.columns
feature_columns.remove(target)
train, test = spark_df.randomSplit([train_split, test_split], seed=1000000)
X_train = train.select(*feature_columns).map(lambda x: list(x))
y_train = train.select(target).map(lambda x: x[0])
zipped = y_train.zip(X_train)
train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1]))
numOfClasses = len(df[target].unique())
logistic_model = LogisticRegressionWithLBFGS.train(train_data,
numClasses=numOfClasses, regParam=0,
regType=regularization, intercept=True,
iterations=num_of_iterations, validateData=False)
X_test = test.select(*feature_columns).map(lambda x: list(x))
y_test = test.select(target).map(lambda x: x[0])
prediction = X_test.map(lambda lp: (float(logistic_model.predict(lp))))
prediction_and_label = prediction.zip(y_test)
LOGGER.info(prediction_and_label.map(lambda labelAndPred: labelAndPred[0] == labelAndPred[1]).mean())
except Exception as e:
raise e
示例3: lrTest
# 需要导入模块: from pyspark.mllib.classification import LogisticRegressionWithLBFGS [as 别名]
# 或者: from pyspark.mllib.classification.LogisticRegressionWithLBFGS import train [as 别名]
def lrTest(sqlContext,dataset_rdd,positive_negotive_rate):
dataset_positive = dataset_rdd.filter(lambda e:e[1]>0.5)
dataset_negotive = dataset_rdd.filter(lambda e:e[1]<0.5)
train_positive = dataset_positive.sample(False,0.8)
test_positive = dataset_positive.subtract(train_positive)
train_negotive = dataset_negotive.sample(False,0.8)
test_negotive = dataset_negotive.subtract(train_negotive)
trainset_rdd = train_positive.union(train_negotive)
testset_rdd = test_positive.union(test_negotive)
trainset = trainset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
trainset_nums = trainset.count()
testset = testset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
testset_nums = testset.count()
trainset_positive = train_positive.count()
testset_positive = test_positive.count()
model = LogisticRegressionWithLBFGS.train(trainset,iterations = 100)
predict = testset.map(lambda p:(p.label,model.predict(p.features)))
hitALL =predict.filter(lambda e:e[0]==e[1]).count()
hitPositive = predict.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count()
positive = predict.filter(lambda e:e[1]>0.5).count()
recallPositive = hitPositive/float(testset_positive)
precision = hitPositive/float(positive)
accuracy = hitALL/float(testset.count())
F_Value = 2/(1/precision+1/recallPositive)
return (trainset_nums,testset_nums,trainset_positive,testset_positive,positive,hitPositive,precision,recallPositive,accuracy,F_Value,model)
示例4: seg_model_lr
# 需要导入模块: from pyspark.mllib.classification import LogisticRegressionWithLBFGS [as 别名]
# 或者: from pyspark.mllib.classification.LogisticRegressionWithLBFGS import train [as 别名]
def seg_model_lr(train_data, test_data, regType, num_iter):
removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id'])
newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train]
# Putting data in vector assembler form
assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features")
transformed_train = assembler_train.transform(train_data.fillna(0))
# Creating input dataset in the form of labeled point for training the model
data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features))
# Training the model using Logistic regression Classifier
model_train = LogisticRegressionWithLBFGS.train(sc.parallelize(data_train.collect(),5),
regType =regType, iterations=num_iter, numClasses=5)
# Creating a list of features to be used for predictions
removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id'])
newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final]
# Putting data in vector assembler form
assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features")
transformed_final= assembler_final.transform(test_data.fillna(0))
# Creating input dataset to be used for predictions
data_final = transformed_final.select("features", "review_id")
# Predicting ratings using the developed model
predictions = model_train.predict(data_final.map(lambda x: x.features))
labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions)
return labelsAndPredictions
示例5: train
# 需要导入模块: from pyspark.mllib.classification import LogisticRegressionWithLBFGS [as 别名]
# 或者: from pyspark.mllib.classification.LogisticRegressionWithLBFGS import train [as 别名]
def train(self, df, target, regularization=None, num_of_iterations=100):
try:
LOGGER.info("Generation logistic regression")
spark_df = self.sql_context.createDataFrame(df)
feature_columns = spark_df.columns
feature_columns.remove(target)
X_train = spark_df.select(*feature_columns).map(lambda x: list(x))
y_train = spark_df.select(target).map(lambda x: x[0])
zipped = y_train.zip(X_train)
train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1]))
numOfClasses = len(df[target].unique())
logistic_model = LogisticRegressionWithLBFGS.train(train_data,
numClasses=numOfClasses, regParam=0,
regType=regularization, intercept=True,
iterations=num_of_iterations, validateData=False)
self.model = logistic_model
except Exception as e:
raise e
示例6: RunLogit
# 需要导入模块: from pyspark.mllib.classification import LogisticRegressionWithLBFGS [as 别名]
# 或者: from pyspark.mllib.classification.LogisticRegressionWithLBFGS import train [as 别名]
def RunLogit(tf):
rdd = tf.map(parseAsLabeledPoints)
train, test = rdd.randomSplit([.8, .2])
numCat = len(genCats)
model = LogisticRegressionWithLBFGS.train(train, numClasses=numCat, iterations=100)
predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
print 'Accuracy of Logit = ', accuracy * 100
print "Test Error = ", (1.0 - accuracy) * 100
示例7: train_model
# 需要导入模块: from pyspark.mllib.classification import LogisticRegressionWithLBFGS [as 别名]
# 或者: from pyspark.mllib.classification.LogisticRegressionWithLBFGS import train [as 别名]
def train_model(training_rdd, **kwargs):
"""
Train a classifier model using an rdd training dataset
:param training_rdd: the rdd of the training dataset
:param kwargs: additional key-value params for the training (if any)
:return:
"""
return LogisticRegressionWithLBFGS.train(training_rdd,
regType=_REGULARIZATION,
intercept=_INTERCEPT,
**kwargs)
示例8: regression
# 需要导入模块: from pyspark.mllib.classification import LogisticRegressionWithLBFGS [as 别名]
# 或者: from pyspark.mllib.classification.LogisticRegressionWithLBFGS import train [as 别名]
def regression(reg_data):
(trainingData, testData) = reg_data.randomSplit([0.7, 0.3])
lrmodel = LogisticRegressionWithLBFGS.train(trainingData)
labelsAndPreds = testData.map(lambda p: (p.label, lrmodel.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
falsePos = labelsAndPreds.filter(lambda (v, p): v != p and v == 0.0).count() / float(testData.filter(lambda lp: lp.label == 0.0).count())
falseNeg = labelsAndPreds.filter(lambda (v, p): v != p and v == 1.0).count() / float(testData.filter(lambda lp: lp.label == 1.0).count())
print "*** Error Rate: %f ***" % trainErr
print "*** False Positive Rate: %f ***" % falsePos
print "*** False Negative Rate: %f ***" % falseNeg
示例9: validation_lr
# 需要导入模块: from pyspark.mllib.classification import LogisticRegressionWithLBFGS [as 别名]
# 或者: from pyspark.mllib.classification.LogisticRegressionWithLBFGS import train [as 别名]
def validation_lr(trainingData,testData, regType, num_iter):
# Training the model using Logistic Regression Classifier
model_train =LogisticRegressionWithLBFGS.train(trainingData, regType =regType, iterations=num_iter, numClasses=5)
# Evaluate model on test instances and compute test error
predictions = model_train.predict(testData.map(lambda x: x.features))
testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v - p) * (v - p)).sum() /\
float(testData.count())
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
float(testData.count())
return testMSE_1,testMSE
示例10: training
# 需要导入模块: from pyspark.mllib.classification import LogisticRegressionWithLBFGS [as 别名]
# 或者: from pyspark.mllib.classification.LogisticRegressionWithLBFGS import train [as 别名]
def training(path):
#import dataset into RDD
raw_data = sc.textFile(path)
#parse raw data into label bag-of-words pairs
parsed_data = raw_data.map(lambda line: parse_line(line))
#separate into training set and test set
training_set, test_set = parsed_data.randomSplit([0.6, 0.4], 17)
#get features for model training
features = feature_extraction(training_set)
labeled_points_training = training_set.map(lambda line: construct_labeled_point(line, features))
labeled_points_test = test_set.map(lambda line: construct_labeled_point(line, features))
#train logistic regression model
lrModel = LogisticRegressionWithLBFGS.train(labeled_points_training)
#train naive bayes model
nbModel = NaiveBayes.train(labeled_points_training)
return lrModel, nbModel, labeled_points_test
示例11: logisticRegression
# 需要导入模块: from pyspark.mllib.classification import LogisticRegressionWithLBFGS [as 别名]
# 或者: from pyspark.mllib.classification.LogisticRegressionWithLBFGS import train [as 别名]
def logisticRegression(features,sc,output_n):
features_and_label = features.collect()
training_features_labels = features_and_label[0:70]
testing_features_labels = features_and_label[70:]
labeled_training = []
labeled_testing = []
for x in training_features_labels:
labeled_training.append(LabeledPoint(x[0],x[1]))
for y in testing_features_labels:
labeled_testing.append(LabeledPoint(y[0],y[1]))
test = sc.parallelize(labeled_testing)
logregression_model = LogisticRegressionWithLBFGS.train(labeled_training)
predictions = test.map(lambda line: (line.label, float(logregression_model.predict(line.features))))
return predictions
示例12: create_model
# 需要导入模块: from pyspark.mllib.classification import LogisticRegressionWithLBFGS [as 别名]
# 或者: from pyspark.mllib.classification.LogisticRegressionWithLBFGS import train [as 别名]
def create_model(name, training):
if name == 'logistic':
print_box()
print "Logistic Regression Model"
print_box()
model = LogisticRegressionWithLBFGS.train(training)
elif name == 'tree':
print_box()
print "Decision Tree Model"
print_box()
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
impurity='gini', maxDepth=5, maxBins=32)
elif name == 'rf':
print_box()
print "Random Forest Model"
print_box()
model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=50)
return model
示例13: main
# 需要导入模块: from pyspark.mllib.classification import LogisticRegressionWithLBFGS [as 别名]
# 或者: from pyspark.mllib.classification.LogisticRegressionWithLBFGS import train [as 别名]
def main(input_file_path):
print('=====>>>>>')
print('ddd')
data = sc.textFile(input_file_path)
traning_data_RDD = data.filter(lambda line: line.split(',')[4] != '' and line.split(',')[0] != 'INDEX')
unseen_data_RDD = data.filter(lambda line: line.split(',')[4] == '')
traning_data_pddf = create_pddf(traning_data_RDD)
traning_data_df = sqlContext.createDataFrame(traning_data_pddf)
print(traning_data_df.head())
parsed_data = rdd_to_labeled_point(traning_data_df.rdd)
parsed_data.persist()
# Correct print: [LabeledPoint(1.0, [1.0,8.6662186586,6.98047693487])]
logisticRegressionWithLBFGS = LogisticRegressionWithLBFGS.train(parsed_data, iterations=500, numClasses=100)
labels_and_preds = parsed_data.map(lambda lp: [lp.label, logisticRegressionWithLBFGS.predict(lp.features)])
Accuracy = float(labels_and_preds.filter(lambda ele: (int(ele[0]) - int(ele[1])) ** 2).reduce(lambda x, y: x + y)[0]) / float(parsed_data.count())
print("Training Accuracy on training data = " + str(Accuracy))
unseen_data_pddf = create_pddf(unseen_data_RDD)
unseen_data_df = sqlContext.createDataFrame(unseen_data_pddf)
unseen_parsed_data = rdd_to_index_featurs(unseen_data_df.rdd)
unseen_parsed_data.persist()
file = open('/Users/1002720/Documents/workspace/SNU-project/data/BDA2Project/1-GenderPrediction/result2.csv', 'w',
encoding='utf-8')
file.write('INDEX,AGE\n')
for data in unseen_parsed_data.collect():
file.write(str(data[0]) + ',' + str(logisticRegressionWithLBFGS.predict(data[1])) + '\n')
# print(labels_and_preds.collect())
parsed_data.unpersist()
unseen_parsed_data.unpersist()
print('=====>>>>>')
print('=====>>>>>')
print('=====>>>>>')
print('=====>>>>>')
示例14: train
# 需要导入模块: from pyspark.mllib.classification import LogisticRegressionWithLBFGS [as 别名]
# 或者: from pyspark.mllib.classification.LogisticRegressionWithLBFGS import train [as 别名]
def train(self, feat='tfidf'):
"""
Trains a multinomal NaiveBayes classifier on TFIDF features.
Parameters
---------
Spark DataFrame with columns:
key: (label, filepath) tuple
tf: Term-frequency Sparse Vector.
IDF: TFIDF Sparse Vector.
Returns
---------
model: MLLib NaiveBayesModel object, trained.
test_score: Accuracy of the model on test dataset.
"""
if not self.lp_path:
self.labeled_points = self.make_labeled_points(self.extract_features())
self.make_train_test(self.test_size)
train_rdd = self.labeled_points.join(self.y_train) \
.map(lambda (key, (lp, label)): lp) \
.repartition(self.n_part).cache()
if self.model_type == 'naive_bayes':
nb = NaiveBayes()
self.model = nb.train(train_rdd)
elif self.model_type == 'log_reg':
n_classes = len(self.unique_ratings())
features = train_rdd.map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))
logreg = LogisticRegressionWithLBFGS.train(features, numClasses=n_classes)
self.model = logreg
# elif self
return self
示例15: processData
# 需要导入模块: from pyspark.mllib.classification import LogisticRegressionWithLBFGS [as 别名]
# 或者: from pyspark.mllib.classification.LogisticRegressionWithLBFGS import train [as 别名]
def processData(sc):
#load and parse the data
raw_data = sc.textFile(DATA_FILE)
raw_data.persist()
print "Train data size {}".format(raw_data.count())
# map data to a format needed for logistic regression
parsedData = raw_data.map(mapper)
print "Sample of input to algorithm ", parsedData.take(10)
# Train model
t0 = time()
model = LogisticRegressionWithLBFGS.train(parsedData)
t1 = time() - t0
print "Classifier trained in {} seconds".format(round(t1, 3))
labelsAndPreds = parsedData.map(lambda point: (point.label, model.predict(point.features)))
# Evaluating the model on training data
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
# Print some stuff
print("Training Error = " + str(trainErr))