本文整理汇总了Python中pyspark.mllib.classification.SVMWithSGD类的典型用法代码示例。如果您正苦于以下问题:Python SVMWithSGD类的具体用法?Python SVMWithSGD怎么用?Python SVMWithSGD使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了SVMWithSGD类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
data = [
LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
]
rdd = self.sc.parallelize(data)
features = [p.features for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
示例2: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
LabeledPoint(0.0, [2, 0, 0]),
LabeledPoint(1.0, [0, 2, 1])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
示例3: modelWithSVM
def modelWithSVM(trainingData, validationData):
##Train the model using Support Vector Machines with different values of iterations.
##Return the SVM model with best accuracy rate
#eta = [0.1, 0.3, 0.5, 1.0, 5.0]
regularizationParamater = [.0000001, 1., 5000., 10000., 200000.]
bestSVMModel = None
bestAccuracy = 0
numOfIterations = 100
visualizationData = []
for regularizer in regularizationParamater:
model = SVMWithSGD.train(trainingData, numOfIterations, 1.0, regParam=regularizer)
predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features)))
totalValidationAds = validationData.count()
correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
accuracy = float(correctlyPredicted)/totalValidationAds
visualizationData += [(regularizer, accuracy)]
if accuracy > bestAccuracy:
bestAccuracy = accuracy
bestSVMModel = model
return bestSVMModel, visualizationData
示例4: main
def main():
stock_file = sys.argv[1]
output_predict_file = sys.argv[2]
conf = SparkConf().setAppName('Stock Prediction Machine Learning with Twitter')
sc = SparkContext(conf=conf)
assert sc.version >= '1.5.1'
''' extracting the header of CSV file'''
file_data_all = sc.textFile(stock_file)
file_header = file_data_all.first()
file_data = file_data_all.filter(lambda line: line != file_header).cache()
''' for five different predictions getting data '''
parsedFileData_NextDayActualOpening = file_data.map(parseNextDayActualOpening)
parsedFileData_NextDayActualHigh = file_data.map(parseNextDayActualHigh)
parsedFileData_NextDayActualLow = file_data.map(parseNextDayActualLow)
parsedFileData_NextDayActualClose = file_data.map(parseNextDayActualClose)
parsedFileData_NextDayActualVolume = file_data.map(parseNextDayActualVolume)
print(parsedFileData_NextDayActualOpening.collect())
''' calling SVM with Stochastic Gradient Descent and
training using our data set '''
svm_model_nxtdayactopn = SVMWithSGD.train(parsedFileData_NextDayActualOpening, iterations=10)
lpreds = parsedFileData_NextDayActualOpening.map(lambda line: (line.label, svm_model_nxtdayactopn.predict(line.features)))
print(lpreds.collect())
示例5: trainSVMModel
def trainSVMModel(data):
"""
Train an SVM model and return it
:param data: RDD[LabeledPoint]
:return: svm classification model
"""
from pyspark.mllib.classification import SVMWithSGD, SVMModel
model = SVMWithSGD.train(data, iterations=100)
return model
示例6: main
def main():
# prepare training data
# RDDTrainData = sc.textFile('2007_100.csv')
RDDTrainData = sc.textFile(','.join([
# '1987.csv',
# '1988.csv',
# '1989.csv',
# '1990.csv',
# '1991.csv',
# '1992.csv',
# '1993.csv',
# '1994.csv',
# '1995.csv',
# '1996.csv',
# '1997.csv',
# '1998.csv',
# '1999.csv',
# '2000.csv',
# '2001.csv',
# '2002.csv',
# '2003.csv',
# '2004.csv',
# '2005.csv',
# '2006.csv',
'2007.csv',
]))
RDDTrainHeader = RDDTrainData.take(1)[0]
trainData = RDDTrainData.filter(lambda line: line != RDDTrainHeader)\
.map(split)\
.map(parseTrain)
# prepare testing data
RDDTestData = sc.textFile('2008.csv')
RDDTestHeader = RDDTestData.take(1)[0]
testData = RDDTestData.filter(lambda line: line != RDDTestHeader)\
.map(split)\
.map(parseTest)
# do prediction
# SVM
model = SVMWithSGD.train(trainData, iterations=100)
# Logistic Regression
# model = LogisticRegressionWithLBFGS.train(trainData)
predictionData = testData.map(lambda d:
(int(d.label), model.predict(d.features))
)
# evaluate error rate
errorCount = predictionData.filter(lambda d: int(d[0]) != int(d[1])).count()
totalCount = predictionData.count()
print 'error rate =', errorCount, '/', totalCount, '=', float(errorCount) / float(totalCount)
示例7: train
def train(sc, file_positive, files_negative, file_output):
"""
Trains a binary classification model using positive samples in file_positive and
negative samples in file_negative. It writes the resulting model to file_output
:param sc: The spark context
:type sc: SparkContext
:param file_positive: The file with positive tweets (relevant ones)
:type file_positive: str
:param files_negative: The file with negative tweets (non-relevant ones)
:type files_negative: list[str]
:param file_output: The output where to store the trained model
:type file_output: str
"""
positive_tweets = sc.textFile(file_positive).map(parse_json).filter(is_valid)
negative_tweets = [sc.textFile(file_negative).map(parse_json).filter(is_valid) for file_negative in files_negative]
positive = positive_tweets.map(parse_positive)
negatives = [nt.map(parse_negative) for nt in negative_tweets]
data = positive
for negative in negatives:
data = data.union(negative)
try:
print("Training classification model")
model = SVMWithSGD.train(data, iterations=150, step=1000.0, regType='l1', regParam=1e-7)
print("Saving classification model to file")
pickle.dump(model, open(file_output, 'wb'))
print("Done!")
except Exception as e:
print("Error:")
print(e)
示例8: gen_predictors
def gen_predictors(training_data):
classifiers = dict()
for item in label_map.iteritems():
print "Gen predictor for label '{0}' ...".format(item[0])
global processed_label
processed_label = item[1]
svm = SVMWithSGD.train(training_data.map(transform_label))
classifiers[item[1]] = svm
return classifiers
示例9: SVM_module
def SVM_module(training):
"""This function returns a SVM model from your training data.
:param training: (REQUIRED) - the training data
:return: SVM model
Use it as (Be sure to call split_data() to get the training data):
>>> model = SVM_module(trainingData)
"""
# Train a SVM model
return SVMWithSGD.train(training, iterations=300)
示例10: run_iterations
def run_iterations(parsedData, iter, seed):
fp_rates = []
tp_rates = []
# thld_arr = []
for i in range(0, 10):
trainingData, testingData = parsedData.randomSplit([70, 30], seed)
print("For " + str(iter) + " iterations:")
# Build the model
model = SVMWithSGD.train(trainingData, iterations=100)
# Evaluating the model on training data
labelsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainingData.count())
MSE = labelsAndPreds.map(lambda(v,p): (v-p)**2).reduce(lambda x, y: x + y)/labelsAndPreds.count()
print("Training Error = " + str(trainErr))
print("MSE = " + str(MSE))
labelsAndPreds = testingData.map(lambda p: (p.label, model.predict(p.features)))
testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testingData.count())
MSE = labelsAndPreds.map(lambda(v,p): (v-p)**2).reduce(lambda x, y: x + y)/labelsAndPreds.count()
print("Testing Error = " + str(testErr))
print("MSE = " + str(MSE))
info = labelsAndPreds.collect()
actual = [int(i[0]) for i in info]
predictions = [i[1] for i in info]
false_positive_rate = labelsAndPreds.filter(lambda (v, p): v == 1 and p == 0).count() / float(labelsAndPreds.filter(lambda (v, p): v == 1).count())
true_positive_rate = labelsAndPreds.filter(lambda (v, p): v == 0 and p == 0).count() / float(labelsAndPreds.filter(lambda (v, p): v == 0).count())
fpr, tpr, thresholds = roc_curve(actual, predictions)
# roc_auc = auc(false_positive_rate, true_positive_rate)
print false_positive_rate
print true_positive_rate
fp_rates.append(false_positive_rate)
tp_rates.append(true_positive_rate)
print fp_rates
print tp_rates
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fp_rates, tp_rates, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
plt.savefig('fig.png')
示例11: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
LabeledPoint(0.0, [2, 0, 0]),
LabeledPoint(1.0, [0, 2, 1])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainClassifier(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
示例12: main
def main(sc):
train_data='/usr/local/spark/data/mllib/sample_svm_data.txt'
data=sc.textFile(train_data).map(parse)
if os.path.exists('model'):
model=SVMModel.load(sc, 'model')
else:
model=SVMWithSGD.train(data, iterations=100)
model.save(sc, 'model')
labelsAndPreds=data.map(lambda p: (p.label, model.predict(p.features)))
# trainErr=labelsAndPreds.filter(lambda (v, p): v != p).count() / float(data.count())
# print('Training Error =' + str(trainErr))
labelsAndPreds.map(lambda x:str(x[0])+'\t'+str(x[1])).saveAsTextFile('labelsAndPreds')
示例13: training
def training(path):
#import dataset into RDD
raw_data = sc.textFile(path)
#parse raw data into label bag-of-words pairs
parsed_data = raw_data.map(lambda line: parse_line(line))
#separate into training set and test set
training_set, test_set = parsed_data.randomSplit([0.6, 0.4], 17)
#get features for model training
features = feature_extraction(training_set)
labeled_points_training = training_set.map(lambda line: construct_labeled_point(line, features))
labeled_points_test = test_set.map(lambda line: construct_labeled_point(line, features))
#train logistic regression model
lrModel = LogisticRegressionWithLBFGS.train(labeled_points_training)
#train naive bayes model
nbModel = NaiveBayes.train(labeled_points_training)
svmModel = SVMWithSGD.train(labeled_points_training)
return lrModel, nbModel, svmModel, labeled_points_test, features
示例14: main
def main(sc):
inputFile=sys.argv[1]
modelPath=sys.argv[2]
data = sc.textFile(inputFile)
parsedData = data.map(parsePoint)
# Build the model
model = SVMWithSGD.train(parsedData, iterations=100)
# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))
# Save and load model
model.save(sc, modelPath)
# sameModel = SVMModel.load(sc, "svm_model")
sc.stop()
示例15: train_trend_model
def train_trend_model(self, model, data, i):
self.logger.info('Start to train the direction model')
rdd_data = self.sc.parallelize(data)
if self.trend_prediction_method == self.RANDOM_FOREST:
model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40,
featureSubsetStrategy="auto", impurity='gini', maxDepth=20,
maxBins=32)
elif self.trend_prediction_method == self.NAIVE_BAYES:
model = NaiveBayes.train(rdd_data)
elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
initialWeights=None if model is None else model.weights)
elif self.trend_prediction_method == self.SVM:
model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001,
initialWeights=None if model is None else model.weights)
return model