本文整理汇总了Python中pyspark.mllib.tree.RandomForestModel类的典型用法代码示例。如果您正苦于以下问题:Python RandomForestModel类的具体用法?Python RandomForestModel怎么用?Python RandomForestModel使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了RandomForestModel类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: load_parameters
def load_parameters(self):
self.amount_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL,
file_name='amount_method')
self.trend_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL,
file_name='trend_method')
self.data_features = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='features')
self.stock_symbol = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='symbol')
self.data_parser = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='data_parser')
amount_model_path = os.path.join(os.path.abspath(self.model_path), 'amount_model')
trend_model_path = os.path.join(os.path.abspath(self.model_path), 'trend_model')
if self.amount_prediction_method == self.RANDOM_FOREST:
amount_model = RandomForestModel.load(sc=self.sc, path=amount_model_path)
elif self.amount_prediction_method == self.LINEAR_REGRESSION:
amount_model = LinearRegressionModel.load(sc=self.sc, path=amount_model_path)
else:
amount_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='amount_model')
if self.trend_prediction_method == self.RANDOM_FOREST:
trend_model = RandomForestModel.load(sc=self.sc, path=trend_model_path)
elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
trend_model = LogisticRegressionModel.load(sc=self.sc, path=trend_model_path)
elif self.trend_prediction_method == self.NAIVE_BAYES:
trend_model = NaiveBayesModel.load(sc=self.sc, path=trend_model_path)
elif self.trend_prediction_method == self.SVM:
trend_model = SVMModel.load(sc=self.sc, path=trend_model_path)
else:
trend_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='trend_model')
return trend_model, amount_model
示例2: evaluate_model
def evaluate_model(type):
if type == 'logistic':
model = LogisticRegressionModel.load(sc, "logit_model.model")
elif type == 'tree':
model = DecisionTreeModel.load(sc, "dt_model.model")
elif type == 'rf':
model = RandomForestModel.load(sc, "rf_model.model")
示例3: main
def main(sc, filename):
'''
The driver for the spark scoring application, it generates predictions for
a given file of features and target variables
'''
rawDataRdd = sc.textFile(filename)
print "Data Size: {}".format(rawDataRdd.count())
labeledPointsRdd = rawDataRdd.map(parse_lines)
#load models
logit_model = LogisticRegressionModel.load(sc, "logit_model.model")
dt_model = DecisionTreeModel.load(sc, "dt_model.model")
rf_model = RandomForestModel.load(sc, "rf_model.model")
#logistic predictions
labels_and_preds = labeledPointsRdd.map(lambda p: (float(logit_model.predict(p.features)), p.label ))
labels_and_preds_collected = labels_and_preds.collect()
print "\n"
print "Predictions: Logistic Regression"
y_true = []
y_pred = []
for row in labels_and_preds_collected:
y_true.append(row[1])
y_pred.append(row[0])
# print "predicted: {0} - actual: {1}\n".format(row[0], row[1])
accuracy = labels_and_preds.filter(lambda (v,p): v == p).count() / float(labeledPointsRdd.count())
print_box()
print "Prediction Accuracy (Logistic): {}".format(round(accuracy, 4))
print_box()
print "\n"
#decision tree predictions
predictions = dt_model.predict(labeledPointsRdd.map(lambda p: p.features))
labels_and_preds_dt = labeledPointsRdd.map(lambda p: p.label).zip(predictions)
labels_and_preds_dt_collected = labels_and_preds.collect()
accuracy_dt = labels_and_preds_dt.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count())
print_box()
print "Prediction Accuracy (Decision Tree): {}".format(round(accuracy_dt, 4))
print_box()
print "\n"
#random forest predictions
predictions_rf = rf_model.predict(labeledPointsRdd.map(lambda p: p.features))
labels_and_preds_rf = labeledPointsRdd.map(lambda p: p.label).zip(predictions_rf)
accuracy_rf = labels_and_preds_rf.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count())
print_box()
print "Prediction Accuracy (Random Forest): {}".format(round(accuracy_rf, 4))
print_box()
示例4: main
def main():
parser = OptionParser()
parser.add_option('', '--enriched_data_path', action='store', dest='enriched_data_path', help='path to write enriched data')
parser.add_option('', '--model_path', action='store', dest='model_path', help='path for model data')
parser.add_option('', '--kafka_zookeeper_hosts', action='store', dest='kafka_zookeeper_hosts', help='list of Zookeeper hosts (host:port)')
parser.add_option('', '--kafka_broker_list', action='store', dest='kafka_broker_list', help='list of Kafka brokers (host:port)')
parser.add_option('', '--kafka_message_topic', action='store', dest='kafka_message_topic', help='topic to consume input messages from')
parser.add_option('', '--kafka_alert_topic', action='store', dest='kafka_alert_topic', help='topic to produce alert messages to')
parser.add_option('', '--kafka_enriched_data_topic', action='store', dest='kafka_enriched_data_topic', help='topic to produce enriched data to')
parser.add_option('', '--streaming_batch_duration_sec', type='float', default=15.0,
action='store', dest='streaming_batch_duration_sec', help='Streaming batch duration in seconds')
parser.add_option('', '--max_batches', type='int', default=0,
action='store', dest='max_batches', help='Number of batches to process (0 means forever)')
options, args = parser.parse_args()
sc = SparkContext()
ssc = StreamingContext(sc, options.streaming_batch_duration_sec)
sqlContext = getSqlContextInstance(sc)
# Load saved model.
model = None
if options.model_path:
model = RandomForestModel.load(sc, options.model_path)
else:
print('No model loaded.')
# Create Kafka stream to receive new messages.
kvs = KafkaUtils.createDirectStream(ssc, [options.kafka_message_topic], {
'metadata.broker.list': options.kafka_broker_list,
'group.id': 'spark_streaming_processor.py'})
# Take only the 2nd element of the tuple.
messages = kvs.map(lambda x: x[1])
# Convert RDD of JSON strings to RDD of Rows.
rows = messages.map(json_to_row)
# Process messages.
rows.foreachRDD(lambda time, rdd:
process_messages(time, rdd,
ssc=ssc,
model=model,
enriched_data_path=options.enriched_data_path,
zookeeper_hosts=options.kafka_zookeeper_hosts,
kafka_alert_topic=options.kafka_alert_topic,
kafka_enriched_data_topic=options.kafka_enriched_data_topic,
max_batches=options.max_batches))
ssc.start()
ssc.awaitTermination()
示例5: test
def test(sc):
files = ["sounds/flushing/20150227_193109-flushing-04.wav",
"sounds/bike/20150227_193806-bici-14.wav",
"sounds/blender/20150227_193606-licuadora-14.wav"
]
rfmodel = RandomForestModel.load(sc, RF_PATH)
dtmodel = DecisionTreeModel.load(sc, DT_PATH)
print dtmodel.toDebugString()
for f in files:
vec = audio.showFeatures(f)
testfeatures = Vectors.dense([float(x) for x in vec.split(' ')])
print(vec)
pred = dtmodel.predict(testfeatures)
print("DT Prediction is " + str(pred), classes[int(pred)])
pred = rfmodel.predict(testfeatures)
print("RF Prediction is " + str(pred), classes[int(pred)])
示例6: predict
def predict(sc, data):
#sc = SparkContext(appName="PythonRandomForestClassificationExample")
# $example on$
# Load and parse the data file into an RDD of LabeledPoint.
#data = MLUtils.loadLibSVMFile(sc, 'deathproject/test1.txt')
#data = [1, 0, 5, 1, 1, 65, 1, 2, 0, 1, 0, 6, 3450]
# Split the data into training and test sets (30% held out for testing)
# (trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a RandomForest model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
print('Starting...')
#model = RandomForest.trainClassifier(trainingData, numClasses=8, categoricalFeaturesInfo={},
# numTrees=5, featureSubsetStrategy="auto",
# impurity='gini', maxDepth=4, maxBins=32)
sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel2")
print('Predicting...')
# Evaluate model on test instances and compute test error
#predictions = sameModel.predict(data.map(lambda x: x.features))
predictions = sameModel.predict(data)
#labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions)
#testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
#print('Test Error = ' + str(testErr))
#print('Learned classification forest model:')
#print(sameModel.toDebugString())
print(predictions)
return int(predictions)
示例7: get_rfc_model
def get_rfc_model(filename, bucket, client, sc):
# get download_url
objects_dict = client.list_objects(Bucket=BUCKET_NAME)
filenames = map(lambda x: x["Key"], objects_dict["Contents"])
filenames = filter(lambda x: True if re.match(filename + '-v' + "[0-9]+", x) else False, filenames)
if len(filenames) == 0:
print "NO RFC MODELS FOUND"
return False
else:
versions = map(lambda x: int(re.search("[0-9]+", x).group()), filenames)
download_url = filename + "-v" + str(max(versions)) + '.zip'
# download zip and unzip it
local_url = '/tmp/' + download_url
os.system("rm -r " + local_url) # remove zip just in case
os.system("rm -r " + local_url[:-4]) # remove unzipped folder just in case
if download_file(download_url, bucket):
with zipfile.ZipFile(local_url, "r") as z:
z.extractall(local_url[:-4])
return RandomForestModel.load(sc, local_url[:-4]) #-4 for zip
else:
print "ZIP DOWNLOAD FAILED"
return False
示例8: run
def run(jobNm, sc, sqlContext, inputFile, dictFile,
bByDate=False,
inputPartitions=-1,
sNum=30,
modelPath=None,
bWriteMonitor=False,
writeFileOutput=False):
# import monitoring if needed
if bWriteMonitor==True:
import plotting
#Create monitoring plot and associated vectors
mPX = range(8)
mPY = [0.]*8
mSL = ["Create Feature Map", "Read in Data", "Aggregate for M.L.", "Read in Model", "Apply Model", "Output Results"]
mInd = 0
t0 = time.time()
#Find the word document frequency for the corpus
#this is used for an idf score used in feature vector formation
t1 = time.time()
revLookup = []
fDict = None
if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:':
# read dict file from hdfs
fDict = sc.textFile(dictFile).collect()
else:
# read from local file
fDict = open(dictFile,"r")
for line in fDict:
terms = line.split("\t")
revLookup.append(terms[0])
nVecLen = len(revLookup)
t2 = time.time()
diff = t2-t1
print "Time to read dict", diff
if bWriteMonitor:
mPY[mInd] = diff
mInd = mInd+1
plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)
#Read in data and filter out entries with no valid words
t1 = time.time()
print 'inputFile ',inputFile
print 'inputPartitions ',inputPartitions
records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions)
nGoodTweets = records.count()
t2 = time.time()
print "Number of good tweets:",nGoodTweets
diff = t2-t1
print "Time to read in data", diff
if bWriteMonitor:
mPY[mInd] = diff
mInd = mInd+1
plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)
#Format data for ML input
t1 = time.time()
mlApply = None
if bByDate:
mlApply = records.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt])).cache()
else:
mlApply = records.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
nApp = mlApply.count()
t2 = time.time()
print "Number of collapsed points:", nApp
diff = t2-t1
print "Time to map points", diff
if bWriteMonitor:
mPY[mInd] = diff
mInd = mInd+1
plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)
# Read in Model
t1 = time.time()
model_Tree = RandomForestModel.load(sc, modelPath)
t2 = time.time()
diff = t2-t1
print "Time to read in model", diff
if bWriteMonitor:
mPY[mInd] = diff
mInd = mInd+1
plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)
# apply model
t1 = time.time()
predictions_Tree = model_Tree.predict(mlApply.map(lambda x: x[1][0].features))
vecAndPredictions = mlApply.zip(predictions_Tree)
vecAndPredictions.cache()
vecAndPredictions.count()
t2 = time.time()
diff = t2-t1
print "Time to apply model: ", diff
if bWriteMonitor:
mPY[mInd] = diff
mInd = mInd+1
plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)
#.........这里部分代码省略.........
示例9: applyModel
#.........这里部分代码省略.........
print('Loaded and prapared %d entries' % df.count())
#########
# keep only needed features
#########
features = ['ADLOADINGTIME',
'PLACEMENTID',
'TIMESTAMP',
'CREATIVETYPE',
'UA_HARDWARETYPE',
'UA_VENDOR',
'UA_MODEL',
'UA_BROWSER',
'UA_BROWSERVERSION',
'FILESJSON',
'ERRORSJSON',
'TOPMOSTREACHABLEWINDOWAREA',
'FILESJSON_SIZE',
'COMBINEDID',
'COMBINEDEXTERNALID',
'PLATFORMCOMBINED',
'UA_OSCOMB',
'SDK',
'EXTERNALADSERVER'
]
df = df.select(features)
#########
# Convert categorical features to numerical
#########
featuresCat = [
'PLACEMENTID',
'CREATIVETYPE',
'UA_HARDWARETYPE',
'UA_VENDOR',
'UA_MODEL',
'UA_BROWSER',
'UA_BROWSERVERSION',
'FILESJSON',
'ERRORSJSON',
'COMBINEDID',
'COMBINEDEXTERNALID',
'PLATFORMCOMBINED',
'UA_OSCOMB',
'SDK',
'EXTERNALADSERVER'
]
for i in range(len(featuresCat)):
indexer = StringIndexer(inputCol=featuresCat[i], outputCol='_'+featuresCat[i]).setHandleInvalid("skip").fit(df)
df = indexer.transform(df).drop(featuresCat[i])
writer = indexer._call_java("write")
writer.overwrite().save("indexer_" + featuresCat[i])
featuresCat = [ '_' + featuresCat[i] for i in range(len(featuresCat))]
features = featuresCat[:]
features.append('TIMESTAMP')
features.append('FILESJSON_SIZE')
features.append('TOPMOSTREACHABLEWINDOWAREA')
#########
# Assemble features
#########
assembler = VectorAssembler(
inputCols=features,
outputCol="features")
df = assembler.transform(df)
#########
# Convert to labeled point
#########
lp = (df.select(func.col("ADLOADINGTIME").alias("label"), func.col("features"))
.map(lambda row: LabeledPoint(row.label, row.features)))
lp.cache()
#########
# Load trained model
#########
model = RandomForestModel.load(sc, loadModelName)
print('Model loaded!')
predictions = model.predict(lp.map(lambda x: x.features)).collect()
return predictions
示例10: SparkContext
if __name__ == "__main__":
sc = SparkContext(appName="PythonRandomForestRegressionExample")
# $example on$
# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a RandomForest model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest model:')
print(model.toDebugString())
# Save and load model
model.save(sc, "target/tmp/myRandomForestRegressionModel")
sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestRegressionModel")
# $example off$
示例11: open
dict_add = pickle.load(handle)
with open('dict_city.pickle', 'rb') as handle:
dict_city = pickle.load(handle)
with open('dict_fac.pickle', 'rb') as handle:
dict_fac = pickle.load(handle)
with open('dict_status.pickle', 'rb') as handle:
dict_status = pickle.load(handle)
with open('dict_bor.pickle', 'rb') as handle:
dict_bor = pickle.load(handle)
model = RandomForestModel.load(sc, "modelCritical")
predictList = []
f = open("outFile3.csv", 'rU')
for line in csv.reader(f, delimiter=","):
predictRow = list(line)
'''
agency = predictRow[agency_index]
complaint = predictRow[comp_index]
location = predictRow[loc_index]
incident = predictRow[incident_index]
address = predictRow[add_index]
city = predictRow[city_index]
facility = predictRow[fac_index]
示例12: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
LabeledPoint(0.0, [2, 0, 0]),
LabeledPoint(1.0, [0, 2, 1])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
temp_dir = tempfile.mkdtemp()
lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd, iterations=10)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
dt_model_dir = os.path.join(temp_dir, "dt")
dt_model.save(self.sc, dt_model_dir)
same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())
rf_model = RandomForest.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
maxBins=4, seed=1)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
rf_model_dir = os.path.join(temp_dir, "rf")
rf_model.save(self.sc, rf_model_dir)
same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())
gbt_model = GradientBoostedTrees.trainClassifier(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
gbt_model_dir = os.path.join(temp_dir, "gbt")
gbt_model.save(self.sc, gbt_model_dir)
same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())
try:
rmtree(temp_dir)
except OSError:
pass
示例13: writeLumbarTrainingReadings
def writeLumbarTrainingReadings(time, rddTraining):
try:
# Convert RDDs of the words DStream to DataFrame and run SQL query
connectionProperties = MySQLConnection.getDBConnectionProps('/home/erik/mysql_credentials.txt')
sqlContext = SQLContext(rddTraining.context)
if rddTraining.isEmpty() == False:
lumbarTrainingReading = sqlContext.jsonRDD(rddTraining)
lumbarTrainingReadingFinal = lumbarTrainingReading.selectExpr("deviceID","metricTypeID","uomID","positionID","actual.y AS actualYaw","actual.p AS actualPitch","actual.r AS actualRoll","setPoints.y AS setPointYaw","setPoints.p AS setPointPitch","setPoints.r AS setPointRoll")
lumbarTrainingReadingFinal.write.jdbc("jdbc:mysql://localhost/biosensor", "SensorTrainingReadings", properties=connectionProperties)
except:
pass
if __name__ == "__main__":
sc = SparkContext(appName="Process Lumbar Sensor Readings")
ssc = StreamingContext(sc, 2) # 2 second batches
loadedModel = RandomForestModel.load(sc, "../machine_learning/models/IoTBackBraceRandomForest.model")
#Process Readings
streamLumbarSensor = KafkaUtils.createDirectStream(ssc, ["LumbarSensorReadings"], {"metadata.broker.list": "localhost:9092"})
lineSensorReading = streamLumbarSensor.map(lambda x: x[1])
lineSensorReading.foreachRDD(writeLumbarReadings)
#Process Training Readings
streamLumbarSensorTraining = KafkaUtils.createDirectStream(ssc, ["LumbarSensorTrainingReadings"], {"metadata.broker.list": "localhost:9092"})
lineSensorTrainingReading = streamLumbarSensorTraining.map(lambda x: x[1])
lineSensorTrainingReading.foreachRDD(writeLumbarTrainingReadings)
# Run and then wait for termination signal
ssc.start()
ssc.awaitTermination()
示例14: print
from pyspark.mllib.linalg import Vectors
print ("Successfully imported Spark Modules")
except ImportError as e:
print ("Can not import Spark Modules", e)
sys.exit(1)
import functools
import itertools
from kafka import KafkaConsumer
def parseData(line):
splittedLine = line.split(",")
values = [float(s) for s in splittedLine[4:-1]]
label = splittedLine[-1]
featuresVector = Vectors.dense(values)
return LabeledPoint(label, featuresVector)
if __name__ == "__main__":
conf = SparkConf().setAppName("RandomForest_Anomaly_Detection_Kafka_Consumer")
sc = SparkContext(conf=conf)
savedModel = RandomForestModel.load(sc, "../train_model/model")
consumer = KafkaConsumer('test', group_id='my_group', bootstrap_servers=['localhost:9092'])
print("Waiting for messages...")
for message in consumer:
print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value))
data = sc.parallelize([message.value])
testData = data.map(parseData)
predictions = savedModel.predict(testData.map(lambda x: x.features))
print("Prediction: ")
print(predictions.first())
开发者ID:MarioPerezEsteso,项目名称:Network-Anomaly-Detection-Apache-Spark-Kafka,代码行数:31,代码来源:kafkaconsumerrandomforest.py
示例15: SparkContext
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils
sc = SparkContext("local", "Amadeus Random Forest Run", pyFiles=['run_model.py'])
data = sc.textFile("data_timbre.csv")
zipped_data = data.zipWithIndex()
keyed_data = zipped_data.map(lambda line: (line[-1], line[:-1]))
target = sc.textFile("target_timbre.csv")
zipped_target = target.zipWithIndex()
keyed_target = zipped_target.map(lambda line: (line[-1], line[:-1]))
target_data = keyed_data.join(keyed_target)
labled_point_data = target_data.map(lambda tup: LabeledPoint(tup[1][1][0], tup[1][0][0].split(',')))
# Split the data into training and test sets (30% held out for testing)
print("Creating Training and Test Data Split")
(trainingData, testData) = labled_point_data.randomSplit([0.7, 0.3])
model = RandomForestModel.load(sc, "myRFModel")
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
testAccuracy = labelsAndPredictions.map(lambda (v, p): 1 if (abs(v - p) < 10) else 0).sum() / float(testData.count())
print('Total Accuracy = ' + str(testAccuracy))