当前位置: 首页>>代码示例>>Python>>正文


Python tree.RandomForestModel类代码示例

本文整理汇总了Python中pyspark.mllib.tree.RandomForestModel的典型用法代码示例。如果您正苦于以下问题:Python RandomForestModel类的具体用法?Python RandomForestModel怎么用?Python RandomForestModel使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了RandomForestModel类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: load_parameters

    def load_parameters(self):
        self.amount_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL,
                                                                 file_name='amount_method')
        self.trend_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL,
                                                                file_name='trend_method')
        self.data_features = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='features')
        self.stock_symbol = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='symbol')
        self.data_parser = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='data_parser')
        amount_model_path = os.path.join(os.path.abspath(self.model_path), 'amount_model')
        trend_model_path = os.path.join(os.path.abspath(self.model_path), 'trend_model')

        if self.amount_prediction_method == self.RANDOM_FOREST:
            amount_model = RandomForestModel.load(sc=self.sc, path=amount_model_path)
        elif self.amount_prediction_method == self.LINEAR_REGRESSION:
            amount_model = LinearRegressionModel.load(sc=self.sc, path=amount_model_path)
        else:
            amount_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='amount_model')

        if self.trend_prediction_method == self.RANDOM_FOREST:
            trend_model = RandomForestModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            trend_model = LogisticRegressionModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            trend_model = NaiveBayesModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.SVM:
            trend_model = SVMModel.load(sc=self.sc, path=trend_model_path)
        else:
            trend_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='trend_model')

        return trend_model, amount_model
开发者ID:WarnWang,项目名称:Dissertation,代码行数:30,代码来源:composition_prediction_system.py

示例2: evaluate_model

def evaluate_model(type):
    if type == 'logistic':
        model = LogisticRegressionModel.load(sc, "logit_model.model")
    elif type == 'tree':
        model = DecisionTreeModel.load(sc, "dt_model.model")
    elif type == 'rf':
        model = RandomForestModel.load(sc, "rf_model.model")
开发者ID:ayushsagar,项目名称:big-data-analytics,代码行数:7,代码来源:score.py

示例3: main

def main(sc, filename):
    '''
    The driver for the spark scoring application, it generates predictions for
    a given file of features and target variables
    '''

    rawDataRdd = sc.textFile(filename)
    print "Data Size: {}".format(rawDataRdd.count())

    labeledPointsRdd = rawDataRdd.map(parse_lines)

    #load models
    logit_model = LogisticRegressionModel.load(sc, "logit_model.model")
    dt_model = DecisionTreeModel.load(sc, "dt_model.model")
    rf_model = RandomForestModel.load(sc, "rf_model.model")

    #logistic predictions
    labels_and_preds = labeledPointsRdd.map(lambda p: (float(logit_model.predict(p.features)), p.label  ))
    labels_and_preds_collected = labels_and_preds.collect()
    print "\n"
    print "Predictions: Logistic Regression"
    y_true = []
    y_pred = []
    for row in labels_and_preds_collected:
        y_true.append(row[1])
        y_pred.append(row[0])
        # print "predicted: {0} - actual: {1}\n".format(row[0], row[1])


    accuracy = labels_and_preds.filter(lambda (v,p): v == p).count() / float(labeledPointsRdd.count())

    print_box()
    print "Prediction Accuracy (Logistic): {}".format(round(accuracy, 4))
    print_box()
    print "\n"

    #decision tree predictions
    predictions = dt_model.predict(labeledPointsRdd.map(lambda p: p.features))
    labels_and_preds_dt = labeledPointsRdd.map(lambda p: p.label).zip(predictions)
    labels_and_preds_dt_collected = labels_and_preds.collect()


    accuracy_dt = labels_and_preds_dt.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count())

    print_box()
    print "Prediction Accuracy (Decision Tree): {}".format(round(accuracy_dt, 4))
    print_box()
    print "\n"

    #random forest predictions
    predictions_rf = rf_model.predict(labeledPointsRdd.map(lambda p: p.features))
    labels_and_preds_rf = labeledPointsRdd.map(lambda p: p.label).zip(predictions_rf)
    accuracy_rf = labels_and_preds_rf.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count())
    print_box()
    print "Prediction Accuracy (Random Forest): {}".format(round(accuracy_rf, 4))
    print_box()
开发者ID:ayushsagar,项目名称:big-data-analytics,代码行数:56,代码来源:score.py

示例4: main

def main():
    parser = OptionParser()
    parser.add_option('', '--enriched_data_path', action='store', dest='enriched_data_path', help='path to write enriched data')
    parser.add_option('', '--model_path', action='store', dest='model_path', help='path for model data')
    parser.add_option('', '--kafka_zookeeper_hosts', action='store', dest='kafka_zookeeper_hosts', help='list of Zookeeper hosts (host:port)')
    parser.add_option('', '--kafka_broker_list', action='store', dest='kafka_broker_list', help='list of Kafka brokers (host:port)')
    parser.add_option('', '--kafka_message_topic', action='store', dest='kafka_message_topic', help='topic to consume input messages from')
    parser.add_option('', '--kafka_alert_topic', action='store', dest='kafka_alert_topic', help='topic to produce alert messages to')
    parser.add_option('', '--kafka_enriched_data_topic', action='store', dest='kafka_enriched_data_topic', help='topic to produce enriched data to')
    parser.add_option('', '--streaming_batch_duration_sec', type='float', default=15.0,
        action='store', dest='streaming_batch_duration_sec', help='Streaming batch duration in seconds')
    parser.add_option('', '--max_batches', type='int', default=0,
        action='store', dest='max_batches', help='Number of batches to process (0 means forever)')
    options, args = parser.parse_args()

    sc = SparkContext()
    ssc = StreamingContext(sc, options.streaming_batch_duration_sec)
    sqlContext = getSqlContextInstance(sc)

    # Load saved model.
    model = None
    if options.model_path:
        model = RandomForestModel.load(sc, options.model_path)
    else:
        print('No model loaded.')

    # Create Kafka stream to receive new messages.
    kvs = KafkaUtils.createDirectStream(ssc, [options.kafka_message_topic], {
        'metadata.broker.list': options.kafka_broker_list,
        'group.id': 'spark_streaming_processor.py'})

    # Take only the 2nd element of the tuple.
    messages = kvs.map(lambda x: x[1])

    # Convert RDD of JSON strings to RDD of Rows.
    rows = messages.map(json_to_row)

    # Process messages.
    rows.foreachRDD(lambda time, rdd: 
        process_messages(time, rdd,
            ssc=ssc,
            model=model,
            enriched_data_path=options.enriched_data_path,
            zookeeper_hosts=options.kafka_zookeeper_hosts,
            kafka_alert_topic=options.kafka_alert_topic,
            kafka_enriched_data_topic=options.kafka_enriched_data_topic,
            max_batches=options.max_batches))

    ssc.start()
    ssc.awaitTermination()
开发者ID:claudiofahey,项目名称:global_anomaly_detection_demo,代码行数:50,代码来源:spark_streaming_processor.py

示例5: test

def test(sc):
    files = ["sounds/flushing/20150227_193109-flushing-04.wav",
             "sounds/bike/20150227_193806-bici-14.wav",
             "sounds/blender/20150227_193606-licuadora-14.wav"
             ]

    rfmodel = RandomForestModel.load(sc, RF_PATH)
    dtmodel = DecisionTreeModel.load(sc, DT_PATH)

    print dtmodel.toDebugString()
    for f in files:
        vec = audio.showFeatures(f)
        testfeatures = Vectors.dense([float(x) for x in vec.split(' ')])
        print(vec)
        pred = dtmodel.predict(testfeatures)
        print("DT Prediction is " + str(pred), classes[int(pred)])
        pred = rfmodel.predict(testfeatures)
        print("RF Prediction is " + str(pred), classes[int(pred)])
开发者ID:LoadedCoders,项目名称:iHear,代码行数:18,代码来源:main.py

示例6: predict

def predict(sc, data):

    #sc = SparkContext(appName="PythonRandomForestClassificationExample")
    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    #data = MLUtils.loadLibSVMFile(sc, 'deathproject/test1.txt')
    #data = [1, 0, 5, 1, 1, 65, 1, 2, 0, 1, 0, 6, 3450]
    # Split the data into training and test sets (30% held out for testing)
    # (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    print('Starting...')

    #model = RandomForest.trainClassifier(trainingData, numClasses=8, categoricalFeaturesInfo={},
    #                                     numTrees=5, featureSubsetStrategy="auto",
    #                                     impurity='gini', maxDepth=4, maxBins=32)



    sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel2")

    print('Predicting...')
    # Evaluate model on test instances and compute test error
   
    #predictions = sameModel.predict(data.map(lambda x: x.features))
    predictions = sameModel.predict(data)

    #labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions)
    #testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    #print('Test Error = ' + str(testErr))
    #print('Learned classification forest model:')
    #print(sameModel.toDebugString())
    print(predictions)
    return int(predictions)
开发者ID:adarsh-murthy,项目名称:AdvancedBigDataProject,代码行数:37,代码来源:pf.py

示例7: get_rfc_model

def get_rfc_model(filename, bucket, client, sc):

    # get download_url
    objects_dict = client.list_objects(Bucket=BUCKET_NAME)
    filenames = map(lambda x: x["Key"], objects_dict["Contents"])
    filenames = filter(lambda x: True if re.match(filename + '-v' + "[0-9]+", x) else False, filenames)
    if len(filenames) == 0:
        print "NO RFC MODELS FOUND"
        return False
    else:
        versions = map(lambda x: int(re.search("[0-9]+", x).group()), filenames)
        download_url = filename + "-v" + str(max(versions)) + '.zip'

    # download zip and unzip it
    local_url = '/tmp/' + download_url
    os.system("rm -r " + local_url) # remove zip just in case
    os.system("rm -r " + local_url[:-4]) # remove unzipped folder just in case
    if download_file(download_url, bucket):
        with zipfile.ZipFile(local_url, "r") as z:
            z.extractall(local_url[:-4])
        return RandomForestModel.load(sc, local_url[:-4]) #-4 for zip
    else:
        print "ZIP DOWNLOAD FAILED"
        return False
开发者ID:TimothySeah,项目名称:PhotoClassifierBackend,代码行数:24,代码来源:constants.py

示例8: run

def run(jobNm, sc, sqlContext, inputFile, dictFile,
        bByDate=False,
        inputPartitions=-1,
        sNum=30,
        modelPath=None,
        bWriteMonitor=False,
        writeFileOutput=False):

    # import monitoring if needed
    if bWriteMonitor==True:
        import plotting

    #Create monitoring plot and associated vectors
    mPX = range(8)
    mPY = [0.]*8
    mSL = ["Create Feature Map", "Read in Data", "Aggregate for M.L.", "Read in Model", "Apply Model", "Output Results"]
    mInd = 0

    t0 = time.time()
    #Find the word document frequency for the corpus
    #this is used for an idf score used in feature vector formation
    t1 = time.time()
    revLookup = []
    fDict = None
    if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:':
        # read dict file from hdfs
        fDict = sc.textFile(dictFile).collect()
    else:
        # read from local file
        fDict = open(dictFile,"r")
    for line in fDict:
        terms = line.split("\t")
        revLookup.append(terms[0])

    nVecLen = len(revLookup)
    t2 = time.time()
    diff = t2-t1
    print "Time to read dict", diff

    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Read in data and filter out entries with no valid words
    t1 = time.time()
    print 'inputFile ',inputFile
    print 'inputPartitions ',inputPartitions
    records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions)
    nGoodTweets = records.count()
    t2 = time.time()
    print "Number of good tweets:",nGoodTweets
    diff = t2-t1
    print "Time to read in data", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Format data for ML input
    t1 = time.time()
    mlApply = None
    if bByDate:
        mlApply = records.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt])).cache()
    else:
        mlApply = records.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
    nApp = mlApply.count()
    t2 = time.time()
    print "Number of collapsed points:", nApp
    diff = t2-t1
    print "Time to map points", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Read in Model
    t1 = time.time()
    model_Tree = RandomForestModel.load(sc, modelPath)
    t2 = time.time()
    diff = t2-t1
    print "Time to read in model", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # apply model
    t1 = time.time()
    predictions_Tree = model_Tree.predict(mlApply.map(lambda x: x[1][0].features))
    vecAndPredictions = mlApply.zip(predictions_Tree)
    vecAndPredictions.cache()
    vecAndPredictions.count()
    t2 = time.time()
    diff = t2-t1
    print "Time to apply model: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)
#.........这里部分代码省略.........
开发者ID:theseusyang,项目名称:GEQE,代码行数:101,代码来源:refindSimilarPlaces.py

示例9: applyModel


#.........这里部分代码省略.........

    print('Loaded and prapared %d entries' % df.count())

    #########
    # keep only needed features
    #########   

    features = ['ADLOADINGTIME',
     'PLACEMENTID',
     'TIMESTAMP',
     'CREATIVETYPE',
     'UA_HARDWARETYPE',
     'UA_VENDOR',
     'UA_MODEL',
     'UA_BROWSER',
     'UA_BROWSERVERSION',
     'FILESJSON',
     'ERRORSJSON',
     'TOPMOSTREACHABLEWINDOWAREA',
     'FILESJSON_SIZE',
     'COMBINEDID',
     'COMBINEDEXTERNALID',
     'PLATFORMCOMBINED',
     'UA_OSCOMB',
     'SDK',
     'EXTERNALADSERVER'
       ]

    df = df.select(features)

    #########
    # Convert categorical features to numerical
    #########   


    featuresCat = [
     'PLACEMENTID',
     'CREATIVETYPE',
     'UA_HARDWARETYPE',
     'UA_VENDOR',
     'UA_MODEL',
     'UA_BROWSER',
     'UA_BROWSERVERSION',
     'FILESJSON',
     'ERRORSJSON',
     'COMBINEDID',
     'COMBINEDEXTERNALID',
     'PLATFORMCOMBINED',
     'UA_OSCOMB',
     'SDK',
     'EXTERNALADSERVER'
       ]

    for i in range(len(featuresCat)):

        indexer = StringIndexer(inputCol=featuresCat[i], outputCol='_'+featuresCat[i]).setHandleInvalid("skip").fit(df)
        df = indexer.transform(df).drop(featuresCat[i])
        writer = indexer._call_java("write")
        writer.overwrite().save("indexer_" + featuresCat[i])    

    featuresCat = [ '_' + featuresCat[i] for i in range(len(featuresCat))]    

    features = featuresCat[:]
    features.append('TIMESTAMP')    
    features.append('FILESJSON_SIZE')
    features.append('TOPMOSTREACHABLEWINDOWAREA')


    #########
    # Assemble features
    #########   


    assembler = VectorAssembler(
        inputCols=features,
        outputCol="features")

    df = assembler.transform(df)

    #########
    # Convert to labeled point
    #########   


    lp = (df.select(func.col("ADLOADINGTIME").alias("label"), func.col("features"))
      .map(lambda row: LabeledPoint(row.label, row.features)))
    lp.cache()


    #########
    # Load trained model
    #########
    
    model = RandomForestModel.load(sc, loadModelName)
    
    print('Model loaded!')
    
    predictions = model.predict(lp.map(lambda x: x.features)).collect()
    
    return predictions
开发者ID:timjerman,项目名称:AdLoadingMiner,代码行数:101,代码来源:applyModelSpark.py

示例10: SparkContext

if __name__ == "__main__":
    sc = SparkContext(appName="PythonRandomForestRegressionExample")
    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                        numTrees=3, featureSubsetStrategy="auto",
                                        impurity='variance', maxDepth=4, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression forest model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myRandomForestRegressionModel")
    sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestRegressionModel")
    # $example off$
开发者ID:lhfei,项目名称:spark-in-action,代码行数:29,代码来源:random_forest_regression_example.py

示例11: open

  dict_add = pickle.load(handle)

with open('dict_city.pickle', 'rb') as handle:
  dict_city = pickle.load(handle)

with open('dict_fac.pickle', 'rb') as handle:
  dict_fac = pickle.load(handle)

with open('dict_status.pickle', 'rb') as handle:
  dict_status = pickle.load(handle)

with open('dict_bor.pickle', 'rb') as handle:
  dict_bor = pickle.load(handle)


model = RandomForestModel.load(sc, "modelCritical")

predictList = []

f = open("outFile3.csv", 'rU')

for line in csv.reader(f, delimiter=","):
	predictRow = list(line)
'''
agency = predictRow[agency_index]
complaint = predictRow[comp_index]
location = predictRow[loc_index]
incident = predictRow[incident_index]
address = predictRow[add_index]
city = predictRow[city_index]
facility = predictRow[fac_index]
开发者ID:aishwarya-r-g,项目名称:NYC-311,代码行数:31,代码来源:predicter.py

示例12: test_classification

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
            maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
开发者ID:HodaAlemi,项目名称:spark,代码行数:75,代码来源:tests.py

示例13: writeLumbarTrainingReadings

def writeLumbarTrainingReadings(time, rddTraining):
	try:
		# Convert RDDs of the words DStream to DataFrame and run SQL query
		connectionProperties = MySQLConnection.getDBConnectionProps('/home/erik/mysql_credentials.txt')
		sqlContext = SQLContext(rddTraining.context)
		if rddTraining.isEmpty() == False:
			lumbarTrainingReading = sqlContext.jsonRDD(rddTraining)
			lumbarTrainingReadingFinal = lumbarTrainingReading.selectExpr("deviceID","metricTypeID","uomID","positionID","actual.y AS actualYaw","actual.p AS actualPitch","actual.r AS actualRoll","setPoints.y AS setPointYaw","setPoints.p AS setPointPitch","setPoints.r AS setPointRoll")
			lumbarTrainingReadingFinal.write.jdbc("jdbc:mysql://localhost/biosensor", "SensorTrainingReadings", properties=connectionProperties)
	except:
		pass
		
if __name__ == "__main__":
	sc = SparkContext(appName="Process Lumbar Sensor Readings")
	ssc = StreamingContext(sc, 2) # 2 second batches
	loadedModel = RandomForestModel.load(sc, "../machine_learning/models/IoTBackBraceRandomForest.model")

	#Process Readings
	streamLumbarSensor = KafkaUtils.createDirectStream(ssc, ["LumbarSensorReadings"], {"metadata.broker.list": "localhost:9092"})
	lineSensorReading = streamLumbarSensor.map(lambda x: x[1])
	lineSensorReading.foreachRDD(writeLumbarReadings)
	
	#Process Training Readings
	streamLumbarSensorTraining = KafkaUtils.createDirectStream(ssc, ["LumbarSensorTrainingReadings"], {"metadata.broker.list": "localhost:9092"})
	lineSensorTrainingReading = streamLumbarSensorTraining.map(lambda x: x[1])
	lineSensorTrainingReading.foreachRDD(writeLumbarTrainingReadings)

	# Run and then wait for termination signal
	ssc.start()
	ssc.awaitTermination()
 
开发者ID:kringen,项目名称:IOT-Back-Brace,代码行数:30,代码来源:ProcessSensorReadings.py

示例14: print

    from pyspark.mllib.linalg import Vectors

    print ("Successfully imported Spark Modules")
except ImportError as e:
    print ("Can not import Spark Modules", e)
    sys.exit(1)

import functools
import itertools
from kafka import KafkaConsumer

def parseData(line):
    splittedLine = line.split(",")
    values = [float(s) for s in splittedLine[4:-1]]
    label = splittedLine[-1]
    featuresVector = Vectors.dense(values)
    return LabeledPoint(label, featuresVector)

if __name__ == "__main__":
    conf = SparkConf().setAppName("RandomForest_Anomaly_Detection_Kafka_Consumer")
    sc = SparkContext(conf=conf)
    savedModel = RandomForestModel.load(sc, "../train_model/model")
    consumer = KafkaConsumer('test', group_id='my_group', bootstrap_servers=['localhost:9092'])
    print("Waiting for messages...")
    for message in consumer:
    	print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value))
        data = sc.parallelize([message.value])
        testData = data.map(parseData)
        predictions = savedModel.predict(testData.map(lambda x: x.features))
        print("Prediction: ")
        print(predictions.first())
开发者ID:MarioPerezEsteso,项目名称:Network-Anomaly-Detection-Apache-Spark-Kafka,代码行数:31,代码来源:kafkaconsumerrandomforest.py

示例15: SparkContext

from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils

sc = SparkContext("local", "Amadeus Random Forest Run", pyFiles=['run_model.py'])

data = sc.textFile("data_timbre.csv")
zipped_data = data.zipWithIndex()
keyed_data = zipped_data.map(lambda line: (line[-1], line[:-1]))

target = sc.textFile("target_timbre.csv")
zipped_target = target.zipWithIndex()
keyed_target = zipped_target.map(lambda line: (line[-1], line[:-1]))

target_data = keyed_data.join(keyed_target)
labled_point_data = target_data.map(lambda tup: LabeledPoint(tup[1][1][0], tup[1][0][0].split(',')))

# Split the data into training and test sets (30% held out for testing)
print("Creating Training and Test Data Split")
(trainingData, testData) = labled_point_data.randomSplit([0.7, 0.3])

model = RandomForestModel.load(sc, "myRFModel")

predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))

testAccuracy = labelsAndPredictions.map(lambda (v, p): 1 if (abs(v - p) < 10) else 0).sum() / float(testData.count())
print('Total Accuracy = ' + str(testAccuracy))
开发者ID:kcparashar,项目名称:amadeus,代码行数:30,代码来源:run_model.py


注:本文中的pyspark.mllib.tree.RandomForestModel类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。