本文整理汇总了Python中disco.core.Job.pipeline方法的典型用法代码示例。如果您正苦于以下问题:Python Job.pipeline方法的具体用法?Python Job.pipeline怎么用?Python Job.pipeline使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类disco.core.Job
的用法示例。
在下文中一共展示了Job.pipeline方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: measure
# 需要导入模块: from disco.core import Job [as 别名]
# 或者: from disco.core.Job import pipeline [as 别名]
def measure(test_data, predictions, measure="ca", save_results=True, show=False):
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job, result_iterator
from disco.worker.task_io import task_input_stream, chain_reader
if measure not in ["ca", "mse"]:
raise Exception("measure should be ca or mse.")
if test_data.params["id_index"] == -1:
raise Exception("ID index should be defined.")
if predictions == []:
return "No predictions", None
# define a job and set save of results to ddfs
job = Job(worker=Worker(save_results=save_results))
job = Job(worker=Worker(save_results=save_results))
job.pipeline = [
("split", Stage("map", input_chain=test_data.params["input_chain"], init=simple_init, process=map_test_data))]
job.params = test_data.params
job.run(name="ma_parse_testdata", input=test_data.params["data_tag"])
parsed_testdata = job.wait(show=show)
reduce_proces = reduce_ca if measure == "ca" else reduce_mse
job = Job(worker=Worker(save_results=save_results))
job.pipeline = [("split", Stage("map", init=simple_init, input_chain=[task_input_stream, chain_reader],
process=map_predictions)),
('group_all', Stage("reduce", init=simple_init, process=reduce_proces, sort=True, combine=True))]
job.run(name="ma_measure_accuracy", input=parsed_testdata + predictions)
measure, acc = [(measure, acc) for measure, acc in result_iterator(job.wait(show=show))][0]
return measure, acc
示例2: predict
# 需要导入模块: from disco.core import Job [as 别名]
# 或者: from disco.core.Job import pipeline [as 别名]
def predict(dataset, fitmodel_url, save_results=True, show=False):
"""
Function starts a job that makes predictions to input data with a given model.
Parameters
----------
input - dataset object with input urls and other parameters
fitmodel_url - model created in fit phase
save_results - save results to ddfs
show - show info about job execution
Returns
-------
Urls with predictions on ddfs
"""
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job, result_iterator
if "linsvm_fitmodel" not in fitmodel_url:
raise Exception("Incorrect fit model.")
job = Job(worker=Worker(save_results=save_results))
# job parallelizes execution of mappers
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]
job.params = dataset.params
job.params["fit_params"] = [v for _, v in result_iterator(fitmodel_url["linsvm_fitmodel"])][0]
job.run(name="linsvm_predict", input=dataset.params["data_tag"])
return job.wait(show=show)
示例3: fit
# 需要导入模块: from disco.core import Job [as 别名]
# 或者: from disco.core.Job import pipeline [as 别名]
def fit(dataset, save_results=True, show=False):
"""
Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model.
Parameters
----------
input - dataset object with input urls and other parameters
save_results - save results to ddfs
show - show info about job execution
Returns
-------
Urls of fit model results on ddfs
"""
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job
# define a job and set save of results to ddfs
job = Job(worker=Worker(save_results=save_results))
# job parallelizes mappers, sorts intermediate pairs and joins them with one reducer
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
('group_all', Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True))]
job.params = dataset.params # job parameters (dataset object)
# define name of a job and input data urls
job.run(name="naivebayes_fit", input=dataset.params["data_tag"])
fitmodel_url = job.wait(show=show)
return {"naivebayes_fitmodel": fitmodel_url} # return results url
示例4: predict
# 需要导入模块: from disco.core import Job [as 别名]
# 或者: from disco.core.Job import pipeline [as 别名]
def predict(dataset, fitmodel_url, voting=False, save_results=True, show=False):
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job, result_iterator
import discomll
path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])
if "drf_fitmodel" not in fitmodel_url:
raise Exception("Incorrect fit model.")
job = Job(worker=Worker(save_results=save_results))
job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init,
process=map_predict_voting if voting else map_predict_dist))]
job.params = dataset.params
for k, v in result_iterator(fitmodel_url["drf_fitmodel"]):
job.params[k] = v
if len(job.params["forest"]) == 0:
print "Warning: There is no decision trees in forest"
return []
job.run(name="distributed_random_forest_predict", input=dataset.params["data_tag"],
required_files=[path + "decision_tree.py"])
return job.wait(show=show)
示例5: predict
# 需要导入模块: from disco.core import Job [as 别名]
# 或者: from disco.core.Job import pipeline [as 别名]
def predict(dataset, fitmodel_url, coeff=0.5, save_results=True, show=False):
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job, result_iterator
import discomll
path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])
job = Job(worker=Worker(save_results=save_results))
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]
if "dwfr_fitmodel" not in fitmodel_url:
raise Exception("Incorrect fit model.")
try:
coeff = float(coeff)
if coeff < 0:
raise Exception("Parameter coeff should be greater than 0.")
except ValueError:
raise Exception("Parameter coeff should be numerical.")
job.params = dataset.params
job.params["coeff"] = coeff
for k, v in result_iterator(fitmodel_url["dwfr_fitmodel"]):
job.params[k] = v
if len(job.params["forest"]) == 0:
print "Warning: There is no decision trees in forest"
return []
job.run(name="distributed_weighted_forest_rand_predict", input=dataset.params["data_tag"],
required_files=[path + "decision_tree.py"])
return job.wait(show=show)
示例6: fit_predict
# 需要导入模块: from disco.core import Job [as 别名]
# 或者: from disco.core.Job import pipeline [as 别名]
def fit_predict(training_data, fitting_data, tau=1, samples_per_job=0, save_results=True, show=False):
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job, result_iterator
from disco.core import Disco
"""
training_data - training samples
fitting_data - dataset to be fitted to training data.
tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x.
samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job.
"""
try:
tau = float(tau)
if tau <= 0:
raise Exception("Parameter tau should be >= 0.")
except ValueError:
raise Exception("Parameter tau should be numerical.")
if fitting_data.params["id_index"] == -1:
raise Exception("Predict data should have id_index set.")
job = Job(worker=Worker(save_results=save_results))
job.pipeline = [
("split", Stage("map", input_chain=fitting_data.params["input_chain"], init=simple_init, process=map_predict))
]
job.params = fitting_data.params
job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"])
samples = {}
results = []
tau = float(2 * tau ** 2) # calculate tau once
counter = 0
for test_id, x in result_iterator(job.wait(show=show)):
if samples_per_job == 0:
# calculate number of samples per job
if len(x) <= 100: # if there is less than 100 attributes
samples_per_job = 100 # 100 samples is max per on job
else:
# there is more than 100 attributes
samples_per_job = len(x) * -25 / 900.0 + 53 # linear function
samples[test_id] = x
if counter == samples_per_job:
results.append(_fit_predict(training_data, samples, tau, save_results, show))
counter = 0
samples = {}
counter += 1
if len(samples) > 0: # if there is some samples left in the the dictionary
results.append(_fit_predict(training_data, samples, tau, save_results, show))
# merge results of every iteration into a single tag
ddfs = Disco().ddfs
ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results])
return ["tag://" + job.name]
示例7: predict
# 需要导入模块: from disco.core import Job [as 别名]
# 或者: from disco.core.Job import pipeline [as 别名]
def predict(dataset, fitmodel_url, m=1, save_results=True, show=False):
"""
Function starts a job that makes predictions to input data with a given model
Parameters
----------
input - dataset object with input urls and other parameters
fitmodel_url - model created in fit phase
m - m estimate is used with discrete features
save_results - save results to ddfs
show - show info about job execution
Returns
-------
Urls of predictions on ddfs
"""
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job, result_iterator
import numpy as np
try:
m = float(m)
except ValueError:
raise Exception("Parameter m should be numerical.")
if "naivebayes_fitmodel" in fitmodel_url:
# fit model is loaded from ddfs
fit_model = dict((k, v) for k, v in result_iterator(fitmodel_url["naivebayes_fitmodel"]))
if len(fit_model["y_labels"]) < 2:
print "There is only one class in training data."
return []
else:
raise Exception("Incorrect fit model.")
if dataset.params["X_meta"].count("d") > 0: # if there are discrete features in the model
# code calculates logarithms to optimize predict phase as opposed to calculation by every mapped.
np.seterr(divide='ignore')
for iv in fit_model["iv"]:
dist = [fit_model.pop((y,) + iv, 0) for y in fit_model["y_labels"]]
fit_model[iv] = np.nan_to_num(
np.log(np.true_divide(np.array(dist) + m * fit_model["prior"], np.sum(dist) + m))) - fit_model[
"prior_log"]
del (fit_model["iv"])
# define a job and set save of results to ddfs
job = Job(worker=Worker(save_results=save_results))
# job parallelizes execution of mappers
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]
job.params = dataset.params # job parameters (dataset object)
job.params["fit_model"] = fit_model
# define name of a job and input data urls
job.run(name="naivebayes_predict", input=dataset.params["data_tag"])
results = job.wait(show=show)
return results
示例8: fit
# 需要导入模块: from disco.core import Job [as 别名]
# 或者: from disco.core.Job import pipeline [as 别名]
def fit(dataset, alpha=1e-8, max_iterations=10, save_results=True, show=False):
"""
Function starts a job for calculation of theta parameters
Parameters
----------
input - dataset object with input urls and other parameters
alpha - convergence value
max_iterations - define maximum number of iterations
save_results - save results to ddfs
show - show info about job execution
Returns
-------
Urls of fit model results on ddfs
"""
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job, result_iterator
import numpy as np
if dataset.params["y_map"] == []:
raise Exception("Logistic regression requires a target label mapping parameter.")
try:
alpha = float(alpha)
max_iterations = int(max_iterations)
if max_iterations < 1:
raise Exception("Parameter max_iterations should be greater than 0.")
except ValueError:
raise Exception("Parameters should be numerical.")
# initialize thetas to 0 and add intercept term
thetas = np.zeros(len(dataset.params["X_indices"]) + 1)
J = [0] # J cost function values for every iteration
for i in range(max_iterations):
job = Job(worker=Worker(save_results=save_results))
# job parallelizes mappers and joins them with one reducer
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]
job.params = dataset.params # job parameters (dataset object)
job.params["thetas"] = thetas # every iteration set new thetas
job.run(name="logreg_fit_iter_%d" % (i + 1), input=dataset.params["data_tag"])
fitmodel_url = job.wait(show=show)
for k, v in result_iterator(fitmodel_url):
if k == "J": #
J.append(v) # save value of J cost function
else:
thetas = v # save new thetas
if np.abs(J[-2] - J[-1]) < alpha: # check for convergence
if show:
print("Converged at iteration %d" % (i + 1))
break
return {"logreg_fitmodel": fitmodel_url} # return results url
示例9: fit
# 需要导入模块: from disco.core import Job [as 别名]
# 或者: from disco.core.Job import pipeline [as 别名]
def fit(dataset, save_results=True, show=False):
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job
job = Job(worker=Worker(save_results=save_results))
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]
job.params = dataset.params
job.run(name="linreg_fit", input=dataset.params["data_tag"])
fitmodel_url = job.wait(show=show)
return {"linreg_fitmodel": fitmodel_url} # return results url
示例10: fit
# 需要导入模块: from disco.core import Job [as 别名]
# 或者: from disco.core.Job import pipeline [as 别名]
def fit(dataset, trees_per_chunk=1, bootstrap=True, max_tree_nodes=50, min_samples_leaf=10, min_samples_split=5,
class_majority=1, separate_max=True, measure="info_gain", accuracy=1, random_state=None, save_results=True,
show=False):
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job
import discomll
path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])
try:
trees_per_chunk = int(trees_per_chunk)
max_tree_nodes = int(max_tree_nodes) if max_tree_nodes != None else max_tree_nodes
min_samples_leaf = int(min_samples_leaf)
min_samples_split = int(min_samples_split)
class_majority = float(class_majority)
accuracy = int(accuracy)
separate_max = separate_max
if trees_per_chunk > 1 and bootstrap == False:
raise Exception("Parameter trees_per_chunk (or Trees per subset) should be 1 to disable bootstrap.")
if trees_per_chunk <= 0 or min_samples_leaf <= 0 or class_majority <= 0 or min_samples_split <= 0 and accuracy < 0 or type(
bootstrap) != bool:
raise Exception("Parameters should be greater than 0.")
except ValueError:
raise Exception("Parameters should be numerical.")
if measure not in ["info_gain", "mdl"]:
raise Exception("measure should be set to info_gain or mdl.")
job = Job(worker=Worker(save_results=save_results))
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=map_init,
process=map_fit_bootstrap if bootstrap else map_fit)),
('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]
job.params = dataset.params
job.params["trees_per_chunk"] = trees_per_chunk
job.params["max_tree_nodes"] = max_tree_nodes
job.params["min_samples_leaf"] = min_samples_leaf
job.params["min_samples_split"] = min_samples_split
job.params["class_majority"] = class_majority
job.params["measure"] = measure
job.params["bootstrap"] = bootstrap
job.params["accuracy"] = accuracy
job.params["separate_max"] = separate_max
job.params['seed'] = random_state
job.run(name="forest_distributed_decision_trees_fit", input=dataset.params["data_tag"],
required_files=[path + "decision_tree.py", path + "measures.py"])
fitmodel_url = job.wait(show=show)
return {"fddt_fitmodel": fitmodel_url} # return results url
示例11: predict
# 需要导入模块: from disco.core import Job [as 别名]
# 或者: from disco.core.Job import pipeline [as 别名]
def predict(dataset, fitmodel_url, save_results=True, show=False):
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job, result_iterator
if "linreg_fitmodel" not in fitmodel_url:
raise Exception("Incorrect fit model.")
job = Job(worker=Worker(save_results=save_results))
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]
job.params = dataset.params
job.params["thetas"] = [v for _, v in result_iterator(fitmodel_url["linreg_fitmodel"])][0]
job.run(name="linreg_predict", input=dataset.params["data_tag"])
return job.wait(show=show)
示例12: measure
# 需要导入模块: from disco.core import Job [as 别名]
# 或者: from disco.core.Job import pipeline [as 别名]
def measure(input, save_results=True, show=False):
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job
# define a job and set save of results to ddfs
job = Job(worker=Worker(save_results=save_results))
job.pipeline = [
("split", Stage("map", input_chain=input.params["input_chain"], init=simple_init, process=map_fit)),
('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]
job.params = input.params # job parameters (dataset object)
job.run(name="Distribution", input=input.params["data_tag"])
return job.wait(show=show) # return results url
示例13: _fit_predict
# 需要导入模块: from disco.core import Job [as 别名]
# 或者: from disco.core.Job import pipeline [as 别名]
def _fit_predict(fit_data, samples, tau, save_results, show):
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job
job = Job(worker=Worker(save_results=save_results))
job.pipeline = [
("split", Stage("map", input_chain=fit_data.params["input_chain"], init=simple_init, process=map_fit)),
("group_all", Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True)),
]
job.params = fit_data.params
job.params["tau"] = tau
job.params["samples"] = samples
job.run(name="lwlr_fit_predict", input=fit_data.params["data_tag"])
return job.wait(show=show)
示例14: fit
# 需要导入模块: from disco.core import Job [as 别名]
# 或者: from disco.core.Job import pipeline [as 别名]
def fit(dataset, nu=0.1, save_results=True, show=False):
"""
Function starts a job for calculation of model parameters
Parameters
----------
input - dataset object with input urls and other parameters
nu - parameter to adjust the classifier
save_results - save results to ddfs
show - show info about job execution
Returns
-------
Urls of fit model results on ddfs
"""
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job
if dataset.params["y_map"] == []:
raise Exception("Linear proximal SVM requires a target label mapping parameter.")
try:
nu = float(nu)
if nu <= 0:
raise Exception("Parameter nu should be greater than 0")
except ValueError:
raise Exception("Parameter should be numerical.")
job = Job(worker=Worker(save_results=save_results))
# job parallelizes mappers and joins them with one reducer
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]
job.params = dataset.params
job.params["nu"] = nu
job.run(name="linearsvm_fit", input=dataset.params["data_tag"])
fitmodel_url = job.wait(show=show)
return {"linsvm_fitmodel": fitmodel_url} # return results url
示例15: fit
# 需要导入模块: from disco.core import Job [as 别名]
# 或者: from disco.core.Job import pipeline [as 别名]
def fit(
dataset,
trees_per_chunk=3,
max_tree_nodes=50,
min_samples_leaf=10,
min_samples_split=5,
class_majority=1,
measure="info_gain",
k="sqrt",
accuracy=1,
random_state=None,
separate_max=True,
save_results=True,
show=False,
):
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job
import discomll
path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])
try:
trees_per_chunk = int(trees_per_chunk)
max_tree_nodes = int(max_tree_nodes) if max_tree_nodes != None else max_tree_nodes
min_samples_leaf = int(min_samples_leaf)
min_samples_split = int(min_samples_split)
class_majority = float(class_majority)
separate_max = separate_max
accuracy = int(accuracy)
if (
trees_per_chunk <= 0
or min_samples_leaf <= 0
or min_samples_split <= 0
or class_majority <= 0
or accuracy < 0
):
raise Exception("Parameters should be greater than 0.")
except ValueError:
raise Exception("Parameters should be numerical.")
if measure not in ["info_gain", "mdl"]:
raise Exception("measure should be set to info_gain or mdl.")
job = Job(worker=Worker(save_results=save_results))
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=map_init, process=map_fit)),
("group_all", Stage("reduce", init=simple_init, process=reduce_fit, combine=True)),
]
job.params = dataset.params
job.params["trees_per_chunk"] = trees_per_chunk
job.params["max_tree_nodes"] = max_tree_nodes
job.params["min_samples_leaf"] = min_samples_leaf
job.params["min_samples_split"] = min_samples_split
job.params["class_majority"] = class_majority
job.params["measure"] = measure
job.params["accuracy"] = accuracy
job.params["k"] = k
job.params["seed"] = random_state
job.params["separate_max"] = separate_max
job.run(
name="distributed_weighted_forest_fit",
input=dataset.params["data_tag"],
required_files=[path + "decision_tree.py", path + "measures.py", path + "k_medoids.py"],
)
fitmodel_url = job.wait(show=show)
return {"dwf_fitmodel": fitmodel_url} # return results url