本文整理汇总了Python中disco.core.Job类的典型用法代码示例。如果您正苦于以下问题:Python Job类的具体用法?Python Job怎么用?Python Job使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Job类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: auth
def auth(clazz, province, input, output, date):
dirList = os.listdir(input)
ptime = datetime.strptime(date, "%Y%m%d")
file_filter = ptime.strftime('%Y-%m-%d')
input = ["file:///" + input + "/" + file for file in dirList
if ( re.search(date, file) or re.search(file_filter, file) )]
if input:
if clazz == 'c+w':
if cw_map_funs.has_key(province):
mapfun = cw_map_funs[province]
else:
mapfun = cw_map
else:
if fixed_map_funs.has_key(province):
mapfun = fixed_map_funs[province]
else:
mapfun = fixed_map
job = Job().run(input=input, map=mapfun)
file = open(output + "/" + clazz + "-" + date + ".ctl", "w")
sqldr_header(file)
for user, line in result_iterator(job.wait(show=True)):
print >>file, line
file.close()
else:
print 'resolve.py: Can not find any auth files.'
示例2: main
def main():
args = parse_args()
news_file = args.news_file
job = Job().run(
input=news_file,
map_reader=disco.worker.classic.func.chain_reader,
map=read_twitter,
reduce=reduce)
with open("output_result",'w') as out:
for word, count in result_iterator(job.wait(show=False)):
out.write(word + "\t" + str(count))
示例3: predict
def predict(input, loglikelihoods, ys, splitter=" ", map_reader=chain_reader):
ys = dict([(id, 1) for id in ys])
job = Job(name="naive_bayes_predict")
job.run(
input=input,
map_reader=map_reader,
map=predict_map,
params=Params(loglikelihoods=loglikelihoods, ys=ys, splitter=splitter),
clean=False,
)
return job.wait()
示例4: predict
def predict(dataset, fitmodel_url, coeff=0.5, save_results=True, show=False):
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job, result_iterator
import discomll
path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])
job = Job(worker=Worker(save_results=save_results))
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]
if "dwfr_fitmodel" not in fitmodel_url:
raise Exception("Incorrect fit model.")
try:
coeff = float(coeff)
if coeff < 0:
raise Exception("Parameter coeff should be greater than 0.")
except ValueError:
raise Exception("Parameter coeff should be numerical.")
job.params = dataset.params
job.params["coeff"] = coeff
for k, v in result_iterator(fitmodel_url["dwfr_fitmodel"]):
job.params[k] = v
if len(job.params["forest"]) == 0:
print "Warning: There is no decision trees in forest"
return []
job.run(name="distributed_weighted_forest_rand_predict", input=dataset.params["data_tag"],
required_files=[path + "decision_tree.py"])
return job.wait(show=show)
示例5: __init__
def __init__(self,config,map,reduce):
self.config = DiscoJob.DEFAULT_CONFIG.copy()
self.config.update(config)
self.map = map
self.reduce = reduce
self.job = Job()
self.params = Params(**self.config)
示例6: predict
def predict(dataset, fitmodel_url, save_results=True, show=False):
"""
Function starts a job that makes predictions to input data with a given model.
Parameters
----------
input - dataset object with input urls and other parameters
fitmodel_url - model created in fit phase
save_results - save results to ddfs
show - show info about job execution
Returns
-------
Urls with predictions on ddfs
"""
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job, result_iterator
if "linsvm_fitmodel" not in fitmodel_url:
raise Exception("Incorrect fit model.")
job = Job(worker=Worker(save_results=save_results))
# job parallelizes execution of mappers
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]
job.params = dataset.params
job.params["fit_params"] = [v for _, v in result_iterator(fitmodel_url["linsvm_fitmodel"])][0]
job.run(name="linsvm_predict", input=dataset.params["data_tag"])
return job.wait(show=show)
示例7: predict
def predict(dataset, fitmodel_url, voting=False, save_results=True, show=False):
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job, result_iterator
import discomll
path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])
if "drf_fitmodel" not in fitmodel_url:
raise Exception("Incorrect fit model.")
job = Job(worker=Worker(save_results=save_results))
job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init,
process=map_predict_voting if voting else map_predict_dist))]
job.params = dataset.params
for k, v in result_iterator(fitmodel_url["drf_fitmodel"]):
job.params[k] = v
if len(job.params["forest"]) == 0:
print "Warning: There is no decision trees in forest"
return []
job.run(name="distributed_random_forest_predict", input=dataset.params["data_tag"],
required_files=[path + "decision_tree.py"])
return job.wait(show=show)
示例8: process_prediction_data_featurization_with_disco
def process_prediction_data_featurization_with_disco(input_list,params,partitions=4):
'''
Called from within featurize_prediction_data_in_parallel
Returns disco.core.result_iterator
Arguments:
input_list: path to file listing filename,unused_string for each individual time series data file.
params: dictionary of parameters to be passed to each map & reduce function.
partitions: Number of nodes/partitions in system.
'''
from disco.core import Job, result_iterator
job = Job().run(input=input_list,
map=pred_map,
partitions=partitions,
reduce=pred_featurize_reduce,
params=params)
result = result_iterator(job.wait(show=True))
return result
示例9: main
def main():
job = Job().run(input=[TRAIN_IN], map=mapper, reduce=reducer, sort=True)
category_options = defaultdict(dict)
category_values = defaultdict(int)
for cat_id, counter in result_iterator(job.wait(show=True)):
if len(counter) > MAX_CATEGORICAL_OPTIONS:
continue
for cat_value in counter:
if cat_value not in category_options[cat_id]:
category_options[cat_id][cat_value] = category_values[cat_id]
category_values[cat_id] += 1
# save possible categorical data
with open(CATEGORY_MAPPING_OUT, 'w') as f:
f.write(dumps(category_options))
with open(CATEGORY_STATUS_OUT, 'w') as f:
f.write(dumps(category_values))
示例10: predict
def predict(dataset, fitmodel_url, m=1, save_results=True, show=False):
"""
Function starts a job that makes predictions to input data with a given model
Parameters
----------
input - dataset object with input urls and other parameters
fitmodel_url - model created in fit phase
m - m estimate is used with discrete features
save_results - save results to ddfs
show - show info about job execution
Returns
-------
Urls of predictions on ddfs
"""
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job, result_iterator
import numpy as np
try:
m = float(m)
except ValueError:
raise Exception("Parameter m should be numerical.")
if "naivebayes_fitmodel" in fitmodel_url:
# fit model is loaded from ddfs
fit_model = dict((k, v) for k, v in result_iterator(fitmodel_url["naivebayes_fitmodel"]))
if len(fit_model["y_labels"]) < 2:
print "There is only one class in training data."
return []
else:
raise Exception("Incorrect fit model.")
if dataset.params["X_meta"].count("d") > 0: # if there are discrete features in the model
# code calculates logarithms to optimize predict phase as opposed to calculation by every mapped.
np.seterr(divide='ignore')
for iv in fit_model["iv"]:
dist = [fit_model.pop((y,) + iv, 0) for y in fit_model["y_labels"]]
fit_model[iv] = np.nan_to_num(
np.log(np.true_divide(np.array(dist) + m * fit_model["prior"], np.sum(dist) + m))) - fit_model[
"prior_log"]
del (fit_model["iv"])
# define a job and set save of results to ddfs
job = Job(worker=Worker(save_results=save_results))
# job parallelizes execution of mappers
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]
job.params = dataset.params # job parameters (dataset object)
job.params["fit_model"] = fit_model
# define name of a job and input data urls
job.run(name="naivebayes_predict", input=dataset.params["data_tag"])
results = job.wait(show=show)
return results
示例11: fit
def fit(dataset, alpha=1e-8, max_iterations=10, save_results=True, show=False):
"""
Function starts a job for calculation of theta parameters
Parameters
----------
input - dataset object with input urls and other parameters
alpha - convergence value
max_iterations - define maximum number of iterations
save_results - save results to ddfs
show - show info about job execution
Returns
-------
Urls of fit model results on ddfs
"""
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job, result_iterator
import numpy as np
if dataset.params["y_map"] == []:
raise Exception("Logistic regression requires a target label mapping parameter.")
try:
alpha = float(alpha)
max_iterations = int(max_iterations)
if max_iterations < 1:
raise Exception("Parameter max_iterations should be greater than 0.")
except ValueError:
raise Exception("Parameters should be numerical.")
# initialize thetas to 0 and add intercept term
thetas = np.zeros(len(dataset.params["X_indices"]) + 1)
J = [0] # J cost function values for every iteration
for i in range(max_iterations):
job = Job(worker=Worker(save_results=save_results))
# job parallelizes mappers and joins them with one reducer
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]
job.params = dataset.params # job parameters (dataset object)
job.params["thetas"] = thetas # every iteration set new thetas
job.run(name="logreg_fit_iter_%d" % (i + 1), input=dataset.params["data_tag"])
fitmodel_url = job.wait(show=show)
for k, v in result_iterator(fitmodel_url):
if k == "J": #
J.append(v) # save value of J cost function
else:
thetas = v # save new thetas
if np.abs(J[-2] - J[-1]) < alpha: # check for convergence
if show:
print("Converged at iteration %d" % (i + 1))
break
return {"logreg_fitmodel": fitmodel_url} # return results url
示例12: fit_model_disco
def fit_model_disco(data_dict, featureset_key, model_type):
"""
"""
from disco.core import Job, result_iterator
params = {"data_dict": data_dict,
"featureset_key": featureset_key,
"model_type": model_type}
input_list = [("placeholder")]
job = Job('with_modules').run(
input=input_list,
reduce=reduce,
params=params,
required_modules=[("mltsp",
os.path.dirname(os.path.dirname(__file__))),
"sklearn"])
result_iter = result_iterator(job.wait(show=True))
rf_fit = None
for rf_obj, dummy_str in result_iter:
rf_fit = rf_obj
return rf_fit
示例13: start
def start(self):
"""Starts the entire process of querying twitter and classification.
This method is responsible for running MapReduce for feature
extraction.
"""
def range_reader(stream, size, url):
page_num = stream.getvalue()
# Map readers should return a list of values, so page_num is
# explicitly converted to an integer and then wrapped into a
# list. By doing this each mapper instance will get exactly
# one page number
# If we don't do this, the mapper API just reads the numbers
# character by character and we end up fetching the same 10
# pages: digits 0, 9 all through since each character of a number
# should be one of these 10 digits.
return [int(page_num)]
job = Job()
inputs = [('raw://%d' % (i)) for i in range(1, self.num_pages)]
job.run(input=inputs, map=mapper, reduce=reducer,
map_reader=range_reader, params=Params(
query=self.query,
trained_vectorizer=self.vectorizer
),
required_modules=[
('vectorizer', os.path.join(datasettings.PROJECT_ROOT,
'analyzer',
'vectorizer.py'),),
('models', os.path.join(datasettings.PROJECT_ROOT,
'webui', 'fatninja',
'models.py'),),
])
self.feature_vector, self.row_num_to_tweet_id_map = \
self.vectorizer.build_feature_matrix(job)
self.classify()
示例14: fit
def fit(dataset, save_results=True, show=False):
"""
Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model.
Parameters
----------
input - dataset object with input urls and other parameters
save_results - save results to ddfs
show - show info about job execution
Returns
-------
Urls of fit model results on ddfs
"""
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job
# define a job and set save of results to ddfs
job = Job(worker=Worker(save_results=save_results))
# job parallelizes mappers, sorts intermediate pairs and joins them with one reducer
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
('group_all', Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True))]
job.params = dataset.params # job parameters (dataset object)
# define name of a job and input data urls
job.run(name="naivebayes_fit", input=dataset.params["data_tag"])
fitmodel_url = job.wait(show=show)
return {"naivebayes_fitmodel": fitmodel_url} # return results url
示例15: process_featurization_with_disco
def process_featurization_with_disco(input_list, params, partitions=4):
"""Featurize time-series data in parallel as a Disco job.
Called from within the `featurize_in_parallel` function.
Parameters
----------
input_list : str
Path to file listing the file name and class name
(comma-separated) for each individual time series data file,
one per line.
params : dict
Dictionary of parameters to be passed to each map & reduce
function.
partitions : int, optional
Number of nodes/partitions in system. Defaults to 4.
Returns
-------
iterator
disco.core.result_iterator(), an interator of two-element
tuples, each containing the file name of the original time
series data file, and a dictionary of the associated features
generated.
"""
from disco.core import Job, result_iterator
job = Job('with_modules').run(
input=input_list,
map_reader=custom_reader,
map=map,
partitions=partitions,
reduce=featurize_reduce,
params=params,
required_modules=[("mltsp",
os.path.dirname(os.path.dirname(__file__)))])
result = result_iterator(job.wait(show=True))
return result