Python core.Job类代码示例

本文整理汇总了Python中disco.core.Job类的典型用法代码示例。如果您正苦于以下问题：Python Job类的具体用法？Python Job怎么用？Python Job使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了Job类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: auth

def auth(clazz, province, input, output, date):
    dirList      = os.listdir(input)
    ptime        = datetime.strptime(date, "%Y%m%d")
    file_filter  = ptime.strftime('%Y-%m-%d')

    input = ["file:///" + input + "/" + file for file in dirList 
            if ( re.search(date, file) or re.search(file_filter, file) )]
    if input:
        if clazz == 'c+w':
            if cw_map_funs.has_key(province):
                mapfun = cw_map_funs[province]
            else:
                mapfun = cw_map
        else:
            if fixed_map_funs.has_key(province):
                mapfun = fixed_map_funs[province]
            else:
                mapfun = fixed_map

        job = Job().run(input=input, map=mapfun)
        file = open(output + "/" + clazz + "-" + date + ".ctl", "w")
        sqldr_header(file)
        for user, line in result_iterator(job.wait(show=True)):
            print >>file, line
        file.close()
    else:
        print 'resolve.py: Can not find any auth files.'

开发者ID:CrazyWisdom，项目名称:auth，代码行数:27，代码来源:resolve.py

示例2: main

def	main():
	args = parse_args()
	news_file = args.news_file
	job = Job().run(
                    input=news_file,
                    map_reader=disco.worker.classic.func.chain_reader,
                    map=read_twitter,
                    reduce=reduce)
	with open("output_result",'w') as out:
		for word, count in result_iterator(job.wait(show=False)):
			out.write(word + "\t" + str(count))

开发者ID:fangjin，项目名称:Hate，代码行数:11，代码来源:map_hashtag.py

示例3: predict

def predict(input, loglikelihoods, ys, splitter=" ", map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])
    job = Job(name="naive_bayes_predict")
    job.run(
        input=input,
        map_reader=map_reader,
        map=predict_map,
        params=Params(loglikelihoods=loglikelihoods, ys=ys, splitter=splitter),
        clean=False,
    )
    return job.wait()

开发者ID:nicolasramy，项目名称:disco，代码行数:11，代码来源:naive_bayes.py

示例4: predict

def predict(dataset, fitmodel_url, coeff=0.5, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll
    path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]

    if "dwfr_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")
    try:
        coeff = float(coeff)
        if coeff < 0:
            raise Exception("Parameter coeff should be greater than 0.")
    except ValueError:
        raise Exception("Parameter coeff should be numerical.")

    job.params = dataset.params
    job.params["coeff"] = coeff
    for k, v in result_iterator(fitmodel_url["dwfr_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_weighted_forest_rand_predict", input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)

开发者ID:romanorac，项目名称:discomll，代码行数:32，代码来源:distributed_weighted_forest_rand.py

示例5: init

    def __init__(self,config,map,reduce):
        self.config = DiscoJob.DEFAULT_CONFIG.copy()
        self.config.update(config)

        self.map = map
        self.reduce = reduce
        self.job = Job()
        self.params = Params(**self.config)

开发者ID:10genNYUITP，项目名称:MongoDisco，代码行数:8，代码来源:job.py

示例6: predict

def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model.

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls with predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if "linsvm_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    # job parallelizes execution of mappers
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]

    job.params = dataset.params
    job.params["fit_params"] = [v for _, v in result_iterator(fitmodel_url["linsvm_fitmodel"])][0]
    job.run(name="linsvm_predict", input=dataset.params["data_tag"])

    return job.wait(show=show)

开发者ID:romanorac，项目名称:discomll，代码行数:31，代码来源:linear_svm.py

示例7: predict

def predict(dataset, fitmodel_url, voting=False, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll

    path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    if "drf_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init,
                                    process=map_predict_voting if voting else map_predict_dist))]

    job.params = dataset.params
    for k, v in result_iterator(fitmodel_url["drf_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_random_forest_predict", input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)

开发者ID:romanorac，项目名称:discomll，代码行数:26，代码来源:distributed_random_forest.py

示例8: process_prediction_data_featurization_with_disco

def process_prediction_data_featurization_with_disco(input_list,params,partitions=4):
    '''
    Called from within featurize_prediction_data_in_parallel
    Returns disco.core.result_iterator
    Arguments:
        input_list: path to file listing filename,unused_string for each individual time series data file.
        params: dictionary of parameters to be passed to each map & reduce function.
        partitions: Number of nodes/partitions in system.
    '''
    from disco.core import Job, result_iterator
    job = Job().run(input=input_list,
                    map=pred_map,
                    partitions=partitions,
                    reduce=pred_featurize_reduce,
                    params=params)
    
    result = result_iterator(job.wait(show=True))
    return result

开发者ID:stefanv，项目名称:cesium，代码行数:18，代码来源:parallel_processing.py

示例9: main

def main():
    job = Job().run(input=[TRAIN_IN], map=mapper, reduce=reducer, sort=True)
    category_options = defaultdict(dict)
    category_values = defaultdict(int)
    for cat_id, counter in result_iterator(job.wait(show=True)):
        if len(counter) > MAX_CATEGORICAL_OPTIONS:
            continue

        for cat_value in counter:
            if cat_value not in category_options[cat_id]:
                category_options[cat_id][cat_value] = category_values[cat_id]
                category_values[cat_id] += 1

    # save possible categorical data
    with open(CATEGORY_MAPPING_OUT, 'w') as f:
        f.write(dumps(category_options))

    with open(CATEGORY_STATUS_OUT, 'w') as f:
        f.write(dumps(category_values))

开发者ID:trein，项目名称:criteo-challenge，代码行数:19，代码来源:mr_categorizer.py

示例10: predict

def predict(dataset, fitmodel_url, m=1, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    m - m estimate is used with discrete features
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import numpy as np

    try:
        m = float(m)
    except ValueError:
        raise Exception("Parameter m should be numerical.")

    if "naivebayes_fitmodel" in fitmodel_url:
        # fit model is loaded from ddfs
        fit_model = dict((k, v) for k, v in result_iterator(fitmodel_url["naivebayes_fitmodel"]))
        if len(fit_model["y_labels"]) < 2:
            print "There is only one class in training data."
            return []
    else:
        raise Exception("Incorrect fit model.")

    if dataset.params["X_meta"].count("d") > 0:  # if there are discrete features in the model
        # code calculates logarithms to optimize predict phase as opposed to calculation by every mapped.
        np.seterr(divide='ignore')
        for iv in fit_model["iv"]:
            dist = [fit_model.pop((y,) + iv, 0) for y in fit_model["y_labels"]]
            fit_model[iv] = np.nan_to_num(
                np.log(np.true_divide(np.array(dist) + m * fit_model["prior"], np.sum(dist) + m))) - fit_model[
                                "prior_log"]
        del (fit_model["iv"])

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes execution of mappers
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]

    job.params = dataset.params  # job parameters (dataset object)
    job.params["fit_model"] = fit_model
    # define name of a job and input data urls
    job.run(name="naivebayes_predict", input=dataset.params["data_tag"])
    results = job.wait(show=show)
    return results

开发者ID:romanorac，项目名称:discomll，代码行数:57，代码来源:naivebayes.py

示例11: fit

def fit(dataset, alpha=1e-8, max_iterations=10, save_results=True, show=False):
    """
    Function starts a job for calculation of theta parameters

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    alpha - convergence value
    max_iterations - define maximum number of iterations
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import numpy as np

    if dataset.params["y_map"] == []:
        raise Exception("Logistic regression requires a target label mapping parameter.")
    try:
        alpha = float(alpha)
        max_iterations = int(max_iterations)
        if max_iterations < 1:
            raise Exception("Parameter max_iterations should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    # initialize thetas to 0 and add intercept term
    thetas = np.zeros(len(dataset.params["X_indices"]) + 1)

    J = [0]  # J cost function values for every iteration
    for i in range(max_iterations):
        job = Job(worker=Worker(save_results=save_results))
        # job parallelizes mappers and joins them with one reducer
        job.pipeline = [
            ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
            ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

        job.params = dataset.params  # job parameters (dataset object)
        job.params["thetas"] = thetas  # every iteration set new thetas
        job.run(name="logreg_fit_iter_%d" % (i + 1), input=dataset.params["data_tag"])

        fitmodel_url = job.wait(show=show)
        for k, v in result_iterator(fitmodel_url):
            if k == "J":  #
                J.append(v)  # save value of J cost function
            else:
                thetas = v  # save new thetas
        if np.abs(J[-2] - J[-1]) < alpha:  # check for convergence
            if show:
                print("Converged at iteration %d" % (i + 1))
            break

    return {"logreg_fitmodel": fitmodel_url}  # return results url

开发者ID:romanorac，项目名称:discomll，代码行数:57，代码来源:logistic_regression.py

示例12: fit_model_disco

def fit_model_disco(data_dict, featureset_key, model_type):
    """
    """
    from disco.core import Job, result_iterator
    params = {"data_dict": data_dict,
              "featureset_key": featureset_key,
              "model_type": model_type}
    input_list = [("placeholder")]
    job = Job('with_modules').run(
        input=input_list,
        reduce=reduce,
        params=params,
        required_modules=[("mltsp",
                           os.path.dirname(os.path.dirname(__file__))),
                          "sklearn"])
    result_iter = result_iterator(job.wait(show=True))
    rf_fit = None
    for rf_obj, dummy_str in result_iter:
        rf_fit = rf_obj
    return rf_fit

开发者ID:gitter-badger，项目名称:mltsp，代码行数:20，代码来源:build_model.py

示例13: start

    def start(self):
        """Starts the entire process of querying twitter and classification.

        This method is responsible for running MapReduce for feature
        extraction.
        """
        def range_reader(stream, size, url):
           page_num = stream.getvalue()
           # Map readers should return a list of values, so page_num is
           # explicitly converted to an integer and then wrapped into a
           # list. By doing this each mapper instance will get exactly
           # one page number
           # If we don't do this, the mapper API just reads the numbers
           # character by character and we end up fetching the same 10
           # pages: digits 0, 9 all through since each character of a number
           # should be one of these 10 digits.
           return [int(page_num)]

        job = Job()

        inputs = [('raw://%d' % (i)) for i in range(1, self.num_pages)]

        job.run(input=inputs, map=mapper, reduce=reducer,
                map_reader=range_reader, params=Params(
                    query=self.query,
                    trained_vectorizer=self.vectorizer
                    ),
                required_modules=[
                    ('vectorizer', os.path.join(datasettings.PROJECT_ROOT,
                        'analyzer',
                        'vectorizer.py'),),
                    ('models', os.path.join(datasettings.PROJECT_ROOT,
                        'webui', 'fatninja',
                        'models.py'),),
                        ])

        self.feature_vector, self.row_num_to_tweet_id_map = \
            self.vectorizer.build_feature_matrix(job)

        self.classify()

开发者ID:AlinaKay，项目名称:sentiment-analyzer，代码行数:40，代码来源:classification.py

示例14: fit

def fit(dataset, save_results=True, show=False):
    """
    Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model.

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers, sorts intermediate pairs and joins them with one reducer
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True))]

    job.params = dataset.params  # job parameters (dataset object)
    # define name of a job and input data urls
    job.run(name="naivebayes_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"naivebayes_fitmodel": fitmodel_url}  # return results url

开发者ID:romanorac，项目名称:discomll，代码行数:30，代码来源:naivebayes.py

示例15: process_featurization_with_disco

def process_featurization_with_disco(input_list, params, partitions=4):
    """Featurize time-series data in parallel as a Disco job.

    Called from within the `featurize_in_parallel` function.

    Parameters
    ----------
    input_list : str
        Path to file listing the file name and class name
        (comma-separated) for each individual time series data file,
        one per line.
    params : dict
        Dictionary of parameters to be passed to each map & reduce
        function.
    partitions : int, optional
        Number of nodes/partitions in system. Defaults to 4.

    Returns
    -------
    iterator
        disco.core.result_iterator(), an interator of two-element
        tuples, each containing the file name of the original time
        series data file, and a dictionary of the associated features
        generated.

    """
    from disco.core import Job, result_iterator
    job = Job('with_modules').run(
        input=input_list,
        map_reader=custom_reader,
        map=map,
        partitions=partitions,
        reduce=featurize_reduce,
        params=params,
        required_modules=[("mltsp",
                           os.path.dirname(os.path.dirname(__file__)))])

    result = result_iterator(job.wait(show=True))
    return result

开发者ID:gitter-badger，项目名称:mltsp，代码行数:39，代码来源:parallel_processing.py

注：本文中的disco.core.Job类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。