本文整理匯總了Python中disco.core.Disco類的典型用法代碼示例。如果您正苦於以下問題:Python Disco類的具體用法?Python Disco怎麽用?Python Disco使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了Disco類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: fit_predict
def fit_predict(training_data, fitting_data, tau=1, samples_per_job=0, save_results=True, show=False):
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job, result_iterator
from disco.core import Disco
"""
training_data - training samples
fitting_data - dataset to be fitted to training data.
tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x.
samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job.
"""
try:
tau = float(tau)
if tau <= 0:
raise Exception("Parameter tau should be >= 0.")
except ValueError:
raise Exception("Parameter tau should be numerical.")
if fitting_data.params["id_index"] == -1:
raise Exception("Predict data should have id_index set.")
job = Job(worker=Worker(save_results=save_results))
job.pipeline = [
("split", Stage("map", input_chain=fitting_data.params["input_chain"], init=simple_init, process=map_predict))
]
job.params = fitting_data.params
job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"])
samples = {}
results = []
tau = float(2 * tau ** 2) # calculate tau once
counter = 0
for test_id, x in result_iterator(job.wait(show=show)):
if samples_per_job == 0:
# calculate number of samples per job
if len(x) <= 100: # if there is less than 100 attributes
samples_per_job = 100 # 100 samples is max per on job
else:
# there is more than 100 attributes
samples_per_job = len(x) * -25 / 900.0 + 53 # linear function
samples[test_id] = x
if counter == samples_per_job:
results.append(_fit_predict(training_data, samples, tau, save_results, show))
counter = 0
samples = {}
counter += 1
if len(samples) > 0: # if there is some samples left in the the dictionary
results.append(_fit_predict(training_data, samples, tau, save_results, show))
# merge results of every iteration into a single tag
ddfs = Disco().ddfs
ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results])
return ["tag://" + job.name]
示例2: submit
def submit(master, jobpack):
from disco.settings import DiscoSettings
from disco.core import Disco
settings = DiscoSettings()
dmaster = Disco(master)
print "Submitting job to ", master
status, response = json.loads(dmaster.request('/disco/job/new', jobpack))
if status != 'ok':
errmsg('Failed to start job. Server replied: %s' % response)
print response
示例3: __init__
def __init__(self, spec, discodex, disco_addr="disco://localhost", profile=False):
# TODO(sqs): refactoring potential with PagerankJob
self.spec = spec
self.discodex = discodex
self.docset = Docset(spec.docset_name)
self.disco = Disco(DiscoSettings()["DISCO_MASTER"])
self.nr_partitions = 8
self.profile = profile
示例4: IndexJob
class IndexJob(object):
def __init__(self, spec, discodex, disco_addr="disco://localhost", profile=False):
# TODO(sqs): refactoring potential with PagerankJob
self.spec = spec
self.discodex = discodex
self.docset = Docset(spec.docset_name)
self.disco = Disco(DiscoSettings()["DISCO_MASTER"])
self.nr_partitions = 8
self.profile = profile
def start(self):
results = self.__run_job(self.__index_job())
self.__run_discodex_index(results)
def __run_job(self, job):
results = job.wait()
if self.profile:
self.__profile_job(job)
return results
def __index_job(self):
return self.disco.new_job(
name="index_tfidf",
input=["tag://" + self.docset.ddfs_tag],
map_reader=docparse,
map=TfIdf.map,
reduce=TfIdf.reduce,
sort=True,
partitions=self.nr_partitions,
partition=TfIdf.partition,
merge_partitions=False,
profile=self.profile,
params=dict(doc_count=self.docset.doc_count),
)
def __run_discodex_index(self, results):
opts = {
"parser": "disco.func.chain_reader",
"demuxer": "freequery.index.tf_idf.TfIdf_demux",
"nr_ichunks": 1, # TODO(sqs): after disco#181 fixed, increase this
}
ds = DataSet(input=results, options=opts)
origname = self.discodex.index(ds)
self.disco.wait(origname) # origname is also the disco job name
self.discodex.clone(origname, self.spec.invindex_name)
示例5: __init__
def __init__(self, spec, disco_addr="disco://localhost", alpha=0.15, niter=2, profile=False):
self.spec = spec
self.docset = Docset(spec.docset_name)
self.disco = Disco("disco://localhost")
self.alpha = alpha
self.niter = niter
self.nr_partitions = 16
self.merge_partitions = False
self.profile = profile
示例6: LinkParseJob
class LinkParseJob(object):
def __init__(self, spec, verbose=False, **kwargs):
self.spec = spec
self.docset = Docset(self.spec.docset_name)
self.disco = Disco("disco://localhost")
self.verbose = verbose
def start(self):
from disco import func
job = self.disco.new_job(
name="linkparse",
input=self.docset.dump_uris(),
map_reader=docparse,
map=linkparse_map,
map_output_stream=(func.map_output_stream,
func.disco_output_stream,
LinkFileOutputStream.disco_output_stream),
partitions=0,
save=True,
)
results = job.wait()
self.__tag_results(results)
if self.verbose:
self.__print_results(results)
def __tag_results(self, results):
from disco.ddfs import DDFS
ddfs = DDFS()
results_tag = results[0]
ddfs.put(self.docset.ddfs_link_file_tag, list(ddfs.blobs(results_tag)))
# remove old, temporary tag
ddfs.delete(results_tag)
def __print_results(self, results):
for doc in result_iterator(results, tempdir=False, reader=doclinksparse):
print "%s\n\t%s" % (doc.uri, "\n\t".join(doc.link_uris))
示例7: fun_reduce
return [(w, 1) for w in re.sub("\W", " ", e).lower().split()]
def fun_reduce(iter, out, params):
s = {}
for k, v in iter:
if k in s:
s[k] += int(v)
else:
s[k] = int(v)
for k, v in s.iteritems():
out.add(k, v)
tserver.run_server(data_gen)
job = Disco(sys.argv[1]).new_job(name="test_50k",
input=tserver.makeurl([""] * int(5e4)),
map=fun_map,
reduce=fun_reduce,
nr_reduces=300,
sort=False)
ANS = {"gutta": int(5e6), "cavat": int(1e7), "capidem": int(5e6)}
i = 0
for key, value in result_iterator(job.wait()):
i += 1
if ANS[key] == int(value):
print "Correct: %s %s" % (key, value)
else:
raise "Results don't match"
if i != 3:
raise "Wrong number of results: Got %d expected 3" % i
job.purge()
示例8: range
if v != results[k]:
raise "%s: Invalid result for key %s, got %s, "\
"expected %s" % (job.name, k, v, results[k])
tserver.run_server(data_gen)
N = 10
results = {}
inputs = []
for i in range(N):
a = [i] * 10
b = range(i, i + 10)
inputs += ["%d:%d" % x for x in zip(a, b)]
results[str(i)] = sum(b)
disco = Disco(sys.argv[1])
# map results in individual files, one per input file (default mode)
job1 = disco.new_job(\
name = "test_partfile1",
input = tserver.makeurl(inputs),
map = fun_map)
# map results in one big partition file per host
job2 = disco.new_job(\
name = "test_partfile2",
input = tserver.makeurl(inputs),
map = fun_map,
nr_reduces = 1)
check_results(job1)
示例9: map
import sys
from disco.core import Disco, result_iterator
from disco.settings import DiscoSettings
def map(entry, params):
for word in entry.split():
yield word, 1
def reduce(iter, out, params):
s = {}
for word, freq in iter:
s[word] = s.get(word, 0) + int(freq)
for word, freq in s.iteritems():
out.add(word, freq)
disco = Disco(DiscoSettings()['DISCO_MASTER'])
print "Starting Disco job.."
print "Go to %s to see status of the job." % disco.master
results = disco.new_job(name="wordcount",
input=["http://discoproject.org/chekhov.txt"],
map=map,
reduce=reduce).wait()
print "Job done. Results:"
for word, freq in result_iterator(results):
print word, freq
示例10: fun_map
import sys
from disco.core import Disco, result_iterator
def fun_map(e, params):
for i in range(3):
msg("--special_test_string_%d--" % i)
return [(e, "")]
inputs = ["raw://discoapi"]
job = Disco(sys.argv[1]).new_job(name = "test_discoapi",
input = inputs,
map = fun_map)
r = list(result_iterator(job.wait()))
if [("discoapi", "")] != r:
raise Exception("Invalid result: <%s> " % r)
n = job.jobspec()["name"]
if not n.startswith("test_discoapi"):
raise Exception("Invalid jobspec: Expected name prefix test_discoapi, "\
"got %s" % n)
events = [ev[2] for offs, ev in job.events()]
for i in range(3):
m = "--special_test_string_%d--" % i
if not [x for x in events if m in x]:
raise Exception("Message '%s' not found in events" % m)
job.purge()
示例11: Disco
import sys
from disco.core import Disco
OK_STATUS = ['job_ready', 'job_died']
disco = Disco(sys.argv[1])
def op_show(n, s):
print n
def op_kill(n, s):
if s == "job_active":
print "Killing", n
disco.kill(n)
def op_clean(n, s):
print "Cleaning", n
disco.clean(n)
def op_purge(n, s):
print "Purging", n
disco.purge(n)
for t, s, name in disco.joblist():
if sys.argv[3] in name:
globals()["op_" + sys.argv[2]](name, s)
示例12: fun_reduce
def fun_reduce(iter, out, params):
for k, v in iter:
out.add("red:" + k, v)
def data_gen(path):
return path[1:]
tserver.run_server(data_gen)
inputs = ["apple", "orange", "pear"]
job = Disco(sys.argv[1]).new_job(
name="test_streams",
input=tserver.makeurl(inputs),
map=fun_map,
reduce=fun_reduce,
nr_reduces=1,
map_reader = map_reader,
map_input_stream =
[map_input_stream, map_input1, map_input2, map_input3],
reduce_output_stream = [reduce_output1, reduce_output2])
for k, v in result_iterator(job.wait(),
input_stream = [resultiter_input1, map_input_stream]):
if not k.startswith("red:cba"):
raise Exception("Invalid prefix in key. Got '%s' "\
"expected prefix 'red:cba'" % k)
if k[7:] not in inputs:
raise Exception("Invalid result '%s'" % k)
inputs.remove(k[7:])
示例13: Disco
Results,
Query)
from disco.core import Disco
from disco.ddfs import DDFS
from disco.error import DiscoError
from disco.util import ddfs_name, flatten, parse_dir
from discodb import Q
discodex_settings = settings.DiscodexSettings()
disco_master_url = discodex_settings['DISCODEX_DISCO_MASTER']
disco_prefix = discodex_settings['DISCODEX_DISCO_PREFIX']
index_prefix = discodex_settings['DISCODEX_INDEX_PREFIX']
purge_file = discodex_settings['DISCODEX_PURGE_FILE']
disco_master = Disco(disco_master_url)
ddfs = DDFS(disco_master_url)
NOT_FOUND, OK, ACTIVE, DEAD = 'unknown job', 'ready', 'active', 'dead'
class IndexCollection(Collection):
allowed_methods = ('GET', 'POST')
def delegate(self, request, *args, **kwargs):
name = str(kwargs.pop('name'))
return IndexResource(name)(request, *args, **kwargs)
@property
def names(self):
return ddfs.list(index_prefix)
示例14: fun_map
import sys
from disco.core import Disco, result_iterator
def fun_map(e, params):
return [("", e + ":map")]
inputs = ["raw://eeny", "raw://meeny", "raw://miny", "raw://moe"]
job = Disco(sys.argv[1]).new_job(name = "test_raw",
input = inputs,
map = fun_map)
res = dict((x[6:] + ":map", True) for x in inputs)
for x in result_iterator(job.wait()):
if x[1] not in res:
raise "Invalid result: <%s> " % x[1]
del res[x[1]]
if res:
raise "Invalid number of results %d" %\
(len(inputs) - len(res))
job.purge()
print "ok"
示例15: __init__
def __init__(self, spec, verbose=False, **kwargs):
self.spec = spec
self.docset = Docset(self.spec.docset_name)
self.disco = Disco("disco://localhost")
self.verbose = verbose