本文整理汇总了Python中pydoop.hdfs.open函数的典型用法代码示例。如果您正苦于以下问题:Python open函数的具体用法?Python open怎么用?Python open使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了open函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: open
def open(self):
for test_path in self.hdfs_paths[0], self.local_paths[0]:
with hdfs.open(test_path, "w") as f:
f.write(self.data)
f.fs.close()
with hdfs.open(test_path) as f:
self.assertEqual(f.read(), self.data)
f.fs.close()
示例2: dump
def dump(self):
for test_path in self.hdfs_paths[0], self.local_paths[0]:
hdfs.dump(self.data, test_path)
with hdfs.open(test_path) as fi:
rdata = fi.read()
fi.fs.close()
self.assertEqual(rdata, self.data)
示例3: xml_from_hdfs
def xml_from_hdfs(url):
with hdfs.open(url, "r") as f:
lines = f.read().strip().split('\n')
docs, doc = [], None
for line in lines:
if line.startswith('<doc'):
doc = line
elif line.startswith('</doc>'):
docs.append(doc + line)
else:
#line = line.replace('&', '').replace('"', "'")
doc += line.replace('"', "'")
for doc in docs:
dom = bs(doc).find('doc')
doc = {}
try:
doc['id'] = dom.attrs['id']
doc['url'] = dom.attrs['url']
doc['title'] = dom.attrs['title']
except AttributeError, e:
continue
doc['content'] = dom.text
doc['md5'] = hashlib.md5(str(doc)).hexdigest()
yield doc
示例4: map
def map(self, ctx):
p = BioImgPlane(ctx.value)
pixels = p.get_xy()
bn = '%s-z%04d-c%04d-t%04d.npy' % (p.name, p.z, p.c, p.t)
fn = hdfs.path.join(self.out_dir, p.name, bn)
with hdfs.open(fn, 'w') as fo:
np.save(fo, pixels)
ctx.emit(fn, '%s\t%s' % (p.dimension_order, pixels.shape))
示例5: __init__
def __init__(self, ctx):
super(AvroReader, self).__init__(ctx)
isplit = ctx.input_split
self.region_start = isplit.offset
self.region_end = isplit.offset + isplit.length
self.reader = SeekableDataFileReader(hdfs.open(isplit.filename),
DatumReader())
self.reader.align_after(isplit.offset)
示例6: put
def put(self):
src = hdfs.path.split(self.local_paths[0])[-1]
dest = self.hdfs_paths[0]
with open(src, "w") as f:
f.write(self.data)
hdfs.put(src, dest)
with hdfs.open(dest) as fi:
rdata = fi.read()
self.assertEqual(rdata, self.data)
示例7: __init__
def __init__(self, context):
super(Reader, self).__init__()
self.isplit = pp.InputSplit(context.getInputSplit())
self.file = hdfs.open(self.isplit.filename)
self.file.seek(self.isplit.offset)
self.bytes_read = 0
if self.isplit.offset > 0:
discarded = self.file.readline() # read by reader of previous split
self.bytes_read += len(discarded)
示例8: __init__
def __init__(self, context):
super(Writer, self).__init__(context)
self.logger = LOGGER.getChild("Writer")
jc = context.job_conf
outfn = context.get_default_work_file()
self.logger.info("writing to %s", outfn)
hdfs_user = jc.get("pydoop.hdfs.user", None)
self.sep = jc.get("mapreduce.output.textoutputformat.separator", "\t")
self.file = hdfs.open(outfn, "wt", user=hdfs_user)
示例9: __init__
def __init__(self, context):
super(Writer, self).__init__(context)
self.logger = logging.getLogger("Writer")
jc = context.getJobConf()
jc_configure_int(self, jc, "mapred.task.partition", "part")
jc_configure(self, jc, "mapred.work.output.dir", "outdir")
jc_configure(self, jc, "mapred.textoutputformat.separator", "sep", "\t")
jc_configure(self, jc, "pydoop.hdfs.user", "hdfs_user", None)
self.outfn = "%s/part-%05d" % (self.outdir, self.part)
self.file = hdfs.open(self.outfn, "w", user=self.hdfs_user)
示例10: json_from_hdfs
def json_from_hdfs(url):
assert hdfs.path.isdir(url)
file_lists = hdfs.ls(url)
for fi in file_lists:
with hdfs.open(fi, "r") as f:
items = f.read().strip().split('\n')
for it in items:
it = loads(it)
it['md5'] = hashlib.md5(str(it)).hexdigest()
yield it
示例11: __init__
def __init__(self, context):
super(Writer, self).__init__(context)
self.logger = LOGGER.getChild("Writer")
jc = context.job_conf
part = jc.get_int("mapred.task.partition")
out_dir = jc["mapred.work.output.dir"]
outfn = "%s/part-%05d" % (out_dir, part)
hdfs_user = jc.get("pydoop.hdfs.user", None)
self.file = hdfs.open(outfn, "w", user=hdfs_user)
self.sep = jc.get("mapred.textoutputformat.separator", "\t")
示例12: _choose_break_points
def _choose_break_points(cls, args):
n_records, n_breakpoints, path = args
block_size = n_records * RECORD_LENGTH
with hdfs.open(path, 'r') as f:
data = f.read(block_size)
assert len(data) == block_size
step = max(n_records // n_breakpoints, 1)
keys = sorted([data[k:k + KEY_LENGTH]
for k in range(0, block_size, RECORD_LENGTH)])
return [_ for _ in it.islice(keys, step, n_records, step)]
示例13: processLine
def processLine(myfile, topic):
with hdfs.open(myfile["name"]) as handle:
for i, line in enumerate(handle):
#strip line
line = line.strip()
#Submit data (my function)
submitLine(topic, line, trials=3)
if i % 20000 == 0 and i != 0:
logger.info("%s lines submitted for %s" %(i, myfile["name"]))
示例14: __init__
def __init__(self, context):
super(AvroWriter, self).__init__(context)
self.logger = LOGGER.getChild('AvroWriter')
job_conf = context.job_conf
part = int(job_conf['mapreduce.task.partition'])
outdir = job_conf["mapreduce.task.output.dir"]
outfn = "%s/part-r-%05d.avro" % (outdir, part)
wh = hdfs.open(outfn, "w")
self.logger.debug('created hdfs file %s', outfn)
self.writer = DataFileWriter(wh, DatumWriter(), self.schema)
self.logger.debug('opened AvroWriter')
示例15: __init__
def __init__(self, context):
super(Writer, self).__init__(context)
self.logger = LOGGER.getChild("Writer")
jc = context.job_conf
part = jc.get_int("mapred.task.partition")
out_dir = jc["mapred.work.output.dir"]
self.logger.debug("part: %d", part)
self.logger.debug("outdir: %s", out_dir)
outfn = "%s/part-%05d" % (out_dir, part)
hdfs_user = jc.get("pydoop.hdfs.user", None)
self.file = hdfs.open(outfn, "wb", user=hdfs_user)