本文整理汇总了Python中gzip.open方法的典型用法代码示例。如果您正苦于以下问题:Python gzip.open方法的具体用法?Python gzip.open怎么用?Python gzip.open使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gzip
的用法示例。
在下文中一共展示了gzip.open方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: saveState
# 需要导入模块: import gzip [as 别名]
# 或者: from gzip import open [as 别名]
def saveState(dataHub):
import pickle as pickle
import gzip
pickle.dump(dataHub, gzip.open(dataHub.args.save_state, "wb"))
logging.warn("^"*20 + " saving state to pickle and exiting " + "^"*20)
示例2: main
# 需要导入模块: import gzip [as 别名]
# 或者: from gzip import open [as 别名]
def main(_):
"""Runs `text_utils.simplify_nq_example` over all shards of a split.
Prints simplified examples to a single gzipped file in the same directory
as the input shards.
"""
split = os.path.basename(FLAGS.data_dir)
outpath = os.path.join(FLAGS.data_dir,
"simplified-nq-{}.jsonl.gz".format(split))
with gzip.open(outpath, "wb") as fout:
num_processed = 0
start = time.time()
for inpath in glob.glob(os.path.join(FLAGS.data_dir, "nq-*-??.jsonl.gz")):
print("Processing {}".format(inpath))
with gzip.open(inpath, "rb") as fin:
for l in fin:
utf8_in = l.decode("utf8", "strict")
utf8_out = json.dumps(
text_utils.simplify_nq_example(json.loads(utf8_in))) + u"\n"
fout.write(utf8_out.encode("utf8"))
num_processed += 1
if not num_processed % 100:
print("Processed {} examples in {}.".format(num_processed,
time.time() - start))
示例3: loadW2V
# 需要导入模块: import gzip [as 别名]
# 或者: from gzip import open [as 别名]
def loadW2V(self,emb_path, type="bin"):
print("Loading W2V data...")
num_keys = 0
if type=="textgz":
# this seems faster than gensim non-binary load
for line in gzip.open(emb_path):
l = line.strip().split()
st=l[0].lower()
self.pre_emb[st]=np.asarray(l[1:])
num_keys=len(self.pre_emb)
if type=="text":
# this seems faster than gensim non-binary load
for line in open(emb_path):
l = line.strip().split()
st=l[0].lower()
self.pre_emb[st]=np.asarray(l[1:])
num_keys=len(self.pre_emb)
else:
self.pre_emb = Word2Vec.load_word2vec_format(emb_path,binary=True)
self.pre_emb.init_sims(replace=True)
num_keys=len(self.pre_emb.vocab)
print("loaded word2vec len ", num_keys)
gc.collect()
示例4: getTsvData
# 需要导入模块: import gzip [as 别名]
# 或者: from gzip import open [as 别名]
def getTsvData(self, filepath):
print("Loading training data from "+filepath)
x1=[]
x2=[]
y=[]
# positive samples from file
for line in open(filepath):
l=line.strip().split("\t")
if len(l)<2:
continue
if random() > 0.5:
x1.append(l[0].lower())
x2.append(l[1].lower())
else:
x1.append(l[1].lower())
x2.append(l[0].lower())
y.append(int(l[2]))
return np.asarray(x1),np.asarray(x2),np.asarray(y)
示例5: dumpValidation
# 需要导入模块: import gzip [as 别名]
# 或者: from gzip import open [as 别名]
def dumpValidation(self,x1_text,x2_text,y,shuffled_index,dev_idx,i):
print("dumping validation "+str(i))
x1_shuffled=x1_text[shuffled_index]
x2_shuffled=x2_text[shuffled_index]
y_shuffled=y[shuffled_index]
x1_dev=x1_shuffled[dev_idx:]
x2_dev=x2_shuffled[dev_idx:]
y_dev=y_shuffled[dev_idx:]
del x1_shuffled
del y_shuffled
with open('validation.txt'+str(i),'w') as f:
for text1,text2,label in zip(x1_dev,x2_dev,y_dev):
f.write(str(label)+"\t"+text1+"\t"+text2+"\n")
f.close()
del x1_dev
del y_dev
# Data Preparatopn
# ==================================================
示例6: assemble_batch
# 需要导入模块: import gzip [as 别名]
# 或者: from gzip import open [as 别名]
def assemble_batch(story_fns, num_answer_words, format_spec):
stories = []
for sfn in story_fns:
with gzip.open(sfn,'rb') as f:
cvtd_story, _, _, _ = pickle.load(f)
stories.append(cvtd_story)
sents, graphs, queries, answers = zip(*stories)
cvtd_sents = np.array(sents, np.int32)
cvtd_queries = np.array(queries, np.int32)
max_ans_len = max(len(a) for a in answers)
cvtd_answers = np.stack([convert_answer(answer, num_answer_words, format_spec, max_ans_len) for answer in answers])
num_new_nodes, new_node_strengths, new_node_ids, next_edges = zip(*graphs)
num_new_nodes = np.stack(num_new_nodes)
new_node_strengths = np.stack(new_node_strengths)
new_node_ids = np.stack(new_node_ids)
next_edges = np.stack(next_edges)
return cvtd_sents, cvtd_queries, cvtd_answers, num_new_nodes, new_node_strengths, new_node_ids, next_edges
示例7: create_mnist
# 需要导入模块: import gzip [as 别名]
# 或者: from gzip import open [as 别名]
def create_mnist(tfrecord_dir, mnist_dir):
print('Loading MNIST from "%s"' % mnist_dir)
import gzip
with gzip.open(os.path.join(mnist_dir, 'train-images-idx3-ubyte.gz'), 'rb') as file:
images = np.frombuffer(file.read(), np.uint8, offset=16)
with gzip.open(os.path.join(mnist_dir, 'train-labels-idx1-ubyte.gz'), 'rb') as file:
labels = np.frombuffer(file.read(), np.uint8, offset=8)
images = images.reshape(-1, 1, 28, 28)
images = np.pad(images, [(0,0), (0,0), (2,2), (2,2)], 'constant', constant_values=0)
assert images.shape == (60000, 1, 32, 32) and images.dtype == np.uint8
assert labels.shape == (60000,) and labels.dtype == np.uint8
assert np.min(images) == 0 and np.max(images) == 255
assert np.min(labels) == 0 and np.max(labels) == 9
onehot = np.zeros((labels.size, np.max(labels) + 1), dtype=np.float32)
onehot[np.arange(labels.size), labels] = 1.0
with TFRecordExporter(tfrecord_dir, images.shape[0]) as tfr:
order = tfr.choose_shuffled_order()
for idx in range(order.size):
tfr.add_image(images[order[idx]])
tfr.add_labels(onehot[order])
#----------------------------------------------------------------------------
示例8: create_mnistrgb
# 需要导入模块: import gzip [as 别名]
# 或者: from gzip import open [as 别名]
def create_mnistrgb(tfrecord_dir, mnist_dir, num_images=1000000, random_seed=123):
print('Loading MNIST from "%s"' % mnist_dir)
import gzip
with gzip.open(os.path.join(mnist_dir, 'train-images-idx3-ubyte.gz'), 'rb') as file:
images = np.frombuffer(file.read(), np.uint8, offset=16)
images = images.reshape(-1, 28, 28)
images = np.pad(images, [(0,0), (2,2), (2,2)], 'constant', constant_values=0)
assert images.shape == (60000, 32, 32) and images.dtype == np.uint8
assert np.min(images) == 0 and np.max(images) == 255
with TFRecordExporter(tfrecord_dir, num_images) as tfr:
rnd = np.random.RandomState(random_seed)
for idx in range(num_images):
tfr.add_image(images[rnd.randint(images.shape[0], size=3)])
#----------------------------------------------------------------------------
示例9: create_cifar100
# 需要导入模块: import gzip [as 别名]
# 或者: from gzip import open [as 别名]
def create_cifar100(tfrecord_dir, cifar100_dir):
print('Loading CIFAR-100 from "%s"' % cifar100_dir)
import pickle
with open(os.path.join(cifar100_dir, 'train'), 'rb') as file:
data = pickle.load(file, encoding='latin1')
images = data['data'].reshape(-1, 3, 32, 32)
labels = np.array(data['fine_labels'])
assert images.shape == (50000, 3, 32, 32) and images.dtype == np.uint8
assert labels.shape == (50000,) and labels.dtype == np.int32
assert np.min(images) == 0 and np.max(images) == 255
assert np.min(labels) == 0 and np.max(labels) == 99
onehot = np.zeros((labels.size, np.max(labels) + 1), dtype=np.float32)
onehot[np.arange(labels.size), labels] = 1.0
with TFRecordExporter(tfrecord_dir, images.shape[0]) as tfr:
order = tfr.choose_shuffled_order()
for idx in range(order.size):
tfr.add_image(images[order[idx]])
tfr.add_labels(onehot[order])
#----------------------------------------------------------------------------
示例10: create_celeba
# 需要导入模块: import gzip [as 别名]
# 或者: from gzip import open [as 别名]
def create_celeba(tfrecord_dir, celeba_dir, cx=89, cy=121):
print('Loading CelebA from "%s"' % celeba_dir)
glob_pattern = os.path.join(celeba_dir, 'img_align_celeba_png', '*.png')
image_filenames = sorted(glob.glob(glob_pattern))
expected_images = 202599
if len(image_filenames) != expected_images:
error('Expected to find %d images' % expected_images)
with TFRecordExporter(tfrecord_dir, len(image_filenames)) as tfr:
order = tfr.choose_shuffled_order()
for idx in range(order.size):
img = np.asarray(PIL.Image.open(image_filenames[order[idx]]))
assert img.shape == (218, 178, 3)
img = img[cy - 64 : cy + 64, cx - 64 : cx + 64]
img = img.transpose(2, 0, 1) # HWC => CHW
tfr.add_image(img)
#----------------------------------------------------------------------------
示例11: create_gsa_mapping
# 需要导入模块: import gzip [as 别名]
# 或者: from gzip import open [as 别名]
def create_gsa_mapping(path, metadata, sample_name, shuffle):
"""
Creates the binning gold standard/gsa mapping
"""
to_genome = name_to_genome(metadata)
gsa_path = os.path.join(path, "anonymous_gsa.fasta") #
count = 0
if not os.path.exists(gsa_path):
gsa_path = os.path.join(path, "anonymous_gsa.fasta.gz") # if zipped
with gzip.open(gsa_path,'r') as gsa:
for line in gsa:
if line.startswith('>'):
count += 1
with gzip.open(gsa_path,'r') as gsa:
gsa_temp = shuffle_anonymize(gsa, path, to_genome, metadata, sample_name, count, shuffle)
else:
with open(gsa_path,'r') as gsa:
for line in gsa:
if line.startswith('>'):
count += 1
with open(gsa_path,'r') as gsa:
gsa_temp = shuffle_anonymize(gsa, path, to_genome, metadata, sample_name, count, shuffle)
os.rename(gsa_temp, gsa_path)
示例12: read_genomes_list
# 需要导入模块: import gzip [as 别名]
# 或者: from gzip import open [as 别名]
def read_genomes_list(genomes_path, additional_file = None):
genomes_map = {}
total_genomes = 0
if additional_file is not None:
with open(additional_file,'r') as add:
for line in add:
ncbi_id, sci_name, path, novelty = line.strip().split('\t')
if ncbi_id in genomes_map:
genomes_map[ncbi_id][1].append(path)
else:
genomes_map[ncbi_id] = (sci_name, [path], novelty) # this might not be a http path
total_genomes += 1
with open(genomes_path,'r') as genomes:
for line in genomes:
ncbi_id, sci_name, ftp = line.strip().split('\t')
http = ftp.replace("ftp://","http://") # not using ftp address but http (proxies)
if ncbi_id in genomes_map:
genomes_map[ncbi_id][1].append(http)
else:
genomes_map[ncbi_id] = (sci_name, [http], 'known_strain') # sci_name is always the same for same taxid (?)
total_genomes += 1
return genomes_map, total_genomes
示例13: download_genome
# 需要导入模块: import gzip [as 别名]
# 或者: from gzip import open [as 别名]
def download_genome(genome, out_path):
genome_path = os.path.join(out_path,"genomes")
out_name = genome.rstrip().split('/')[-1]
http_address = os.path.join(genome, out_name + "_genomic.fna.gz")
opened = urllib2.urlopen(http_address)
out = os.path.join(genome_path, out_name + ".fa")
tmp_out = os.path.join(genome_path, out_name + "tmp.fa")
out_gz = out + ".gz"
with open(out_gz,'wb') as outF:
outF.write(opened.read())
gf = gzip.open(out_gz)
new_out = open(tmp_out,'wb')
new_out.write(gf.read())
gf.close()
os.remove(out_gz)
new_out.close()
split_by_N(tmp_out, out)
return out
示例14: parse_data
# 需要导入模块: import gzip [as 别名]
# 或者: from gzip import open [as 别名]
def parse_data(path, dataset, flatten):
if dataset != 'train' and dataset != 't10k':
raise NameError('dataset must be train or t10k')
label_file = os.path.join(path, dataset + '-labels-idx1-ubyte')
with open(label_file, 'rb') as file:
_, num = struct.unpack(">II", file.read(8))
labels = np.fromfile(file, dtype=np.int8) # int8
new_labels = np.zeros((num, 10))
new_labels[np.arange(num), labels] = 1
img_file = os.path.join(path, dataset + '-images-idx3-ubyte')
with open(img_file, 'rb') as file:
_, num, rows, cols = struct.unpack(">IIII", file.read(16))
imgs = np.fromfile(file, dtype=np.uint8).reshape(num, rows, cols) # uint8
imgs = imgs.astype(np.float32) / 255.0
if flatten:
imgs = imgs.reshape([num, -1])
return imgs, new_labels
示例15: load_json
# 需要导入模块: import gzip [as 别名]
# 或者: from gzip import open [as 别名]
def load_json(filename, to='auto'):
'''
load_json(filename) yields the object represented by the json file or stream object filename.
The optional argument to may be set to None to indicate that the JSON data should be returned
verbatim rather than parsed by neuropythy's denormalize system.
'''
from neuropythy.util import denormalize as denorm
if pimms.is_str(filename):
try:
with gzip.open(filename, 'rt') as fl: dat = json.load(fl)
except Exception:
with open(filename, 'rt') as fl: dat = json.load(fl)
else:
dat = json.load(filename)
filename = '<stream>'
if to is None: return dat
elif to == 'auto': return denorm(dat)
else: raise ValueError('unrecognized to option: %s' % to)