本文整理汇总了Python中smart_open.smart_open方法的典型用法代码示例。如果您正苦于以下问题:Python smart_open.smart_open方法的具体用法?Python smart_open.smart_open怎么用?Python smart_open.smart_open使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类smart_open
的用法示例。
在下文中一共展示了smart_open.smart_open方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __iter__
# 需要导入模块: import smart_open [as 别名]
# 或者: from smart_open import smart_open [as 别名]
def __iter__(self):
"""Streams relations from self.file_path decoded into unicode strings.
Yields
-------
2-tuple (unicode, unicode)
Relation from input file.
"""
with smart_open(self.file_path) as file_obj:
if sys.version_info[0] < 3:
lines = file_obj
else:
lines = (l.decode(self.encoding) for l in file_obj)
# csv.reader requires bytestring input in python2, unicode input in python3
reader = csv.reader(lines, delimiter=self.delimiter)
for row in reader:
if sys.version_info[0] < 3:
row = [value.decode(self.encoding) for value in row]
(u,v) = tuple(row) # Swap line in the csv file because we want the correct edge direction.
assert u != v
if self.reverse:
yield (v,u)
else:
yield (u,v)
示例2: _get_file
# 需要导入模块: import smart_open [as 别名]
# 或者: from smart_open import smart_open [as 别名]
def _get_file(self):
if not self.cur_file or self.bytes_written >= self.max_file_size:
if self.cur_file:
self.cur_file.close()
timestr = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
path = os.path.join(
self.path, "output-{}_worker-{}_{}.json".format(
timestr, self.ioctx.worker_index, self.file_index))
if self.path_is_uri:
if smart_open is None:
raise ValueError(
"You must install the `smart_open` module to write "
"to URIs like {}".format(path))
self.cur_file = smart_open(path, "w")
else:
self.cur_file = open(path, "w")
self.file_index += 1
self.bytes_written = 0
logger.info("Writing to new output file {}".format(self.cur_file))
return self.cur_file
示例3: _next_line
# 需要导入模块: import smart_open [as 别名]
# 或者: from smart_open import smart_open [as 别名]
def _next_line(self):
if not self.cur_file:
self.cur_file = self._next_file()
line = self.cur_file.readline()
tries = 0
while not line and tries < 100:
tries += 1
if hasattr(self.cur_file, "close"): # legacy smart_open impls
self.cur_file.close()
self.cur_file = self._next_file()
line = self.cur_file.readline()
if not line:
logger.debug("Ignoring empty file {}".format(self.cur_file))
if not line:
raise ValueError("Failed to read next line from files: {}".format(
self.files))
return line
示例4: fit
# 需要导入模块: import smart_open [as 别名]
# 或者: from smart_open import smart_open [as 别名]
def fit(self, data):
require(lore.dependencies.SMART_OPEN)
from smart_open import smart_open
with timer('fit %s' % self.name, logging.DEBUG):
self.missing_value = numpy.asarray([0.0] * self.dimensions, dtype=numpy.float32)
if not Glove.map:
Glove.map = {}
Glove.inverse = {}
path = os.path.join('encoders', 'glove.6B.%dd.txt.gz' % self.dimensions)
local = lore.io.download(path)
for line in smart_open(local):
values = line.split()
word = values[0]
parameters = numpy.asarray(values[1:], dtype=numpy.float32)
Glove.map[word] = parameters
Glove.inverse[tuple(parameters.tolist())] = word
self.map = Glove.map
self.inverse = Glove.inverse
示例5: __init__
# 需要导入模块: import smart_open [as 别名]
# 或者: from smart_open import smart_open [as 别名]
def __init__(self, args):
self.args = args
if not args.create_vocab:
logger.info('[ Reading vocab files from {}]'.format(args.vocab_dir))
self.tok2ind = json.load(open(args.vocab_dir+'tok2ind.json'))
self.ind2tok = json.load(open(args.vocab_dir+'ind2tok.json'))
else:
self.tok2ind = {self.NULL: 0, self.UNK: 1}
self.ind2tok = {0: self.NULL, 1: self.UNK}
self.oov_words = {}
# Index words in embedding file
if args.pretrained_words and args.embedding_file:
logger.info('[ Indexing words in embedding file... ]')
self.valid_words = set()
with smart_open(args.embedding_file) as f:
for line in f:
w = self.normalize(line.decode('utf-8').rstrip().split(' ')[0])
self.valid_words.add(w)
logger.info('[ Num words in set = %d ]' % len(self.valid_words))
else:
self.valid_words = None
示例6: smart_open
# 需要导入模块: import smart_open [as 别名]
# 或者: from smart_open import smart_open [as 别名]
def smart_open(fname, mode='rb'):
_, ext = os.path.splitext(fname)
if ext == '.bz2':
from bz2 import BZ2File
return make_closing(BZ2File)(fname, mode)
if ext == '.gz':
from gzip import GzipFile
return make_closing(GzipFile)(fname, mode)
return open(fname, mode)
# noinspection PyUnresolvedReferences
示例7: glove2word2vec
# 需要导入模块: import smart_open [as 别名]
# 或者: from smart_open import smart_open [as 别名]
def glove2word2vec(glove_vector_file, output_model_file):
"""Convert GloVe vectors into word2vec C format"""
def get_info(glove_file_name):
"""Return the number of vectors and dimensions in a file in GloVe format."""
with smart_open.smart_open(glove_file_name) as f:
num_lines = sum(1 for line in f)
with smart_open.smart_open(glove_file_name) as f:
num_dims = len(f.readline().split()) - 1
return num_lines, num_dims
def prepend_line(infile, outfile, line):
"""
Function to prepend lines using smart_open
"""
with smart_open.smart_open(infile, 'rb') as old:
with smart_open.smart_open(outfile, 'wb') as new:
new.write(str(line.strip()) + "\n")
for line in old:
new.write(line)
return outfile
num_lines, dims = get_info(glove_vector_file)
logger.info('%d lines with %s dimensions' % (num_lines, dims))
gensim_first_line = "{} {}".format(num_lines, dims)
model_file = prepend_line(glove_vector_file, output_model_file, gensim_first_line)
logger.info('Model %s successfully created !!'%output_model_file)
# Demo: Loads the newly created glove_model.txt into gensim API.
model = gensim.models.Word2Vec.load_word2vec_format(model_file, binary=False) #GloVe Model
logger.info('Most similar to king are: %s' % model.most_similar(positive=['king'], topn=10))
logger.info('Similarity score between woman and man is %s ' % model.similarity('woman', 'man'))
logger.info("Finished running %s", program)
return model_file
示例8: smart_open
# 需要导入模块: import smart_open [as 别名]
# 或者: from smart_open import smart_open [as 别名]
def smart_open(fname, mode='rb'):
_, ext = os.path.splitext(fname)
if ext == '.bz2':
from bz2 import BZ2File
return make_closing(BZ2File)(fname, mode)
if ext == '.gz':
from gzip import GzipFile
return make_closing(GzipFile)(fname, mode)
return open(fname, mode)
示例9: file_or_filename
# 需要导入模块: import smart_open [as 别名]
# 或者: from smart_open import smart_open [as 别名]
def file_or_filename(input):
"""
Return a file-like object ready to be read from the beginning. `input` is either
a filename (gz/bz2 also supported) or a file-like object supporting seek.
"""
if isinstance(input, string_types):
# input was a filename: open as file
yield smart_open(input)
else:
# input already a file-like object; just reset to the beginning
input.seek(0)
yield input
示例10: write
# 需要导入模块: import smart_open [as 别名]
# 或者: from smart_open import smart_open [as 别名]
def write(self, sample_batch):
start = time.time()
data = _to_json(sample_batch, self.compress_columns)
f = self._get_file()
f.write(data)
f.write("\n")
if hasattr(f, "flush"): # legacy smart_open impls
f.flush()
self.bytes_written += len(data)
logger.debug("Wrote {} bytes to {} in {}s".format(
len(data), f,
time.time() - start))
示例11: _next_file
# 需要导入模块: import smart_open [as 别名]
# 或者: from smart_open import smart_open [as 别名]
def _next_file(self):
path = random.choice(self.files)
if urlparse(path).scheme not in ["", "c"]:
if smart_open is None:
raise ValueError(
"You must install the `smart_open` module to read "
"from URIs like {}".format(path))
return smart_open(path, "r")
else:
return open(path, "r")
示例12: _from_json
# 需要导入模块: import smart_open [as 别名]
# 或者: from smart_open import smart_open [as 别名]
def _from_json(batch):
if isinstance(batch, bytes): # smart_open S3 doesn't respect "r"
batch = batch.decode("utf-8")
data = json.loads(batch)
if "type" in data:
data_type = data.pop("type")
else:
raise ValueError("JSON record missing 'type' field")
if data_type == "SampleBatch":
for k, v in data.items():
data[k] = unpack_if_needed(v)
return SampleBatch(data)
elif data_type == "MultiAgentBatch":
policy_batches = {}
for policy_id, policy_batch in data["policy_batches"].items():
inner = {}
for k, v in policy_batch.items():
inner[k] = unpack_if_needed(v)
policy_batches[policy_id] = SampleBatch(inner)
return MultiAgentBatch(policy_batches, data["count"])
else:
raise ValueError(
"Type field must be one of ['SampleBatch', 'MultiAgentBatch']",
data_type)
示例13: main
# 需要导入模块: import smart_open [as 别名]
# 或者: from smart_open import smart_open [as 别名]
def main():
args = parse_arguments()
# open file
file_handler = smart_open(args.diversity_file)
# create analysis object
default_class = RandomnessGenerator(file_handler)
default_class.run()
default_class.viz.save(filename=args.svg_file)
print(f"{default_class.viz.over_called_pixels} pixels overrepresented.")
示例14: load_embeddings
# 需要导入模块: import smart_open [as 别名]
# 或者: from smart_open import smart_open [as 别名]
def load_embeddings(args, word_dict):
embeddings = torch.Tensor(len(word_dict), args.embedding_dim_orig)
if not os.path.isfile(args.embedding_table):
logger.info("Initializing embedding table randomly...")
embeddings.normal_(0, 1)
embeddings[0].fill_(0)
# Fill in embeddings
with smart_open(args.embedding_file) as f:
for line in f:
line = line.decode('utf-8')
parsed = line.rstrip().split(' ')
assert (len(parsed) == args.embedding_dim_orig + 1)
w = word_dict.normalize(parsed[0])
if w in word_dict:
vec = torch.Tensor([float(i) for i in parsed[1:]])
embeddings[word_dict[w]].copy_(vec)
# save the embedding table
logger.info('Saving the embedding table')
torch.save(embeddings, args.embedding_table)
else:
logger.info('Loading embeddings from saved embeddings table')
embeddings = torch.load(args.embedding_table)
return embeddings
#
# ------------------------------------------------------------------------------
# Utility classes
# ------------------------------------------------------------------------------
示例15: _parse
# 需要导入模块: import smart_open [as 别名]
# 或者: from smart_open import smart_open [as 别名]
def _parse(self, path: Union[Path, str]) -> None:
section_name: str = ''
sample_header: Optional[List[str]] = None
with open(path, encoding=self._encoding) as handle:
lines = list(csv.reader(handle, skipinitialspace=True))
for i, line in enumerate(lines):
# Skip to next line if this line is empty to support formats of
# sample sheets with multiple newlines as section seperators.
#
# https://github.com/clintval/sample-sheet/issues/46
#
if not ''.join(line).strip():
continue
# Raise exception if we encounter invalid characters.
if any(
character not in VALID_ASCII
for character in set(''.join(line))
):
raise ValueError(
f'Sample sheet contains invalid characters on line '
f'{i + 1}: {"".join(line)}'
)
header_match = self._section_header_re.match(line[0])
# If we enter a section save it's name and continue to next line.
if header_match:
section_name, *_ = header_match.groups()
if (
section_name not in self._sections
and section_name not in REQUIRED_SECTIONS
):
self.add_section(section_name)
continue
# [Reads] - vertical list of integers.
if section_name == 'Reads':
self.Reads.append(int(line[0]))
continue
# [Data] - delimited data with the first line a header.
elif section_name == 'Data':
if sample_header is not None:
self.add_sample(Sample(dict(zip(sample_header, line))))
elif any(key == '' for key in line):
raise ValueError(
f'Header for [Data] section is not allowed to '
f'have empty fields: {line}'
)
else:
sample_header = line
continue
# [<Other>] - keys in first column and values in second column.
elif len(line) >= 2:
key, value = (line[0], line[1])
section: Section = getattr(self, section_name)
section[key] = value