本文整理汇总了Python中vocab.Vocab方法的典型用法代码示例。如果您正苦于以下问题:Python vocab.Vocab方法的具体用法?Python vocab.Vocab怎么用?Python vocab.Vocab使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类vocab
的用法示例。
在下文中一共展示了vocab.Vocab方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: build_vocabulary
# 需要导入模块: import vocab [as 别名]
# 或者: from vocab import Vocab [as 别名]
def build_vocabulary(datadir, outdir, glove_path):
"""Construct the vocabulary object used throughout."""
# We're not going to backprop through the word vectors
# both train and dev words end up in the vocab.
counter = Counter()
for split in splits:
if split == "augmented":
continue
datapath = os.path.join(datadir, split + ".json")
for question, context, _, _ in data_stream(datapath):
for word in ciseau.tokenize(question, normalize_ascii=False):
counter[normalize(word)] += 1
for word in ciseau.tokenize(context, normalize_ascii=False):
counter[normalize(word)] += 1
common_words = [UNK, SOS, EOS, PAD] + [w for w, _ in counter.most_common()]
vocab_path = os.path.join(outdir, "vocab.txt")
with open(vocab_path, "wt") as handle:
handle.write("\n".join(common_words))
return Vocab(outdir)
示例2: main
# 需要导入模块: import vocab [as 别名]
# 或者: from vocab import Vocab [as 别名]
def main():
if not os.path.exists(opt.model_save_file):
os.makedirs(opt.model_save_file)
vocab = Vocab(opt.emb_filename)
log.info(f'Loading {opt.dataset} Datasets...')
log.info(f'Domains: {opt.domains}')
train_sets, dev_sets, test_sets, unlabeled_sets = {}, {}, {}, {}
for domain in opt.domains:
train_sets[domain], dev_sets[domain], test_sets[domain], unlabeled_sets[domain] = \
get_fdu_mtl_datasets(vocab, opt.fdu_mtl_dir, domain, opt.max_seq_len)
opt.num_labels = FduMtlDataset.num_labels
log.info(f'Done Loading {opt.dataset} Datasets.')
cv = train(vocab, train_sets, dev_sets, test_sets, unlabeled_sets)
log.info(f'Training done...')
acc = sum(cv['valid'].values()) / len(cv['valid'])
log.info(f'Validation Set Domain Average\t{acc}')
test_acc = sum(cv['test'].values()) / len(cv['test'])
log.info(f'Test Set Domain Average\t{test_acc}')
return cv
示例3: create
# 需要导入模块: import vocab [as 别名]
# 或者: from vocab import Vocab [as 别名]
def create(name, question_layers, document_layers, pick_end_word_layers,
layer_size, beam_size, embedding_dropout, hidden_dropout,
learning_rate, anneal_every, anneal_rate, clip_norm,
l2_scale, weight_noise, replace, vocab_path, seed):
"""Create a new QA model."""
vocab = Vocab(vocab_path)
random.seed(seed)
config = gnr.ModelConfig(
vocab_size=vocab.size,
question_layers=question_layers,
document_layers=document_layers,
pick_end_word_layers=pick_end_word_layers,
layer_size=layer_size,
beam_size=beam_size,
embedding_dropout_prob=embedding_dropout,
hidden_dropout_prob=hidden_dropout,
learning_rate=learning_rate,
anneal_every=anneal_every,
anneal_rate=anneal_rate,
clip_norm=clip_norm,
l2_scale=l2_scale,
weight_noise=weight_noise)
create_model("gnr", name, config, gnr, vocab.word_embeddings, replace)
示例4: predict
# 需要导入模块: import vocab [as 别名]
# 或者: from vocab import Vocab [as 别名]
def predict(name, data, vocab_path, output, batch_size):
"""Generate predictions for a trained model."""
with session_with_model("gnr", name) as (session, qa_model):
vocab = Vocab(vocab_path)
eval_batches = []
current_batch = []
for question, context, _, qa_id in featurize.data_stream(data):
current_batch.append((question, context, qa_id))
if len(current_batch) == batch_size:
eval_batches.append(current_batch)
current_batch = []
if len(current_batch) > 0:
eval_batches.append(current_batch)
predictions = {}
for idx, batch in enumerate(eval_batches):
questions, contexts, ids = list(zip(*batch))
preds = evaluate_batch(
session, qa_model, zip(questions, contexts), vocab)
for i, pred in enumerate(preds):
predictions[ids[i]] = pred
with open(output, "wt") as handle:
json.dump(predictions, handle)
# run the evaluation code
subprocess.check_call(["python", "evaluate.py", data, output])
示例5: prepare
# 需要导入模块: import vocab [as 别名]
# 或者: from vocab import Vocab [as 别名]
def prepare(args):
"""
checks data, creates the directories, prepare the vocabulary and embeddings
"""
logger = logging.getLogger("brc")
logger.info('Checking the data files...')
for data_path in args.train_files + args.dev_files + args.test_files:
assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)
logger.info('Preparing the directories...')
for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
if not os.path.exists(dir_path):
os.makedirs(dir_path)
logger.info('Building vocabulary...')
brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
args.train_files, args.dev_files, args.test_files)
vocab = Vocab(lower=True)
for word in brc_data.word_iter('train'):
vocab.add(word)
unfiltered_vocab_size = vocab.size()
vocab.filter_tokens_by_cnt(min_cnt=2)
filtered_num = unfiltered_vocab_size - vocab.size()
logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,
vocab.size()))
logger.info('Assigning embeddings...')
vocab.randomly_init_embeddings(args.embed_size)
logger.info('Saving vocab...')
with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
pickle.dump(vocab, fout)
logger.info('Done with preparing!')
示例6: prepare
# 需要导入模块: import vocab [as 别名]
# 或者: from vocab import Vocab [as 别名]
def prepare(logger, args):
"""
checks data, creates the directories, prepare the vocabulary and embeddings
"""
logger.info('Checking the data files...')
for data_path in args.trainset + args.devset + args.testset:
assert os.path.exists(data_path), '{} file does not exist.'.format(
data_path)
logger.info('Preparing the directories...')
for dir_path in [args.vocab_dir, args.save_dir, args.result_dir]:
if not os.path.exists(dir_path):
os.makedirs(dir_path)
logger.info('Building vocabulary...')
brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
args.trainset, args.devset, args.testset)
vocab = Vocab(lower=True)
for word in brc_data.word_iter('train'):
vocab.add(word)
unfiltered_vocab_size = vocab.size()
vocab.filter_tokens_by_cnt(min_cnt=2)
filtered_num = unfiltered_vocab_size - vocab.size()
logger.info('After filter {} tokens, the final vocab size is {}'.format(
filtered_num, vocab.size()))
logger.info('Assigning embeddings...')
vocab.randomly_init_embeddings(args.embed_size)
logger.info('Saving vocab...')
with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
pickle.dump(vocab, fout)
logger.info('Done with preparing!')
示例7: load_data
# 需要导入模块: import vocab [as 别名]
# 或者: from vocab import Vocab [as 别名]
def load_data(small=True, char_based=False, batch_size=20, vocab_size=10000, history_len=5, max_tokens=50, null_mark=False):
vocab_path = os.path.join(resource_dir, "ptb.train.txt")
valid_path = os.path.join(resource_dir, "ptb.valid.txt")
if small:
train_path = os.path.join(resource_dir, "ptb.train.10k.txt")
else:
train_path = os.path.join(resource_dir, "ptb.train.txt")
vocab = Vocab(char_based=char_based, null_mark=null_mark)
vocab.load(vocab_path, max_size=vocab_size)
lmdata = LMDataset(vocab, train_path, valid_path, history_len=-1, char_based=char_based, max_tokens=max_tokens)
batch = BunchSequences(lmdata, batch_size=batch_size, fragment_length=history_len)
return vocab, batch
示例8: load_word_vectors
# 需要导入模块: import vocab [as 别名]
# 或者: from vocab import Vocab [as 别名]
def load_word_vectors(path):
if os.path.isfile(path + '.pth') and os.path.isfile(path + '.vocab'):
print('==> File found, loading to memory')
vectors = torch.load(path + '.pth')
vocab = Vocab(filename=path + '.vocab')
return vocab, vectors
# saved file not found, read from txt file
# and create tensors for word vectors
print('==> File not found, preparing, be patient')
count = sum(1 for line in open(path + '.txt'))
with open(path + '.txt', 'r') as f:
contents = f.readline().rstrip('\n').split(' ')
dim = len(contents[1:])
words = [None] * (count)
vectors = torch.zeros(count, dim)
with open(path + '.txt', 'r') as f:
idx = 0
for line in f:
contents = line.rstrip('\n').split(' ')
words[idx] = contents[0]
vectors[idx] = torch.Tensor(list(map(float, contents[1:])))
idx += 1
with open(path + '.vocab', 'w') as f:
for word in words:
f.write(word + '\n')
vocab = Vocab(filename=path + '.vocab')
torch.save(vectors, path + '.pth')
return vocab, vectors
# write unique words from a set of files to a new file
示例9: load_word_vectors
# 需要导入模块: import vocab [as 别名]
# 或者: from vocab import Vocab [as 别名]
def load_word_vectors(path):
if os.path.isfile(path+'.pth') and os.path.isfile(path+'.vocab'):
print('==> File found, loading to memory')
vectors = torch.load(path+'.pth')
vocab = Vocab(filename=path+'.vocab')
return vocab, vectors
# saved file not found, read from txt file
# and create tensors for word vectors
print('==> File not found, preparing, be patient')
count = sum(1 for line in open(path+'.txt'))
with open(path+'.txt','r') as f:
contents = f.readline().rstrip('\n').split(' ')
dim = len(contents[1:])
words = [None]*(count)
vectors = torch.zeros(count,dim)
with open(path+'.txt','r') as f:
idx = 0
for line in f:
contents = line.rstrip('\n').split(' ')
words[idx] = contents[0]
#vectors[idx] = torch.Tensor(map(float, contents[1:]))
vectors[idx] = torch.Tensor(list(map(float, contents[1:])))
idx += 1
with open(path+'.vocab','w') as f:
for word in words:
f.write(word+'\n')
vocab = Vocab(filename=path+'.vocab')
torch.save(vectors, path+'.pth')
return vocab, vectors
# write unique words from a set of files to a new file
示例10: main
# 需要导入模块: import vocab [as 别名]
# 或者: from vocab import Vocab [as 别名]
def main():
if not os.path.exists(opt.model_save_file):
os.makedirs(opt.model_save_file)
log.info('Running the S-MAN + P-MoE + C-MoE model...')
vocabs = {}
tag_vocab = TagVocab()
assert opt.use_wordemb or opt.use_charemb, "At least one of word or char embeddings must be used!"
char_vocab = Vocab(opt.charemb_size)
log.info(f'Loading Datasets...')
log.info(f'Languages {opt.langs}')
log.info('Loading Embeddings...')
train_sets, dev_sets, test_sets, unlabeled_sets = {}, {}, {}, {}
for lang in opt.all_langs:
log.info(f'Building Vocab for {lang}...')
vocabs[lang] = Vocab(opt.emb_size, opt.emb_filenames[lang])
assert not opt.train_on_translation or not opt.test_on_translation
if opt.dataset.lower() == 'conll':
get_dataset_fn = get_conll_ner_datasets
if opt.train_on_translation:
get_dataset_fn = get_train_on_translation_conll_ner_datasets
if opt.test_on_translation:
get_dataset_fn = get_test_on_translation_conll_ner_datasets
train_sets[lang], dev_sets[lang], test_sets[lang], unlabeled_sets[lang] = \
get_dataset_fn(vocabs[lang], char_vocab, tag_vocab, opt.conll_dir, lang)
else:
raise Exception(f"Unknown dataset {opt.dataset}")
opt.num_labels = len(tag_vocab)
log.info(f'Tagset: {tag_vocab.id2tag}')
log.info(f'Done Loading Datasets.')
cv = train(vocabs, char_vocab, tag_vocab, train_sets, dev_sets, test_sets, unlabeled_sets)
log.info(f'Training done...')
acc = sum(cv['valid'].values()) / len(cv['valid'])
log.info(f'Validation Set Domain Average\t{acc}')
test_acc = sum(cv['test'].values()) / len(cv['test'])
log.info(f'Test Set Domain Average\t{test_acc}')
return cv
示例11: main
# 需要导入模块: import vocab [as 别名]
# 或者: from vocab import Vocab [as 别名]
def main():
if not os.path.exists(opt.model_save_file):
os.makedirs(opt.model_save_file)
log.info('Running the S-MAN + P-MoE + C-MoE model...')
vocabs = {}
assert opt.use_wordemb or opt.use_charemb, "At least one of word or char embeddings must be used!"
char_vocab = Vocab(opt.charemb_size) if opt.use_charemb else None
log.info(f'Loading Datasets...')
log.info(f'Domain: {opt.domain}')
log.info(f'Languages {opt.langs}')
log.info('Loading Embeddings...')
train_sets, dev_sets, test_sets, unlabeled_sets = {}, {}, {}, {}
for lang in opt.all_langs:
log.info(f'Building Vocab for {lang}...')
vocabs[lang] = Vocab(opt.emb_size, opt.emb_filenames[lang])
assert not opt.train_on_translation or not opt.test_on_translation
train_sets[lang], dev_sets[lang], test_sets[lang], unlabeled_sets[lang] = \
get_multi_lingual_amazon_datasets(vocabs[lang], char_vocab, opt.amazon_dir,
opt.domain, lang, opt.max_seq_len)
opt.num_labels = MultiLangAmazonDataset.num_labels
log.info(f'Done Loading Datasets.')
cv = train(vocabs, char_vocab, train_sets, dev_sets, test_sets, unlabeled_sets)
log.info(f'Training done...')
acc = sum(cv['valid'].values()) / len(cv['valid'])
log.info(f'Validation Set Domain Average\t{acc}')
test_acc = sum(cv['test'].values()) / len(cv['test'])
log.info(f'Test Set Domain Average\t{test_acc}')
return cv
示例12: prepro
# 需要导入模块: import vocab [as 别名]
# 或者: from vocab import Vocab [as 别名]
def prepro(args):
logger = logging.getLogger("QANet")
logger.info("====== preprocessing ======")
logger.info('Checking the data files...')
for data_path in args.train_files + args.dev_files + args.test_files:
assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)
logger.info('Preparing the directories...')
for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
if not os.path.exists(dir_path):
os.makedirs(dir_path)
logger.info('Building vocabulary...')
dataloader = DataLoader(args.max_p_num, args.max_p_len, args.max_q_len, args.max_ch_len,
args.train_files, args.dev_files, args.test_files)
vocab = Vocab(lower=True)
for word in dataloader.word_iter('train'):
vocab.add_word(word)
[vocab.add_char(ch) for ch in word]
unfiltered_vocab_size = vocab.word_size()
vocab.filter_words_by_cnt(min_cnt=2)
filtered_num = unfiltered_vocab_size - vocab.word_size()
logger.info('After filter {} tokens, the final vocab size is {}, char size is{}'.format(filtered_num,
vocab.word_size(), vocab.char_size()))
unfiltered_vocab_char_size = vocab.char_size()
vocab.filter_chars_by_cnt(min_cnt=2)
filtered_char_num = unfiltered_vocab_char_size - vocab.char_size()
logger.info('After filter {} chars, the final char vocab size is {}'.format(filtered_char_num,
vocab.char_size()))
logger.info('Assigning embeddings...')
if args.pretrained_word_path is not None:
vocab.load_pretrained_word_embeddings(args.pretrained_word_path)
else:
vocab.randomly_init_word_embeddings(args.word_embed_size)
if args.pretrained_char_path is not None:
vocab.load_pretrained_char_embeddings(args.pretrained_char_path)
else:
vocab.randomly_init_char_embeddings(args.char_embed_size)
logger.info('Saving vocab...')
with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
pickle.dump(vocab, fout)
logger.info('====== Done with preparing! ======')
示例13: prepare
# 需要导入模块: import vocab [as 别名]
# 或者: from vocab import Vocab [as 别名]
def prepare(args):
logger = logging.getLogger('SegEDU')
logger.info('Randomly sample 10% of the training data for validation...')
raw_train_dir = os.path.join(args.rst_dir, 'TRAINING')
raw_dev_dir = os.path.join(args.rst_dir, 'DEV')
if not os.path.exists(raw_dev_dir):
os.makedirs(raw_dev_dir)
raw_train_doc_ids = [file.split('.')[0] for file in os.listdir(raw_train_dir) if file.endswith('.out')]
random.shuffle(raw_train_doc_ids)
dev_doc_ids = raw_train_doc_ids[: int(len(raw_train_doc_ids) * 0.1)]
for doc_id in dev_doc_ids:
p = subprocess.call('mv {}/{}* {}'.format(raw_train_dir, doc_id, raw_dev_dir), shell=True)
preprocessed_train_dir = os.path.join(args.rst_dir, 'preprocessed/train/')
preprocessed_dev_dir = os.path.join(args.rst_dir, 'preprocessed/dev/')
preprocessed_test_dir = os.path.join(args.rst_dir, 'preprocessed/test/')
logger.info('Preprocessing Train data...')
preprocess_rst_data(os.path.join(args.rst_dir, 'TRAINING'), preprocessed_train_dir)
logger.info('Preprocessing Dev data...')
preprocess_rst_data(os.path.join(args.rst_dir, 'DEV'), preprocessed_dev_dir)
logger.info('Preprocessing Test data...')
preprocess_rst_data(os.path.join(args.rst_dir, 'TEST'), preprocessed_test_dir)
# logger.info('Building Vocab...')
# train_files = [os.path.join(preprocessed_train_dir, filename)
# for filename in sorted(os.listdir(preprocessed_train_dir)) if filename.endswith('.preprocessed')]
# dev_files = [os.path.join(preprocessed_dev_dir, filename)
# for filename in sorted(os.listdir(preprocessed_dev_dir)) if filename.endswith('.preprocessed')]
# test_files = [os.path.join(preprocessed_test_dir, filename)
# for filename in sorted(os.listdir(preprocessed_test_dir)) if filename.endswith('.preprocessed')]
# rst_data = RSTData(train_files=train_files, dev_files=dev_files, test_files=test_files)
# word_vocab = Vocab(lower=False)
# for word in rst_data.gen_all_words():
# word_vocab.add(word)
#
# logger.info('Loading pretrained embeddings for words...')
# if args.word_embed_path:
# word_vocab.load_pretrained_embeddings(args.word_embed_path)
# else:
# word_vocab.embed_dim = args.word_embed_size
#
# logger.info('Saving vocab...')
# if not os.path.exists(os.path.dirname(args.word_vocab_path)):
# os.makedirs(os.path.dirname(args.word_vocab_path))
# with open(args.word_vocab_path, 'wb') as fout:
# pickle.dump(word_vocab, fout)