本文整理汇总了Python中word2vec.load方法的典型用法代码示例。如果您正苦于以下问题:Python word2vec.load方法的具体用法?Python word2vec.load怎么用?Python word2vec.load使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类word2vec
的用法示例。
在下文中一共展示了word2vec.load方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: transfer
# 需要导入模块: import word2vec [as 别名]
# 或者: from word2vec import load [as 别名]
def transfer(model_path, embedding_size):
start_time = time.time()
model = word2vec.load(model_path)
word2id_dic = {}
init_0 = [0.0 for i in range(embedding_size)]
id2vec_dic = [init_0]
for i in range(len(model.vocab)):
id = i + 1
word2id_dic[model.vocab[i]] = id
id2vec_dic.append(model[model.vocab[i]].tolist())
end_time = time.time()
print('词转id,id转向量完成')
print(end_time - start_time)
return word2id_dic, id2vec_dic
# 存入json文件
开发者ID:yuhaitao1994,项目名称:AIchallenger2018_MachineReadingComprehension,代码行数:18,代码来源:data_process_aug.py
示例2: main
# 需要导入模块: import word2vec [as 别名]
# 或者: from word2vec import load [as 别名]
def main(em_file, em_result):
'''
embedding ->numpy
'''
em = word2vec.load(em_file)
vec = (em.vectors)
word2id = em.vocab_hash
# d = dict(vector = vec, word2id = word2id)
# t.save(d,em_result)
np.savez_compressed(em_result,vector=vec,word2id=word2id)
示例3: emb2npz
# 需要导入模块: import word2vec [as 别名]
# 或者: from word2vec import load [as 别名]
def emb2npz(emb_file_path, emb_dict_path):
"""将txt格式的embedding转为字典格式, 并将<PAD>和<UNK>加入"""
emb = word2vec.load(emb_file_path)
vec = emb.vectors
word2id = emb.vocab_hash
word2id['<PAD>'] = len(word2id)
pad_row = [0] * vec.shape[1]
vec = np.row_stack((vec, pad_row))
np.savez_compressed(emb_dict_path, vec=vec, word2id=word2id)
print('word size: {}'.format(len(word2id)))
print('emb shape: {}'.format(vec.shape))
示例4: data2npz
# 需要导入模块: import word2vec [as 别名]
# 或者: from word2vec import load [as 别名]
def data2npz(src_path, dst_path):
"""src_path txt: label+\t+title+\t+content
如:40,6 w6061,w26959,w109 w23255,w728,w12768,w58588,w11,w1442,w855,w36791"""
data = np.load(conf.emb_path)
word2id = data['word2id'].item()
del data
labels = []
titles = []
contents = []
with open(src_path, 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
label, title, content = line.replace('\n', '').split('\t')
label = [int(lab) for lab in label.split(',')]
label_mat = np.zeros(conf.n_classes, dtype='int32')
label_mat[label] = 1
labels.append(label_mat)
# word2id
title = [word2id[word if word in word2id else '</s>'] for word in title.split(',') if word.rstrip()]
content = [word2id[word if word in word2id else '</s>'] for word in content.split(',') if word.rstrip()]
# padding
titles.append(padding(title, conf.title_seq_len, pad=word2id['<PAD>']))
contents.append(padding(content, conf.content_seq_len, pad=word2id['<PAD>']))
print('data size: {}'.format(len(labels)))
np.savez_compressed(dst_path, label=labels, title=titles, content=contents)
示例5: load_word2vec
# 需要导入模块: import word2vec [as 别名]
# 或者: from word2vec import load [as 别名]
def load_word2vec(self, word2vec_file):
model = word2vec.load(word2vec_file)
embedding_matrix = []
for word in self.vocab:
try:
embedding_matrix.append(model[word])
except KeyError:
# print(word)
embedding_matrix.append(model['the'])
embedding_matrix = np.array(embedding_matrix)
return embedding_matrix
示例6: transfer
# 需要导入模块: import word2vec [as 别名]
# 或者: from word2vec import load [as 别名]
def transfer(model_path, embedding_size):
start_time = time.time()
model = word2vec.load(model_path)
word2id_dic = {}
init_0 = [0.0 for i in range(embedding_size)]
id2vec_dic = [init_0]
for i in range(len(model.vocab)):
id = i + 1
word2id_dic[model.vocab[i]] = id
id2vec_dic.append(model[model.vocab[i]].tolist())
end_time = time.time()
print('词转id,id转向量完成')
print(end_time - start_time)
return word2id_dic, id2vec_dic
示例7: __init__
# 需要导入模块: import word2vec [as 别名]
# 或者: from word2vec import load [as 别名]
def __init__(self,file_path):
# w2v_file = os.path.join(base_path, "vectors_poem.bin")
self.model = word2vec.load(file_path)
self.add_word('<unknown>')
self.add_word('<pad>')
# self.vocab_size = len(self.model.vocab)
示例8: __init__
# 需要导入模块: import word2vec [as 别名]
# 或者: from word2vec import load [as 别名]
def __init__(self, vector_path):
self.vector_path = vector_path
if vector_path.endswith('.csv'):
self.vectors = pd.read_csv(vector_path).set_index('pfam_id')
elif vector_path.endswith('.tsv'):
self.vectors = pd.read_csv(vector_path, sep='\t').set_index('pfam_id')
elif vector_path.endswith('.pkl') or vector_path.endswith('.pickle'):
self.vectors = pd.read_pickle(vector_path)
elif vector_path.endswith('.bin'):
import word2vec
model = word2vec.load(vector_path, kind='bin')
self.vectors = pd.DataFrame(model.vectors, index=model.vocab)
else:
raise ValueError("File type {} not supported for Pfam2Vec, use .csv, .pkl, .pickle or .bin".format(vector_path))
vectors_min = self.vectors.min()
too_low_idx = vectors_min < -1
cols_too_low = list(self.vectors.columns[too_low_idx])
if cols_too_low:
raise ValueError('Pfam2vec vectors should be >= -1, got {} in {}'.format(list(vectors_min[too_low_idx]), cols_too_low))
vectors_max = self.vectors.max()
too_high_idx = vectors_max > 1
cols_too_high = list(self.vectors.columns[too_high_idx])
if cols_too_high:
raise ValueError('Pfam2vec vectors should be <= 1, got {} in {}'.format(list(vectors_max[too_high_idx]), cols_too_high))
示例9: create_voabulary
# 需要导入模块: import word2vec [as 别名]
# 或者: from word2vec import load [as 别名]
def create_voabulary(simple=None,word2vec_model_path='../zhihu-word2vec-title-desc.bin-100',name_scope=''): #zhihu-word2vec-multilabel.bin-100
cache_path ='../cache_vocabulary_label_pik/'+ name_scope + "_word_voabulary.pik"
print("cache_path:",cache_path,"file_exists:",os.path.exists(cache_path))
if os.path.exists(cache_path):#如果缓存文件存在,则直接读取
with open(cache_path, 'r') as data_f:
vocabulary_word2index, vocabulary_index2word=pickle.load(data_f)
return vocabulary_word2index, vocabulary_index2word
else:
vocabulary_word2index={}
vocabulary_index2word={}
if simple is not None:
word2vec_model_path='../zhihu-word2vec.bin-100'
print("create vocabulary. word2vec_model_path:",word2vec_model_path)
model=word2vec.load(word2vec_model_path,kind='bin')
vocabulary_word2index['PAD_ID']=0
vocabulary_index2word[0]='PAD_ID'
special_index=0
if 'biLstmTextRelation' in name_scope:
vocabulary_word2index['EOS']=1 # a special token for biLstTextRelation model. which is used between two sentences.
vocabulary_index2word[1]='EOS'
special_index=1
for i,vocab in enumerate(model.vocab):
vocabulary_word2index[vocab]=i+1+special_index
vocabulary_index2word[i+1+special_index]=vocab
#save to file system if vocabulary of words is not exists.
if not os.path.exists(cache_path): #如果不存在写到缓存文件中
with open(cache_path, 'a') as data_f:
pickle.dump((vocabulary_word2index,vocabulary_index2word), data_f)
return vocabulary_word2index,vocabulary_index2word
# create vocabulary of lables. label is sorted. 1 is high frequency, 2 is low frequency.
示例10: create_voabulary_labelO
# 需要导入模块: import word2vec [as 别名]
# 或者: from word2vec import load [as 别名]
def create_voabulary_labelO():
model = word2vec.load('zhihu-word2vec-multilabel.bin-100', kind='bin') #zhihu-word2vec.bin-100
count=0
vocabulary_word2index_label={}
vocabulary_index2word_label={}
label_unique={}
for i,vocab in enumerate(model.vocab):
if '__label__' in vocab: #'__label__-2051131023989903826
label=vocab[vocab.index('__label__')+len('__label__'):]
if label_unique.get(label,None) is None: #不曾出现过的话,保持到字典中
vocabulary_word2index_label[label]=count
vocabulary_index2word_label[count]=label #ADD
count=count+1
label_unique[label]=label
return vocabulary_word2index_label,vocabulary_index2word_label
示例11: load_data
# 需要导入模块: import word2vec [as 别名]
# 或者: from word2vec import load [as 别名]
def load_data(vocabulary_word2index,vocabulary_word2index_label,valid_portion=0.05,max_training_data=1000000,training_data_path='train-zhihu4-only-title-all.txt'): # n_words=100000,
"""
input: a file path
:return: train, test, valid. where train=(trainX, trainY). where
trainX: is a list of list.each list representation a sentence.trainY: is a list of label. each label is a number
"""
# 1.load a zhihu data from file
# example:"w305 w6651 w3974 w1005 w54 w109 w110 w3974 w29 w25 w1513 w3645 w6 w111 __label__-400525901828896492"
print("load_data.started...")
zhihu_f = codecs.open(training_data_path, 'r', 'utf8') #-zhihu4-only-title.txt
lines = zhihu_f.readlines()
# 2.transform X as indices
# 3.transform y as scalar
X = []
Y = []
for i, line in enumerate(lines):
x, y = line.split('__label__') #x='w17314 w5521 w7729 w767 w10147 w111'
y=y.replace('\n','')
x = x.replace("\t",' EOS ').strip()
if i<5:
print("x0:",x) #get raw x
#x_=process_one_sentence_to_get_ui_bi_tri_gram(x)
#if i<5:
# print("x1:",x_) #
x=x.split(" ")
x = [vocabulary_word2index.get(e,0) for e in x] #if can't find the word, set the index as '0'.(equal to PAD_ID = 0)
if i<5:
print("x1:",x) #word to index
y = vocabulary_word2index_label[y] #np.abs(hash(y))
X.append(x)
Y.append(y)
# 4.split to train,test and valid data
number_examples = len(X)
print("number_examples:",number_examples) #
train = (X[0:int((1 - valid_portion) * number_examples)], Y[0:int((1 - valid_portion) * number_examples)])
test = (X[int((1 - valid_portion) * number_examples) + 1:], Y[int((1 - valid_portion) * number_examples) + 1:])
# 5.return
print("load_data.ended...")
return train, test, test
# 将一句话转化为(uigram,bigram,trigram)后的字符串
示例12: assign_pretrained_word_embedding
# 需要导入模块: import word2vec [as 别名]
# 或者: from word2vec import load [as 别名]
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,model,word2vec_model_path=None):
print("using pre-trained word emebedding.started.word2vec_model_path:",word2vec_model_path)
# word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874']
word2vec_model = word2vec.load(word2vec_model_path, kind='bin')
word2vec_dict = {}
for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors):
word2vec_dict[word] = vector
word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list.
word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD'
bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables.
count_exist = 0;
count_not_exist = 0
for i in range(1, vocab_size): # loop each word
word = vocabulary_index2word[i] # get a word
embedding = None
try:
embedding = word2vec_dict[word] # try to get vector:it is an array.
except Exception:
embedding = None
if embedding is not None: # the 'word' exist a embedding
word_embedding_2dlist[i] = embedding;
count_exist = count_exist + 1 # assign array to this word.
else: # no embedding for this word
word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size);
count_not_exist = count_not_exist + 1 # init a random value for the word.
word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array.
word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor
t_assign_embedding = tf.assign(model.Embedding,word_embedding) # assign this value to our embedding variables of our model.
sess.run(t_assign_embedding);
print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist)
print("using pre-trained word emebedding.ended...")
# 在验证集上做验证,报告损失、精确度
示例13: create_voabulary
# 需要导入模块: import word2vec [as 别名]
# 或者: from word2vec import load [as 别名]
def create_voabulary(simple=None,word2vec_model_path='zhihu-word2vec-title-desc.bin-100',name_scope=''):
cache_path ='cache_vocabulary_label_pik/'+ name_scope + "_word_voabulary.pik"
print("cache_path:",cache_path,"file_exists:",os.path.exists(cache_path))
if os.path.exists(cache_path):#if exists, load it; otherwise create it.
with open(cache_path, 'r') as data_f:
vocabulary_word2index, vocabulary_index2word=pickle.load(data_f)
return vocabulary_word2index, vocabulary_index2word
else:
vocabulary_word2index={}
vocabulary_index2word={}
if simple is not None:
word2vec_model_path='zhihu-word2vec.bin-100'
print("create vocabulary. word2vec_model_path:",word2vec_model_path)
model=word2vec.load(word2vec_model_path,kind='bin')
vocabulary_word2index['PAD_ID']=0
vocabulary_index2word[0]='PAD_ID'
special_index=0
if 'biLstmTextRelation' in name_scope:
vocabulary_word2index['EOS']=1 # a special token for biLstTextRelation model. which is used between two sentences.
vocabulary_index2word[1]='EOS'
special_index=1
for i,vocab in enumerate(model.vocab):
vocabulary_word2index[vocab]=i+1+special_index
vocabulary_index2word[i+1+special_index]=vocab
#save to file system if vocabulary of words is not exists.
if not os.path.exists(cache_path): #如果不存在写到缓存文件中
with open(cache_path, 'a') as data_f:
pickle.dump((vocabulary_word2index,vocabulary_index2word), data_f)
return vocabulary_word2index,vocabulary_index2word
# create vocabulary of lables. label is sorted. 1 is high frequency, 2 is low frequency.
示例14: process_one_sentence_to_get_ui_bi_tri_gram
# 需要导入模块: import word2vec [as 别名]
# 或者: from word2vec import load [as 别名]
def process_one_sentence_to_get_ui_bi_tri_gram(sentence,n_gram=3):
"""
:param sentence: string. example:'w17314 w5521 w7729 w767 w10147 w111'
:param n_gram:
:return:string. example:'w17314 w17314w5521 w17314w5521w7729 w5521 w5521w7729 w5521w7729w767 w7729 w7729w767 w7729w767w10147 w767 w767w10147 w767w10147w111 w10147 w10147w111 w111'
"""
result=[]
word_list=sentence.split(" ") #[sentence[i] for i in range(len(sentence))]
unigram='';bigram='';trigram='';fourgram=''
length_sentence=len(word_list)
for i,word in enumerate(word_list):
unigram=word #ui-gram
word_i=unigram
if n_gram>=2 and i+2<=length_sentence: #bi-gram
bigram="".join(word_list[i:i+2])
word_i=word_i+' '+bigram
if n_gram>=3 and i+3<=length_sentence: #tri-gram
trigram="".join(word_list[i:i+3])
word_i = word_i + ' ' + trigram
if n_gram>=4 and i+4<=length_sentence: #four-gram
fourgram="".join(word_list[i:i+4])
word_i = word_i + ' ' + fourgram
if n_gram>=5 and i+5<=length_sentence: #five-gram
fivegram="".join(word_list[i:i+5])
word_i = word_i + ' ' + fivegram
result.append(word_i)
result=" ".join(result)
return result
# 加载数据,标签包含多个label:load data with multi-labels
示例15: assign_pretrained_word_embedding
# 需要导入模块: import word2vec [as 别名]
# 或者: from word2vec import load [as 别名]
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,textCNN,word2vec_model_path=None):
print("using pre-trained word emebedding.started.word2vec_model_path:",word2vec_model_path)
# word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874']
word2vec_model = word2vec.load(word2vec_model_path, kind='bin')
word2vec_dict = {}
for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors):
word2vec_dict[word] = vector
word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list.
word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD'
bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables.
count_exist = 0;
count_not_exist = 0
for i in range(1, vocab_size): # loop each word
word = vocabulary_index2word[i] # get a word
embedding = None
try:
embedding = word2vec_dict[word] # try to get vector:it is an array.
except Exception:
embedding = None
if embedding is not None: # the 'word' exist a embedding
word_embedding_2dlist[i] = embedding;
count_exist = count_exist + 1 # assign array to this word.
else: # no embedding for this word
word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size);
count_not_exist = count_not_exist + 1 # init a random value for the word.
word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array.
word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor
t_assign_embedding = tf.assign(textCNN.Embedding,word_embedding) # assign this value to our embedding variables of our model.
sess.run(t_assign_embedding);
print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist)
print("using pre-trained word emebedding.ended...")
# 在验证集上做验证,报告损失、精确度