本文整理汇总了Python中gensim.models.FastText方法的典型用法代码示例。如果您正苦于以下问题:Python models.FastText方法的具体用法?Python models.FastText怎么用?Python models.FastText使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models
的用法示例。
在下文中一共展示了models.FastText方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_average_train_np_ft
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import FastText [as 别名]
def test_average_train_np_ft(self):
ft = FastText(min_count=1, size=DIM)
ft.build_vocab(SENTENCES)
m = Average(ft)
m.prep.prepare_vectors(
sv=m.sv, total_sentences=len(self.sentences), update=False
)
m._pre_train_calls()
m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors, dtype=np.float32)
m.wv.vectors_ngrams = np.full_like(m.wv.vectors_ngrams, 2, dtype=np.float32)
mem = m._get_thread_working_mem()
output = train_average_np(m, self.sentences, m.sv.vectors, mem)
self.assertEqual((4, 10), output)
self.assertTrue((1.0 == m.sv[0]).all())
self.assertTrue((1.5 == m.sv[2]).all())
self.assertTrue((2 == m.sv[3]).all())
# "go" -> [1,1...]
# oov: "12345" -> (14 hashes * 2) / 14 = 2
# (2 + 1) / 2 = 1.5
示例2: train_emb
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import FastText [as 别名]
def train_emb(self, sent_words, word2id, id2word, emb_dim, min_count, ft_iters, use_subword, min_n, max_n):
"""因为fasttext的词频筛选策略(>=5),word2id和id2word会发生改变,但是要保持按照词频的排序
:return: emb_mat, word2id, id2word
- emb_mat: np.array [num_entities, emb_dim]
- word2id
- id2word
"""
print("Training fasttext")
model = FastText(sent_words, size=emb_dim, min_count=min_count,
iter=ft_iters, word_ngrams=int(use_subword), min_n=min_n, max_n=max_n)
id2word = [wd for wd in id2word if wd in model.wv.vocab]
word2id = {wd: i for (i, wd) in enumerate(id2word)}
emb_mat = np.zeros((len(id2word), emb_dim))
for i, wd in enumerate(id2word):
emb_mat[i, :] = model.wv[wd]
return emb_mat, word2id, id2word
# clustering
示例3: train_fasttext
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import FastText [as 别名]
def train_fasttext(input_file, output_file, skipgram, loss, size, epochs):
"""
train_fasttext(args**) -> Takes the input file, the
output file and the model
hyperparameters as arguments
and trains the model accordingly.
The model is saved at the output location.
Arguments
---------
input_file : Input pre-processed wiki dump
output_file : Output directory to save the model.
skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
size : Embedding size (100 ~ 300)
epochs : Number of epochs
"""
sentence = LineSentence(input_file)
model = FastText(sentence, sg=skipgram, hs=loss, size=size,
alpha=0.05, window=5, min_count=5, min_n=2,
max_n=5, workers=3, iter=epochs)
model.save(output_file)
示例4: test_average_train_cy_ft
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import FastText [as 别名]
def test_average_train_cy_ft(self):
ft = FastText(min_count=1, size=DIM)
ft.build_vocab(SENTENCES)
m = Average(ft)
m.prep.prepare_vectors(
sv=m.sv, total_sentences=len(self.sentences), update=False
)
m._pre_train_calls()
m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors, dtype=np.float32)
m.wv.vectors_ngrams = np.full_like(m.wv.vectors_ngrams, 2, dtype=np.float32)
mem = m._get_thread_working_mem()
from fse.models.average_inner import train_average_cy
output = train_average_cy(m, self.sentences, m.sv.vectors, mem)
self.assertEqual((4, 10), output)
self.assertTrue((1.0 + EPS == m.sv[0]).all())
self.assertTrue(np.allclose(1.5, m.sv[2]))
self.assertTrue(np.allclose(2, m.sv[3]))
示例5: test_map_all_vectors_to_disk
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import FastText [as 别名]
def test_map_all_vectors_to_disk(self):
ft = FastText(min_count=1, size=5)
ft.build_vocab(SENTENCES)
p = Path("fse/test/test_data/test_emb")
p_vecs = Path("fse/test/test_data/test_emb_wv.vectors")
p_ngrams = Path("fse/test/test_data/test_emb_ngrams.vectors")
p_vocab = Path("fse/test/test_data/test_emb_vocab.vectors")
se = BaseSentence2VecModel(ft, wv_mapfile_path=str(p))
self.assertTrue(p_vecs.exists())
self.assertTrue(p_ngrams.exists())
self.assertTrue(p_vocab.exists())
for p in [p_vecs, p_ngrams, p_vocab]:
p.unlink()
示例6: test_check_pre_train_statistics
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import FastText [as 别名]
def test_check_pre_train_statistics(self):
ft = FastText(min_count=1, size=5)
ft.build_vocab(SENTENCES)
se = BaseSentence2VecModel(ft)
for v in se.wv.vocab:
se.wv.vocab[v].count = 1
# Just throws multiple warnings warning
se._check_pre_training_sanity(1, 1, 1)
with self.assertRaises(ValueError):
se._check_pre_training_sanity(0, 1, 1)
with self.assertRaises(ValueError):
se._check_pre_training_sanity(1, 0, 1)
with self.assertRaises(ValueError):
se._check_pre_training_sanity(1, 1, 0)
示例7: __init__
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import FastText [as 别名]
def __init__(self, documents: List[List[str]], cluster_size: int, sparsity_percentage: float, gaussian_mixture_kwargs: Dict[Any, Any],
dictionary: gensim.corpora.Dictionary, w2v: Union[FastText, Word2Vec]) -> None:
"""
:param documents: documents for training.
:param cluster_size: word cluster size.
:param sparsity_percentage: sparsity percentage. This must be in [0, 1].
:param gaussian_mixture_kwargs: Arguments to build `sklearn.mixture.GaussianMixture` except cluster_size. Please see `sklearn.mixture.GaussianMixture.__init__` for details.
:param dictionary: `gensim.corpora.Dictionary`.
"""
logger.info('_build_dictionary...')
self._dictionary = dictionary
vocabulary_size = len(self._dictionary.token2id)
embedding_size = w2v.wv.vector_size
logger.info('_build_word_embeddings...')
self._word_embeddings = self._build_word_embeddings(self._dictionary, w2v)
assert self._word_embeddings.shape == (vocabulary_size, embedding_size)
logger.info('_build_word_cluster_probabilities...')
self._word_cluster_probabilities = self._build_word_cluster_probabilities(self._word_embeddings, cluster_size, gaussian_mixture_kwargs)
assert self._word_cluster_probabilities.shape == (vocabulary_size, cluster_size)
logger.info('_build_idf...')
self._idf = self._build_idf(self._dictionary)
assert self._idf.shape == (vocabulary_size, )
logger.info('_build_word_cluster_vectors...')
word_cluster_vectors = self._build_word_cluster_vectors(self._word_embeddings, self._word_cluster_probabilities)
assert word_cluster_vectors.shape == (vocabulary_size, cluster_size, embedding_size)
logger.info('_build_word_topic_vectors...')
word_topic_vectors = self._build_word_topic_vectors(self._idf, word_cluster_vectors)
assert word_topic_vectors.shape == (vocabulary_size, (cluster_size * embedding_size))
logger.info('_build_sparsity_threshold...')
self._sparse_threshold = self._build_sparsity_threshold(word_topic_vectors, self._dictionary, documents, sparsity_percentage)
示例8: _build_word_embeddings
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import FastText [as 别名]
def _build_word_embeddings(dictionary: gensim.corpora.Dictionary, w2v: Union[FastText, Word2Vec]) -> np.ndarray:
embeddings = np.zeros((len(dictionary.token2id), w2v.vector_size))
for token, idx in dictionary.token2id.items():
if token in w2v.wv:
embeddings[idx] = w2v.wv[token]
return sklearn.preprocessing.normalize(embeddings, axis=1, norm='l2')
示例9: specific_setup
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import FastText [as 别名]
def specific_setup(self):
self.name = 'LangModelFeaturizer'
self.emb_size = 10
self.all_attrs = self.ds.get_attributes()
self.attrs_number = len(self.all_attrs)
self.attr_language_model = {}
raw_data = self.ds.get_raw_data()
for attr in self.all_attrs:
attr_corpus = list(zip(raw_data[attr].tolist()))
model = FastText(attr_corpus, min_count=1, size=self.emb_size)
self.attr_language_model[attr] = model
示例10: train_fasttext
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import FastText [as 别名]
def train_fasttext(corpus: List[List[str]],
vocabulary: Dict[str, int],
zero_init_indices: Union[int, List[int]] = 0,
rand_init_indices: Union[int, List[int]] = 1,
embedding_dim: int = 300) -> np.ndarray:
"""Use fasttext to train on corpus to obtain embedding
Args:
corpus: List of List of str. List of tokenized texts, the corpus to train on, like ``[['我',
'是', '中', '国', '人'], ...]``.
vocabulary: Dict[str, int']. A mapping of words to indices
zero_init_indices: int or a List of int. The indices which use zero-initialization. These
indices usually represent padding token.
rand_init_indices: int or a List of int. The indices which use randomly-initialization.These
indices usually represent other special tokens, such as "unk" token.
embedding_dim: int. Dimensionality of embedding
Returns: np.ndarray, a word embedding matrix, shaped [vocab_size, embedding_dim].
"""
model = FastText(size=embedding_dim, min_count=1, window=5, sg=1, word_ngrams=1)
model.build_vocab(sentences=corpus)
model.train(sentences=corpus, total_examples=len(corpus), epochs=10)
emb = np.zeros(shape=(len(vocabulary), embedding_dim), dtype='float32')
for w, i in vocabulary.items():
emb[i, :] = model.wv[w] # note that oov words can still have word vectors
if isinstance(zero_init_indices, int):
zero_init_indices = [zero_init_indices]
if isinstance(rand_init_indices, int):
rand_init_indices = [rand_init_indices]
for idx in zero_init_indices:
emb[idx] = np.zeros(embedding_dim)
for idx in rand_init_indices:
emb[idx] = np.random.normal(0, 0.05, embedding_dim)
return emb
示例11: test_cy_equal_np_ft_random
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import FastText [as 别名]
def test_cy_equal_np_ft_random(self):
ft = FastText(size=20, min_count=1)
ft.build_vocab(SENTENCES)
m1 = Average(ft)
m1.prep.prepare_vectors(
sv=m1.sv, total_sentences=len(self.sentences), update=False
)
m1._pre_train_calls()
from fse.models.average_inner import MAX_NGRAMS_IN_BATCH
m1.batch_ngrams = MAX_NGRAMS_IN_BATCH
mem1 = m1._get_thread_working_mem()
o1 = train_average_np(m1, self.sentences[:2], m1.sv.vectors, mem1)
m2 = Average(ft)
m2.prep.prepare_vectors(
sv=m2.sv, total_sentences=len(self.sentences), update=False
)
m2._pre_train_calls()
mem2 = m2._get_thread_working_mem()
from fse.models.average_inner import train_average_cy
o2 = train_average_cy(m2, self.sentences[:2], m2.sv.vectors, mem2)
self.assertEqual(o1, o2)
self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6))
示例12: test_init_w_ft_model_wo_vecs
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import FastText [as 别名]
def test_init_w_ft_model_wo_vecs(self):
ft = FastText(SENTENCES, size=5)
with self.assertRaises(RuntimeError):
ft.wv.vectors_vocab = None
BaseSentence2VecModel(ft)
with self.assertRaises(RuntimeError):
ft.wv.vectors_ngrams = None
BaseSentence2VecModel(ft)
示例13: test_init_w_empty_ft_model
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import FastText [as 别名]
def test_init_w_empty_ft_model(self):
ft = FastText(min_count=1, size=DIM)
ft.wv.vectors = np.zeros(10)
ft.wv.vectors_ngrams = None
with self.assertRaises(RuntimeError):
BaseSentence2VecModel(ft)
示例14: test_save_load_with_memmap
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import FastText [as 别名]
def test_save_load_with_memmap(self):
ft = FastText(min_count=1, size=5)
ft.build_vocab(SENTENCES)
shape = (1000, 1000)
ft.wv.vectors = np.zeros(shape, np.float32)
p = Path("fse/test/test_data/test_emb")
p_vecs = Path("fse/test/test_data/test_emb_wv.vectors")
p_ngrams = Path("fse/test/test_data/test_emb_ngrams.vectors")
p_vocab = Path("fse/test/test_data/test_emb_vocab.vectors")
p_not_exists = Path("fse/test/test_data/test_emb.wv.vectors.npy")
se = BaseSentence2VecModel(ft, wv_mapfile_path=str(p))
self.assertTrue(p_vecs.exists())
self.assertTrue(p_ngrams.exists())
self.assertTrue(p_vocab.exists())
se.save(str(p.absolute()))
self.assertTrue(p.exists())
self.assertFalse(p_not_exists.exists())
se = BaseSentence2VecModel.load(str(p.absolute()))
self.assertFalse(se.wv.vectors_vocab.flags.writeable)
self.assertEqual(shape, se.wv.vectors.shape)
self.assertEqual((2000000, 5), se.wv.vectors_ngrams.shape)
for p in [p, p_vecs, p_ngrams, p_vocab]:
p.unlink()
示例15: test_estimate_memory
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import FastText [as 别名]
def test_estimate_memory(self):
ft = FastText(min_count=1, size=5)
ft.build_vocab(SENTENCES)
se = BaseSentence2VecModel(ft)
self.assertEqual(2040025124, se.estimate_memory(int(1e8))["Total"])