本文整理汇总了Python中gensim.utils.simple_preprocess方法的典型用法代码示例。如果您正苦于以下问题:Python utils.simple_preprocess方法的具体用法?Python utils.simple_preprocess怎么用?Python utils.simple_preprocess使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.utils
的用法示例。
在下文中一共展示了utils.simple_preprocess方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: fit
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def fit(self, X, y=None):
# Format text for processing, by creating a list of strings
observations = self.prepare_input(X)
# Preprocess & tokenize
observations = list(map(lambda x: simple_preprocess(x), observations))
# Generate embedding_sequence_length, if necessary
if self.max_sequence_length is None:
self.max_sequence_length = self.generate_embedding_sequence_length(observations)
# Update index_lookup
tokens = set()
for observation in observations:
tokens.update(observation)
logging.debug('Fitting with tokens: {}'.format(tokens))
current_max_index = max(self.token_index_lookup.values())
index_range = range(current_max_index, len(tokens) + current_max_index)
learned_token_index_lookup = dict(zip(tokens, index_range))
self.token_index_lookup.update(learned_token_index_lookup)
new_max_token_index = max(self.token_index_lookup.values())
logging.info('Learned tokens, new_max_token_index: {}'.format(new_max_token_index))
return self
示例2: read_corpus
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def read_corpus(path = '.', exclude = [], targets = None):
i= 0
for file in os.listdir(path):
if file[-4:] == '.txt' and file not in exclude and 'no_en' not in file: # ensure file is an english txt file
print(file)
with open(os.path.join(path, file), encoding="utf8") as document_text:
for line in document_text:
count = 0
words = simple_preprocess(line)
for word in words: # count the number of words with <= 3 characters
if len(word) <= 3:
count += 1
if count < len(words)/2 and len(words) > 10: # exclude lines in which 1/2 the words have less
yield(doc2vec.TaggedDocument(words, [i])) # than 3 characters or have less than 10 words
i+=1
if targets:
for key, val in targets.items():
yield(doc2vec.TaggedDocument(simple_preprocess(val), [i]))
i+=1
示例3: inferVector1
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def inferVector1(self, line):
'''
Given a new line, infer a custom vector representation using the corpus tfidf.
Args:
line : new sentence to be inferred
Returns:
numpy.ndarray : vector representation of the line
'''
line = ' '.join(simple_preprocess(line)) # pre-process the line
line_tf_idf = self.tf_idf_obj.transform([line]) # infer the tf-idf values for the words in the line
rows, cols = line_tf_idf.nonzero()
new_vec = np.zeros(self.dimensions)
# Apply the same sentence to vector conversion as above.
for col in cols:
try:
new_vec += (self.word2vec_model[(self.word_index[col])] * line_tf_idf[0, col])
except:
continue
return np.asarray(new_vec)
示例4: fit
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def fit(self, texts):
model_param = {
"vector_size": self.vector_size,
"epochs": self.epochs,
"min_count": self.min_count,
"workers": self.n_jobs,
"window": self.window,
"dm_concat": self.dm_concat,
"dbow_words": self.dbow_words,
}
corpus = [TaggedDocument(simple_preprocess(text), [i])
for i, text in enumerate(texts)]
# If self.dm is 2, train both models and concatenate the feature
# vectors later. Resulting vector size should be the same.
if self.dm == 2:
model_param["vector_size"] = int(model_param["vector_size"]/2)
self.model_dm = _train_model(corpus, **model_param, dm=1)
self.model_dbow = _train_model(corpus, **model_param, dm=0)
else:
self.model = _train_model(corpus, **model_param, dm=self.dm)
示例5: __iter__
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def __iter__(self):
with open(datapath('lee_background.cor')) as f:
for line in f:
yield utils.simple_preprocess(line)
示例6: process_string
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def process_string(self, input_string):
"""
Turn a string into padded sequences, consistent with Keras's Embedding layer
- Simple preprocess & tokenize
- Convert tokens to indices
- Pad sequence to be the correct length
:param input_string: A string, to be converted into a padded sequence of token indices
:type input_string: str
:return: A padded, fixed-length array of token indices
:rtype: [int]
"""
logging.debug('Processing string: {}'.format(input_string))
# Convert to tokens
tokens = simple_preprocess(input_string)
logging.debug('Tokens: {}'.format(tokens))
# Convert to indices
indices = list(map(lambda x: self.token_index_lookup[x], tokens))
logging.debug('Indices: {}'.format(indices))
# Pad indices
padding_index = self.token_index_lookup['__PAD__']
padding_length = self.max_sequence_length
padded_indices = self.pad(indices, length=padding_length, pad_char=padding_index)
logging.debug('Padded indices: {}'.format(padded_indices))
return padded_indices
示例7: inferVector2
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def inferVector2(self, line):
'''
Given a new line, infer a custom vector representation using the ground truth tfidf.
Args:
line : new sentence to be inferred
Returns:
numpy.ndarray : vector representation of the line
'''
line = ' '.join(simple_preprocess(line)) # pre-process the line
replacement_words = []
for word in line.split():
if word not in self.extra_tf_idf_obj.vocabulary_:
try:
similar_words = self.word2vec_model.similar_by_word(word, topn=10, restrict_vocab=None)
for sim in similar_words:
if sim[0] in self.extra_tf_idf_obj.vocabulary_:
replacement_words.append((word, sim[0]))
break
except:
continue
for old, new in replacement_words:
line = line.replace(old, new)
line_tf_idf = self.extra_tf_idf_obj.transform([line]) # infer the tf-idf values for the words in the line
rows, cols = line_tf_idf.nonzero()
new_vec = np.zeros(self.dimensions)
# Apply the same sentence to vector conversion as above.
for col in cols:
try:
new_vec += (self.word2vec_model[(self.extra_word_index[col])] * line_tf_idf[0, col])
except:
continue
return np.asarray(new_vec)
示例8: test_doc2vec_inference
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def test_doc2vec_inference():
tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
for i, doc in enumerate(documents)]
model = Doc2Vec(tagged_docs, epochs=1, min_count=1)
d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
match_op = Matching()
retrieval = Retrieval(d2v, matching=match_op).fit(documents)
result = retrieval.query("scientists")
assert result[0] == 1
示例9: test_doc2vec_inference_saveload
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def test_doc2vec_inference_saveload():
tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
for i, doc in enumerate(documents)]
model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10)
model.save(TEST_FILE)
del model
model = Doc2Vec.load(TEST_FILE)
os.remove(TEST_FILE)
d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
match_op = Matching()
retrieval = Retrieval(d2v, matching=match_op).fit(documents)
result = retrieval.query("scientists")
assert result[0] == 1
示例10: transform
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def transform(self, texts):
corpus = [TaggedDocument(simple_preprocess(text), [i])
for i, text in enumerate(texts)]
if self.dm == 2:
X_dm = _transform_text(self.model_dm, corpus)
X_dbow = _transform_text(self.model_dbow, corpus)
X = np.concatenate((X_dm, X_dbow), axis=1)
else:
X = _transform_text(self.model, corpus)
return X
示例11: tokenizer
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def tokenizer(sentence: str) -> List[str]:
"""use gensim's `simple_preprocess` and `STOPWORDS` list"""
return [stem(token) for token in simple_preprocess(sentence) if token not in STOPWORDS]