本文整理汇总了Python中gensim.models.doc2vec.LabeledSentence方法的典型用法代码示例。如果您正苦于以下问题:Python doc2vec.LabeledSentence方法的具体用法?Python doc2vec.LabeledSentence怎么用?Python doc2vec.LabeledSentence使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models.doc2vec
的用法示例。
在下文中一共展示了doc2vec.LabeledSentence方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _doc2vec_doc_stream
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def _doc2vec_doc_stream(paths, n, tokenizer=word_tokenize, sentences=True):
"""
Generator to feed sentences to the dov2vec model.
"""
i = 0
p = Progress()
for path in paths:
with open(path, 'r') as f:
for line in f:
i += 1
p.print_progress(i/n)
# We do minimal pre-processing here so the model can learn
# punctuation
line = line.lower()
if sentences:
for sent in sent_tokenize(line):
tokens = tokenizer(sent)
yield LabeledSentence(tokens, ['SENT_{}'.format(i)])
else:
tokens = tokenizer(line)
yield LabeledSentence(tokens, ['SENT_{}'.format(i)])
示例2: __iter__
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def __iter__(self):
if self.labels:
for index, line in zip(self.series.index, self.series.values):
label = ['SENT_%s' % str(index)]
ls = LabeledSentence(line.split(' '), label)
yield ls
else:
for index, line in self.series.index, self.series.values:
yield line.split(' ')
示例3: __iter__
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def __iter__(self):
for column in self.columns:
for sentence in self.df[column]:
if not sentence in self.sent_label:
self.cnt += 1
self.sent_label[sentence] = "SENT_%d"%self.cnt
tokens = nlp_utils._tokenize(sentence, token_pattern)
yield LabeledSentence(words=tokens, tags=[self.sent_label[sentence]])
示例4: __iter__
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def __iter__(self):
for source in self.sources:
with codecs.open(source, "r", "utf-8") as fin:
for cnt,line in enumerate(fin):
if "<doc" in line: # Every new document starts with this format
found = ""
m = re.search('title="(.*)">',line) # This gives the document title of Wikipedia
if m:
found = m.group(1)
found = found.lower()
found = unicodedata.normalize("NFKD", found)
found = found.replace(" ","_")
found = found.encode('utf-8')
else:
found = ""
values =[]
else:
if "</doc" not in line: #</doc tells us end of document, till not reached it is same document
for word in line.split(" "):
values.append(word.strip())
if "</doc" in line:
if found!= "":
yield LabeledSentence(words = values, tags = [found])
示例5: label_sentences
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def label_sentences(corpus, label_type):
"""
Gensim's Doc2Vec implementation requires each
document/paragraph to have a label associated with it.
We do this by using the LabeledSentence method.
The format will be "TRAIN_i" or "TEST_i" where "i" is
a dummy index of the review.
"""
labeled = []
for i, v in enumerate(corpus):
label = label_type + '_' + str(i)
labeled.append(doc2vec.LabeledSentence(v.split(), [label]))
return labeled
示例6: _gen_sentence
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def _gen_sentence(self, assetid_body_tuple):
'''
Takes an assetid_body_tuple and returns a Doc2Vec LabeledSentence
Args:
assetid_body_tuple (tuple): (assetid, bodytext) pair
'''
asset_id, body = assetid_body_tuple
text = self._process(body)
sentence = LabeledSentence(text, labels=['DOC_%s' % str(asset_id)])
return sentence
示例7: getCleanLabeledReviews
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def getCleanLabeledReviews(reviews):
clean_reviews = []
for review in reviews["review"]:
clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review))
labelized = []
for i, id_label in enumerate(reviews["id"]):
labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
return labelized
示例8: getCleanLabeledReviews
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def getCleanLabeledReviews(reviews):
clean_reviews = []
for review in reviews["review"]:
clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, True))
labelized = []
for i, id_label in enumerate(reviews["id"]):
labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
return labelized