当前位置: 首页>>代码示例>>Python>>正文


Python doc2vec.LabeledSentence方法代码示例

本文整理汇总了Python中gensim.models.doc2vec.LabeledSentence方法的典型用法代码示例。如果您正苦于以下问题:Python doc2vec.LabeledSentence方法的具体用法?Python doc2vec.LabeledSentence怎么用?Python doc2vec.LabeledSentence使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.models.doc2vec的用法示例。


在下文中一共展示了doc2vec.LabeledSentence方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _doc2vec_doc_stream

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def _doc2vec_doc_stream(paths, n, tokenizer=word_tokenize, sentences=True):
    """
    Generator to feed sentences to the dov2vec model.
    """
    i = 0
    p = Progress()
    for path in paths:
        with open(path, 'r') as f:
            for line in f:
                i += 1
                p.print_progress(i/n)

                # We do minimal pre-processing here so the model can learn
                # punctuation
                line = line.lower()

                if sentences:
                    for sent in sent_tokenize(line):
                        tokens = tokenizer(sent)
                        yield LabeledSentence(tokens, ['SENT_{}'.format(i)])
                else:
                    tokens = tokenizer(line)
                    yield LabeledSentence(tokens, ['SENT_{}'.format(i)]) 
开发者ID:frnsys,项目名称:broca,代码行数:25,代码来源:doc2vec.py

示例2: __iter__

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def __iter__(self):
        if self.labels:
            for index, line in zip(self.series.index, self.series.values):
                label = ['SENT_%s' % str(index)]
                ls = LabeledSentence(line.split(' '), label)
                yield ls
        else:
            for index, line in self.series.index, self.series.values:
                yield line.split(' ') 
开发者ID:cemoody,项目名称:Document2Vec,代码行数:11,代码来源:corpora.py

示例3: __iter__

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def __iter__(self):
        for column in self.columns:
            for sentence in self.df[column]:
                if not sentence in self.sent_label:
                    self.cnt += 1
                    self.sent_label[sentence] = "SENT_%d"%self.cnt
                tokens = nlp_utils._tokenize(sentence, token_pattern)
                yield LabeledSentence(words=tokens, tags=[self.sent_label[sentence]]) 
开发者ID:ChenglongChen,项目名称:kaggle-HomeDepot,代码行数:10,代码来源:embedding_trainer.py

示例4: __iter__

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def __iter__(self):

        for source in self.sources:
            with codecs.open(source, "r", "utf-8") as fin:
                for cnt,line in enumerate(fin):
                    if "<doc" in line:           # Every new document starts with this format
                        found = ""

                        m = re.search('title="(.*)">',line)    # This gives the document title of Wikipedia
                        if m:
                            found = m.group(1)
                            found = found.lower()
			    found = unicodedata.normalize("NFKD", found) 
                            found = found.replace(" ","_") 
                            found = found.encode('utf-8')
					   
                        else:
                            found = ""
                        values =[]
                    else:
                        if "</doc" not in line:                      #</doc tells us end of document, till not reached it is same document
                            for word in line.split(" "):
                                values.append(word.strip())
                        if "</doc" in line:
                            if found!= "":
                            
                                yield LabeledSentence(words = values, tags = [found]) 
开发者ID:sb1992,项目名称:NETL-Automatic-Topic-Labelling-,代码行数:29,代码来源:doc2vectrain.py

示例5: label_sentences

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def label_sentences(corpus, label_type):
        """
        Gensim's Doc2Vec implementation requires each
         document/paragraph to have a label associated with it.
        We do this by using the LabeledSentence method.
        The format will be "TRAIN_i" or "TEST_i" where "i" is
        a dummy index of the review.
        """
        labeled = []
        for i, v in enumerate(corpus):
            label = label_type + '_' + str(i)
            labeled.append(doc2vec.LabeledSentence(v.split(), [label]))
        return labeled 
开发者ID:ibrahimsharaf,项目名称:doc2vec,代码行数:15,代码来源:doc2vec_model.py

示例6: _gen_sentence

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def _gen_sentence(self, assetid_body_tuple):
        '''
        Takes an assetid_body_tuple and returns a Doc2Vec LabeledSentence 

        Args:
            assetid_body_tuple (tuple): (assetid, bodytext) pair 
        '''
        asset_id, body = assetid_body_tuple
        text = self._process(body)
        sentence = LabeledSentence(text, labels=['DOC_%s' % str(asset_id)])
        return sentence 
开发者ID:frnsys,项目名称:broca,代码行数:13,代码来源:doc2vec.py

示例7: getCleanLabeledReviews

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def getCleanLabeledReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review))
    
    labelized = []
    for i, id_label in enumerate(reviews["id"]):
        labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
    return labelized 
开发者ID:tjflexic,项目名称:kaggle-word2vec-movie-reviews,代码行数:11,代码来源:generate_d2v.py

示例8: getCleanLabeledReviews

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 别名]
def getCleanLabeledReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, True))
    
    labelized = []
    for i, id_label in enumerate(reviews["id"]):
        labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
    return labelized 
开发者ID:tjflexic,项目名称:kaggle-word2vec-movie-reviews,代码行数:11,代码来源:predict.py


注:本文中的gensim.models.doc2vec.LabeledSentence方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。