當前位置: 首頁>>代碼示例>>Python>>正文


Python doc2vec.LabeledSentence方法代碼示例

本文整理匯總了Python中gensim.models.doc2vec.LabeledSentence方法的典型用法代碼示例。如果您正苦於以下問題:Python doc2vec.LabeledSentence方法的具體用法?Python doc2vec.LabeledSentence怎麽用?Python doc2vec.LabeledSentence使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在gensim.models.doc2vec的用法示例。


在下文中一共展示了doc2vec.LabeledSentence方法的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: _doc2vec_doc_stream

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 別名]
def _doc2vec_doc_stream(paths, n, tokenizer=word_tokenize, sentences=True):
    """
    Generator to feed sentences to the dov2vec model.
    """
    i = 0
    p = Progress()
    for path in paths:
        with open(path, 'r') as f:
            for line in f:
                i += 1
                p.print_progress(i/n)

                # We do minimal pre-processing here so the model can learn
                # punctuation
                line = line.lower()

                if sentences:
                    for sent in sent_tokenize(line):
                        tokens = tokenizer(sent)
                        yield LabeledSentence(tokens, ['SENT_{}'.format(i)])
                else:
                    tokens = tokenizer(line)
                    yield LabeledSentence(tokens, ['SENT_{}'.format(i)]) 
開發者ID:frnsys,項目名稱:broca,代碼行數:25,代碼來源:doc2vec.py

示例2: __iter__

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 別名]
def __iter__(self):
        if self.labels:
            for index, line in zip(self.series.index, self.series.values):
                label = ['SENT_%s' % str(index)]
                ls = LabeledSentence(line.split(' '), label)
                yield ls
        else:
            for index, line in self.series.index, self.series.values:
                yield line.split(' ') 
開發者ID:cemoody,項目名稱:Document2Vec,代碼行數:11,代碼來源:corpora.py

示例3: __iter__

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 別名]
def __iter__(self):
        for column in self.columns:
            for sentence in self.df[column]:
                if not sentence in self.sent_label:
                    self.cnt += 1
                    self.sent_label[sentence] = "SENT_%d"%self.cnt
                tokens = nlp_utils._tokenize(sentence, token_pattern)
                yield LabeledSentence(words=tokens, tags=[self.sent_label[sentence]]) 
開發者ID:ChenglongChen,項目名稱:kaggle-HomeDepot,代碼行數:10,代碼來源:embedding_trainer.py

示例4: __iter__

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 別名]
def __iter__(self):

        for source in self.sources:
            with codecs.open(source, "r", "utf-8") as fin:
                for cnt,line in enumerate(fin):
                    if "<doc" in line:           # Every new document starts with this format
                        found = ""

                        m = re.search('title="(.*)">',line)    # This gives the document title of Wikipedia
                        if m:
                            found = m.group(1)
                            found = found.lower()
			    found = unicodedata.normalize("NFKD", found) 
                            found = found.replace(" ","_") 
                            found = found.encode('utf-8')
					   
                        else:
                            found = ""
                        values =[]
                    else:
                        if "</doc" not in line:                      #</doc tells us end of document, till not reached it is same document
                            for word in line.split(" "):
                                values.append(word.strip())
                        if "</doc" in line:
                            if found!= "":
                            
                                yield LabeledSentence(words = values, tags = [found]) 
開發者ID:sb1992,項目名稱:NETL-Automatic-Topic-Labelling-,代碼行數:29,代碼來源:doc2vectrain.py

示例5: label_sentences

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 別名]
def label_sentences(corpus, label_type):
        """
        Gensim's Doc2Vec implementation requires each
         document/paragraph to have a label associated with it.
        We do this by using the LabeledSentence method.
        The format will be "TRAIN_i" or "TEST_i" where "i" is
        a dummy index of the review.
        """
        labeled = []
        for i, v in enumerate(corpus):
            label = label_type + '_' + str(i)
            labeled.append(doc2vec.LabeledSentence(v.split(), [label]))
        return labeled 
開發者ID:ibrahimsharaf,項目名稱:doc2vec,代碼行數:15,代碼來源:doc2vec_model.py

示例6: _gen_sentence

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 別名]
def _gen_sentence(self, assetid_body_tuple):
        '''
        Takes an assetid_body_tuple and returns a Doc2Vec LabeledSentence 

        Args:
            assetid_body_tuple (tuple): (assetid, bodytext) pair 
        '''
        asset_id, body = assetid_body_tuple
        text = self._process(body)
        sentence = LabeledSentence(text, labels=['DOC_%s' % str(asset_id)])
        return sentence 
開發者ID:frnsys,項目名稱:broca,代碼行數:13,代碼來源:doc2vec.py

示例7: getCleanLabeledReviews

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 別名]
def getCleanLabeledReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review))
    
    labelized = []
    for i, id_label in enumerate(reviews["id"]):
        labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
    return labelized 
開發者ID:tjflexic,項目名稱:kaggle-word2vec-movie-reviews,代碼行數:11,代碼來源:generate_d2v.py

示例8: getCleanLabeledReviews

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import LabeledSentence [as 別名]
def getCleanLabeledReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, True))
    
    labelized = []
    for i, id_label in enumerate(reviews["id"]):
        labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
    return labelized 
開發者ID:tjflexic,項目名稱:kaggle-word2vec-movie-reviews,代碼行數:11,代碼來源:predict.py


注:本文中的gensim.models.doc2vec.LabeledSentence方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。