Python utils.simple_preprocess方法代码示例

本文整理汇总了Python中gensim.utils.simple_preprocess方法的典型用法代码示例。如果您正苦于以下问题：Python utils.simple_preprocess方法的具体用法？Python utils.simple_preprocess怎么用？Python utils.simple_preprocess使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.utils的用法示例。

在下文中一共展示了utils.simple_preprocess方法的11个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: fit

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def fit(self, X, y=None):
        # Format text for processing, by creating a list of strings
        observations = self.prepare_input(X)

        # Preprocess & tokenize
        observations = list(map(lambda x: simple_preprocess(x), observations))

        # Generate embedding_sequence_length, if necessary
        if self.max_sequence_length is None:
            self.max_sequence_length = self.generate_embedding_sequence_length(observations)

        # Update index_lookup
        tokens = set()
        for observation in observations:
            tokens.update(observation)

        logging.debug('Fitting with tokens: {}'.format(tokens))

        current_max_index = max(self.token_index_lookup.values())
        index_range = range(current_max_index, len(tokens) + current_max_index)
        learned_token_index_lookup = dict(zip(tokens, index_range))
        self.token_index_lookup.update(learned_token_index_lookup)
        new_max_token_index = max(self.token_index_lookup.values())
        logging.info('Learned tokens, new_max_token_index: {}'.format(new_max_token_index))
        return self

开发者ID:bjherger，项目名称:keras-pandas，代码行数:27，代码来源:transformations.py

示例2: read_corpus

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def read_corpus(path = '.', exclude = [], targets = None):
    i= 0
    for file in os.listdir(path):
        if file[-4:] == '.txt' and file not in exclude and 'no_en' not in file: # ensure file is an english txt file
            print(file)
            with open(os.path.join(path, file),  encoding="utf8") as document_text:
                for line in document_text:
                    count = 0
                    words = simple_preprocess(line)
                    for word in words: # count the number of words with <= 3 characters
                        if len(word) <= 3:
                            count += 1
                    if count < len(words)/2 and len(words) > 10: # exclude lines in which 1/2 the words have less 
                        yield(doc2vec.TaggedDocument(words, [i])) # than 3 characters or have less than 10 words
                        i+=1
    if targets:
        for key, val in targets.items():
            yield(doc2vec.TaggedDocument(simple_preprocess(val), [i]))
            i+=1

开发者ID:IBM，项目名称:Semantic-Search-for-Sustainable-Development，代码行数:21，代码来源:parseundp.py

示例3: inferVector1

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def inferVector1(self, line):
        '''
        Given a new line, infer a custom vector representation using the corpus tfidf.
 
        Args: 
            line : new sentence to be inferred

        Returns: 
            numpy.ndarray : vector representation of the line
        '''
        line = ' '.join(simple_preprocess(line)) # pre-process the line
        line_tf_idf = self.tf_idf_obj.transform([line]) # infer the tf-idf values for the words in the line
        rows, cols = line_tf_idf.nonzero()
        
        new_vec = np.zeros(self.dimensions)
        # Apply the same sentence to vector conversion as above. 
        for col in cols:
            try:    
                new_vec += (self.word2vec_model[(self.word_index[col])] * line_tf_idf[0, col])
            except:
                continue
        return np.asarray(new_vec)

开发者ID:IBM，项目名称:Semantic-Search-for-Sustainable-Development，代码行数:24，代码来源:CustomParVec.py

示例4: fit

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def fit(self, texts):

        model_param = {
            "vector_size": self.vector_size,
            "epochs": self.epochs,
            "min_count": self.min_count,
            "workers": self.n_jobs,
            "window": self.window,
            "dm_concat": self.dm_concat,
            "dbow_words": self.dbow_words,
        }

        corpus = [TaggedDocument(simple_preprocess(text), [i])
                  for i, text in enumerate(texts)]

        # If self.dm is 2, train both models and concatenate the feature
        # vectors later. Resulting vector size should be the same.
        if self.dm == 2:
            model_param["vector_size"] = int(model_param["vector_size"]/2)
            self.model_dm = _train_model(corpus, **model_param, dm=1)
            self.model_dbow = _train_model(corpus, **model_param, dm=0)
        else:
            self.model = _train_model(corpus, **model_param, dm=self.dm)

开发者ID:asreview，项目名称:asreview，代码行数:25，代码来源:doc2vec.py

示例5: iter

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def __iter__(self):
        with open(datapath('lee_background.cor')) as f:
            for line in f:
                yield utils.simple_preprocess(line)

开发者ID:largelymfs，项目名称:topical_word_embeddings，代码行数:6，代码来源:test_word2vec.py

示例6: process_string

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def process_string(self, input_string):
        """
        Turn a string into padded sequences, consistent with Keras's Embedding layer

         - Simple preprocess & tokenize
         - Convert tokens to indices
         - Pad sequence to be the correct length

        :param input_string: A string, to be converted into a padded sequence of token indices
        :type input_string: str
        :return: A padded, fixed-length array of token indices
        :rtype: [int]
        """
        logging.debug('Processing string: {}'.format(input_string))

        # Convert to tokens
        tokens = simple_preprocess(input_string)
        logging.debug('Tokens: {}'.format(tokens))

        # Convert to indices
        indices = list(map(lambda x: self.token_index_lookup[x], tokens))
        logging.debug('Indices: {}'.format(indices))

        # Pad indices
        padding_index = self.token_index_lookup['__PAD__']
        padding_length = self.max_sequence_length
        padded_indices = self.pad(indices, length=padding_length, pad_char=padding_index)
        logging.debug('Padded indices: {}'.format(padded_indices))

        return padded_indices

开发者ID:bjherger，项目名称:keras-pandas，代码行数:32，代码来源:transformations.py

示例7: inferVector2

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def inferVector2(self, line):
        '''
        Given a new line, infer a custom vector representation using the ground truth tfidf.
 
        Args: 
            line : new sentence to be inferred

        Returns: 
            numpy.ndarray : vector representation of the line
        '''
        line = ' '.join(simple_preprocess(line)) # pre-process the line
        
        replacement_words = []
        for word in line.split():
            if word not in self.extra_tf_idf_obj.vocabulary_:
                try:
                    similar_words = self.word2vec_model.similar_by_word(word, topn=10, restrict_vocab=None)
                    for sim in similar_words:
                        if sim[0] in self.extra_tf_idf_obj.vocabulary_:
                            replacement_words.append((word, sim[0]))
                            break
                except:
                    continue
                    
        for old, new in replacement_words:
            line = line.replace(old, new)
            
        line_tf_idf = self.extra_tf_idf_obj.transform([line]) # infer the tf-idf values for the words in the line
        rows, cols = line_tf_idf.nonzero()
        
        new_vec = np.zeros(self.dimensions)
        # Apply the same sentence to vector conversion as above. 
        for col in cols:
            try:    
                new_vec += (self.word2vec_model[(self.extra_word_index[col])] * line_tf_idf[0, col])
            except:
                continue
                            
        return np.asarray(new_vec)

开发者ID:IBM，项目名称:Semantic-Search-for-Sustainable-Development，代码行数:41，代码来源:CustomParVec.py

示例8: test_doc2vec_inference

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def test_doc2vec_inference():
    tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
                   for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, epochs=1, min_count=1)
    d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
    match_op = Matching()
    retrieval = Retrieval(d2v, matching=match_op).fit(documents)
    result = retrieval.query("scientists")
    assert result[0] == 1

开发者ID:lgalke，项目名称:vec4ir，代码行数:11，代码来源:test_vec4ir.py

示例9: test_doc2vec_inference_saveload

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def test_doc2vec_inference_saveload():
    tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
                   for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10)
    model.save(TEST_FILE)
    del model
    model = Doc2Vec.load(TEST_FILE)
    os.remove(TEST_FILE)
    d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
    match_op = Matching()
    retrieval = Retrieval(d2v, matching=match_op).fit(documents)
    result = retrieval.query("scientists")
    assert result[0] == 1

开发者ID:lgalke，项目名称:vec4ir，代码行数:15，代码来源:test_vec4ir.py

示例10: transform

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def transform(self, texts):
        corpus = [TaggedDocument(simple_preprocess(text), [i])
                  for i, text in enumerate(texts)]

        if self.dm == 2:
            X_dm = _transform_text(self.model_dm, corpus)
            X_dbow = _transform_text(self.model_dbow, corpus)
            X = np.concatenate((X_dm, X_dbow), axis=1)
        else:
            X = _transform_text(self.model, corpus)
        return X

开发者ID:asreview，项目名称:asreview，代码行数:13，代码来源:doc2vec.py

示例11: tokenizer

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import simple_preprocess [as 别名]
def tokenizer(sentence: str) -> List[str]:
    """use gensim's `simple_preprocess` and `STOPWORDS` list"""
    return [stem(token) for token in simple_preprocess(sentence) if token not in STOPWORDS]

开发者ID:allenai，项目名称:aristo-mini，代码行数:5，代码来源:wordtwovec.py

注：本文中的gensim.utils.simple_preprocess方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。