Python utils.tokenize方法代碼示例

本文整理匯總了Python中gensim.utils.tokenize方法的典型用法代碼示例。如果您正苦於以下問題：Python utils.tokenize方法的具體用法？Python utils.tokenize怎麽用？Python utils.tokenize使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類gensim.utils的用法示例。

在下文中一共展示了utils.tokenize方法的9個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: get_texts

# 需要導入模塊: from gensim import utils [as 別名]
# 或者: from gensim.utils import tokenize [as 別名]
def get_texts(self):
        """
        Iterate over the collection, yielding one document at a time. A document
        is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.

        Override this function to match your input (parse input files, do any
        text preprocessing, lowercasing, tokenizing etc.). There will be no further
        preprocessing of the words coming out of this function.
        """
        # Instead of raising NotImplementedError, let's provide a sample implementation:
        # assume documents are lines in a single file (one document per line).
        # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
        lineno = -1
        with self.getstream() as lines:
            for lineno, line in enumerate(lines):
                if self.metadata:
                    yield utils.tokenize(line, lowercase=True), (lineno,)
                else:
                    yield utils.tokenize(line, lowercase=True)
            self.length = lineno + 1 # will be 0 if loop never executes

開發者ID:largelymfs，項目名稱:topical_word_embeddings，代碼行數:22，代碼來源:textcorpus.py

示例2: simple_preprocess

# 需要導入模塊: from gensim import utils [as 別名]
# 或者: from gensim.utils import tokenize [as 別名]
def simple_preprocess(
    doc: str,
    lower: bool = False,
    deacc: bool = False,
    min_len: int = 2,
    max_len: int = 15,
) -> List[str]:
    r"""
    Gensim's simple_preprocess adding a 'lower' param to indicate wether or not to
    lower case all the token in the texts

    For more informations see: https://radimrehurek.com/gensim/utils.html
    """
    tokens = [
        token
        for token in tokenize(doc, lower=False, deacc=deacc, errors="ignore")
        if min_len <= len(token) <= max_len and not token.startswith("_")
    ]
    return tokens

開發者ID:jrzaurin，項目名稱:pytorch-widedeep，代碼行數:21，代碼來源:text_utils.py

示例3: tokenize

# 需要導入模塊: from gensim import utils [as 別名]
# 或者: from gensim.utils import tokenize [as 別名]
def tokenize(self, content):
        raise NotImplementedError('Abstract Base Class')

開發者ID:largelymfs，項目名稱:topical_word_embeddings，代碼行數:4，代碼來源:sources.py

示例4: characters

# 需要導入模塊: from gensim import utils [as 別名]
# 或者: from gensim.utils import tokenize [as 別名]
def characters(self, text):
            # for text, we only care about tokens directly within the <p> tag
            if self.path[-1] == 'p':
                tokens = [token.encode('utf8') for token in utils.tokenize(text, errors = 'ignore') if not token.isdigit()]
                self.tokens.extend(tokens)
    #endclass ArxmlivHandler

開發者ID:largelymfs，項目名稱:topical_word_embeddings，代碼行數:8，代碼來源:sources.py

示例5: process_article

# 需要導入模塊: from gensim import utils [as 別名]
# 或者: from gensim.utils import tokenize [as 別名]
def process_article(args):
    """
    Parse a wikipedia article, returning its content as a list of tokens
    (utf8-encoded strings).
    """
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid

開發者ID:largelymfs，項目名稱:topical_word_embeddings，代碼行數:14，代碼來源:wikicorpus.py

示例6: tokenize

# 需要導入模塊: from gensim import utils [as 別名]
# 或者: from gensim.utils import tokenize [as 別名]
def tokenize(content):
    """
    Tokenize a piece of text from wikipedia. The input string `content` is assumed
    to be mark-up free (see `filter_wiki()`).

    Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer
    that 15 characters (not bytes!).
    """
    # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
    return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore')
            if 2 <= len(token) <= 15 and not token.startswith('_')]

開發者ID:largelymfs，項目名稱:topical_word_embeddings，代碼行數:13，代碼來源:wikicorpus.py

示例7: get_texts

# 需要導入模塊: from gensim import utils [as 別名]
# 或者: from gensim.utils import tokenize [as 別名]
def get_texts(self, metadata=None):
        """Iterate over the lines of "The Complete Works of William Shakespeare".

        This yields lists of strings (**texts**) rather than vectors (vectorized bags-of-words).
        And the **texts** yielded are lines rather than entire plays or sonnets.
        If you want vectors, use the corpus interface instead of this method.

        >>> shakes = ShakesCorpus(lowercase=True)
        >>> for i, tokens in enumerate(shakes.get_texts()):
        ...     print(i, tokens)
        ...     if i >= 4:
        ...         break
        (0, [])
        (1, [])
        (2, [u'the', u'sonnets'])
        (3, [])
        (4, [u'by', u'william', u'shakespeare'])
        """
        if metadata is None:
            metadata = self.metadata
        self.input_file = gzip.GzipFile(self.input_file_path)
        volume_num = 0
        with self.input_file as lines:
            for lineno, line in enumerate(lines):
                if volume_num >= len(self.book_meta['volumes']):
                    raise StopIteration()
                if lineno < self.book_meta['volumes'][volume_num]['start']:
                    continue
                if lineno < self.book_meta['volumes'][volume_num]['stop']:
                    # act_num, scene_num = 0, 0  # FIXME: use self.book_meta['volumes'][volume_num]['sections']
                    if metadata:
                        # FIXME: use self.lemmatize
                        toks = self.tokenize(line, lowercase=self.lowercase)
                        yield (toks, (lineno,))
                    else:
                        toks = self.tokenize(line, lowercase=self.lowercase)
                        yield toks
                else:
                    volume_num += 1  # don't yield the "THE END" line?

開發者ID:totalgood，項目名稱:twip，代碼行數:41，代碼來源:shakescorpus.py

示例8: tokenize

# 需要導入模塊: from gensim import utils [as 別名]
# 或者: from gensim.utils import tokenize [as 別名]
def tokenize(self, line, **kwargs):
        return list(utils.tokenize(line, **kwargs))

開發者ID:totalgood，項目名稱:twip，代碼行數:4，代碼來源:shakescorpus.py

示例9: tokenize_tr

# 需要導入模塊: from gensim import utils [as 別名]
# 或者: from gensim.utils import tokenize [as 別名]
def tokenize_tr(content,token_min_len=2,token_max_len=50,lower=True):
	if lower:
		lowerMap = {ord(u'A'): u'a',ord(u'A'): u'a',ord(u'B'): u'b',ord(u'C'): u'c',ord(u'Ç'): u'ç',ord(u'D'): u'd',ord(u'E'): u'e',ord(u'F'): u'f',ord(u'G'): u'g',ord(u'Ğ'): u'ğ',ord(u'H'): u'h',ord(u'I'): u'ı',ord(u'İ'): u'i',ord(u'J'): u'j',ord(u'K'): u'k',ord(u'L'): u'l',ord(u'M'): u'm',ord(u'N'): u'n',ord(u'O'): u'o',ord(u'Ö'): u'ö',ord(u'P'): u'p',ord(u'R'): u'r',ord(u'S'): u's',ord(u'Ş'): u'ş',ord(u'T'): u't',ord(u'U'): u'u',ord(u'Ü'): u'ü',ord(u'V'): u'v',ord(u'Y'): u'y',ord(u'Z'): u'z'}
		content = content.translate(lowerMap)
	return [
	utils.to_unicode(token) for token in utils.tokenize(content, lower=False, errors='ignore')
	if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
	]

開發者ID:akoksal，項目名稱:Turkish-Word2Vec，代碼行數:10，代碼來源:preprocess.py

注：本文中的gensim.utils.tokenize方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。