本文整理汇总了Python中gensim.utils.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python utils.tokenize方法的具体用法?Python utils.tokenize怎么用?Python utils.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.utils
的用法示例。
在下文中一共展示了utils.tokenize方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_texts
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import tokenize [as 别名]
def get_texts(self):
"""
Iterate over the collection, yielding one document at a time. A document
is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.
Override this function to match your input (parse input files, do any
text preprocessing, lowercasing, tokenizing etc.). There will be no further
preprocessing of the words coming out of this function.
"""
# Instead of raising NotImplementedError, let's provide a sample implementation:
# assume documents are lines in a single file (one document per line).
# Yield each document as a list of lowercase tokens, via `utils.tokenize`.
lineno = -1
with self.getstream() as lines:
for lineno, line in enumerate(lines):
if self.metadata:
yield utils.tokenize(line, lowercase=True), (lineno,)
else:
yield utils.tokenize(line, lowercase=True)
self.length = lineno + 1 # will be 0 if loop never executes
示例2: simple_preprocess
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import tokenize [as 别名]
def simple_preprocess(
doc: str,
lower: bool = False,
deacc: bool = False,
min_len: int = 2,
max_len: int = 15,
) -> List[str]:
r"""
Gensim's simple_preprocess adding a 'lower' param to indicate wether or not to
lower case all the token in the texts
For more informations see: https://radimrehurek.com/gensim/utils.html
"""
tokens = [
token
for token in tokenize(doc, lower=False, deacc=deacc, errors="ignore")
if min_len <= len(token) <= max_len and not token.startswith("_")
]
return tokens
示例3: tokenize
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import tokenize [as 别名]
def tokenize(self, content):
raise NotImplementedError('Abstract Base Class')
示例4: characters
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import tokenize [as 别名]
def characters(self, text):
# for text, we only care about tokens directly within the <p> tag
if self.path[-1] == 'p':
tokens = [token.encode('utf8') for token in utils.tokenize(text, errors = 'ignore') if not token.isdigit()]
self.tokens.extend(tokens)
#endclass ArxmlivHandler
示例5: process_article
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import tokenize [as 别名]
def process_article(args):
"""
Parse a wikipedia article, returning its content as a list of tokens
(utf8-encoded strings).
"""
text, lemmatize, title, pageid = args
text = filter_wiki(text)
if lemmatize:
result = utils.lemmatize(text)
else:
result = tokenize(text)
return result, title, pageid
示例6: tokenize
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import tokenize [as 别名]
def tokenize(content):
"""
Tokenize a piece of text from wikipedia. The input string `content` is assumed
to be mark-up free (see `filter_wiki()`).
Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer
that 15 characters (not bytes!).
"""
# TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore')
if 2 <= len(token) <= 15 and not token.startswith('_')]
示例7: get_texts
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import tokenize [as 别名]
def get_texts(self, metadata=None):
"""Iterate over the lines of "The Complete Works of William Shakespeare".
This yields lists of strings (**texts**) rather than vectors (vectorized bags-of-words).
And the **texts** yielded are lines rather than entire plays or sonnets.
If you want vectors, use the corpus interface instead of this method.
>>> shakes = ShakesCorpus(lowercase=True)
>>> for i, tokens in enumerate(shakes.get_texts()):
... print(i, tokens)
... if i >= 4:
... break
(0, [])
(1, [])
(2, [u'the', u'sonnets'])
(3, [])
(4, [u'by', u'william', u'shakespeare'])
"""
if metadata is None:
metadata = self.metadata
self.input_file = gzip.GzipFile(self.input_file_path)
volume_num = 0
with self.input_file as lines:
for lineno, line in enumerate(lines):
if volume_num >= len(self.book_meta['volumes']):
raise StopIteration()
if lineno < self.book_meta['volumes'][volume_num]['start']:
continue
if lineno < self.book_meta['volumes'][volume_num]['stop']:
# act_num, scene_num = 0, 0 # FIXME: use self.book_meta['volumes'][volume_num]['sections']
if metadata:
# FIXME: use self.lemmatize
toks = self.tokenize(line, lowercase=self.lowercase)
yield (toks, (lineno,))
else:
toks = self.tokenize(line, lowercase=self.lowercase)
yield toks
else:
volume_num += 1 # don't yield the "THE END" line?
示例8: tokenize
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import tokenize [as 别名]
def tokenize(self, line, **kwargs):
return list(utils.tokenize(line, **kwargs))
示例9: tokenize_tr
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import tokenize [as 别名]
def tokenize_tr(content,token_min_len=2,token_max_len=50,lower=True):
if lower:
lowerMap = {ord(u'A'): u'a',ord(u'A'): u'a',ord(u'B'): u'b',ord(u'C'): u'c',ord(u'Ç'): u'ç',ord(u'D'): u'd',ord(u'E'): u'e',ord(u'F'): u'f',ord(u'G'): u'g',ord(u'Ğ'): u'ğ',ord(u'H'): u'h',ord(u'I'): u'ı',ord(u'İ'): u'i',ord(u'J'): u'j',ord(u'K'): u'k',ord(u'L'): u'l',ord(u'M'): u'm',ord(u'N'): u'n',ord(u'O'): u'o',ord(u'Ö'): u'ö',ord(u'P'): u'p',ord(u'R'): u'r',ord(u'S'): u's',ord(u'Ş'): u'ş',ord(u'T'): u't',ord(u'U'): u'u',ord(u'Ü'): u'ü',ord(u'V'): u'v',ord(u'Y'): u'y',ord(u'Z'): u'z'}
content = content.translate(lowerMap)
return [
utils.to_unicode(token) for token in utils.tokenize(content, lower=False, errors='ignore')
if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
]