本文整理汇总了Python中tokenizer.Tokenizer.analyze方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.analyze方法的具体用法?Python Tokenizer.analyze怎么用?Python Tokenizer.analyze使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tokenizer.Tokenizer
的用法示例。
在下文中一共展示了Tokenizer.analyze方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Tokenizer
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import analyze [as 别名]
# At this point, we have a valid text file for a case that does not exist on Wikisource
logger.info("Parsing {0}.".format(metadict['title']))
tokenizer = Tokenizer(metadict)
parser = Parser(metadict)
try:
os.mkdir('wikitext')
except OSError:
pass
out_filename = 'wikitext/' + re.sub(r'[^a-zA-Z0-9_]', '', metadict['title'])
postprocessor = Postprocessor(out_filename)
with open(file, 'r', encoding='utf-8') as input_file:
raw_text = input_file.read()
try:
token_stream = tokenizer.analyze(raw_text)
except IllegalCharacter as e:
logger.error("Illegal character encountered: \"{0}\" at {1}. More: {2}"
.format(raw_text[e.value], e.value,
(raw_text[e.value:e.value+20] + "...").replace('\n', '\\n')))
with open(out_filename, 'w', encoding='utf-8') as output_file:
parser.parse(token_stream, output_file)
postprocessor.process()
# Begin the bot parsing
try:
os.mkdir('botfiles')
except OSError:
pass
try:
os.mkdir('botfiles/pdfs')
示例2: Tokenizer
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import analyze [as 别名]
# Open and read files
tokenizer = Tokenizer()
progress = util.ProgressChecker()
parser = Parser(progress)
if not os.path.exists(os.curdir + '/latex'):
os.mkdir(os.curdir + '/latex')
if not os.path.exists(os.curdir + '/latex'):
os.mkdir(os.curdir + '/latex')
#folders = sorted(os.listdir(path=(os.curdir + '/text')), key=int)
folders = ['0', '1', '2', '3']
for folder in folders:
files = sorted(os.listdir(path=(os.curdir + '/text/' + folder)), key=lambda x: int(x[0]))
if folder == '3':
files = ['0.txt', '1.txt']
with codecs.open(os.curdir + '/latex/' + folder + '.tex', 'w+', 'utf-8') as outputfile:
last_open = os.curdir + '/latex/' + folder + '.tex'
for file in files:
logger.debug("Parsing " + folder + "/" + file + " to " + folder + ".tex.")
with codecs.open(os.curdir + '/text/' + folder + '/' + file, 'r', 'utf-8') as f:
data = f.read()
token_list = tokenizer.analyze(data)
parser.begin(outputfile)
parser.dispatch(token_list)
print("Total number of pages included in main pages: " + str(doc.num_pages))
progress.get_statistics()
# with codecs.open(last_open, 'a', 'utf-8') as outputfile:
# contributors = doc.attribute()
# parser.end_matter(contributors, outputfile)
logger.debug("Parsing complete.")