本文整理汇总了Python中calibre.ebooks.conversion.utils.HeuristicProcessor类的典型用法代码示例。如果您正苦于以下问题:Python HeuristicProcessor类的具体用法?Python HeuristicProcessor怎么用?Python HeuristicProcessor使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了HeuristicProcessor类的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: smarten_punctuation
def smarten_punctuation(self):
"""Convert standard punctuation to "smart" punctuation."""
preprocessor = HeuristicProcessor(log=self.log)
for name in self.html_names():
self.log.info("Smartening punctuation for file {0}".format(name))
html = self.get_raw(name, force_unicode=True)
if html is None:
continue
# Fix non-breaking space indents
html = preprocessor.fix_nbsp_indents(html)
# Smarten punctuation
html = smartyPants(html)
# Ellipsis to HTML entity
html = ELLIPSIS_RE.sub("…", html)
# Double-dash and unicode char code to em-dash
html = string.replace(html, "---", " – ")
html = string.replace(html, "\x97", " – ")
html = string.replace(html, "\u2013", " – ")
html = string.replace(html, "--", " — ")
html = string.replace(html, "\u2014", " — ")
# Fix comment nodes that got mangled
html = string.replace(html, "<! — ", "<!-- ")
html = string.replace(html, " — >", " -->")
self.dirty(name)
self.flush_cache()
示例2: smarten_punctuation
def smarten_punctuation(self):
preprocessor = HeuristicProcessor(log=self.log)
for name in self.get_html_names():
self.log.info("Smartening punctuation for file {0}".format(name))
html = self.get_raw(name)
html = html.encode("UTF-8")
# Fix non-breaking space indents
html = preprocessor.fix_nbsp_indents(html)
# Smarten punctuation
html = smartyPants(html)
# Ellipsis to HTML entity
html = re.sub(ur'(?u)(?<=\w)\s?(\.\s+?){2}\.', '…', html, flags=re.UNICODE | re.MULTILINE)
# Double-dash and unicode char code to em-dash
html = string.replace(html, '---', ' – ')
html = string.replace(html, u"\x97", ' – ')
html = string.replace(html, '--', ' — ')
html = string.replace(html, u"\u2014", ' — ')
html = string.replace(html, u"\u2013", ' – ')
# Fix comment nodes that got mangled
html = string.replace(html, u'<! — ', u'<!-- ')
html = string.replace(html, u' — >', u' -->')
# Remove Unicode replacement characters
html = string.replace(html, u"\uFFFD", "")
self.dirty(name)
self.flush_cache()
示例3: smarten_punctuation
def smarten_punctuation(self):
preprocessor = HeuristicProcessor(log = self.log)
for name in self.get_html_names():
html = self.get_raw(name)
html = html.encode("UTF-8")
# Fix non-breaking space indents
html = preprocessor.fix_nbsp_indents(html)
# Smarten punctuation
html = smartyPants(html)
# Ellipsis to HTML entity
html = re.sub(r'(?u)(?<=\w)\s?(\.\s+?){2}\.', '…', html)
# Double-dash and unicode char code to em-dash
html = string.replace(html, '---', ' – ')
html = string.replace(html, u"\x97", ' – ')
html = string.replace(html, '--', ' — ')
html = string.replace(html, u"\u2014", ' — ')
html = string.replace(html, u"\u2013", ' – ')
html = string.replace(html, u"...", "…")
# Remove Unicode replacement characters
html = string.replace(html, u"\uFFFD", "")
self.set(name, html)
示例4: clean_markup
def clean_markup(self):
preprocessor = HeuristicProcessor(log = self.log)
for name in self.get_html_names():
html = self.get_raw(name)
html = html.encode("UTF-8")
html = string.replace(html, u"\u2014", ' -- ')
html = string.replace(html, u"\u2013", ' --- ')
html = string.replace(html, u"\x97", ' --- ')
html = preprocessor.cleanup_markup(html)
# Remove Unicode replacement characters
html = string.replace(html, u"\uFFFD", "")
self.set(name, html)
示例5: smarten_punctuation
def smarten_punctuation(html, log=None):
from calibre.utils.smartypants import smartyPants
from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(log=log)
from uuid import uuid4
start = 'calibre-smartypants-'+str(uuid4())
stop = 'calibre-smartypants-'+str(uuid4())
html = html.replace('<!--', start)
html = html.replace('-->', stop)
html = preprocessor.fix_nbsp_indents(html)
html = smartyPants(html)
html = html.replace(start, '<!--')
html = html.replace(stop, '-->')
return substitute_entites(html)
示例6: smarten_punctuation
def smarten_punctuation(html, log):
from calibre.utils.smartypants import smartyPants
from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(log=log)
from uuid import uuid4
start = 'calibre-smartypants-'+str(uuid4())
stop = 'calibre-smartypants-'+str(uuid4())
html = html.replace('<!--', start)
html = html.replace('-->', stop)
html = preprocessor.fix_nbsp_indents(html)
html = smartyPants(html)
html = html.replace(start, '<!--')
html = html.replace(stop, '-->')
# convert ellipsis to entities to prevent wrapping
html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
# convert double dashes to em-dash
html = re.sub(r'\s--\s', u'\u2014', html)
return substitute_entites(html)
示例7: smarten_punctuation
def smarten_punctuation(html, log):
from calibre.utils.smartypants import smartyPants
from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(log=log)
from uuid import uuid4
start = "calibre-smartypants-" + str(uuid4())
stop = "calibre-smartypants-" + str(uuid4())
html = html.replace("<!--", start)
html = html.replace("-->", stop)
html = preprocessor.fix_nbsp_indents(html)
html = smartyPants(html)
html = html.replace(start, "<!--")
html = html.replace(stop, "-->")
# convert ellipsis to entities to prevent wrapping
html = re.sub(r"(?u)(?<=\w)\s?(\.\s?){2}\.", "…", html)
# convert double dashes to em-dash
html = re.sub(r"\s--\s", u"\u2014", html)
return substitute_entites(html)
示例8: __call__
#.........这里部分代码省略.........
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append(
# Un wrap using punctuation
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa
)
for rule in self.PREPROCESS + start_rules:
html = rule[0].sub(rule[1], html)
if self.regex_wizard_callback is not None:
self.regex_wizard_callback(self.current_href, html)
if get_preprocess_html:
return html
def dump(raw, where):
import os
dp = getattr(self.extra_opts, 'debug_pipeline', None)
if dp and os.path.exists(dp):
odir = os.path.join(dp, 'input')
if os.path.exists(odir):
odir = os.path.join(odir, where)
if not os.path.exists(odir):
os.makedirs(odir)
name, i = None, 0
while not name or os.path.exists(os.path.join(odir, name)):
i += 1
name = '%04d.html'%i
with open(os.path.join(odir, name), 'wb') as f:
f.write(raw.encode('utf-8'))
# dump(html, 'pre-preprocess')
for rule in rules + end_rules:
try:
html = rule[0].sub(rule[1], html)
except Exception as e:
if rule in user_sr_rules:
self.log.error(
'User supplied search & replace rule: %s -> %s '
'failed with error: %s, ignoring.'%(
user_sr_rules[rule], rule[1], e))
else:
raise
if is_pdftohtml and length > -1:
# Dehyphenate
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
html = dehyphenator(html,'html', length)
if is_pdftohtml:
from calibre.ebooks.conversion.utils import HeuristicProcessor
pdf_markup = HeuristicProcessor(self.extra_opts, None)
totalwords = 0
if pdf_markup.get_word_count(html) > 7000:
html = pdf_markup.markup_chapters(html, totalwords, True)
# dump(html, 'post-preprocess')
# Handle broken XHTML w/ SVG (ugh)
if 'svg:' in html and SVG_NS not in html:
html = html.replace(
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
if 'xlink:' in html and XLINK_NS not in html:
html = html.replace(
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
html = XMLDECL_RE.sub('', html)
if getattr(self.extra_opts, 'asciiize', False):
from calibre.utils.localization import get_udc
from calibre.utils.mreplace import MReplace
unihandecoder = get_udc()
mr = MReplace(data={u'«':u'<'*3, u'»':u'>'*3})
html = mr.mreplace(html)
html = unihandecoder.decode(html)
if getattr(self.extra_opts, 'enable_heuristics', False):
from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
html = preprocessor(html)
if is_pdftohtml:
html = html.replace('<!-- created by calibre\'s pdftohtml -->', '')
if getattr(self.extra_opts, 'smarten_punctuation', False):
html = smarten_punctuation(html, self.log)
try:
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
except AttributeError:
unsupported_unicode_chars = u''
if unsupported_unicode_chars:
from calibre.utils.localization import get_udc
unihandecoder = get_udc()
for char in unsupported_unicode_chars:
asciichar = unihandecoder.decode(char)
html = html.replace(char, asciichar)
return html
示例9: convert
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.ebooks.chardet import detect
from calibre.utils.zipfile import ZipFile
from calibre.ebooks.txt.processor import (convert_basic,
convert_markdown_with_metadata, separate_paragraphs_single_line,
separate_paragraphs_print_formatted, preserve_spaces,
detect_paragraph_type, detect_formatting_type,
normalize_line_endings, convert_textile, remove_indents,
block_to_single_line, separate_hard_scene_breaks)
self.log = log
txt = ''
log.debug('Reading text from file...')
length = 0
# Extract content from zip archive.
if file_ext == 'txtz':
zf = ZipFile(stream)
zf.extractall('.')
for x in walk('.'):
if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
with open(x, 'rb') as tf:
txt += tf.read() + '\n\n'
else:
txt = stream.read()
if file_ext in {'md', 'textile', 'markdown'}:
options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
log.info('File extension indicates particular formatting. '
'Forcing formatting type to: %s'%options.formatting_type)
options.paragraph_type = 'off'
# Get the encoding of the document.
if options.input_encoding:
ienc = options.input_encoding
log.debug('Using user specified input encoding of %s' % ienc)
else:
det_encoding = detect(txt[:4096])
det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
# Microsoft Word exports to HTML with encoding incorrectly set to
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
det_encoding = 'gbk'
ienc = det_encoding
log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
if not ienc:
ienc = 'utf-8'
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
# Remove BOM from start of txt as its presence can confuse markdown
import codecs
for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
if txt.startswith(bom):
txt = txt[len(bom):]
break
txt = txt.decode(ienc, 'replace')
# Replace entities
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
# Normalize line endings
txt = normalize_line_endings(txt)
# Determine the paragraph type of the document.
if options.paragraph_type == 'auto':
options.paragraph_type = detect_paragraph_type(txt)
if options.paragraph_type == 'unknown':
log.debug('Could not reliably determine paragraph type using block')
options.paragraph_type = 'block'
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
# Detect formatting
if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
log.debug('Auto detected formatting as %s' % options.formatting_type)
if options.formatting_type == 'heuristic':
setattr(options, 'enable_heuristics', True)
setattr(options, 'unwrap_lines', False)
setattr(options, 'smarten_punctuation', True)
# Reformat paragraphs to block formatting based on the detected type.
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
if options.paragraph_type == 'single':
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_hard_scene_breaks(txt)
txt = separate_paragraphs_print_formatted(txt)
txt = block_to_single_line(txt)
elif options.paragraph_type == 'unformatted':
from calibre.ebooks.conversion.utils import HeuristicProcessor
# unwrap lines based on punctuation
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
#.........这里部分代码省略.........