本文整理汇总了Python中tidy.parseString函数的典型用法代码示例。如果您正苦于以下问题:Python parseString函数的具体用法?Python parseString怎么用?Python parseString使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了parseString函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_bad_option_values
def test_bad_option_values(self):
badopts = [{"indent": "---"}, {"indent_spaces": None}]
for opts in badopts:
with self.assertRaisesRegexp(
tidy.OptionArgError, "missing or malformed argument"
):
tidy.parseString(self.input2, **opts)
示例2: test_bad_options
def test_bad_options(self):
badopts = [{"foo": 1}]
for opts in badopts:
with self.assertRaisesRegexp(
tidy.InvalidOptionError, "not a valid Tidy option"
):
tidy.parseString(self.input2, **opts)
示例3: test_encodings
def test_encodings(self):
foo = file('foo.htm').read().decode('utf8').encode('ascii',
'xmlcharrefreplace')
doc1u = tidy.parseString(foo, input_encoding='ascii',
output_encoding='latin1')
self.failUnless(str(doc1u).find('\xe9')>=0)
doc2u = tidy.parseString(foo, input_encoding='ascii',
output_encoding='utf8')
self.failUnless(str(doc2u).find('\xc3\xa9')>=0)
示例4: test_badOptions
def test_badOptions(self):
badopts = [{'foo': 1}, {'indent': '---'}, {'indent_spaces': None}]
for dct in badopts:
try:
tidy.parseString(self.input2, **dct)
except tidy.TidyLibError:
pass
else:
self.fail("Invalid option %s should have raised an error" %
repr(dct))
示例5: test_encodings
def test_encodings(self):
text = (
open(self.test_file, "rb")
.read()
.decode("utf8")
.encode("ascii", "xmlcharrefreplace")
)
doc1u = tidy.parseString(text, input_encoding="ascii", output_encoding="latin1")
self.assertTrue(doc1u.getvalue().find(b"\xe9") >= 0)
doc2u = tidy.parseString(text, input_encoding="ascii", output_encoding="utf8")
self.assertTrue(doc2u.getvalue().find(b"\xc3\xa9") >= 0)
示例6: test_options
def test_options(self):
doc1 = tidy.parseString(
self.input1, add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1
)
self.assertIn("CDATA", str(doc1))
doc2 = tidy.parseString(
"<Html>", add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1
)
self.assertTrue(str(doc2).startswith("<?xml"))
self.assertFalse(len(doc2.errors) == 0)
self.assertNotIn("\n", str(doc2))
doc3 = tidy.parse(self.test_file, char_encoding="utf8", alt_text="foo")
self.assertIn('alt="foo"', doc3.gettext())
self.assertIn("é", doc3.gettext())
示例7: load_doc_file
def load_doc_file(filename, f):
tidyopts = dict(drop_proprietary_attributes=1,
alt_text='',
hide_comments=1,
output_xhtml=1,
show_body_only=1,
clean=1,
char_encoding='utf8',
indent='auto',
)
contents = unicode(f.read(),'latin1')
tm = re_titlematch.search(contents)
if tm:
title = tm.group(1)
else:
title = ""
if not quiet: print "--- file: %s (%s) ---" % (filename, title)
s = tidy.parseString(contents.encode('utf-8'), **tidyopts)
curs.execute("INSERT INTO docs (file, version, title, content) VALUES (%(f)s, %(v)s, %(t)s, %(c)s)",{
'f': filename,
'v': ver,
't': title,
'c': str(s),
})
global pagecount
pagecount += 1
示例8: get_page_title
def get_page_title(content):
try:
content = str(tidy.parseString(content, output_xhtml=True, add_xml_decl=True, indent=False, tidy_mark=False))
content = ENTITY.sub(ENTITY_REP, content)
#~ f = open("tmp.log", "w")
#~ f.write(content)
#~ f.close()
root = etree.fromstring(content)
head = root.find("{http://www.w3.org/1999/xhtml}head")
title = head.find("{http://www.w3.org/1999/xhtml}title")
titletext = title.text
time.sleep(0.5)
return titletext
except Exception, e:
print "\tHTML Parser Error:", str(e)
m = R_TITLE.search(content)
if m is not None:
return m.group(1)
return ""
示例9: issue
def issue(answers_xml):
# validate the answers
# validateAnswers(answers_xml)
# generate the answers XML document
ctxt = validateAnswers(answers_xml) # lxml.etree.parse(StringIO(answers_xml))
# apply the xslt transform
transform = lxml.etree.XSLT(
lxml.etree.parse(XSLT_SOURCE)
)
result = transform.apply(ctxt)
# return the transformed document, after passing it through tidy
return transform.tostring(result)
try:
return str(tidy.parseString(transform.tostring(result),
output_xml=1, input_xml=1, tidy_mark=0, indent=1))
except:
# if something goes wrong with Tidy, just return the version with
# the fucked img tag
return transform.tostring(result)
示例10: tidyhtml
def tidyhtml(html):
"""simply tidies up html code, returning xhtml"""
if isinstance(html, unicode):
html = html.encode("utf-8")
html = tidy.parseString(html, output_xhtml=1, tidy_mark=0, input_encoding="utf8", output_encoding="utf8")
html = str(html)
return html
示例11: clean
def clean(txt):
return unicode(str(tidy.parseString(txt, **{'output_xhtml' : 1,
'add_xml_decl' : 0,
'indent' : 0,
'tidy_mark' : 0,
'doctype' : "strict",
'wrap' : 0})),'utf8')
示例12: tidy_html
def tidy_html(html_buffer, cleaning_lib='utidylib'):
"""
Tidy up the input HTML using one of the installed cleaning
libraries.
@param html_buffer: the input HTML to clean up
@type html_buffer: string
@param cleaning_lib: chose the preferred library to clean the HTML. One of:
- utidylib
- beautifulsoup
@return: a cleaned version of the input HTML
@note: requires uTidylib or BeautifulSoup to be installed. If the chosen library is missing, the input X{html_buffer} is returned I{as is}.
"""
if CFG_TIDY_INSTALLED and cleaning_lib == 'utidylib':
options = dict(output_xhtml=1,
show_body_only=1,
merge_divs=0,
wrap=0)
try:
output = str(tidy.parseString(html_buffer, **options))
except:
output = html_buffer
elif CFG_BEAUTIFULSOUP_INSTALLED and cleaning_lib == 'beautifulsoup':
try:
output = str(BeautifulSoup(html_buffer).prettify())
except:
output = html_buffer
else:
output = html_buffer
return output
示例13: _tidy2
def _tidy2(text):
"""uTidyLib's XHTML validator.
This function is a wrapper to uTidyLib's validator.
"""
text = tidy.parseString(text, output_xhtml=1, add_xml_decl=0, indent=0, tidy_mark=0)
return _in_tag(str(text), 'body')
示例14: to_xhtml
def to_xhtml(self, stylesheet_url='', settings=DEFAULT_HTML_OVERRIDES,
tidy_settings=DEFAULT_TIDY_XHTML_OPTIONS, *args, **kwargs):
if 'tidy_output' in kwargs:
del kwargs['tidy_output']
html_string, discard = self.to_html(stylesheet_url, tidy_output=False,
*args, **kwargs)
return str(tidy.parseString(html_string, **tidy_settings)), []
示例15: run
def run(self, text):
# Pass text to Tidy. As Tidy does not accept unicode we need to encode
# it and decode its return value.
enc = self.markdown.tidy_options.get('char_encoding', 'utf8')
return unicode(tidy.parseString(text.encode(enc),
**self.markdown.tidy_options),
encoding=enc)