本文整理汇总了Python中invenio.htmlutils.HTMLWasher.wash方法的典型用法代码示例。如果您正苦于以下问题:Python HTMLWasher.wash方法的具体用法?Python HTMLWasher.wash怎么用?Python HTMLWasher.wash使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类invenio.htmlutils.HTMLWasher
的用法示例。
在下文中一共展示了HTMLWasher.wash方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: escape_email_quoted_text
# 需要导入模块: from invenio.htmlutils import HTMLWasher [as 别名]
# 或者: from invenio.htmlutils.HTMLWasher import wash [as 别名]
def escape_email_quoted_text(text, indent_txt='>>', linebreak_txt='\n'):
"""Escape text using an email-like indenting rule.
As an example, this text:
>>Brave Sir Robin ran away...
<img src="malicious_script />*No!*
>>bravely ran away away...
I didn't!*<script>malicious code</script>
>>When danger reared its ugly head, he bravely turned his tail and fled.
<form onload="malicious"></form>*I never did!*
will be escaped like this:
>>Brave Sir Robin ran away...
<img src="malicious_script />*No!*
>>bravely ran away away...
I didn't!*<script>malicious code</script>
>>When danger reared its ugly head, he bravely turned his tail and fled.
<form onload="malicious"></form>*I never did!*
"""
washer = HTMLWasher()
lines = text.split(linebreak_txt)
output = ''
for line in lines:
line = line.strip()
nb_indent = 0
while True:
if line.startswith(indent_txt):
nb_indent += 1
line = line[len(indent_txt):]
else:
break
output += (nb_indent * indent_txt) + washer.wash(line, render_unallowed_tags=True) + linebreak_txt
nb_indent = 0
return output[:-1]
示例2: XSSEscapingTest
# 需要导入模块: from invenio.htmlutils import HTMLWasher [as 别名]
# 或者: from invenio.htmlutils.HTMLWasher import wash [as 别名]
class XSSEscapingTest(unittest.TestCase):
"""Test functions related to the prevention of XSS attacks."""
def __init__(self, methodName='test'):
self.washer = HTMLWasher()
unittest.TestCase.__init__(self, methodName)
def test_forbidden_formatting_tags(self):
"""htmlutils - washing of tags altering formatting of a page (e.g. </html>)"""
test_str = """</html></body></pre>"""
self.assertEqual(self.washer.wash(html_buffer=test_str),
'')
self.assertEqual(self.washer.wash(html_buffer=test_str,
render_unallowed_tags=True),
'</html></body></pre>')
def test_forbidden_script_tags(self):
"""htmlutils - washing of tags defining scripts (e.g. <script>)"""
test_str = """<script>malicious_function();</script>"""
self.assertEqual(self.washer.wash(html_buffer=test_str),
'')
self.assertEqual(self.washer.wash(html_buffer=test_str,
render_unallowed_tags=True),
'<script>malicious_function();</script>')
def test_forbidden_attributes(self):
"""htmlutils - washing of forbidden attributes in allowed tags (e.g. onLoad)"""
# onload
test_str = """<p onload="javascript:malicious_functtion();">"""
self.assertEqual(self.washer.wash(html_buffer=test_str), '<p>')
# tricky: css calling a javascript
test_str = """<p style="background: url('http://malicious_site.com/malicious_script.js');">"""
self.assertEqual(self.washer.wash(html_buffer=test_str), '<p>')
def test_fake_url(self):
"""htmlutils - washing of fake URLs which execute scripts"""
test_str = """<a href="javascript:malicious_function();">link</a>"""
self.assertEqual(self.washer.wash(html_buffer=test_str),
'<a href="">link</a>')
# Pirates could encode ascii values, or use uppercase letters...
test_str = """<a href="javasCRipt:malicious_function();">link</a>"""
self.assertEqual(self.washer.wash(html_buffer=test_str),
'<a href="">link</a>')
# MSIE treats 'java\ns\ncript:' the same way as 'javascript:'
# Here we test with:
# j
# avas
# crIPt :
test_str = """<a href="j\n avas\n crIPt :malicious_function();">link</a>"""
self.assertEqual(self.washer.wash(html_buffer=test_str),
'<a href="">link</a>')
示例3: HTMLAutomaticLinksTransformation
# 需要导入模块: from invenio.htmlutils import HTMLWasher [as 别名]
# 或者: from invenio.htmlutils.HTMLWasher import wash [as 别名]
class HTMLAutomaticLinksTransformation(unittest.TestCase):
"""Test functions related to transforming links into HTML context"""
def __init__(self, methodName='test'):
self.washer = HTMLWasher()
unittest.TestCase.__init__(self, methodName)
def test_transform_link(self):
"""htmlutils - transforming a link"""
body_input = 'https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es'
body_expected = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es</a>'
self.assertEqual(self.washer.wash(html_buffer=body_input,
automatic_link_transformation=True),
body_expected)
def test_transform_several_links(self):
"""htmlutils - transforming several links"""
body_input = 'some text https://cds.cern.ch/collection/Videos?ln=es more text https://cds.cern.ch/search?p=%27CERN+News'
body_expected = 'some text <a href="https://cds.cern.ch/collection/Videos?ln=es">https://cds.cern.ch/collection/Videos?ln=es</a> more text <a href="https://cds.cern.ch/search?p=%27CERN">https://cds.cern.ch/search?p=%27CERN</a>+News'
self.assertEqual(self.washer.wash(html_buffer=body_input,
automatic_link_transformation=True),
body_expected)
def test_transform_just_valid_links(self):
"""htmlutils - transforming just valid links"""
body_input = body_input = 'some text https://cds.cern.ch/collection/Videos?ln=es more text https://cds..cern/search?p=%27CERN+News'
body_expected = 'some text <a href="https://cds.cern.ch/collection/Videos?ln=es">https://cds.cern.ch/collection/Videos?ln=es</a> more text https://cds..cern/search?p=%27CERN+News'
self.assertEqual(self.washer.wash(html_buffer=body_input,
automatic_link_transformation=True),
body_expected)
def test_not_transform_link(self):
"""htmlutils - not transforming a link"""
body_input = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">Multimedia</a>'
body_expected = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">Multimedia</a>'
self.assertEqual(self.washer.wash(html_buffer=body_input,
automatic_link_transformation=True),
body_expected)
示例4: HTMLWashingTest
# 需要导入模块: from invenio.htmlutils import HTMLWasher [as 别名]
# 或者: from invenio.htmlutils.HTMLWasher import wash [as 别名]
class HTMLWashingTest(unittest.TestCase):
"""Test functions related to general washing of HTML source"""
def __init__(self, methodName='test'):
self.washer = HTMLWasher()
unittest.TestCase.__init__(self, methodName)
def test_wash_html(self):
"""htmlutils - washing HTML tags"""
# Simple test case
test_str = 'Spam and <b><blink>eggs</blink></b>'
self.assertEqual(self.washer.wash(html_buffer=test_str),
'Spam and <b>eggs</b>')
# Show 'escaped' tags
test_str = 'Spam and <b><blink>eggs</blink></b>'
self.assertEqual(self.washer.wash(html_buffer=test_str,
render_unallowed_tags=True),
'Spam and <b><blink>eggs</blink></b>')
# Keep entity and character references
test_str = '<b> a < b > c </b> ÷'
self.assertEqual(self.washer.wash(html_buffer=test_str),
'<b> a < b > c </b> ÷')
# Remove content of <script> tags
test_str = '<script type="text/javacript">alert("foo")</script>bar'
self.assertEqual(self.washer.wash(html_buffer=test_str),
'bar')
test_str = '<script type="text/javacript"><!--alert("foo")--></script>bar'
self.assertEqual(self.washer.wash(html_buffer=test_str),
'bar')
# Remove content of <style> tags
test_str = '<style>.myclass {color:#f00}</style><span class="myclass">styled text</span>'
self.assertEqual(self.washer.wash(html_buffer=test_str),
'styled text')
test_str = '<style><!-- .myclass {color:#f00} --></style><span class="myclass">styled text</span>'
self.assertEqual(self.washer.wash(html_buffer=test_str),
'styled text')
示例5: format_element
# 需要导入模块: from invenio.htmlutils import HTMLWasher [as 别名]
# 或者: from invenio.htmlutils.HTMLWasher import wash [as 别名]
def format_element(bfo, note_suffix, note_prefix='Note: ', separator='; '):
"""
Displays notes (various note fields)
@param note_prefix: a prefix before each group of notes
@param note_suffix: a suffix after each group of notes
@param separator: a separator between notes of a group
"""
notes = []
washer = HTMLWasher()
wash_and_join = lambda x: separator.join([washer.wash(item, automatic_link_transformation=True) for item in x])
# Get values from certain fields, wash them (so all links become clickable),
# join using separator and add to a list
if bfo.fields('500__a'):
notes.append(wash_and_join(bfo.fields('500__a')))
if len(notes) > 0:
# Split all list elements and add prefixes and suffixes
notes = [note_prefix + x + note_suffix
for x in notes]
return_notes = "".join(notes)
return return_notes
示例6: format_element
# 需要导入模块: from invenio.htmlutils import HTMLWasher [as 别名]
# 或者: from invenio.htmlutils.HTMLWasher import wash [as 别名]
def format_element(bfo, note_suffix, note_prefix='Note: ', separator='; '):
"""
Displays notes (various note fields)
@param note_prefix: a prefix before each group of notes
@param note_suffix: a suffix after each group of notes
@param separator: a separator between notes of a group
"""
notes = []
washer = HTMLWasher()
# Get values from certain fields, wash them (so all links become clickable),
# join using separator and add to a list
for field in bfo.fields('500__a'):
field = washer.wash(field.replace("&", "&"),
automatic_link_transformation=True)
notes.append(field)
if len(notes) > 0:
# Split all list elements and add prefixes and suffixes
notes = [note_prefix + x + note_suffix
for x in notes]
return_notes = "".join(notes)
return return_notes
示例7: _get_feature_text
# 需要导入模块: from invenio.htmlutils import HTMLWasher [as 别名]
# 或者: from invenio.htmlutils.HTMLWasher import wash [as 别名]
def _get_feature_text(record, language):
"""
Looks for a text (header) that can be featured on the article overview
page.
"""
washer = HTMLWasher()
header_text = ""
# Check if there is a header
if language == "fr":
header = record.field('590__a')
if header.strip() in \
['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']:
header = record.field('520__a')
else:
header = record.field('520__a')
if header.strip() in \
['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']:
header = record.field('590__a')
header = washer.wash(html_buffer=header,
allowed_tag_whitelist=[],
allowed_attribute_whitelist=[])
if header != "":
header_text = header
else:
if language == "fr":
article = record.fields('590__b')
if not article or \
(len(article) == 1 and \
article[0].strip() in \
['', '<br />', '<!--HTML--><br />', '<!--HTML-->']):
article = record.fields('520__b')
else:
article = record.fields('520__b')
if not article or \
(len(article) == 1 and \
article[0].strip() in \
['', '<br />', '<!--HTML--><br />', '<!--HTML-->']):
article = record.fields('590__b')
try:
article = article[0]
except:
return ''
match_obj = re.search(header_pattern, article)
if not match_obj:
match_obj = re.search(header_pattern2, article)
try:
header_text = match_obj.group("header")
header_text = washer.wash(html_buffer=header_text,
allowed_tag_whitelist=['a'],
allowed_attribute_whitelist=['href',
'target',
'class'])
if header_text == "":
raise Exception
except:
article = article.replace(header_text, '')
article = article.replace('<p/>', '')
article = article.replace('<p> </p>', '')
match_obj = re.search(para_pattern, article)
try:
# get the first paragraph
header_text = match_obj.group("paragraph")
try:
header_text = washer.wash(html_buffer=header_text,
allowed_tag_whitelist=[],
allowed_attribute_whitelist=[])
except:
# was not able to parse correctly the HTML. Use
# this safer function, but producing less good
# results
header_text = remove_html_markup(header_text)
if header_text.strip() == "":
raise Exception
else:
if len(header_text) > 250:
header_text = _get_first_sentence_or_part(header_text)
except:
# in a last instance get the first sentence
try:
article = washer.wash(article,
allowed_tag_whitelist=[],
allowed_attribute_whitelist=[])
except:
# was not able to parse correctly the HTML. Use
# this safer function, but producing less good
# results
article = remove_html_markup(article)
header_text = _get_first_sentence_or_part(article)
return header_text
示例8: email_quoted_txt2html
# 需要导入模块: from invenio.htmlutils import HTMLWasher [as 别名]
# 或者: from invenio.htmlutils.HTMLWasher import wash [as 别名]
def email_quoted_txt2html(text,
tabs_before=0,
indent_txt='>>',
linebreak_txt="\n",
indent_html=('<div class="commentbox">', "</div>"),
linebreak_html='<br/>'):
"""
Takes a typical mail quoted text, e.g.:
hello,
you told me:
>> Your mother was a hamster and your father smelt of elderberries
I must tell you that I'm not convinced. Then in this discussion:
>>>> Is there someone else up there we could talk to?
>> No. Now, go away, or I shall taunt you a second time-a!
I think we're not going to be friends!
and return an html formatted output, e.g.:
hello,<br/>
you told me:<br/>
<div>
Your mother was a hamster and your father smelt of elderberries
</div>
I must tell you that I'm not convinced. Then in this discussion:
<div>
<div>
Is there someone else up there we could talk to?
</div>
No. Now, go away, or I shall taunt you a second time-a!
</div>
I think we're not going to be friends!
@param text: the text in quoted format
@param tabs_before: number of tabulations before each line
@param indent_txt: quote separator in email (default:'>>')
@param linebreak_txt: line separator in email (default: '\n')
@param indent_html: tuple of (opening, closing) html tags.
default: ('<div class="commentbox">', "</div>")
@param linebreak_html: line separator in html (default: '<br/>')
@return: string containing html formatted output
"""
washer = HTMLWasher()
final_body = ""
nb_indent = 0
text = text.strip('\n')
lines = text.split(linebreak_txt)
for line in lines:
new_nb_indent = 0
while True:
if line.startswith(indent_txt):
new_nb_indent += 1
line = line[len(indent_txt):]
else:
break
if (new_nb_indent > nb_indent):
for dummy in range(nb_indent, new_nb_indent):
final_body += tabs_before*"\t" + indent_html[0] + "\n"
tabs_before += 1
elif (new_nb_indent < nb_indent):
for dummy in range(new_nb_indent, nb_indent):
tabs_before -= 1
final_body += (tabs_before)*"\t" + indent_html[1] + "\n"
else:
final_body += (tabs_before)*"\t"
line = washer.wash(line)
final_body += tabs_before*"\t" + line
final_body += linebreak_html + "\n"
nb_indent = new_nb_indent
for dummy in range(0, nb_indent):
tabs_before -= 1
final_body += (tabs_before)*"\t" + "</div>\n"
return final_body
示例9: format_element
# 需要导入模块: from invenio.htmlutils import HTMLWasher [as 别名]
# 或者: from invenio.htmlutils.HTMLWasher import wash [as 别名]
def format_element(bfo, separator='<br/>'):
"""
Display article body
@param separator: separator between each body
"""
# Retrieve context (journal, issue and category) from URI
args = parse_url_string(bfo.user_info['uri'])
ln = args["ln"]
_ = gettext_set_language(ln)
if ln == "fr":
article = bfo.fields('590__b')
if not article or \
(len(article) == 1 and \
(article[0].strip() in ['', '<br />', '<!--HTML--><br />'])):
article = bfo.fields('520__b')
else:
article = bfo.fields('520__b')
if not article or \
(len(article) == 1 and \
(article[0].strip() in ['', '<br />', '<!--HTML--><br />'])):
article = bfo.fields('590__b')
if not CFG_CERN_SITE or \
not bfo.field('980__a').startswith('BULLETIN'):
return separator.join(article)
################################################################
# CERN Bulletin-specific code #
################################################################
# We need a compatibility layer for old CERN Bulletin
# articles. Identify them and process them if needed.
is_old_cern_bulletin_article = False
if bfo.field('980__a').startswith('BULLETIN'):
try:
year = int(bfo.fields('260__c')[0])
except IndexError:
year = 2000
if year < 2009 or \
(bfo.field('980__a').startswith('BULLETINSTAFF') and \
("CERN EDS" in bfo.field('595__a'))):
is_old_cern_bulletin_article = True
header_out = ''
if not is_old_cern_bulletin_article:
# Return the same as any other journal article
return separator.join(article)
# Old CERN articles
if year < 2007 or bfo.field('980__a').startswith('BULLETINSTAFF'):
# Really old CERN articles
if len(article) > 0:
# CERN-only: old CERN Bulletin articles
return __backward_compatible_HTML(article[0]) + \
(bfo.field('980__a').startswith('BULLETINSTAFF') and \
('<br/><br/>' + bfe_fulltext.format_element(bfo, style="", show_icons='yes')) \
or '')
else:
return ''
# Not-so-old CERN articles follow:
# 2. prepare regex's for the elements
#=====================================================
from invenio.webjournal_utils import \
image_pattern, \
para_pattern, \
header_pattern
page_elements = {}
# 3. get the header (either from marc xml or regex)
#=====================================================
if bfo.lang == "fr":
header = bfo.field('590__a')
if header == '':
header = bfo.field('520__a')
else:
header = bfo.field('520__a')
if header == '':
header = bfo.field('590__a')
if not header:
try:
header_obj = re.search(header_pattern, article[0])
header_text = header_obj.group("header")
except:
header_text = ""
else:
header_text = header
washer = HTMLWasher()
header_text_clean = washer.wash(html_buffer=header_text,
allowed_tag_whitelist=['a'],
allowed_attribute_whitelist=['href'])
header_out = '<p class="articleHeader">' + header_text_clean + '</p>'
#.........这里部分代码省略.........
示例10: email_quoted_txt2html
# 需要导入模块: from invenio.htmlutils import HTMLWasher [as 别名]
# 或者: from invenio.htmlutils.HTMLWasher import wash [as 别名]
def email_quoted_txt2html(
text,
tabs_before=0,
indent_txt=">>",
linebreak_txt="\n",
indent_html=('<div class="commentbox">', "</div>"),
linebreak_html="<br/>",
indent_block=True,
):
"""
Takes a typical mail quoted text, e.g.::
hello,
you told me:
>> Your mother was a hamster and your father smelt of elderberries
I must tell you that I'm not convinced. Then in this discussion:
>>>> Is there someone else up there we could talk to?
>> No. Now, go away, or I shall taunt you a second time-a!
I think we're not going to be friends!
and return an html formatted output, e.g.::
hello,<br/>
you told me:<br/>
<div>
Your mother was a hamster and your father smelt of elderberries
</div>
I must tell you that I'm not convinced. Then in this discussion:
<div>
<div>
Is there someone else up there we could talk to?
</div>
No. Now, go away, or I shall taunt you a second time-a!
</div>
I think we're not going to be friends!
The behaviour is different when C{indent_block} is C{True} or C{False}.
When C{True} the when C{indent_html} is only added at each change of
level of indentation, while it is added for each line when C{False}.
For eg::
>> a
>> b
>>>> c
would result in (if C{True})::
<div class="commentbox">
a<br/>
b<br/>
<div class="commentbox">
c<br/>
</div>
</div>
or would be (if C{False})::
<div class="commentbox"> a</div><br/>
<div class="commentbox"> b</div><br/>
<div class="commentbox"><div class="commentbox"> c</div></div><br/>
@param text: the text in quoted format
@param tabs_before: number of tabulations before each line
@param indent_txt: quote separator in email (default:'>>')
@param linebreak_txt: line separator in email
@param indent_html: tuple of (opening, closing) html tags.
default: ('<div class="commentbox">', "</div>")
@param linebreak_html: line separator in html (default: '<br/>')
@param indent_block: if indentation should be done per 'block'
i.e. only at changes of indentation level
(+1, -1) or at each line.
@return: string containing html formatted output
"""
washer = HTMLWasher()
final_body = ""
nb_indent = 0
text = text.strip("\n")
lines = text.split(linebreak_txt)
for line in lines:
new_nb_indent = 0
while True:
if line.startswith(indent_txt):
new_nb_indent += 1
line = line[len(indent_txt) :]
else:
break
if indent_block:
if new_nb_indent > nb_indent:
for dummy in range(nb_indent, new_nb_indent):
final_body += tabs_before * "\t" + indent_html[0] + "\n"
tabs_before += 1
elif new_nb_indent < nb_indent:
for dummy in range(new_nb_indent, nb_indent):
tabs_before -= 1
final_body += (tabs_before) * "\t" + indent_html[1] + "\n"
else:
final_body += (tabs_before) * "\t"
else:
final_body += tabs_before * "\t" + new_nb_indent * indent_html[0]
try:
line = washer.wash(line)
except HTMLParseError:
# Line contained something like "foo<bar"
line = cgi.escape(line)
#.........这里部分代码省略.........