本文整理汇总了Python中lxml.html.tostring方法的典型用法代码示例。如果您正苦于以下问题:Python html.tostring方法的具体用法?Python html.tostring怎么用?Python html.tostring使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml.html
的用法示例。
在下文中一共展示了html.tostring方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def __init__(self, file_name, user_id):
with open(file_name, 'r') as self.opened_file:
# So Instapaper doesn't close <li> tags
# This was causing infinite recursion when using BS directly
# Hence why the stuff below is being done, so that the <li> tags get closed
self.html = html.document_fromstring(self.opened_file.read())
self.html = html.tostring(self.html)
self.soup = BeautifulSoup4(self.html)
self.user = user_id
self.urls = dict()
self.check_duplicates = dict()
self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
Bookmark.deleted == False).all()
for bmark in self.check_duplicates_query:
self.check_duplicates[bmark.main_url] = bmark
self.tags_dict = dict()
self.tags_set = set()
self.valid_url = re.compile(
r'^(?:[a-z0-9\.\-]*)://'
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
r'(?::\d+)?'
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
示例2: content
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def content(self):
"""
:returns: The text body of the message.
"""
# The code that follows is obviously pretty disgusting.
# It seems like it might be impossible to completely replicate
# the text of the original message if it has trailing whitespace
message = self._content_xpb.one_(self._message_element)
first_line = message.text
if message.text[:2] == ' ':
first_line = message.text[2:]
else:
log.debug("message did not have expected leading whitespace")
subsequent_lines = ''.join([
html.tostring(child, encoding='unicode').replace('<br>', '\n')
for child in message.iterchildren()
])
message_text = first_line + subsequent_lines
if len(message_text) > 0 and message_text[-1] == ' ':
message_text = message_text[:-1]
else:
log.debug("message did not have expected leading whitespace")
return message_text
示例3: from_text
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def from_text(txt):
def replace(match):
txt = match.group()
if '\n' in txt:
return '<br>' * txt.count('\n')
else:
return ' ' * txt.count(' ')
tpl = '<p>%s</p>'
htm = escape(txt)
htm = fromstring(tpl % htm)
fix_links(htm)
htm = tostring(htm, encoding='unicode')
htm = htm[3:-4]
htm = re.sub('(?m)((\r?\n)+| [ ]+|^ )', replace, htm)
htm = tpl % htm
return htm
示例4: try_justext
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def try_justext(tree, url, target_language):
'''Second safety net: try with the generic algorithm justext'''
result_body = etree.Element('body')
justtextstring = html.tostring(tree, pretty_print=False, encoding='utf-8')
# determine language
if target_language is not None and target_language in JUSTEXT_LANGUAGES:
langsetting = JUSTEXT_LANGUAGES[target_language]
justext_stoplist = justext.get_stoplist(langsetting)
else:
#justext_stoplist = justext.get_stoplist(JUSTEXT_DEFAULT)
justext_stoplist = JT_STOPLIST
# extract
try:
paragraphs = justext.justext(justtextstring, justext_stoplist, 50, 200, 0.1, 0.2, 0.2, 200, True)
except ValueError as err: # not an XML element: HtmlComment
LOGGER.error('justext %s %s', err, url)
result_body = None
else:
for paragraph in paragraphs:
if not paragraph.is_boilerplate:
#if duplicate_test(paragraph) is not True:
elem = etree.Element('p')
elem.text = paragraph.text
result_body.append(elem)
return result_body
示例5: ingest
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def ingest(self, file_path):
"""Ingestor implementation."""
file_size = self.result.size or os.path.getsize(file_path)
if file_size > self.MAX_SIZE:
raise ProcessingException("XML file is too large.")
try:
doc = etree.parse(file_path)
except (ParserError, ParseError):
raise ProcessingException("XML could not be parsed.")
text = self.extract_html_text(doc.getroot())
transform = etree.XSLT(self.XSLT)
html_doc = transform(doc)
html_body = html.tostring(html_doc, encoding=str, pretty_print=True)
self.result.flag(self.result.FLAG_HTML)
self.result.emit_html_body(html_body, text)
示例6: clean_html
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def clean_html(context, data):
"""Clean an HTML DOM and store the changed version."""
doc = _get_html_document(context, data)
if doc is None:
context.emit(data=data)
return
remove_paths = context.params.get('remove_paths')
for path in ensure_list(remove_paths):
for el in doc.xpath(path):
el.drop_tree()
html_text = html.tostring(doc, pretty_print=True)
content_hash = context.store_data(html_text)
data['content_hash'] = content_hash
context.emit(data=data)
示例7: download_page
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def download_page(url, cookie_jar):
"""
Request page using authenticated cookies (cookiejar).
Download html source and save in browser directory, to
be used by in show_in_browser().
"""
browser_dir = os.path.join(server_path, 'static/browser')
delete_directory_files(browser_dir)
filename = '{}.html'.format(uuid.uuid4())
filepath = os.path.join(browser_dir, filename)
try:
response = cookie_request(url, cookie_jar)
except requests.RequestException as e:
return e, None
doc = html.document_fromstring(response.text)
with open(filepath, 'wb') as f:
f.write(html.tostring(doc))
return None, filename
示例8: gdoc_view
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def gdoc_view(request, doc_id):
try:
gdoc_id = settings.GDOC_DOCS[doc_id]
except KeyError:
raise Http404("No doc named %s" % doc_id)
url = "https://docs.google.com/document/d/%s/pub?embedded=true" % gdoc_id
page = requests.get(url)
tree = html.fromstring(page.text)
content = (
"<style>"
+ "".join(
[
html.tostring(child).decode("utf8")
for child in tree.head.xpath("//style")
]
)
+ "</style>"
)
content += "".join([html.tostring(child).decode("utf8") for child in tree.body])
context = {"content": content}
return render(request, "gdoc.html", context)
示例9: __str__
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def __str__(self):
return mark_safe(html.tostring(self._tree, encoding='unicode'))
示例10: fragment_tree_to_str
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def fragment_tree_to_str(tree):
return html.tostring(tree, encoding='unicode')[len('<div>'):-len('</div>')]
示例11: url_trim
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def url_trim(html):
"""Trims anchor texts that are longer than 70 chars."""
fragment = fromstring(html)
for el, attrib_, link_, pos_ in fragment.iterlinks():
new_link_text = trim_url(el.text_content())
el.text = new_link_text
return mark_safe(tostring(fragment, encoding="unicode"))
示例12: fix_privacy
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def fix_privacy(htm, only_proxy=False):
if not htm.strip():
return htm
use_proxy = conf['USE_PROXY']
if only_proxy and not use_proxy:
return htm
htm = fromstring(htm)
for img in htm.xpath('//img[@src]'):
src = img.attrib['src']
if re.match('^(https?://|//).*', src):
if src.startswith('//'):
src = 'https:' + src
if use_proxy:
src = '/proxy?url=' + src
if only_proxy:
img.attrib['src'] = src
else:
img.attrib['data-src'] = src
del img.attrib['src']
if not only_proxy:
# style could contain "background-image", etc.
for el in htm.xpath('//*[@style]'):
el.attrib['data-style'] = el.attrib['style']
del el.attrib['style']
htm = tostring(htm, encoding='unicode').strip()
htm = re.sub('(^<div>|</div>$)', '', htm)
return htm
示例13: get_deck_list
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def get_deck_list(deckid):
"""
For a given HearthPwn deck ID, return a list of Cards that belong to that
deck.
Parameters:
- 'deckid' - a HearthPwn deck ID
"""
# http://www.hearthpwn.com/decks/listing/ + deckid + /neutral or /class
url = 'http://www.hearthpwn.com/decks/listing/'
css = '#cards > tbody > tr > td.col-name'
deck = []
# Class Cards
htmlelement = get_htmlelement_from_url(url + str(deckid) + '/class')
cardelements = htmlelement.cssselect(css)
# Neutral Cards
htmlelement = get_htmlelement_from_url(url + str(deckid) + '/neutral')
cardelements += htmlelement.cssselect(css)
regex = re.compile('×\s+(\d+)')
for element in cardelements:
# cssselect always returns an array, but in our case the result is
# always just one element.
cardname = element.cssselect('a')[0].text.strip()
elementtext = html.tostring(element).decode('UTF-8')
# There's probably a better way to get the amount, but we currently
# look for the "x #" in the raw text of the element
match = re.search(regex, elementtext)
if match:
amount = int(match.group(1))
else:
print('ERROR: Unable to get amount for card ' + cardname)
# This shouldn't happen, but when it does, just continue on after
# logging an error.
amount = 0
deck.append(Card(cardname, amount))
return deck
示例14: convert_json_to_html
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def convert_json_to_html(elements):
content = html.fragment_fromstring('<div></div>')
for element in elements:
content.append(_recursive_convert_json(element))
content.make_links_absolute(base_url=base_url)
for x in content.xpath('.//span'):
x.drop_tag()
html_string = html.tostring(content, encoding='unicode')
html_string = replace_line_breaks_except_pre(html_string, '<br/>')
html_string = html_string[5:-6]
return html_string
示例15: convert_html_to_telegraph_format
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def convert_html_to_telegraph_format(html_string, clean_html=True, output_format="json_string"):
if clean_html:
html_string = clean_article_html(html_string)
body = preprocess_fragments(
_fragments_from_string(html_string)
)
if body is not None:
desc = [x for x in body.iterdescendants()]
for tag in desc:
preprocess_media_tags(tag)
move_to_top(body)
post_process(body)
else:
fragments = _fragments_from_string(html_string)
body = fragments[0].getparent() if len(fragments) else None
content = []
if body is not None:
content = [_recursive_convert(x) for x in body.iterchildren()]
if output_format == 'json_string':
return json.dumps(content, ensure_ascii=False)
elif output_format == 'python_list':
return content
elif output_format == 'html_string':
return html.tostring(body, encoding='unicode')