当前位置: 首页>>代码示例>>Python>>正文


Python html.tostring方法代码示例

本文整理汇总了Python中lxml.html.tostring方法的典型用法代码示例。如果您正苦于以下问题:Python html.tostring方法的具体用法?Python html.tostring怎么用?Python html.tostring使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在lxml.html的用法示例。


在下文中一共展示了html.tostring方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def __init__(self, file_name, user_id):
        with open(file_name, 'r') as self.opened_file:
            #  So Instapaper doesn't close <li> tags
            #  This was causing infinite recursion when using BS directly
            #  Hence why the stuff below is being done, so that the <li> tags get closed
            self.html = html.document_fromstring(self.opened_file.read())
            self.html = html.tostring(self.html)
        self.soup = BeautifulSoup4(self.html)
        self.user = user_id
        self.urls = dict()
        self.check_duplicates = dict()
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for bmark in self.check_duplicates_query:
            self.check_duplicates[bmark.main_url] = bmark
        self.tags_dict = dict()
        self.tags_set = set()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE) 
开发者ID:dhamaniasad,项目名称:crestify,代码行数:26,代码来源:parsers.py

示例2: content

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def content(self):
        """
        :returns: The text body of the message.
        """
        # The code that follows is obviously pretty disgusting.
        # It seems like it might be impossible to completely replicate
        # the text of the original message if it has trailing whitespace
        message = self._content_xpb.one_(self._message_element)
        first_line = message.text
        if message.text[:2] == '  ':
            first_line = message.text[2:]
        else:
            log.debug("message did not have expected leading whitespace")
        subsequent_lines = ''.join([
            html.tostring(child, encoding='unicode').replace('<br>', '\n')
            for child in message.iterchildren()
        ])
        message_text = first_line + subsequent_lines
        if len(message_text) > 0 and message_text[-1] == ' ':
            message_text = message_text[:-1]
        else:
            log.debug("message did not have expected leading whitespace")

        return message_text 
开发者ID:IvanMalison,项目名称:okcupyd,代码行数:26,代码来源:messaging.py

示例3: from_text

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def from_text(txt):
    def replace(match):
        txt = match.group()
        if '\n' in txt:
            return '<br>' * txt.count('\n')
        else:
            return '&nbsp;' * txt.count(' ')

    tpl = '<p>%s</p>'
    htm = escape(txt)
    htm = fromstring(tpl % htm)
    fix_links(htm)
    htm = tostring(htm, encoding='unicode')
    htm = htm[3:-4]
    htm = re.sub('(?m)((\r?\n)+| [ ]+|^ )', replace, htm)
    htm = tpl % htm
    return htm 
开发者ID:naspeh,项目名称:mailur,代码行数:19,代码来源:html.py

示例4: try_justext

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def try_justext(tree, url, target_language):
    '''Second safety net: try with the generic algorithm justext'''
    result_body = etree.Element('body')
    justtextstring = html.tostring(tree, pretty_print=False, encoding='utf-8')
    # determine language
    if target_language is not None and target_language in JUSTEXT_LANGUAGES:
        langsetting = JUSTEXT_LANGUAGES[target_language]
        justext_stoplist = justext.get_stoplist(langsetting)
    else:
        #justext_stoplist = justext.get_stoplist(JUSTEXT_DEFAULT)
        justext_stoplist = JT_STOPLIST
    # extract
    try:
        paragraphs = justext.justext(justtextstring, justext_stoplist, 50, 200, 0.1, 0.2, 0.2, 200, True)
    except ValueError as err:  # not an XML element: HtmlComment
        LOGGER.error('justext %s %s', err, url)
        result_body = None
    else:
        for paragraph in paragraphs:
            if not paragraph.is_boilerplate:
                #if duplicate_test(paragraph) is not True:
                elem = etree.Element('p')
                elem.text = paragraph.text
                result_body.append(elem)
    return result_body 
开发者ID:adbar,项目名称:trafilatura,代码行数:27,代码来源:external.py

示例5: ingest

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def ingest(self, file_path):
        """Ingestor implementation."""
        file_size = self.result.size or os.path.getsize(file_path)
        if file_size > self.MAX_SIZE:
            raise ProcessingException("XML file is too large.")

        try:
            doc = etree.parse(file_path)
        except (ParserError, ParseError):
            raise ProcessingException("XML could not be parsed.")

        text = self.extract_html_text(doc.getroot())
        transform = etree.XSLT(self.XSLT)
        html_doc = transform(doc)
        html_body = html.tostring(html_doc, encoding=str, pretty_print=True)
        self.result.flag(self.result.FLAG_HTML)
        self.result.emit_html_body(html_body, text) 
开发者ID:occrp-attic,项目名称:ingestors,代码行数:19,代码来源:xml.py

示例6: clean_html

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def clean_html(context, data):
    """Clean an HTML DOM and store the changed version."""
    doc = _get_html_document(context, data)
    if doc is None:
        context.emit(data=data)
        return

    remove_paths = context.params.get('remove_paths')
    for path in ensure_list(remove_paths):
        for el in doc.xpath(path):
            el.drop_tree()

    html_text = html.tostring(doc, pretty_print=True)
    content_hash = context.store_data(html_text)
    data['content_hash'] = content_hash
    context.emit(data=data) 
开发者ID:alephdata,项目名称:memorious,代码行数:18,代码来源:clean.py

示例7: download_page

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def download_page(url, cookie_jar):
    """
    Request page using authenticated cookies (cookiejar).
    Download html source and save in browser directory, to
    be used by in show_in_browser().
    """
    browser_dir = os.path.join(server_path, 'static/browser')
    delete_directory_files(browser_dir)
    filename = '{}.html'.format(uuid.uuid4())
    filepath = os.path.join(browser_dir, filename)
    try:
        response = cookie_request(url, cookie_jar)
    except requests.RequestException as e:
        return e, None
    doc = html.document_fromstring(response.text)
    with open(filepath, 'wb') as f:
        f.write(html.tostring(doc))
    return None, filename 
开发者ID:TeamHG-Memex,项目名称:autologin,代码行数:20,代码来源:server.py

示例8: gdoc_view

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def gdoc_view(request, doc_id):
    try:
        gdoc_id = settings.GDOC_DOCS[doc_id]
    except KeyError:
        raise Http404("No doc named %s" % doc_id)
    url = "https://docs.google.com/document/d/%s/pub?embedded=true" % gdoc_id
    page = requests.get(url)
    tree = html.fromstring(page.text)

    content = (
        "<style>"
        + "".join(
            [
                html.tostring(child).decode("utf8")
                for child in tree.head.xpath("//style")
            ]
        )
        + "</style>"
    )
    content += "".join([html.tostring(child).decode("utf8") for child in tree.body])
    context = {"content": content}
    return render(request, "gdoc.html", context) 
开发者ID:ebmdatalab,项目名称:openprescribing,代码行数:24,代码来源:views.py

示例9: __str__

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def __str__(self):
        return mark_safe(html.tostring(self._tree, encoding='unicode')) 
开发者ID:DMOJ,项目名称:online-judge,代码行数:4,代码来源:lxml_tree.py

示例10: fragment_tree_to_str

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def fragment_tree_to_str(tree):
    return html.tostring(tree, encoding='unicode')[len('<div>'):-len('</div>')] 
开发者ID:DMOJ,项目名称:online-judge,代码行数:4,代码来源:__init__.py

示例11: url_trim

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def url_trim(html):
    """Trims anchor texts that are longer than 70 chars."""
    fragment = fromstring(html)
    for el, attrib_, link_, pos_ in fragment.iterlinks():
        new_link_text = trim_url(el.text_content())
        el.text = new_link_text

    return mark_safe(tostring(fragment, encoding="unicode")) 
开发者ID:evernote,项目名称:zing,代码行数:10,代码来源:cleanhtml.py

示例12: fix_privacy

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def fix_privacy(htm, only_proxy=False):
    if not htm.strip():
        return htm

    use_proxy = conf['USE_PROXY']
    if only_proxy and not use_proxy:
        return htm

    htm = fromstring(htm)
    for img in htm.xpath('//img[@src]'):
        src = img.attrib['src']
        if re.match('^(https?://|//).*', src):
            if src.startswith('//'):
                src = 'https:' + src
            if use_proxy:
                src = '/proxy?url=' + src
            if only_proxy:
                img.attrib['src'] = src
            else:
                img.attrib['data-src'] = src
                del img.attrib['src']

    if not only_proxy:
        # style could contain "background-image", etc.
        for el in htm.xpath('//*[@style]'):
            el.attrib['data-style'] = el.attrib['style']
            del el.attrib['style']

    htm = tostring(htm, encoding='unicode').strip()
    htm = re.sub('(^<div>|</div>$)', '', htm)
    return htm 
开发者ID:naspeh,项目名称:mailur,代码行数:33,代码来源:html.py

示例13: get_deck_list

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def get_deck_list(deckid):
    """
    For a given HearthPwn deck ID, return a list of Cards that belong to that
    deck.

    Parameters:

    - 'deckid' - a HearthPwn deck ID
    """
    # http://www.hearthpwn.com/decks/listing/ + deckid + /neutral or /class
    url = 'http://www.hearthpwn.com/decks/listing/'
    css = '#cards > tbody > tr > td.col-name'

    deck = []

    # Class Cards
    htmlelement = get_htmlelement_from_url(url + str(deckid) + '/class')
    cardelements = htmlelement.cssselect(css)
    # Neutral Cards
    htmlelement = get_htmlelement_from_url(url + str(deckid) + '/neutral')
    cardelements += htmlelement.cssselect(css)

    regex = re.compile('&#215;\s+(\d+)')
    for element in cardelements:
        # cssselect always returns an array, but in our case the result is
        # always just one element.
        cardname = element.cssselect('a')[0].text.strip()
        elementtext = html.tostring(element).decode('UTF-8')
        # There's probably a better way to get the amount, but we currently
        # look for the "x #" in the raw text of the element
        match = re.search(regex, elementtext)
        if match:
            amount = int(match.group(1))
        else:
            print('ERROR: Unable to get amount for card ' + cardname)
            # This shouldn't happen, but when it does, just continue on after
            # logging an error.
            amount = 0
        deck.append(Card(cardname, amount))

    return deck 
开发者ID:waymanglover,项目名称:hearthstats,代码行数:43,代码来源:hearth.py

示例14: convert_json_to_html

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def convert_json_to_html(elements):
    content = html.fragment_fromstring('<div></div>')
    for element in elements:
        content.append(_recursive_convert_json(element))
    content.make_links_absolute(base_url=base_url)
    for x in content.xpath('.//span'):
        x.drop_tag()
    html_string = html.tostring(content, encoding='unicode')
    html_string = replace_line_breaks_except_pre(html_string, '<br/>')
    html_string = html_string[5:-6]
    return html_string 
开发者ID:mercuree,项目名称:html-telegraph-poster,代码行数:13,代码来源:html_to_telegraph.py

示例15: convert_html_to_telegraph_format

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import tostring [as 别名]
def convert_html_to_telegraph_format(html_string, clean_html=True, output_format="json_string"):
    if clean_html:
        html_string = clean_article_html(html_string)

        body = preprocess_fragments(
            _fragments_from_string(html_string)
        )
        if body is not None:
            desc = [x for x in body.iterdescendants()]
            for tag in desc:
                preprocess_media_tags(tag)
            move_to_top(body)
            post_process(body)
    else:
        fragments = _fragments_from_string(html_string)
        body = fragments[0].getparent() if len(fragments) else None

    content = []
    if body is not None:
        content = [_recursive_convert(x) for x in body.iterchildren()]

    if output_format == 'json_string':
        return json.dumps(content, ensure_ascii=False)
    elif output_format == 'python_list':
        return content
    elif output_format == 'html_string':
        return html.tostring(body, encoding='unicode') 
开发者ID:mercuree,项目名称:html-telegraph-poster,代码行数:29,代码来源:html_to_telegraph.py


注:本文中的lxml.html.tostring方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。