当前位置: 首页>>代码示例>>Python>>正文


Python tidylib.tidy_fragment函数代码示例

本文整理汇总了Python中tidylib.tidy_fragment函数的典型用法代码示例。如果您正苦于以下问题:Python tidy_fragment函数的具体用法?Python tidy_fragment怎么用?Python tidy_fragment使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了tidy_fragment函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_frag_with_entity

 def test_frag_with_entity(self):
     h = "é"
     expected = "é"
     doc, err = tidy_fragment(h)
     self.assertEqual(doc, expected)
     
     expected = "é"
     doc, err = tidy_fragment(h, {'numeric-entities':1})
     self.assertEqual(doc, expected)
开发者ID:18600597055,项目名称:hue,代码行数:9,代码来源:FragsTest.py

示例2: parse_book_file

def parse_book_file(href, book):
    block = {}
    book_tree = lxml.html.parse(join(books_dir, href), parser)
    if not 'page_count' in book:
        td = book_tree.xpath(
                "//td[descendant::*[contains(text(), '{}')]]".format(
                    book['title'])
                )
        if len(td):
            td = td[0]
            page_info = td.xpath("descendant::*[contains(text(), 'страниц')]")
            if len(page_info):
                book['page_count'] = patterns[0][1].search(
                        tostring(page_info[0], encoding='unicode')).groups()[0]

    block['annotation'] = book_tree.xpath(
            r"//table[descendant::*[contains(text(), 'Аннотация')]]")
    block['contents'] = book_tree.xpath(
            r"//table[descendant::*[contains(text(), 'Содержание')]]")
    for key in block:
        if len(block[key]):
            mark = block[key][-1]
            book[key] = ""
            for element in mark.itersiblings():
                if element.tag == "table":
                    break
                drop_a(element)
                remove_attr(element)
                book[key] += tostring(element, encoding='unicode')
            book[key] = tidy_fragment(clean(book[key]))[0]
    return book
开发者ID:a-iv,项目名称:practica.ru,代码行数:31,代码来源:oldsite_parser.py

示例3: test_frag_with_unclosed_tag

    def test_frag_with_unclosed_tag(self):
        h = "<p>hello"
        expected = '''<p>
  hello
</p>'''
        doc, err = tidy_fragment(h)
        self.assertEqual(doc, expected)
开发者ID:waylan,项目名称:pytidylib,代码行数:7,代码来源:FragsTest.py

示例4: sanitize_html

def sanitize_html(value):
    from BeautifulSoup import BeautifulSoup, Comment, Tag

    # FIXME: 'None' should never be saved as text
    if value is None:
        return ""

    # allowed tags for a Vodafone Live <CONTAINER type="data" />
    # this doubles up as a translation table. CKEditor does new-ish
    # HTML than Vodafone Live will accept. We have to translate 'em' back
    # to 'i', and 'strong' back to 'b'.
    #
    # NOTE: Order is important since <strong>'s can be inside <p>'s.
    tags = (
        ("em", "i"),  # when creating them in the editor they're EMs
        ("strong", "b"),
        ("i", "i"),  # when loading them as I's the editor leaves them
        ("b", "b"),  # we keep them here to prevent them from being removed
        ("u", "u"),
        ("br", "br"),
        ("p", "p"),
    )
    valid_tags = [tag for tag, replacement_tag in tags]
    soup = BeautifulSoup(value)

    # remove all comments from the HTML
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # hide all tags that aren't in the allowed list, but keep
    # their contents
    for tag in soup.findAll(True):
        # Vodafone Live allows for no tag attributes
        tag.attrs = []
        if tag.name not in valid_tags:
            tag.hidden = True

    # replace tags with Vlive equivelants
    for element, replacement_element in tags:
        if element is not replacement_element:
            for tag in soup.findAll(element):
                replacement_tag = Tag(soup, replacement_element)
                replacement_tag.insert(0, tag.text)
                tag.replaceWith(replacement_tag)

    xml = soup.renderContents().decode("utf8")
    fragment, errors = tidy_fragment(xml, {"char-encoding": "utf8"})

    return (
        fragment.replace("&nbsp;", " ")
        .replace("&rsquo;", "'")
        .replace("&lsquo;", "'")
        .replace("&quot;", '"')
        .replace("&ldquo;", '"')
        .replace("&rdquo;", '"')
        .replace("&bull;", "- ")
        .replace("&eacute;", "e")
        .replace("&Eacute;", "E")
        .replace("&ndash;", "-")
    )
开发者ID:praekelt,项目名称:ummeli,代码行数:60,代码来源:vlive_tags.py

示例5: clean

def clean( html ):
    if not html:
        return html
    clean = bleach.clean( html, tags = local_config.TAG_WHITELIST, attributes = local_config.ATTRIBUTE_WHITELIST )
    # catches some additional problems
    tidy, warnings = tidylib.tidy_fragment( clean )
    return tidy
开发者ID:mrwonko,项目名称:homepage,代码行数:7,代码来源:just_sanitize.py

示例6: link_title_uid_txt

def link_title_uid_txt(i):
    if 'alternate' in i:
        link = i['alternate'][0]['href']
    else:
        link = ''
    if 'title' in i:
        title = i['title']
        title = unescape(title)
    else:
        title = '无题'
    rss_uid = i.get('id') or 1
    snippet = i.get('summary') or i.get('content') or None

    if not snippet:
        return

    if snippet:
        htm = snippet['content']
        if not htm:
            return

    htm = txttidy(htm)
    htm = txt_map('<pre', '</pre>', htm, pre_br)
    htm = tidy_fragment(htm, {'indent': 0})[0]
    htm = htm.replace('<br />', '\n')
    txt = htm2txt(htm)

    if not txt:
        return

    return link, title, rss_uid, txt
开发者ID:immissile,项目名称:42qu_github_mirror,代码行数:31,代码来源:rss_update.py

示例7: get_article_text

    def get_article_text(self, body):
        """
        Gets the article main text
        :param body:
        :return:
        """
        raw_article_body = body.find("div", {"class": "article-body"})

        article_body_no_html = raw_article_body

        if article_body_no_html is not None:
            article_body_no_html = article_body_no_html.get_text()
            article_body_no_html = self.gremlin_zapper.zap_string(article_body_no_html)

        if raw_article_body is not None:
            self.zap_tag_contents(raw_article_body)
            article_body = ''
            for item in raw_article_body.contents:
                article_body += str(item)
        else:
            article_body = ''

        article_body, errors = tidy_fragment(article_body, options={'numeric-entities': 1})

        return article_body, article_body_no_html
开发者ID:ucsc,项目名称:slug-news,代码行数:25,代码来源:scraper.py

示例8: test_frag_with_unicode_subclass

    def test_frag_with_unicode_subclass(self):
        class MyUnicode(utype):
            pass

        h = MyUnicode("unicode string ß")
        expected = h
        doc, err = tidy_fragment(h)
        self.assertEqual(doc, expected)
开发者ID:waylan,项目名称:pytidylib,代码行数:8,代码来源:FragsTest.py

示例9: object_for_typepad_object

def object_for_typepad_object(tp_obj):
    try:
        obj = Object.objects.get(service='typepad.com', foreign_id=tp_obj.url_id)
    except Object.DoesNotExist:
        pass
    else:
        log.debug("Reusing typepad object %r for asset %s", obj, tp_obj.url_id)
        return False, obj

    log.debug("Making new object for TypePad post %s by %s", tp_obj.url_id, tp_obj.author.display_name)

    author = account_for_typepad_user(tp_obj.author)
    body = tp_obj.rendered_content
    if not body and tp_obj.content:
        if tp_obj.text_format == 'html_convert_linebreaks':
            body = '\n\n'.join(u'<p>%s</p>' % t for t in tp_obj.content.split('\n\n'))
        else:
            body = tp_obj.content
    if body:
        body, errors = tidy_fragment(body)
    else:
        body = ''

    obj = Object(
        service='typepad.com',
        foreign_id=tp_obj.url_id,
        render_mode='mixed',
        title=tp_obj.title,
        body=body,
        time=tp_obj.published,
        permalink_url=tp_obj.permalink_url,
        author=author,
    )

    if getattr(tp_obj, 'in_reply_to', None) is not None:
        # This post is in reply, so we don't care if our referent was
        # really a share. Be transitively in reply to the shared obj.
        really_a_share, obj.in_reply_to = object_for_typepad_object(tp_obj.in_reply_to)
    elif getattr(tp_obj, 'reblog_of', None) is not None:
        # Assets are public so it's okay if we use an anonymous typd here.
        t = typd.TypePad(endpoint='http://api.typepad.com/')
        reblog_of = t.assets.get(tp_obj.reblog_of.url_id)

        really_a_share, obj.in_reply_to = object_for_typepad_object(reblog_of)
        remove_reblog_boilerplate_from_obj(obj)
        if not obj.body:
            return True, obj.in_reply_to
    elif getattr(tp_obj, 'reblog_of_url', None) is not None:
        reblog_url = tp_obj.reblog_of_url
        try:
            in_reply_to = leapfrog.poll.embedlam.object_for_url(reblog_url)
        except leapfrog.poll.embedlam.RequestError, exc:
            in_reply_to = None
        except ValueError, exc:
            in_reply_to = None
            log.error("Error making object from referent %s of %s's post %s", reblog_url, author.display_name, tp_obj.url_id)
            log.exception(exc)
开发者ID:apparentlymart,项目名称:leapfrog,代码行数:57,代码来源:typepad.py

示例10: tidy_html

def tidy_html(html):
    """
    Process an input string containing HTML and return a tuple (xhtml,
    errors, warnings) containing the output of tidylib and lists of
    validation errors and warnings.

    Input must be unicode.
    Output will be valid XHTML.
    """
    if not isinstance(html, unicode):
        raise ValueError("tidyhtml must be called with a Unicode string!")

    warnings = list()

    # First, deal with embedded control codes:
    html, sub_count = CONTROL_CHAR_RE.subn(" ", html)
    if sub_count:
        warnings.append("Stripped %d control characters from body: %s" % (
            sub_count,
            set(ord(i) for i in CONTROL_CHAR_RE.findall(html))
        ))

    html, messages = tidylib.tidy_fragment(
        html.strip(),
        {
            "char-encoding":               "utf8",
            "clean":                        False,
            "drop-empty-paras":             False,
            "drop-font-tags":               True,
            "drop-proprietary-attributes":  False,
            "fix-backslash":                True,
            "indent":                       True,
            "output-xhtml":                 True,
        }
    )

    messages = filter(None, (l.strip() for l in messages.split("\n") if l))

    # postprocess warnings to avoid HTML fragments being reported as lacking
    # doctype and title:
    errors = list()
    warnings = list()

    for msg in messages:
        if "Warning: missing <!DOCTYPE> declaration" in msg:
            continue
        if "Warning: inserting missing 'title' element" in msg:
            continue
        if "Warning: inserting implicit <body>" in msg:
            continue

        if "Error:" in msg:
            errors.append(msg)
        else:
            warnings.append(msg)

    return html, errors, warnings
开发者ID:akaihola,项目名称:feincms,代码行数:57,代码来源:tidy.py

示例11: cleanupText

def cleanupText(text):
    """This method cleans up the text of the report using libtidy"""
    # tidylib options
    options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0, char_encoding="utf8", quote_nbsp=0)
    # remove html entities from the text
    ubody_text = unescape(text)
    # clean up xhtml using tidy
    aftertidy, errors = tidy_fragment(ubody_text.encode("utf8"), options, keep_doc=False)
    # tidylib returns a <tidy.lib._Document object>
    return str(aftertidy)
开发者ID:BenoitTalbot,项目名称:bungeni-portal,代码行数:10,代码来源:downloaddocument.py

示例12: html

 def html(self, string):
     """Parses HTML"""
     if "allow_html" not in INGIniousConfiguration or INGIniousConfiguration["allow_html"] == False:
         raise Exception("HTML is not allowed")
     elif INGIniousConfiguration["allow_html"] == "tidy":
         import tidylib
         out, dummy = tidylib.tidy_fragment(string)
         return out
     else:
         return string
开发者ID:GuillaumeDerval,项目名称:INGInious,代码行数:10,代码来源:parsable_text.py

示例13: html2xhtml

def html2xhtml(html,**options):
    options.update(doctype='omit')
    options.update(show_warnings=0)
    options.update(indent=0)
    options.update(output_xml=1)
    document, errors = tidy_fragment(html,options=options)
    if errors:
        #~ raise Exception(repr(errors))
        raise Exception("Errors while processing %s\n==========\n%s" % (html,errors))
    return document
开发者ID:MaxTyutyunnikov,项目名称:lino,代码行数:10,代码来源:html2xhtml.py

示例14: fix_open_tags

def fix_open_tags(source):
    """ Fixes missing tags in html fragments. """
    if not source:
        return source

    fixedhtml, errors = tidy_fragment(source)
    if settings.DEBUG and errors:
        errors = filter_tidylib_errors(errors)
        if errors:
            log.debug('Tidylib errors:\n{}'.format(errors))
    return fixedhtml
开发者ID:welbornprod,项目名称:wp_site,代码行数:11,代码来源:htmltools.py

示例15: POST

        def POST(self):
            """ POST request """
            web.header('Content-Type', 'application/json')

            post_input = web.data()

            try:
                decoded_input = json.loads(post_input)
            except:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: cannot decode POST</p>"})

            if "xqueue_body" not in decoded_input:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: no xqueue_body in POST</p>"})
            try:
                edx_input = json.loads(decoded_input["xqueue_body"])
                taskid = json.loads(edx_input["grader_payload"])["tid"]
            except:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: cannot decode JSON</p>"})

            try:
                task = course.get_task(taskid)
            except:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: unknown task {}</p>".format(taskid)})

            if not task.input_is_consistent(edx_input):
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: input not consistent with task</p>"})

            try:
                job_return = job_manager_sync.new_job(task, edx_input, "Plugin - EDX")
            except:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: error while grading submission</p>"})

            try:
                text = ""
                if "text" in job_return:
                    text = job_return["text"]
                if "problems" in job_return:
                    for prob in job_return["problems"]:
                        text += "<br/><h4>" + job_return["task"].get_problems()[prob].get_name() + "</h4>" + job_return["problems"][prob]

                score = (1 if job_return["result"] == "success" else 0)
                if "score" in job_return:
                    score = job_return["score"]

                import tidylib

                out, dummy = tidylib.tidy_fragment(text, options={'output-xhtml': 1, 'enclose-block-text': 1, 'enclose-text': 1})
                return json.dumps({"correct": (True if (job_return["result"] == "success") else None), "score": score, "msg": out})
            except:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: error converting submission result</p>"})
开发者ID:jonsan21,项目名称:INGInious,代码行数:50,代码来源:edx.py


注:本文中的tidylib.tidy_fragment函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。