Python encoding.html_to_unicode函数代码示例

本文整理汇总了Python中w3lib.encoding.html_to_unicode函数的典型用法代码示例。如果您正苦于以下问题：Python html_to_unicode函数的具体用法？Python html_to_unicode怎么用？Python html_to_unicode使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了html_to_unicode函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_replace_wrong_encoding

    def test_replace_wrong_encoding(self):
        """Test invalid chars are replaced properly"""
        encoding, body_unicode = html_to_unicode(ct('utf-8'),
                'PREFIX\xe3\xabSUFFIX')
        # XXX: Policy for replacing invalid chars may suffer minor variations
        # but it should always contain the unicode replacement char (u'\ufffd')
        assert u'\ufffd' in body_unicode, repr(body_unicode)
        assert u'PREFIX' in body_unicode, repr(body_unicode)
        assert u'SUFFIX' in body_unicode, repr(body_unicode)

        # Do not destroy html tags due to encoding bugs
        encoding, body_unicode = html_to_unicode(ct('utf-8'),
            '\xf0<span>value</span>')
        assert u'<span>value</span>' in body_unicode, repr(body_unicode)

开发者ID:Dior222，项目名称:w3lib，代码行数:14，代码来源:test_encoding.py

示例2: test_gunzip_illegal_eof

 def test_gunzip_illegal_eof(self):
     with open(join(SAMPLEDIR, "unexpected-eof.gz"), "rb") as f:
         text = html_to_unicode("charset=cp1252", gunzip(f.read()))[1]
         with open(join(SAMPLEDIR, "unexpected-eof-output.txt"), "rb") as o:
             expected_text = o.read().decode("utf-8")
             self.assertEqual(len(text), len(expected_text))
             self.assertEqual(text, expected_text)

开发者ID:lopuhin，项目名称:scrapy，代码行数:7，代码来源:test_utils_gz.py

示例3: extract

    def extract(self, html='', **kwargs):
        """
        extract data field from raw html or from a url.
        """
        if not html and 'url' in kwargs:
            info = urlopen(kwargs.pop('url'))
            _, html = html_to_unicode(info.headers.get('content_type'), info.read())

        builder = DomTreeBuilder(html)
        root = builder.build()

        region_finder = MiningDataRegion(root, self.k, self.threshold)
        regions = region_finder.find_regions(root)

        record_finder = MiningDataRecord(self.threshold)
        field_finder = MiningDataField()

        for region in regions:
            records = record_finder.find_records(region)
            items, _ = field_finder.align_records(records)
            region.items = items
            if 'verbose' in kwargs:
                print region
                for record in records:
                    print '\t', record

        return regions

开发者ID:tpeng，项目名称:pydepta，代码行数:27，代码来源:depta.py

示例4: _assert_encoding

 def _assert_encoding(self, content_type, body, expected_encoding,
             expected_unicode):
     encoding, body_unicode = html_to_unicode(ct(content_type), body)
     self.assertTrue(isinstance(body_unicode, unicode))
     self.assertEqual(norm_encoding(encoding),
             norm_encoding(expected_encoding))
     self.assertEqual(body_unicode, expected_unicode)

开发者ID:Dior222，项目名称:w3lib，代码行数:7，代码来源:test_encoding.py

示例5: test_unicode_body

 def test_unicode_body(self):
     unicode_string = u'\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442'
     original_string = unicode_string.encode('cp1251')
     encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string)
     # check body_as_unicode
     self.assertTrue(isinstance(body_unicode, unicode))
     self.assertEqual(body_unicode, unicode_string)

开发者ID:Dior222，项目名称:w3lib，代码行数:7，代码来源:test_encoding.py

示例6: url_to_page

def url_to_page(url, encoding=None, default_encoding='utf-8'):
    """Fetch a URL, using python urllib2, and return an HtmlPage object.

    The `url` may be a string, or a `urllib2.Request` object. The `encoding`
    argument can be used to force the interpretation of the page encoding.

    Redirects are followed, and the `url` property of the returned HtmlPage object
    is the url of the final page redirected to.

    If the encoding of the page is known, it can be passed as a keyword argument. If
    unspecified, the encoding is guessed using `w3lib.encoding.html_to_unicode`.
    `default_encoding` is used if the encoding cannot be determined.
    """
    fh = urlopen(url)
    info = fh.info()
    body_str = fh.read()
    # guess content encoding if not specified
    if encoding is None:
        try:
            # Python 3.x
            content_type_header = fh.getheader("content-type")
        except AttributeError:
            # Python 2.x
            content_type_header = info.getheader("content-type")
        encoding, body = html_to_unicode(content_type_header, body_str,
                default_encoding=default_encoding)
    else:
        body = body_str.decode(encoding)
    return HtmlPage(fh.geturl(), headers=dict(info.items()), body=body, encoding=encoding)

开发者ID:abudulemusa，项目名称:scrapely，代码行数:29，代码来源:htmlpage.py

示例7: factory

    def factory(self,data, parser_cls,url):
        charset = 'charset=%s' % 'utf-8'
        data = html_to_unicode(charset, data)[1]
        body = data.encode('utf8') or '<html/>'


        parser = parser_cls(recover=True, encoding='utf8')
        return etree.fromstring(body, parser=parser, base_url=url)

开发者ID:ysc8620，项目名称:redant_spider，代码行数:8，代码来源:index.py

示例8: text

 def text(self):
     """ Body as unicode """
     # access self.encoding before _cached_ubody to make sure
     # _body_inferred_encoding is called
     benc = self.encoding
     if self._cached_ubody is None:
         charset = 'charset=%s' % benc
         self._cached_ubody = html_to_unicode(charset, self.body)[1]
     return self._cached_ubody

开发者ID:wusy1209，项目名称:scrapy，代码行数:9，代码来源:text.py

示例9: _body_inferred_encoding

 def _body_inferred_encoding(self):
     if self._cached_benc is None:
         content_type = to_native_str(self.headers.get(b'Content-Type', b''))
         benc, ubody = html_to_unicode(content_type, self.body,
                 auto_detect_fun=self._auto_detect_fun,
                 default_encoding=self._DEFAULT_ENCODING)
         self._cached_benc = benc
         self._cached_ubody = ubody
     return self._cached_benc

开发者ID:wusy1209，项目名称:scrapy，代码行数:9，代码来源:text.py

示例10: body_as_unicode

 def body_as_unicode(self):
     """Return body as unicode"""
     # check for self.encoding before _cached_ubody just in
     # _body_inferred_encoding is called
     benc = self.encoding
     if self._cached_ubody is None:
         charset = 'charset=%s' % benc
         self._cached_ubody = html_to_unicode(charset, self.body)[1]
     return self._cached_ubody

开发者ID:AugustLONG，项目名称:scrapy，代码行数:9，代码来源:text.py

示例11: body_as_unicode

 def body_as_unicode(self):
     from w3lib.encoding import html_to_unicode, resolve_encoding, \
 html_body_declared_encoding, http_content_type_encoding
     """Return body as unicode"""
     # check for self.encoding before _cached_ubody just in
     # _body_inferred_encoding is called
     benc = self.encoding
     charset = 'charset=%s' % benc
     self._cached_ubody = html_to_unicode(charset, self.content)[1]
     return self._cached_ubody

开发者ID:deniyes，项目名称:pyspider，代码行数:10，代码来源:response.py

示例12: response2unicode

def response2unicode(resp):
    """
    Convert requests.Response body to unicode.
    Unlike ``response.text`` it handles <meta> tags in response content.
    """
    enc, html = html_to_unicode(
        content_type_header=resp.headers.get("Content-Type"),
        html_body_str=resp.content,
        auto_detect_fun=_autodetect_encoding,
    )
    return html

开发者ID:RaoUmer，项目名称:Formasaurus，代码行数:11，代码来源:utils.py

示例13: encoding

    def encoding(self) -> str:
        """The encoding string to be used, extracted from the HTML and
        :class:`HTMLResponse <HTMLResponse>` headers.
        """
        if self._encoding:
            return self._encoding

        # Scan meta tags for chaset.
        if self._html:
            self._encoding = html_to_unicode(self.default_encoding, self._html)[0]

        return self._encoding if self._encoding else self.default_encoding

开发者ID:666King999，项目名称:requests-html，代码行数:12，代码来源:requests_html.py

示例14: infer

    def infer(self, html='', **kwargs):
        """
        extract data with seed region and the data you expect to scrape from there.
        """
        if 'url' in kwargs:
            info = urlopen(kwargs.pop('url'))
            _, html = html_to_unicode(info.headers.get('content_type'), info.read())

        builder = DomTreeBuilder(html)
        doc = builder.build()
        page = HtmlPage(body=tostring(doc, encoding=unicode, method='html'))

        return self.scraper.scrape_page(page)

开发者ID:tpeng，项目名称:pydepta，代码行数:13，代码来源:depta.py

示例15: _assert_encoding

    def _assert_encoding(self, content_type, body, expected_encoding,
                expected_unicode):
        encoding, body_unicode = html_to_unicode(ct(content_type), body)
        self.assertTrue(isinstance(body_unicode, unicode))
        self.assertEqual(norm_encoding(encoding),
                norm_encoding(expected_encoding))

        if isinstance(expected_unicode, basestring):
            self.assertEqual(body_unicode, expected_unicode)
        else:
            self.assertTrue(
                body_unicode in expected_unicode,
                "%s is not in %s" % (body_unicode, expected_unicode)
            )

开发者ID:TontonMax，项目名称:w3lib，代码行数:14，代码来源:test_encoding.py

注：本文中的w3lib.encoding.html_to_unicode函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。