当前位置: 首页>>代码示例>>Python>>正文


Python utils.get_encodings_from_content函数代码示例

本文整理汇总了Python中requests.utils.get_encodings_from_content函数的典型用法代码示例。如果您正苦于以下问题:Python get_encodings_from_content函数的具体用法?Python get_encodings_from_content怎么用?Python get_encodings_from_content使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了get_encodings_from_content函数的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_precedence

 def test_precedence(self):
     content = '''
     <?xml version="1.0" encoding="XML"?>
     <meta charset="HTML5">
     <meta http-equiv="Content-type" content="text/html;charset=HTML4" />
     '''.strip()
     assert get_encodings_from_content(content) == ['HTML5', 'HTML4', 'XML']
开发者ID:PoNote,项目名称:requests,代码行数:7,代码来源:test_utils.py

示例2: find_encoding

def find_encoding(content, headers=None):
    # content is unicode
    if isinstance(content, unicode):
        return 'unicode'

    encoding = None

    # Try charset from content-type
    if headers:
        encoding = get_encoding_from_headers(headers)
        if encoding == 'ISO-8859-1':
            encoding = None

    # Try charset from content
    if not encoding:
        encoding = get_encodings_from_content(content)
        encoding = encoding and encoding[0] or None

    # Fallback to auto-detected encoding.
    if not encoding and chardet is not None:
        encoding = chardet.detect(content)['encoding']

    if encoding and encoding.lower() == 'gb2312':
        encoding = 'gb18030'

    return encoding or 'latin_1'
开发者ID:Crooky,项目名称:qiandao,代码行数:26,代码来源:utils.py

示例3: guess_response_encoding

def guess_response_encoding(resp):
    '''
    Guess the content encoding of a requests response.

    Note: there's a performance issue due to chardet.
    '''
    # first try the encoding supplied by responce header and content
    encs = get_encodings_from_content(resp.content) or []
    for enc in encs:
        try:
            resp.content.decode(enc)
            LOG.info('Detected encoding %s from response content.', enc)
            return enc
        except UnicodeDecodeError:
            LOG.debug('Encoding from response content doesn\'t work.')

    enc = get_encoding_from_headers(resp.headers)
    if enc:
        try:
            resp.content.decode(enc)
            LOG.info('Detected encoding %s from response header.', enc)
            return enc
        except UnicodeDecodeError:
            LOG.debug('Encoding from response header doesn\'t work.')

    # neither encoding works, we have to go the hard way.
    start = clock()
    g = detect(resp.content)
    LOG.info('Detected encoding %s with cofidence of %g in %gs.' % (g['encoding'], g['confidence'], clock() - start))
    return g['encoding']
开发者ID:dirtysalt,项目名称:dirtysalt.github.io,代码行数:30,代码来源:utils.py

示例4: encoding

    def encoding(self):
        if hasattr(self, '_encoding'):
            return self._encoding

        # content is unicode
        if isinstance(self.content, unicode):
            return 'unicode'

        # Try charset from content-type
        encoding = get_encoding_from_headers(self.headers)
        if encoding == 'ISO-8859-1':
            encoding = None

        # Try charset from content
        if not encoding:
            encoding = get_encodings_from_content(self.content)
            encoding = encoding and encoding[0] or None

        # Fallback to auto-detected encoding.
        if not encoding and chardet is not None:
            encoding = chardet.detect(self.content)['encoding']

        if encoding and encoding.lower() == 'gb2312':
            encoding = 'gb18030'

        self._encoding = encoding or 'utf-8'
        return self._encoding
开发者ID:5aket,项目名称:pyspider,代码行数:27,代码来源:response.py

示例5: encoding

def encoding(rsp):
    """
    encoding of Response.content.

    if Response.encoding is None, encoding will be guessed
    by header or content or chardet if avaibable.
    """
    # content is unicode
    if isinstance(rsp.content, six.text_type):
        return 'unicode'

    # Try charset from content-type
    encoding = get_encoding_from_headers(rsp.headers)
    if encoding == 'ISO-8859-1':
        encoding = None

    # Try charset from content
    if not encoding and get_encodings_from_content:
        encoding = get_encodings_from_content(rsp.content)
        encoding = encoding and encoding[0] or None

    # Fallback to auto-detected encoding.
    if not encoding and chardet is not None:
        encoding = chardet.detect(rsp.content)['encoding']

    if encoding and encoding.lower() == 'gb2312':
        encoding = 'gb18030'

    encoding = encoding or 'utf-8'
    return encoding
开发者ID:zymtech,项目名称:parse_newspage,代码行数:30,代码来源:parserstandalone.py

示例6: _fetchContent

    def _fetchContent(self):
        r = requests.get(self.url)

        if get_encodings_from_content(r.content):
            self.encoding = get_encodings_from_content(r.content)[0]
        else:
            from contextlib import closing
            from urllib2 import urlopen
            with closing(urlopen(self.url)) as f:
                self.encoding = f.info().getparam("charset")

        # Set System default Codeing
        reload(sys)
        sys.setdefaultencoding(self.encoding)

        content = r.content.decode(self.encoding)

        return content
开发者ID:Lab-317,项目名称:NewsParser,代码行数:18,代码来源:NewsParser.py

示例7: encoding

    def encoding(self):
        """
        encoding of Response.content.

        if Response.encoding is None, encoding will be guessed
        by header or content or chardet if available.
        """
        if hasattr(self, '_encoding'):
            return self._encoding

        # content is unicode
        if isinstance(self.content, six.text_type):
            return 'unicode'

        # Try charset from content-type
        encoding = get_encoding_from_headers(self.headers)
        if encoding == 'ISO-8859-1':
            encoding = None

        # Try charset from content
        if not encoding and get_encodings_from_content:
            if six.PY3:
                encoding = get_encodings_from_content(utils.pretty_unicode(self.content[:100]))
            else:
                encoding = get_encodings_from_content(self.content)
            encoding = encoding and encoding[0] or None

        # Fallback to auto-detected encoding.
        if not encoding and chardet is not None:
            encoding = chardet.detect(self.content[:600])['encoding']

        if encoding and encoding.lower() == 'gb2312':
            encoding = 'gb18030'

        self._encoding = encoding or 'utf-8'
        return self._encoding
开发者ID:01jiagnwei01,项目名称:pyspider,代码行数:36,代码来源:response.py

示例8: procdata_getencoding

def procdata_getencoding(seed,headers,content):

	code = utils.get_encoding_from_headers(headers)
	if code:
		if code.lower() == 'gbk' or code.lower() == 'gb2312':
			code = 'gbk'
		elif code.lower() == 'utf-8':
			code = 'utf-8'
		else:
			code = None

	if code == None:
		code = utils.get_encodings_from_content(content)
		print "content",seed,code
		if code:
			code = code[0]
			if code.lower() == 'gbk' or code.lower() == 'gb2312':
				code = 'gbk'

	return code
开发者ID:salmonx,项目名称:fengbei,代码行数:20,代码来源:daemon.py

示例9: guess_content_encoding

def guess_content_encoding(content):
    '''
    Guess the encoding for plain content.

    Note: there's a performance issue due to chardet.
    '''
    # first try the encoding supplied by content
    encs = get_encodings_from_content(content) or []
    for enc in encs:
        try:
            content.decode(enc)
            LOG.info('Detected encoding %s from content.', enc)
            return enc
        except UnicodeDecodeError:
            LOG.debug('Encoding from content doesn\'t work.')

    # neither encoding works, we have to go the hard way.
    start = clock()
    g = detect(content)
    LOG.info('Detected encoding %s with cofidence of %g in %gs.' % (g['encoding'], g['confidence'], clock() - start))
    return g['encoding']
开发者ID:dirtysalt,项目名称:dirtysalt.github.io,代码行数:21,代码来源:utils.py

示例10: filter_encoding

    def filter_encoding(self,seed, headers,content):

        code = utils.get_encoding_from_headers(headers)
        if code:
            if code.lower() == 'gbk' or code.lower() == 'gb2312':
                code = 'gbk'
                return True
            elif code.lower() == 'utf-8' or code.lower() == 'utf8':
                code = 'utf8'
                # as for utf8, we should check the content
            else: #  'ISO-8859-1' and so on, 
                code = None

        # chinese website may also miss the content-encoding header, so detect the content
        if code == None:
            codes = utils.get_encodings_from_content(content)
            if codes:
                for code in codes:
                    if code.lower() in [ 'gbk','gb2312']:
                        return True
                    elif code.lower() == 'utf8' or code.lower() == 'utf-8':
                        code = 'utf8'
                        break
       
        if code != 'utf8':
            return False
 
        # here handle utf8
        # to detect any chinese char win
        try:
            ucon = content.decode('utf8')
            for uchar in ucon:
                i = ord(uchar)
                if i >= 0x4e00 and i <= 0x9fa5:
                    return True
        except Exception, e:
            print url, e
            pass
开发者ID:salmonx,项目名称:fengbei,代码行数:38,代码来源:worker_filter.py

示例11: test_pragmas

 def test_pragmas(self, content):
     encodings = get_encodings_from_content(content)
     assert len(encodings) == 1
     assert encodings[0] == 'UTF-8'
开发者ID:PoNote,项目名称:requests,代码行数:4,代码来源:test_utils.py

示例12: test_none

 def test_none(self):
     encodings = get_encodings_from_content('')
     assert not len(encodings)
开发者ID:PoNote,项目名称:requests,代码行数:3,代码来源:test_utils.py

示例13: on_incoming

	def on_incoming(self, msg):
		if not msg.type == msg.CHANNEL:
			return

		# Catching all exceptions without alerting, as there is just so much crap that can go wrong with web stuff. Also, I'm lazy.
		try:
			urls = self.url_re.findall(msg.body)
			for url in urls:
				# Catch edge case where url is in brackets
				while url.startswith('(') and url.endswith(')'):
					url = url[1:-1]

				head = requests.head(url, allow_redirects=True)
				# work on the URL we were redirected to, if any
				url = head.url

				message = ""
				content_type = head.headers['content-type']

				# HTML websites
				if 'text/html' in content_type:
					# Set up any required request headers
					req_headers = {}
					# TODO: Accept-Language header from config

					req = requests.get(url, headers=req_headers, timeout=5)

					if 'charset' not in content_type:
						# requests only looks at headers to detect the encoding, we must find the charset ourselves
						# we can't use req.content because regex doesn't work on bytestrings apparently
						encodings = get_encodings_from_content(req.text)
						if encodings:
							req.encoding = encodings[0]

					soup = BeautifulSoup(req.text)

					# Look for the <title> tag or an <h1>, whichever is first
					title = soup.find(['title', 'h1'])
					if title is None:
						return
					title = self.utils.tag_to_string(title)
					title = ' '.join(title.split())
					message = "Title: " + title

				# Other resources
				else:
					content_length = head.headers.get('content-length', '')
					if content_length.isdigit():
						size = self.sizeof_fmt(int(content_length))
					else:
						size = "Unknown size"

					# Searches for the last segment of the URL (the filename)
					filename = re.search(r'/([^/]+)/?$', url).groups(1)[0]

					message = "{}: {} ({})".format(filename, content_type, size)

				self.bot.privmsg(msg.channel, message)

		except Exception as exception:
			print("Link Info Exception!")
			print(type(exception), exception)
开发者ID:ackwell,项目名称:ninjabot,代码行数:62,代码来源:linkinfo.py


注:本文中的requests.utils.get_encodings_from_content函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。