本文整理汇总了Python中chardet.detect方法的典型用法代码示例。如果您正苦于以下问题:Python chardet.detect方法的具体用法?Python chardet.detect怎么用?Python chardet.detect使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类chardet
的用法示例。
在下文中一共展示了chardet.detect方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_title
# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def get_title(url):
code = 0
try:
r = req.get(url)
code = r.status_code
coding = chardet.detect(r.content).get('encoding')
text = r.content[:10000].decode(coding)
html = etree.HTML(text)
title = html.xpath('//title/text()')
if title:
return url + ' | ' + title[0]
else:
return url + ' | Status_code: ' + str(code)
except:
pass
return url + ' | Status_code: ' + str(code)
示例2: webtables_iter
# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def webtables_iter(path):
# generate the next line of json(table)
with gzip.open(path, 'rb') as f_in:
iter_count = 0 # only count the # of succesfully yield dataframes
for line_count, dataset in enumerate(f_in):
try:
data = json.loads(dataset.decode('utf-8'))
yield (iter_count, data)
iter_count+=1
except UnicodeDecodeError:
encoding = chardet.detect(dataset)['encoding']
try:
data = json.loads(dataset.decode(encoding))
yield (iter_count, data)
iter_count+=1
except Exception as e:
#print('Cannot parse:', e)
continue
continue
示例3: main
# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def main(src_file, dst_file, **kwargs):
policies = []
with open(src_file, mode='rb') as f:
raw = f.read()
encoding = chardet.detect(raw)['encoding']
src = raw.decode(encoding).splitlines()
if '[Unicode]' in src:
policies = _convert_secedit(src)
else:
policies = _convert_regpol(src)
with open(dst_file, mode='w') as dh_:
yaml.safe_dump(policies, dh_, default_flow_style=False)
示例4: _render_and_compare_dot_files
# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def _render_and_compare_dot_files(self, directory):
# files that confuse `chardet`
encodings = {
'Latin1.dot': 'latin-1'}
dot_files = [
fname for fname in os.listdir(directory)
if fname.endswith('.dot')]
for fname in dot_files:
fpath = os.path.join(directory, fname)
with open(fpath, 'rb') as f:
s = f.read()
estimate = chardet.detect(s)
encoding = encodings.get(fname, estimate['encoding'])
os.sys.stdout.write('#')
os.sys.stdout.flush()
pydot_sha = self._render_with_pydot(fpath, encoding)
graphviz_sha = self._render_with_graphviz(fpath, encoding)
assert pydot_sha == graphviz_sha, (pydot_sha, graphviz_sha)
示例5: detect_codec
# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def detect_codec(bytedata, filename):
for codec in DEFAULT_CONF['codec_chain']:
if codec == 'chardet':
chr_res = chardet.detect(bytedata)
if not chr_res['encoding'] or chr_res['confidence'] < DEFAULT_CONF['confi_thres']:
log.debug(f"The codec of {filename} is unable to detect, the result is {chr_res}.")
else:
return normalize_codec_name(chr_res['encoding'])
else:
try:
bytedata.decode(codec, 'strict')
return codec
except UnicodeDecodeError:
log.debug(f"{filename} is not {codec}-encoded.")
continue
# end of for-loop
return None
# end of detect_codec(bytedata, filename):
示例6: to_unicode
# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def to_unicode(original, *args):
logger.debug('def to_unicode started')
try:
if isinstance(original, str):
return original
else:
try:
return six.text_type(original, *args)
except:
try:
detected = detect(original)
try:
if detected.get('confidence') > 0.8:
return original.decode(detected.get('encoding'))
except:
pass
return ek(original, *args)
except:
raise
except:
import traceback
logger.error('Unable to decode value "%s..." : %s ', (repr(original)[:20], traceback.format_exc()))
return 'ERROR DECODING STRING'
示例7: read_content
# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def read_content(dir_file):
"""
Read the file as bite
and return the content
Arguments:
dir_file {[str]} -- [description]
Returns:
[str] -- [description]
"""
with open(dir_file,"rb") as rb:
content = rb.read()
encoder_code = chardet.detect(content)["encoding"]
try:
content = content.decode(encoder_code)
except:
message = "This file code {} is error, and ignored".format(dir_file)
warnings.warn(message)
content = content.decode(encoder_code, "ignore")
return content
示例8: finalize
# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def finalize(self):
"""finalizing this Report sends off the email."""
self.write(self._formatter.finalize())
report = ezmail.MIMEText.MIMEText(self._fo.getvalue(),
self._formatter.MIMETYPE.split("/")[1])
report["Content-Disposition"] = "inline"
self._message.attach(report)
if self._attach_logfile and self._logfile:
try:
lfd = open(self._logfile, "rb").read()
except:
pass # non-fatal
else:
logmsg = ezmail.MIMEText.MIMEText(lfd, charset=chardet.detect(lfd))
logmsg["Content-Disposition"] = 'attachment; filename=%s' % (
os.path.basename(self._logfile), )
self._message.attach(logmsg)
ezmail.mail(self._message)
示例9: skipwrap
# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def skipwrap(para):
# If the text begins with four spaces or one tab, it's a code block; don't wrap
if para[0:4] == ' ' or para[0] == '\t':
return True
# If the text begins with only two "--", possibly preceded by whitespace, that's
# an emdash; so wrap.
stripped = para.lstrip()
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
return False
# I'm not sure what this is for; I thought it was to detect lists, but there's
# a <br>-inside-<span> case in one of the tests that also depends upon it.
if stripped[0:1] == '-' or stripped[0:1] == '*':
return True
# If the text begins with a single -, *, or +, followed by a space, or an integer,
# followed by a ., followed by a space (in either case optionally preceeded by
# whitespace), it's a list; don't wrap.
if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
return True
return False
示例10: check_encoding
# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def check_encoding():
base_path = os.path.abspath("./data")
onlyfiles = [ f for f in listdir(base_path) if isfile(join(base_path,f)) ]
headers = {}
# for each of the file
for filepath in onlyfiles:
#form the full file path
refFile_path = os.path.join(base_path, filepath)
rawdata = open(refFile_path, "r").read()
result = chardet.detect(rawdata)
charenc = result['encoding']
print charenc
# check_for_differences() - read all the public facility csv columns and look for
# 1) Common column headers among all csv files
# 2) Unique column headers among all csv files
示例11: readPFM
# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def readPFM(file):
file = open(file, 'rb')
color = None
width = None
height = None
scale = None
endian = None
header = file.readline().rstrip()
encode_type = chardet.detect(header)
header = header.decode(encode_type['encoding'])
if header == 'PF':
color = True
elif header == 'Pf':
color = False
else:
raise Exception('Not a PFM file.')
dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode(encode_type['encoding']))
if dim_match:
width, height = map(int, dim_match.groups())
else:
raise Exception('Malformed PFM header.')
scale = float(file.readline().rstrip().decode(encode_type['encoding']))
if scale < 0: # little-endian
endian = '<'
scale = -scale
else:
endian = '>' # big-endian
data = np.fromfile(file, endian + 'f')
shape = (height, width, 3) if color else (height, width)
data = np.reshape(data, shape)
data = np.flipud(data)
return data, scale
示例12: get_title
# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def get_title(url):
try:
r = req.get(url)
coding = chardet.detect(r.content).get('encoding')
text = r.content[:10000].decode(coding)
webinfo = WebPage(r.url, text, r.headers).info()
if webinfo.get('apps'):
return 'URL: ' + url + ' | Fingerprint: ' + ' , '.join(webinfo.get('apps'))
except:
pass
示例13: web_info
# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def web_info(url):
host = parse_host(url)
ipaddr = parse_ip(host)
url = url.strip('/')
address = geoip(ipaddr)
wafresult = checkwaf(url)
req = Requests()
# noinspection PyBroadException
try:
r = req.get(url)
coding = chardet.detect(r.content).get('encoding')
r.encoding = coding
webinfo = WebPage(r.url, r.text, r.headers).info()
except Exception as e:
logging.exception(e)
webinfo = {}
if webinfo:
console('Webinfo', host, 'title: {}\n'.format(webinfo.get('title')))
console('Webinfo', host, 'Fingerprint: {}\n'.format(webinfo.get('apps')))
console('Webinfo', host, 'Server: {}\n'.format(webinfo.get('server')))
console('Webinfo', host, 'WAF: {}\n'.format(wafresult))
else:
webinfo = {}
wafresult = 'None'
if iscdn(host):
osname = osdetect(host)
else:
osname = None
data = {
host: {
'WAF': wafresult,
'Ipaddr': ipaddr,
'Address': address,
'Webinfo': webinfo,
'OS': osname,
}
}
return data, webinfo.get('apps'), webinfo.get('title')
示例14: check404
# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def check404(self, url):
# 访问一个随机的页面记录404页面的长度与内容
key = str(random.random() * 100)
random_url = base64.b64encode(key.encode('utf-8'))
url = url + '/' + random_url.decode('utf-8') + '.html'
try:
self.notstr = '404page'
r = self.req.get(url)
if r.status_code == '200':
coding = chardet.detect(r.content[:10000]).get('encoding')
if coding:
text = r.content[:20000].decode(coding)
self.notstr = self.parse_html(text)
self.notlen = r.headers.get('Content-Length')
if not self.notlen:
self.notlen = len(r.content)
if r.is_redirect:
self.goto = r.headers['Location']
except (requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout, requests.exceptions.Timeout,
requests.exceptions.SSLError, requests.exceptions.ConnectionError, ssl.SSLError, AttributeError,
ConnectionRefusedError, socket.timeout, urllib3.exceptions.ReadTimeoutError,
urllib3.exceptions.ProtocolError, OpenSSL.SSL.WantReadError):
pass
except UnboundLocalError:
pass
except Exception as e:
logging.exception(e)
示例15: encoding
# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def encoding(self):
"""
encoding of Response.content.
if Response.encoding is None, encoding will be guessed
by header or content or chardet if available.
"""
if hasattr(self, '_encoding'):
return self._encoding
# content is unicode
if isinstance(self.content, six.text_type):
return 'unicode'
# Try charset from content-type or content
encoding = get_encoding(self.headers, self.content)
# Fallback to auto-detected encoding.
if not encoding and chardet is not None:
encoding = chardet.detect(self.content[:600])['encoding']
if encoding and encoding.lower() == 'gb2312':
encoding = 'gb18030'
self._encoding = encoding or 'utf-8'
return self._encoding