当前位置: 首页>>代码示例>>Python>>正文


Python chardet.detect方法代码示例

本文整理汇总了Python中chardet.detect方法的典型用法代码示例。如果您正苦于以下问题:Python chardet.detect方法的具体用法?Python chardet.detect怎么用?Python chardet.detect使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在chardet的用法示例。


在下文中一共展示了chardet.detect方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_title

# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def get_title(url):
    code = 0

    try:
        r = req.get(url)
        code = r.status_code
        coding = chardet.detect(r.content).get('encoding')
        text = r.content[:10000].decode(coding)
        html = etree.HTML(text)
        title = html.xpath('//title/text()')
        if title:
            return url + ' | ' + title[0]
        else:
            return url + ' | Status_code: ' + str(code)
    except:
        pass

    return url + ' | Status_code: ' + str(code) 
开发者ID:al0ne,项目名称:Vxscan,代码行数:20,代码来源:get_title.py

示例2: webtables_iter

# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def webtables_iter(path):
    # generate the next line of json(table)
    with gzip.open(path, 'rb') as f_in:
        iter_count = 0  # only count the # of succesfully yield dataframes
        for line_count, dataset in enumerate(f_in):
            try:
                data = json.loads(dataset.decode('utf-8'))
                yield (iter_count, data)
                iter_count+=1
            except UnicodeDecodeError:
                encoding = chardet.detect(dataset)['encoding']
                try:
                    data = json.loads(dataset.decode(encoding))
                    yield (iter_count, data)
                    iter_count+=1
                except Exception as e:
                    #print('Cannot parse:', e)
                    continue
                continue 
开发者ID:megagonlabs,项目名称:sato,代码行数:21,代码来源:read_raw_data.py

示例3: main

# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def main(src_file, dst_file, **kwargs):
    policies = []

    with open(src_file, mode='rb') as f:
        raw = f.read()
    
    encoding = chardet.detect(raw)['encoding']
    src = raw.decode(encoding).splitlines()

    if '[Unicode]' in src:
        policies = _convert_secedit(src)
    else:
        policies = _convert_regpol(src)

    with open(dst_file, mode='w') as dh_:
        yaml.safe_dump(policies, dh_, default_flow_style=False) 
开发者ID:plus3it,项目名称:ash-windows-formula,代码行数:18,代码来源:convert-lgpo-policy.py

示例4: _render_and_compare_dot_files

# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def _render_and_compare_dot_files(self, directory):
        # files that confuse `chardet`
        encodings = {
            'Latin1.dot': 'latin-1'}
        dot_files = [
            fname for fname in os.listdir(directory)
            if fname.endswith('.dot')]
        for fname in dot_files:
            fpath = os.path.join(directory, fname)
            with open(fpath, 'rb') as f:
                s = f.read()
            estimate = chardet.detect(s)
            encoding = encodings.get(fname, estimate['encoding'])
            os.sys.stdout.write('#')
            os.sys.stdout.flush()
            pydot_sha = self._render_with_pydot(fpath, encoding)
            graphviz_sha = self._render_with_graphviz(fpath, encoding)
            assert pydot_sha == graphviz_sha, (pydot_sha, graphviz_sha) 
开发者ID:pydot,项目名称:pydot,代码行数:20,代码来源:pydot_unittest.py

示例5: detect_codec

# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def detect_codec(bytedata, filename):

    for codec in DEFAULT_CONF['codec_chain']:
        if codec == 'chardet':
            chr_res = chardet.detect(bytedata)
            if not chr_res['encoding'] or chr_res['confidence'] < DEFAULT_CONF['confi_thres']:
                log.debug(f"The codec of {filename} is unable to detect, the result is {chr_res}.")
            else:
                return normalize_codec_name(chr_res['encoding'])
        else:
            try:
                bytedata.decode(codec, 'strict')
                return codec
            except UnicodeDecodeError:
                log.debug(f"{filename} is not {codec}-encoded.")
                continue
    # end of for-loop

    return None
# end of detect_codec(bytedata, filename): 
开发者ID:x1angli,项目名称:cvt2utf,代码行数:22,代码来源:main.py

示例6: to_unicode

# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def to_unicode(original, *args):
    logger.debug('def to_unicode started')
    try:
        if isinstance(original, str):
            return original
        else:
            try:
                return six.text_type(original, *args)
            except:
                try:
                    detected = detect(original)
                    try:
                        if detected.get('confidence') > 0.8:
                            return original.decode(detected.get('encoding'))
                    except:
                        pass

                    return ek(original, *args)
                except:
                    raise
    except:
        import traceback
        logger.error('Unable to decode value "%s..." : %s ', (repr(original)[:20], traceback.format_exc()))
        return 'ERROR DECODING STRING' 
开发者ID:piejanssens,项目名称:premiumizer,代码行数:26,代码来源:premiumizer.py

示例7: read_content

# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def read_content(dir_file):
    """ 
    Read the file as bite
    and return the content
    
    Arguments:
        dir_file {[str]} -- [description]
    
    Returns:
        [str] -- [description]
    """

    with open(dir_file,"rb") as rb:
        content = rb.read()
    
    encoder_code = chardet.detect(content)["encoding"]
    try:
        content = content.decode(encoder_code)
    except:
        message = "This file code {} is error, and ignored".format(dir_file)
        warnings.warn(message)
        content = content.decode(encoder_code, "ignore")
    return content 
开发者ID:Sohone-Guo,项目名称:Pointer-Generator,代码行数:25,代码来源:os.py

示例8: finalize

# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def finalize(self):
        """finalizing this Report sends off the email."""
        self.write(self._formatter.finalize())
        report = ezmail.MIMEText.MIMEText(self._fo.getvalue(),
                        self._formatter.MIMETYPE.split("/")[1])
        report["Content-Disposition"] = "inline"
        self._message.attach(report)
        if self._attach_logfile and self._logfile:
            try:
                lfd = open(self._logfile, "rb").read()
            except:
                pass # non-fatal
            else:
                logmsg = ezmail.MIMEText.MIMEText(lfd, charset=chardet.detect(lfd))
                logmsg["Content-Disposition"] = 'attachment; filename=%s' % (
                        os.path.basename(self._logfile), )
                self._message.attach(logmsg)
        ezmail.mail(self._message) 
开发者ID:kdart,项目名称:pycopia,代码行数:20,代码来源:Email.py

示例9: skipwrap

# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def skipwrap(para):
    # If the text begins with four spaces or one tab, it's a code block; don't wrap
    if para[0:4] == '    ' or para[0] == '\t':
        return True
    # If the text begins with only two "--", possibly preceded by whitespace, that's
    # an emdash; so wrap.
    stripped = para.lstrip()
    if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
        return False
    # I'm not sure what this is for; I thought it was to detect lists, but there's
    # a <br>-inside-<span> case in one of the tests that also depends upon it.
    if stripped[0:1] == '-' or stripped[0:1] == '*':
        return True
    # If the text begins with a single -, *, or +, followed by a space, or an integer,
    # followed by a ., followed by a space (in either case optionally preceeded by
    # whitespace), it's a list; don't wrap.
    if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
        return True
    return False 
开发者ID:schollz,项目名称:extract_recipe,代码行数:21,代码来源:extract_recipe.py

示例10: check_encoding

# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def check_encoding():    
    base_path = os.path.abspath("./data") 
    onlyfiles = [ f for f in listdir(base_path) if isfile(join(base_path,f)) ]
    headers = {}
    # for each of the file 
    for filepath in onlyfiles:        
        #form the full file path
        refFile_path = os.path.join(base_path, filepath)
        
        rawdata = open(refFile_path, "r").read()
        result = chardet.detect(rawdata)
        charenc = result['encoding']
        print charenc


# check_for_differences() - read all the public facility csv columns and look for
#  1) Common column headers among all csv files
#  2) Unique column headers among all csv files 
开发者ID:gazetteerhk,项目名称:census_explorer,代码行数:20,代码来源:getPubFacility_GeoJSON_with_CACODE.py

示例11: readPFM

# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def readPFM(file):
    file = open(file, 'rb')

    color = None
    width = None
    height = None
    scale = None
    endian = None

    header = file.readline().rstrip()
    encode_type = chardet.detect(header)  
    header = header.decode(encode_type['encoding'])
    if header == 'PF':
        color = True
    elif header == 'Pf':
        color = False
    else:
        raise Exception('Not a PFM file.')

    dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode(encode_type['encoding']))
    if dim_match:
        width, height = map(int, dim_match.groups())
    else:
        raise Exception('Malformed PFM header.')

    scale = float(file.readline().rstrip().decode(encode_type['encoding']))
    if scale < 0: # little-endian
        endian = '<'
        scale = -scale
    else:
        endian = '>' # big-endian

    data = np.fromfile(file, endian + 'f')
    shape = (height, width, 3) if color else (height, width)

    data = np.reshape(data, shape)
    data = np.flipud(data)
    return data, scale 
开发者ID:JiaRenChang,项目名称:PSMNet,代码行数:40,代码来源:readpfm.py

示例12: get_title

# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def get_title(url):
    try:
        r = req.get(url)
        coding = chardet.detect(r.content).get('encoding')
        text = r.content[:10000].decode(coding)
        webinfo = WebPage(r.url, text, r.headers).info()
        if webinfo.get('apps'):
            return 'URL: ' + url + ' | Fingerprint: ' + ' , '.join(webinfo.get('apps'))
    except:
        pass 
开发者ID:al0ne,项目名称:Vxscan,代码行数:12,代码来源:fingerprint.py

示例13: web_info

# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def web_info(url):
    host = parse_host(url)
    ipaddr = parse_ip(host)
    url = url.strip('/')
    address = geoip(ipaddr)
    wafresult = checkwaf(url)
    req = Requests()
    # noinspection PyBroadException
    try:
        r = req.get(url)
        coding = chardet.detect(r.content).get('encoding')
        r.encoding = coding
        webinfo = WebPage(r.url, r.text, r.headers).info()
    except Exception as e:
        logging.exception(e)
        webinfo = {}
    if webinfo:
        console('Webinfo', host, 'title: {}\n'.format(webinfo.get('title')))
        console('Webinfo', host, 'Fingerprint: {}\n'.format(webinfo.get('apps')))
        console('Webinfo', host, 'Server: {}\n'.format(webinfo.get('server')))
        console('Webinfo', host, 'WAF: {}\n'.format(wafresult))
    else:
        webinfo = {}
        wafresult = 'None'
    if iscdn(host):
        osname = osdetect(host)
    else:
        osname = None
    
    data = {
        host: {
            'WAF': wafresult,
            'Ipaddr': ipaddr,
            'Address': address,
            'Webinfo': webinfo,
            'OS': osname,
        }
    }
    
    return data, webinfo.get('apps'), webinfo.get('title') 
开发者ID:al0ne,项目名称:Vxscan,代码行数:42,代码来源:web_info.py

示例14: check404

# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def check404(self, url):
        # 访问一个随机的页面记录404页面的长度与内容
        key = str(random.random() * 100)
        random_url = base64.b64encode(key.encode('utf-8'))
        url = url + '/' + random_url.decode('utf-8') + '.html'
        try:
            self.notstr = '404page'
            r = self.req.get(url)
            if r.status_code == '200':
                coding = chardet.detect(r.content[:10000]).get('encoding')
                if coding:
                    text = r.content[:20000].decode(coding)
                    self.notstr = self.parse_html(text)
            self.notlen = r.headers.get('Content-Length')
            if not self.notlen:
                self.notlen = len(r.content)
            if r.is_redirect:
                self.goto = r.headers['Location']
        except (requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout, requests.exceptions.Timeout,
                requests.exceptions.SSLError, requests.exceptions.ConnectionError, ssl.SSLError, AttributeError,
                ConnectionRefusedError, socket.timeout, urllib3.exceptions.ReadTimeoutError,
                urllib3.exceptions.ProtocolError, OpenSSL.SSL.WantReadError):
            pass

        except UnboundLocalError:
            pass

        except Exception as e:
            logging.exception(e) 
开发者ID:al0ne,项目名称:Vxscan,代码行数:31,代码来源:dir_scan.py

示例15: encoding

# 需要导入模块: import chardet [as 别名]
# 或者: from chardet import detect [as 别名]
def encoding(self):
        """
        encoding of Response.content.

        if Response.encoding is None, encoding will be guessed
        by header or content or chardet if available.
        """
        if hasattr(self, '_encoding'):
            return self._encoding

        # content is unicode
        if isinstance(self.content, six.text_type):
            return 'unicode'

        # Try charset from content-type or content
        encoding = get_encoding(self.headers, self.content)

        # Fallback to auto-detected encoding.
        if not encoding and chardet is not None:
            encoding = chardet.detect(self.content[:600])['encoding']

        if encoding and encoding.lower() == 'gb2312':
            encoding = 'gb18030'

        self._encoding = encoding or 'utf-8'
        return self._encoding 
开发者ID:binux,项目名称:pyspider,代码行数:28,代码来源:response.py


注:本文中的chardet.detect方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。