當前位置: 首頁>>代碼示例>>Python>>正文


Python chardet.detect方法代碼示例

本文整理匯總了Python中chardet.detect方法的典型用法代碼示例。如果您正苦於以下問題:Python chardet.detect方法的具體用法?Python chardet.detect怎麽用?Python chardet.detect使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在chardet的用法示例。


在下文中一共展示了chardet.detect方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: get_title

# 需要導入模塊: import chardet [as 別名]
# 或者: from chardet import detect [as 別名]
def get_title(url):
    code = 0

    try:
        r = req.get(url)
        code = r.status_code
        coding = chardet.detect(r.content).get('encoding')
        text = r.content[:10000].decode(coding)
        html = etree.HTML(text)
        title = html.xpath('//title/text()')
        if title:
            return url + ' | ' + title[0]
        else:
            return url + ' | Status_code: ' + str(code)
    except:
        pass

    return url + ' | Status_code: ' + str(code) 
開發者ID:al0ne,項目名稱:Vxscan,代碼行數:20,代碼來源:get_title.py

示例2: webtables_iter

# 需要導入模塊: import chardet [as 別名]
# 或者: from chardet import detect [as 別名]
def webtables_iter(path):
    # generate the next line of json(table)
    with gzip.open(path, 'rb') as f_in:
        iter_count = 0  # only count the # of succesfully yield dataframes
        for line_count, dataset in enumerate(f_in):
            try:
                data = json.loads(dataset.decode('utf-8'))
                yield (iter_count, data)
                iter_count+=1
            except UnicodeDecodeError:
                encoding = chardet.detect(dataset)['encoding']
                try:
                    data = json.loads(dataset.decode(encoding))
                    yield (iter_count, data)
                    iter_count+=1
                except Exception as e:
                    #print('Cannot parse:', e)
                    continue
                continue 
開發者ID:megagonlabs,項目名稱:sato,代碼行數:21,代碼來源:read_raw_data.py

示例3: main

# 需要導入模塊: import chardet [as 別名]
# 或者: from chardet import detect [as 別名]
def main(src_file, dst_file, **kwargs):
    policies = []

    with open(src_file, mode='rb') as f:
        raw = f.read()
    
    encoding = chardet.detect(raw)['encoding']
    src = raw.decode(encoding).splitlines()

    if '[Unicode]' in src:
        policies = _convert_secedit(src)
    else:
        policies = _convert_regpol(src)

    with open(dst_file, mode='w') as dh_:
        yaml.safe_dump(policies, dh_, default_flow_style=False) 
開發者ID:plus3it,項目名稱:ash-windows-formula,代碼行數:18,代碼來源:convert-lgpo-policy.py

示例4: _render_and_compare_dot_files

# 需要導入模塊: import chardet [as 別名]
# 或者: from chardet import detect [as 別名]
def _render_and_compare_dot_files(self, directory):
        # files that confuse `chardet`
        encodings = {
            'Latin1.dot': 'latin-1'}
        dot_files = [
            fname for fname in os.listdir(directory)
            if fname.endswith('.dot')]
        for fname in dot_files:
            fpath = os.path.join(directory, fname)
            with open(fpath, 'rb') as f:
                s = f.read()
            estimate = chardet.detect(s)
            encoding = encodings.get(fname, estimate['encoding'])
            os.sys.stdout.write('#')
            os.sys.stdout.flush()
            pydot_sha = self._render_with_pydot(fpath, encoding)
            graphviz_sha = self._render_with_graphviz(fpath, encoding)
            assert pydot_sha == graphviz_sha, (pydot_sha, graphviz_sha) 
開發者ID:pydot,項目名稱:pydot,代碼行數:20,代碼來源:pydot_unittest.py

示例5: detect_codec

# 需要導入模塊: import chardet [as 別名]
# 或者: from chardet import detect [as 別名]
def detect_codec(bytedata, filename):

    for codec in DEFAULT_CONF['codec_chain']:
        if codec == 'chardet':
            chr_res = chardet.detect(bytedata)
            if not chr_res['encoding'] or chr_res['confidence'] < DEFAULT_CONF['confi_thres']:
                log.debug(f"The codec of {filename} is unable to detect, the result is {chr_res}.")
            else:
                return normalize_codec_name(chr_res['encoding'])
        else:
            try:
                bytedata.decode(codec, 'strict')
                return codec
            except UnicodeDecodeError:
                log.debug(f"{filename} is not {codec}-encoded.")
                continue
    # end of for-loop

    return None
# end of detect_codec(bytedata, filename): 
開發者ID:x1angli,項目名稱:cvt2utf,代碼行數:22,代碼來源:main.py

示例6: to_unicode

# 需要導入模塊: import chardet [as 別名]
# 或者: from chardet import detect [as 別名]
def to_unicode(original, *args):
    logger.debug('def to_unicode started')
    try:
        if isinstance(original, str):
            return original
        else:
            try:
                return six.text_type(original, *args)
            except:
                try:
                    detected = detect(original)
                    try:
                        if detected.get('confidence') > 0.8:
                            return original.decode(detected.get('encoding'))
                    except:
                        pass

                    return ek(original, *args)
                except:
                    raise
    except:
        import traceback
        logger.error('Unable to decode value "%s..." : %s ', (repr(original)[:20], traceback.format_exc()))
        return 'ERROR DECODING STRING' 
開發者ID:piejanssens,項目名稱:premiumizer,代碼行數:26,代碼來源:premiumizer.py

示例7: read_content

# 需要導入模塊: import chardet [as 別名]
# 或者: from chardet import detect [as 別名]
def read_content(dir_file):
    """ 
    Read the file as bite
    and return the content
    
    Arguments:
        dir_file {[str]} -- [description]
    
    Returns:
        [str] -- [description]
    """

    with open(dir_file,"rb") as rb:
        content = rb.read()
    
    encoder_code = chardet.detect(content)["encoding"]
    try:
        content = content.decode(encoder_code)
    except:
        message = "This file code {} is error, and ignored".format(dir_file)
        warnings.warn(message)
        content = content.decode(encoder_code, "ignore")
    return content 
開發者ID:Sohone-Guo,項目名稱:Pointer-Generator,代碼行數:25,代碼來源:os.py

示例8: finalize

# 需要導入模塊: import chardet [as 別名]
# 或者: from chardet import detect [as 別名]
def finalize(self):
        """finalizing this Report sends off the email."""
        self.write(self._formatter.finalize())
        report = ezmail.MIMEText.MIMEText(self._fo.getvalue(),
                        self._formatter.MIMETYPE.split("/")[1])
        report["Content-Disposition"] = "inline"
        self._message.attach(report)
        if self._attach_logfile and self._logfile:
            try:
                lfd = open(self._logfile, "rb").read()
            except:
                pass # non-fatal
            else:
                logmsg = ezmail.MIMEText.MIMEText(lfd, charset=chardet.detect(lfd))
                logmsg["Content-Disposition"] = 'attachment; filename=%s' % (
                        os.path.basename(self._logfile), )
                self._message.attach(logmsg)
        ezmail.mail(self._message) 
開發者ID:kdart,項目名稱:pycopia,代碼行數:20,代碼來源:Email.py

示例9: skipwrap

# 需要導入模塊: import chardet [as 別名]
# 或者: from chardet import detect [as 別名]
def skipwrap(para):
    # If the text begins with four spaces or one tab, it's a code block; don't wrap
    if para[0:4] == '    ' or para[0] == '\t':
        return True
    # If the text begins with only two "--", possibly preceded by whitespace, that's
    # an emdash; so wrap.
    stripped = para.lstrip()
    if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
        return False
    # I'm not sure what this is for; I thought it was to detect lists, but there's
    # a <br>-inside-<span> case in one of the tests that also depends upon it.
    if stripped[0:1] == '-' or stripped[0:1] == '*':
        return True
    # If the text begins with a single -, *, or +, followed by a space, or an integer,
    # followed by a ., followed by a space (in either case optionally preceeded by
    # whitespace), it's a list; don't wrap.
    if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
        return True
    return False 
開發者ID:schollz,項目名稱:extract_recipe,代碼行數:21,代碼來源:extract_recipe.py

示例10: check_encoding

# 需要導入模塊: import chardet [as 別名]
# 或者: from chardet import detect [as 別名]
def check_encoding():    
    base_path = os.path.abspath("./data") 
    onlyfiles = [ f for f in listdir(base_path) if isfile(join(base_path,f)) ]
    headers = {}
    # for each of the file 
    for filepath in onlyfiles:        
        #form the full file path
        refFile_path = os.path.join(base_path, filepath)
        
        rawdata = open(refFile_path, "r").read()
        result = chardet.detect(rawdata)
        charenc = result['encoding']
        print charenc


# check_for_differences() - read all the public facility csv columns and look for
#  1) Common column headers among all csv files
#  2) Unique column headers among all csv files 
開發者ID:gazetteerhk,項目名稱:census_explorer,代碼行數:20,代碼來源:getPubFacility_GeoJSON_with_CACODE.py

示例11: readPFM

# 需要導入模塊: import chardet [as 別名]
# 或者: from chardet import detect [as 別名]
def readPFM(file):
    file = open(file, 'rb')

    color = None
    width = None
    height = None
    scale = None
    endian = None

    header = file.readline().rstrip()
    encode_type = chardet.detect(header)  
    header = header.decode(encode_type['encoding'])
    if header == 'PF':
        color = True
    elif header == 'Pf':
        color = False
    else:
        raise Exception('Not a PFM file.')

    dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode(encode_type['encoding']))
    if dim_match:
        width, height = map(int, dim_match.groups())
    else:
        raise Exception('Malformed PFM header.')

    scale = float(file.readline().rstrip().decode(encode_type['encoding']))
    if scale < 0: # little-endian
        endian = '<'
        scale = -scale
    else:
        endian = '>' # big-endian

    data = np.fromfile(file, endian + 'f')
    shape = (height, width, 3) if color else (height, width)

    data = np.reshape(data, shape)
    data = np.flipud(data)
    return data, scale 
開發者ID:JiaRenChang,項目名稱:PSMNet,代碼行數:40,代碼來源:readpfm.py

示例12: get_title

# 需要導入模塊: import chardet [as 別名]
# 或者: from chardet import detect [as 別名]
def get_title(url):
    try:
        r = req.get(url)
        coding = chardet.detect(r.content).get('encoding')
        text = r.content[:10000].decode(coding)
        webinfo = WebPage(r.url, text, r.headers).info()
        if webinfo.get('apps'):
            return 'URL: ' + url + ' | Fingerprint: ' + ' , '.join(webinfo.get('apps'))
    except:
        pass 
開發者ID:al0ne,項目名稱:Vxscan,代碼行數:12,代碼來源:fingerprint.py

示例13: web_info

# 需要導入模塊: import chardet [as 別名]
# 或者: from chardet import detect [as 別名]
def web_info(url):
    host = parse_host(url)
    ipaddr = parse_ip(host)
    url = url.strip('/')
    address = geoip(ipaddr)
    wafresult = checkwaf(url)
    req = Requests()
    # noinspection PyBroadException
    try:
        r = req.get(url)
        coding = chardet.detect(r.content).get('encoding')
        r.encoding = coding
        webinfo = WebPage(r.url, r.text, r.headers).info()
    except Exception as e:
        logging.exception(e)
        webinfo = {}
    if webinfo:
        console('Webinfo', host, 'title: {}\n'.format(webinfo.get('title')))
        console('Webinfo', host, 'Fingerprint: {}\n'.format(webinfo.get('apps')))
        console('Webinfo', host, 'Server: {}\n'.format(webinfo.get('server')))
        console('Webinfo', host, 'WAF: {}\n'.format(wafresult))
    else:
        webinfo = {}
        wafresult = 'None'
    if iscdn(host):
        osname = osdetect(host)
    else:
        osname = None
    
    data = {
        host: {
            'WAF': wafresult,
            'Ipaddr': ipaddr,
            'Address': address,
            'Webinfo': webinfo,
            'OS': osname,
        }
    }
    
    return data, webinfo.get('apps'), webinfo.get('title') 
開發者ID:al0ne,項目名稱:Vxscan,代碼行數:42,代碼來源:web_info.py

示例14: check404

# 需要導入模塊: import chardet [as 別名]
# 或者: from chardet import detect [as 別名]
def check404(self, url):
        # 訪問一個隨機的頁麵記錄404頁麵的長度與內容
        key = str(random.random() * 100)
        random_url = base64.b64encode(key.encode('utf-8'))
        url = url + '/' + random_url.decode('utf-8') + '.html'
        try:
            self.notstr = '404page'
            r = self.req.get(url)
            if r.status_code == '200':
                coding = chardet.detect(r.content[:10000]).get('encoding')
                if coding:
                    text = r.content[:20000].decode(coding)
                    self.notstr = self.parse_html(text)
            self.notlen = r.headers.get('Content-Length')
            if not self.notlen:
                self.notlen = len(r.content)
            if r.is_redirect:
                self.goto = r.headers['Location']
        except (requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout, requests.exceptions.Timeout,
                requests.exceptions.SSLError, requests.exceptions.ConnectionError, ssl.SSLError, AttributeError,
                ConnectionRefusedError, socket.timeout, urllib3.exceptions.ReadTimeoutError,
                urllib3.exceptions.ProtocolError, OpenSSL.SSL.WantReadError):
            pass

        except UnboundLocalError:
            pass

        except Exception as e:
            logging.exception(e) 
開發者ID:al0ne,項目名稱:Vxscan,代碼行數:31,代碼來源:dir_scan.py

示例15: encoding

# 需要導入模塊: import chardet [as 別名]
# 或者: from chardet import detect [as 別名]
def encoding(self):
        """
        encoding of Response.content.

        if Response.encoding is None, encoding will be guessed
        by header or content or chardet if available.
        """
        if hasattr(self, '_encoding'):
            return self._encoding

        # content is unicode
        if isinstance(self.content, six.text_type):
            return 'unicode'

        # Try charset from content-type or content
        encoding = get_encoding(self.headers, self.content)

        # Fallback to auto-detected encoding.
        if not encoding and chardet is not None:
            encoding = chardet.detect(self.content[:600])['encoding']

        if encoding and encoding.lower() == 'gb2312':
            encoding = 'gb18030'

        self._encoding = encoding or 'utf-8'
        return self._encoding 
開發者ID:binux,項目名稱:pyspider,代碼行數:28,代碼來源:response.py


注:本文中的chardet.detect方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。