当前位置: 首页>>代码示例>>Python>>正文


Python bs4.UnicodeDammit方法代码示例

本文整理汇总了Python中bs4.UnicodeDammit方法的典型用法代码示例。如果您正苦于以下问题:Python bs4.UnicodeDammit方法的具体用法?Python bs4.UnicodeDammit怎么用?Python bs4.UnicodeDammit使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在bs4的用法示例。


在下文中一共展示了bs4.UnicodeDammit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse_rsc_html

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def parse_rsc_html(htmlstring):
    """Messy RSC HTML needs this special parser to fix problems before creating selector."""
    converted = UnicodeDammit(htmlstring)
    if not converted.unicode_markup:
        raise UnicodeDecodeError('Failed to detect encoding, tried [%s]')
    root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding))
    # Add p.otherpara tags around orphan text
    newp = None
    for child in root.get_element_by_id('wrapper'):
        if newp is not None:
            if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None:
                child.addprevious(newp)
                newp = None
            else:
                newp.append(child)
        if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip():
            newp = Element('p', **{'class': 'otherpara'})
            newp.text = child.tail
            child.tail = ''
    return root 
开发者ID:mcs07,项目名称:ChemDataExtractor,代码行数:22,代码来源:rsc.py

示例2: search_txt

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def search_txt(filename, word):
    '''
    Search the word in a text file
    '''
    # Detect the encoding
    with open(filename, 'rb') as file:
        content = file.read(1024)

    suggestion = UnicodeDammit(content)
    encoding = suggestion.original_encoding

    # Open and read
    with open(filename, encoding=encoding) as file:
        for line in file:
            if word in line.lower():
                return True

    return False 
开发者ID:PacktPublishing,项目名称:Python-Automation-Cookbook,代码行数:20,代码来源:scan.py

示例3: force_unicode

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def force_unicode(s):
    """
    Ensure a string is unicode, not encoded; used for enforcing file paths to be unicode upon saving a subtitle,
    to prevent encoding issues when saving a subtitle to a non-ascii path.
    :param s: string
    :return: unicode string
    """
    if not isinstance(s, str):
        try:
            s = s.decode("utf-8")
        except UnicodeDecodeError:
            t = chardet.detect(s)
            try:
                s = s.decode(t["encoding"])
            except UnicodeDecodeError:
                s = UnicodeDammit(s).unicode_markup
    return s 
开发者ID:morpheus65535,项目名称:bazarr,代码行数:19,代码来源:helper.py

示例4: parse

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def parse(self, response):
        """
        default parse method, rule is not useful now
        """
        # import pdb; pdb.set_trace()
        response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
        hxs = HtmlXPathSelector(response)
        index_level = self.determine_level(response)
        log.msg("Parse: index level:" + str(index_level))
        if index_level in [1, 2, 3, 4]:
            self.save_to_file_system(index_level, response)
            relative_urls = self.get_follow_links(index_level, hxs)
            if relative_urls is not None:
                for url in relative_urls:
                    log.msg('yield process, url:' + url)
                    yield Request(url, callback=self.parse)
        elif index_level == 5:
            personProfile = HtmlParser.extract_person_profile(hxs)
            linkedin_id = self.get_linkedin_id(response.url)
            linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
            if linkedin_id:
                personProfile['_id'] = linkedin_id
                personProfile['url'] = UnicodeDammit(response.url).markup
                yield personProfile 
开发者ID:openslack,项目名称:openslack-crawler,代码行数:26,代码来源:LinkedinSpider.py

示例5: generate_rows

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def generate_rows(self, dataset_schema=None, dataset_partitioning=None,
                            partition_id=None, records_limit = -1):
        """
        The main reading method.
        """

        url_book = self.mirror
        lid = len(str(self.book_id))
        fullbid = str(self.book_id)
        rootbid = fullbid # sometimes the id to access a file has a variation, ex fullbid=14285-8 for the book 14285

        stopit = 0
        for i in range(lid-1):
            if (fullbid[i+1] != "-") and (stopit==0):
                url_book += '/'+fullbid[i]
            else:
                stopit=1
                rootbid=fullbid[0:i]
        url_book += '/'+ rootbid  + '/'+ fullbid + '.txt'

        response = urlopen(url_book)
        raw = response.read()   #.decode('utf8')
        converted = UnicodeDammit(raw)
        raw = converted.unicode_markup
        start_book = raw.find("START OF")
        end_book = raw.rfind('END OF')
        preamb = raw[:start_book]

        author = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Author') != -1][0]
        title = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Title') != -1][0]
        date = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Release Date') != -1][0]
        book_paraph =  raw[start_book:end_book].split("\r\n\r\n")

        logger.info("Book length %s" % len(raw))
        logger.info("N paragraphs:", len(book_paraph))

        for id_p, p in enumerate(book_paraph):
            yield {'id':id_p, 'author': author, 'title': title, 'text': p} 
开发者ID:dataiku,项目名称:dataiku-contrib,代码行数:40,代码来源:connector.py

示例6: getItem

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def getItem(self, itemUrl, addlHeaders=None):

		content, handle = self.wg.getpage(itemUrl, returnMultiple=True, addlHeaders={'Referer': self.refererUrl})
		if not content or not handle:
			raise ValueError("Failed to retreive file from page '%s'!" % itemUrl)



		info = handle.info()
		if not 'Content-Disposition' in info:
			info['Content-Disposition'] = ''

		fileN = jsLiteralParse.parseContentDispositon(info['Content-Disposition'], itemUrl)
		fileN = bs4.UnicodeDammit(fileN).unicode_markup

		mType = handle.info()['Content-Type']

		# If there is an encoding in the content-type (or any other info), strip it out.
		# We don't care about the encoding, since WebRequest will already have handled that,
		# and returned a decoded unicode object.

		if mType and ";" in mType:
			mType = mType.split(";")[0].strip()


		self.log.info("Retreived file of type '%s', name of '%s' with a size of %0.3f K", mType, fileN, len(content)/1000.0)
		return content, fileN, mType 
开发者ID:fake-name,项目名称:ReadableWebProxy,代码行数:29,代码来源:gDocParse.py

示例7: beautify

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def beautify(self, data, charset):
        dammit = UnicodeDammit(data, [charset, "utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
        data = dammit.unicode_markup
        return data 
开发者ID:pingf,项目名称:falsy,代码行数:6,代码来源:chromeboy.py

示例8: pycurl_get_resp

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def pycurl_get_resp(data_buf, headers, payload, resp):
    charset = None
    if 'content-type' in headers:
        content_type = headers['content-type'].lower()
        match = re.search('charset=(\S+)', content_type)
        if match:
            charset = match.group(1)
            print('Decoding using %s' % charset)
    body = data_buf.getvalue()
    if len(body) == 0:
        data = ''
        charset = 'utf-8'
    else:
        if charset is None:
            dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
            data = dammit.unicode_markup
            charset = dammit.original_encoding
        else:
            data = body.decode(charset, 'ignore')
    # headers.remove({})
    headers['content'] = [h for h in headers['content'] if len(h) > 0]
    soup_lxml = BeautifulSoup(data, 'lxml')
    soup_html = BeautifulSoup(data, 'html.parser')
    resp.update({
        'url': payload.get('url'),
        # 'soup': soup,
        'title': get_title(soup_lxml),
        'links': get_links(soup_lxml),
        'links2': get_links2(soup_lxml),
        'metas': get_metas(soup_lxml),
        'images': get_images(soup_lxml),
        'scripts': get_scripts(soup_lxml),
        'text': get_text(soup_html),
        'data': data,
        'headers': headers,
        'charset': charset,
        'spider': 'pycurl',
        'payload': payload,
    }) 
开发者ID:pingf,项目名称:falsy,代码行数:41,代码来源:one.py

示例9: _get_encoding

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def _get_encoding(cls, input_string, encoding):
        converted = UnicodeDammit(input_string, [encoding] if encoding else [])
        # Not worth raising exception? lxml will raise if parse fails.
        # if not converted.unicode_markup:
        #     raise UnicodeDecodeError('Failed to detect encoding')
        return converted.original_encoding 
开发者ID:mcs07,项目名称:ChemDataExtractor,代码行数:8,代码来源:selector.py

示例10: get_encoding

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def get_encoding(input_string, guesses=None, is_html=False):
    """Return the encoding of a byte string. Uses bs4 UnicodeDammit.

    :param string input_string: Encoded byte string.
    :param list[string] guesses: (Optional) List of encoding guesses to prioritize.
    :param bool is_html: Whether the input is HTML.
    """
    converted = UnicodeDammit(input_string, override_encodings=[guesses] if guesses else [], is_html=is_html)
    return converted.original_encoding 
开发者ID:mcs07,项目名称:ChemDataExtractor,代码行数:11,代码来源:__init__.py

示例11: scan

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def scan(self, payload: Payload, request: Request) -> WorkerResponse:

        normalize: bool = True
        ioctype: str = 'all'
        results: Dict = {}

        if ioctype == 'all':
            for ioc in self.compiled_re:
                if self.compiled_re[ioc]:
                    matches = self.compiled_re[ioc].findall(UnicodeDammit(payload.content).unicode_markup)
                    if matches:
                        results[ioc] = list(set(matches))
        elif self.compiled_re[ioctype]:
            matches = self.compiled_re[ioctype].findall(UnicodeDammit(payload.content).unicode_markup)
            if matches:
                results[ioctype] = list(set(matches))

        if 'ipv6' in results:
            results['ipv6'] = [
                address for address in results['ipv6'] if self._validate_ipv6(address)
            ]
            if not results['ipv6']:
                results.pop('ipv6')

        if normalize:
            results = self._normalize(results)

        return WorkerResponse(results) 
开发者ID:PUNCH-Cyber,项目名称:stoq-plugins-public,代码行数:30,代码来源:iocextract.py

示例12: scan

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def scan(self, payload: Payload, request: Request) -> WorkerResponse:
        extracted: List[ExtractedPayload] = []
        tnef_results = TNEF(payload.content)
        if tnef_results.attachments:
            for tnef_attachment in tnef_results.attachments:
                try:
                    filename = UnicodeDammit(tnef_attachment.name).unicode_markup
                except:
                    filename = "None"
                tnef_meta = PayloadMeta(extra_data={'filename': filename})
                extracted.append(ExtractedPayload(tnef_attachment.data, tnef_meta))
        return WorkerResponse(extracted=extracted) 
开发者ID:PUNCH-Cyber,项目名称:stoq-plugins-public,代码行数:14,代码来源:tnef.py

示例13: discover_mirror_selection

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def discover_mirror_selection():
    """Discover "geographically suitable" Ubuntu mirrors."""
    timer = Timer()
    logger.info("Identifying fast Ubuntu mirrors using %s ..", MIRROR_SELECTION_URL)
    data = fetch_url(MIRROR_SELECTION_URL, timeout=3, retry=True, max_attempts=5)
    # shorter timeout with more retries is good for unstable connections to MIRROR_SELECTION_URL
    dammit = UnicodeDammit(data)
    mirrors = set(
        CandidateMirror(mirror_url=mirror_url.strip())
        for mirror_url in dammit.unicode_markup.splitlines()
        if mirror_url and not mirror_url.isspace() and mirror_url.startswith(('http://', 'https://'))
    )
    logger.debug("Found %s in %s.", pluralize(len(mirrors), "fast Ubuntu mirror"), timer)
    return mirrors 
开发者ID:martin68,项目名称:apt-smart,代码行数:16,代码来源:ubuntu.py

示例14: guess_encoding

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def guess_encoding(self):
        # override default subtitle guess_encoding method to not include language-specific encodings guessing
        # chardet encoding detection seem to yield better results
        """Guess encoding using chardet.

        :return: the guessed encoding.
        :rtype: str

        """
        if self._guessed_encoding:
            return self._guessed_encoding

        logger.info('Guessing encoding for language %s', self.language)

        # guess/detect encoding using chardet
        encoding = chardet.detect(self.content)['encoding']
        logger.info('Chardet found encoding %s', encoding)

        if not encoding:
            # fallback on bs4
            logger.info('Falling back to bs4 detection')
            a = UnicodeDammit(self.content)

            logger.info("bs4 detected encoding: %s", a.original_encoding)

            if a.original_encoding:
                self._guessed_encoding = a.original_encoding
                return a.original_encoding
            raise ValueError(u"Couldn't guess the proper encoding for %s", self)

        self._guessed_encoding = encoding
        return encoding 
开发者ID:morpheus65535,项目名称:bazarr,代码行数:34,代码来源:nekur.py

示例15: extract

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def extract(self):
		try:
			arch, fName = self.wg.getFileAndName(self.url, addlHeaders={'Referer': self.refererUrl})
		except IndexError:
			print("ERROR: Failure retrieving page!")
			return None, []

		baseName = fName.split(".")[0]

		if not isinstance(arch, bytes):
			if 'You need permission' in arch or 'Sign in to continue to Docs':
				self.log.critical("Retrieving zip archive failed?")
				self.log.critical("Retreived content type: '%s'", type(arch))
				raise TypeError("Cannot access document? Is it protected?")
			else:
				with open("tmp_page.html", "w") as fp:
					fp.write(arch)
				raise ValueError("Doc not valid?")

		zp = io.BytesIO(arch)
		zfp = zipfile.ZipFile(zp)

		resources = []
		baseFile = None

		for item in zfp.infolist():
			if not "/" in item.filename and not baseFile:
				contents = zfp.open(item).read()
				contents = bs4.UnicodeDammit(contents).unicode_markup

				baseFile = (item.filename, contents)

			elif baseName in item.filename and baseName:
				raise ValueError("Multiple base file items?")

			else:
				resources.append((item.filename, mimetypes.guess_type(item.filename)[0], zfp.open(item).read()))

		if not baseFile:
			raise ValueError("No base file found!")

		return baseFile, resources 
开发者ID:fake-name,项目名称:ReadableWebProxy,代码行数:44,代码来源:gDocParse.py


注:本文中的bs4.UnicodeDammit方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。