当前位置: 首页>>代码示例>>Python>>正文


Python BeautifulSoup.BeautifulSoup类代码示例

本文整理汇总了Python中calibre.ebooks.BeautifulSoup.BeautifulSoup的典型用法代码示例。如果您正苦于以下问题:Python BeautifulSoup类的具体用法?Python BeautifulSoup怎么用?Python BeautifulSoup使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了BeautifulSoup类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: save_soup

def save_soup(soup, target):
    ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
    nm = ns.find('meta')
    metas = soup.findAll('meta', content=True)
    added = False
    for meta in metas:
        if 'charset' in meta.get('content', '').lower():
            meta.replaceWith(nm)
            added = True
    if not added:
        head = soup.find('head')
        if head is not None:
            head.insert(0, nm)

    selfdir = os.path.dirname(target)

    for tag in soup.findAll(['img', 'link', 'a']):
        for key in ('src', 'href'):
            path = tag.get(key, None)
            if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
                tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))

    html = unicode(soup)
    with open(target, 'wb') as f:
        f.write(html.encode('utf-8'))
开发者ID:Coi-l,项目名称:calibre,代码行数:25,代码来源:simple.py

示例2: search_for_asin_on_amazon

    def search_for_asin_on_amazon(self, query):
        '''Search for book's asin on amazon using given query'''
        query = urlencode({'keywords': query})
        url = '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query
        try:
            response = open_url(self._connections['amazon'], url)
        except PageDoesNotExist:
            return None

        # check to make sure there are results
        if ('did not match any products' in response and 'Did you mean:' not in response and
                'so we searched in All Departments' not in response):
            return None

        soup = BeautifulSoup(response)
        results = soup.findAll('div', {'id': 'resultsCol'})

        if not results:
            return None

        for result in results:
            if 'Buy now with 1-Click' in str(result):
                asin_search = AMAZON_ASIN_PAT.search(str(result))
                if asin_search:
                    return asin_search.group(1)

        return None
开发者ID:szarroug3,项目名称:X-Ray_Calibre_Plugin,代码行数:27,代码来源:book_settings.py

示例3: existing_annotations

def existing_annotations(parent, field, return_all=False):
    """
    Return count of existing annotations, or existence of any
    """
    # import calibre_plugins.marvin_manager.config as cfg
    _log_location(field)
    annotation_map = []
    if field:
        db = parent.opts.gui.current_db
        id = db.FIELD_MAP["id"]
        for i, record in enumerate(db.data.iterall()):
            mi = db.get_metadata(record[id], index_is_id=True)
            if field == "Comments":
                if mi.comments:
                    soup = BeautifulSoup(mi.comments)
                else:
                    continue
            else:
                soup = BeautifulSoup(mi.get_user_metadata(field, False)["#value#"])
            if soup.find("div", "user_annotations") is not None:
                annotation_map.append(mi.id)
                if not return_all:
                    break
        if return_all:
            _log("Identified %d annotated books of %d total books" % (len(annotation_map), len(db.data)))

        _log("annotation_map: %s" % repr(annotation_map))
    else:
        _log("no active field")

    return annotation_map
开发者ID:kbw1,项目名称:calibre-marvin-manager,代码行数:31,代码来源:common_utils.py

示例4: existing_annotations

def existing_annotations(parent, field, return_all=False):
    '''
    Return count of existing annotations, or existence of any
    '''
    import calibre_plugins.annotations.config as cfg
    annotation_map = []
    if field:
        db = parent.opts.gui.current_db
        id = db.FIELD_MAP['id']
        for i, record in enumerate(db.data.iterall()):
            mi = db.get_metadata(record[id], index_is_id=True)
            if field == 'Comments':
                if mi.comments:
                    soup = BeautifulSoup(mi.comments)
                else:
                    continue
            else:
                soup = BeautifulSoup(mi.get_user_metadata(field, False)['#value#'])
            if soup.find('div', 'user_annotations') is not None:
                annotation_map.append(mi.id)
                if not return_all:
                    break
        if return_all:
            _log_location("Identified %d annotated books of %d total books" %
                (len(annotation_map), len(db.data)))
        return annotation_map
开发者ID:mkarpiarz,项目名称:calibre-annotations,代码行数:26,代码来源:common_utils.py

示例5: get_annotations_date_range

    def get_annotations_date_range(self):
        '''
        Find oldest, newest annotation in annotated books
        initial values of self.oldest, self.newest are reversed to allow update comparisons
        if no annotations, restore to correct values
        '''
        annotations_found = False

        for cid in self.annotation_map:
            mi = self.cdb.get_metadata(cid, index_is_id=True)
            if self.field == 'Comments':
                soup = BeautifulSoup(mi.comments)
            else:
                soup = BeautifulSoup(mi.get_user_metadata(self.field, False)['#value#'])

            uas = soup.findAll('div', 'annotation')
            for ua in uas:
                annotations_found = True
                timestamp = float(ua.find('td', 'timestamp')['uts'])
                if timestamp < self.oldest_annotation:
                    self.oldest_annotation = timestamp
                if timestamp > self.newest_annotation:
                    self.newest_annotation = timestamp

        if not annotations_found:
            temp = self.newest_annotation
            self.newest_annotation = self.oldest_annotation
            self.oldest_annotation = temp
开发者ID:DuskyRose,项目名称:calibre-marvin-manager,代码行数:28,代码来源:config.py

示例6: extract_calibre_cover

def extract_calibre_cover(raw, base, log):
    from calibre.ebooks.BeautifulSoup import BeautifulSoup

    soup = BeautifulSoup(raw)
    matches = soup.find(name=["h1", "h2", "h3", "h4", "h5", "h6", "p", "span", "font", "br"])
    images = soup.findAll("img")
    if matches is None and len(images) == 1 and images[0].get("alt", "") == "cover":
        img = images[0]
        img = os.path.join(base, *img["src"].split("/"))
        if os.path.exists(img):
            return open(img, "rb").read()

    # Look for a simple cover, i.e. a body with no text and only one <img> tag
    if matches is None:
        body = soup.find("body")
        if body is not None:
            text = u"".join(map(unicode, body.findAll(text=True)))
            if text.strip():
                # Body has text, abort
                return
            images = body.findAll("img", src=True)
            if 0 < len(images) < 2:
                img = os.path.join(base, *images[0]["src"].split("/"))
                if os.path.exists(img):
                    return open(img, "rb").read()
开发者ID:naisonsouza,项目名称:calibre,代码行数:25,代码来源:__init__.py

示例7: extract_calibre_cover

def extract_calibre_cover(raw, base, log):
    from calibre.ebooks.BeautifulSoup import BeautifulSoup
    soup = BeautifulSoup(raw)
    matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span',
        'font', 'br'])
    images = soup.findAll('img')
    if matches is None and len(images) == 1 and \
            images[0].get('alt', '')=='cover':
        img = images[0]
        img = os.path.join(base, *img['src'].split('/'))
        if os.path.exists(img):
            return open(img, 'rb').read()

    # Look for a simple cover, i.e. a body with no text and only one <img> tag
    if matches is None:
        body = soup.find('body')
        if body is not None:
            text = u''.join(map(unicode, body.findAll(text=True)))
            if text.strip():
                # Body has text, abort
                return
            images = body.findAll('img', src=True)
            if 0 < len(images) < 2:
                img = os.path.join(base, *images[0]['src'].split('/'))
                if os.path.exists(img):
                    return open(img, 'rb').read()
开发者ID:yeyanchao,项目名称:calibre,代码行数:26,代码来源:__init__.py

示例8: get_series

def get_series(title, authors, timeout=60):
    mi = Metadata(title, authors)
    if title and title[0] in _ignore_starts:
        title = title[1:]
    title = re.sub(r'^(A|The|An)\s+', '', title).strip()
    if not title:
        return mi
    if isinstance(title, unicode):
        title = title.encode('utf-8')

    title = urllib.quote_plus(title)

    author = authors[0].strip()
    if not author:
        return mi
    if ',' in author:
        author = author.split(',')[0]
    else:
        author = author.split()[-1]

    url = URL.format(author, title)
    br = browser()
    try:
        raw = br.open_novisit(url, timeout=timeout).read()
    except URLError as e:
        if isinstance(e.reason, socket.timeout):
            raise Exception('KDL Server busy, try again later')
        raise
    if 'see the full results' not in raw:
        return mi
    raw = xml_to_unicode(raw)[0]
    soup = BeautifulSoup(raw)
    searcharea = soup.find('div', attrs={'class':'searcharea'})
    if searcharea is None:
        return mi
    ss = searcharea.find('div', attrs={'class':'seriessearch'})
    if ss is None:
        return mi
    a = ss.find('a', href=True)
    if a is None:
        return mi
    href = a['href'].partition('?')[-1]
    data = urlparse.parse_qs(href)
    series = data.get('SeriesName', [])
    if not series:
        return mi
    series = series[0]
    series = re.sub(r' series$', '', series).strip()
    if series:
        mi.series = series
    ns = ss.nextSibling
    if ns.contents:
        raw = unicode(ns.contents[0])
        raw = raw.partition('.')[0].strip()
        try:
            mi.series_index = int(raw)
        except:
            pass
    return mi
开发者ID:MarioJC,项目名称:calibre,代码行数:59,代码来源:kdl.py

示例9: find_all_annotated_books

 def find_all_annotated_books(self):
     '''
     Find all annotated books in library
     '''
     self._log_location("field: {0}".format(self.field))
     cids = self.cdb.search_getting_ids('formats:EPUB', '')
     for cid in cids:
         mi = self.cdb.get_metadata(cid, index_is_id=True)
         raw = mi.get_user_metadata(self.field, False)
         if raw['#value#'] is not None:
             soup = BeautifulSoup(raw['#value#'])
             if soup.find('div', 'user_annotations') is not None:
                 self.annotation_map.append(mi.id)
开发者ID:DuskyRose,项目名称:calibre-marvin-manager,代码行数:13,代码来源:config.py

示例10: get_soup

    def get_soup(self, src, url=None):
        nmassage = []
        nmassage.extend(self.preprocess_regexps)
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        for pat, repl in nmassage:
            usrc = pat.sub(repl, usrc)
        soup = BeautifulSoup(usrc)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
            for pat, repl in nmassage:
                replace = pat.sub(repl, replace)
            soup = BeautifulSoup(replace)

        if self.keep_only_tags:
            body = soup.new_tag('body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
开发者ID:JimmXinu,项目名称:calibre,代码行数:56,代码来源:simple.py

示例11: get_asin

    def get_asin(self, connection):
        query = urlencode({'keywords': '%s - %s' % (self._title, self._author)})
        try:
            connection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
            response = connection.getresponse().read()
        except:
            try:
                connection.close()
                if self._proxy:
                    connection = HTTPConnection(self._http_address, self._http_port)
                    connection.set_tunnel('www.amazon.com', 80)
                else:
                    connection = HTTPConnection('www.amazon.com')

                connection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
                response = connection.getresponse().read()
            except:
                self._status = self.FAIL
                self._status_message = self.FAILED_COULD_NOT_CONNECT_TO_AMAZON
                raise Exception(self._status_message)

        # check to make sure there are results
        if 'did not match any products' in response and not 'Did you mean:' in response and not 'so we searched in All Departments' in response:
            self._status = self.FAIL
            self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_PAGE
            raise Exception(self._status_message)

        soup = BeautifulSoup(response)
        results = soup.findAll('div', {'id': 'resultsCol'})
       
        if not results or len(results) == 0:
            self._status = self.FAIL
            self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_PAGE
            raise Exception(self._status_message)

        for r in results:
            if 'Buy now with 1-Click' in str(r):
                asinSearch = self.AMAZON_ASIN_PAT.search(str(r))
                if asinSearch:
                    self._asin = asinSearch.group(1)
                    mi = self._db.get_metadata(self._book_id)
                    identifiers = mi.get_identifiers()
                    identifiers['mobi-asin'] = self._asin
                    mi.set_identifiers(identifiers)
                    self._db.set_metadata(self._book_id, mi)
                    self._book_settings.prefs['asin'] = self._asin
                    return connection

        self._status = self.FAIL
        self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_ASIN
        raise Exception(self._status_message)
开发者ID:gitter-badger,项目名称:X-Ray_Calibre_Plugin,代码行数:51,代码来源:book.py

示例12: _remove_old_style

 def _remove_old_style(self, html):
     '''
     Remove the old style tag, finalize soup in preparation for styling
     '''
     unstyled_soup = BeautifulSoup(html)
     head = unstyled_soup.find("head")
     voc = unstyled_soup.body.find('div', {'class': 'vocabulary'})
     tds = voc.findAll(lambda tag: tag.name == 'td' and tag.a)
     dart = random.randrange(len(tds))
     self.td = tds[dart]
     self.oh = self.td.a['href']
     self.td.a['href'] = self._finalize()
     old_style = head.find('style')
     if old_style:
         old_style.extract()
     return unstyled_soup
开发者ID:Philantrop,项目名称:calibre-marvin-manager,代码行数:16,代码来源:css_editor.py

示例13: _inject_css

 def _inject_css(self, html):
     '''
     stick a <style> element into html
     '''
     css = self.prefs.get('injected_css', None)
     if css:
         try:
             styled_soup = BeautifulSoup(html)
             head = styled_soup.find("head")
             style_tag = Tag(styled_soup, 'style')
             style_tag['type'] = "text/css"
             style_tag.insert(0, css)
             head.insert(0, style_tag)
             html = styled_soup.renderContents()
         except:
             return html
     return(html)
开发者ID:DuskyRose,项目名称:calibre-marvin-manager,代码行数:17,代码来源:dropbox.py

示例14: get_soup

    def get_soup(self, src, url=None):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        # Some websites have buggy doctype declarations that mess up beautifulsoup
        nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        soup = BeautifulSoup(usrc, markupMassage=nmassage)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
开发者ID:rlugojr,项目名称:calibre,代码行数:53,代码来源:simple.py

示例15: generate_html

    def generate_html(comments):
        args = dict(xmlns=XHTML_NS,
                    title_str=title_str,
                    css=css,
                    title=title,
                    author=author,
                    publisher=publisher,
                    pubdate_label=_('Published'), pubdate=pubdate,
                    series_label=_('Series'), series=series,
                    rating_label=_('Rating'), rating=rating,
                    tags_label=_('Tags'), tags=tags,
                    comments=comments,
                    footer=''
                    )
        for key in mi.custom_field_keys():
            try:
                display_name, val = mi.format_field_extended(key)[:2]
                key = key.replace('#', '_')
                args[key] = escape(val)
                args[key+'_label'] = escape(display_name)
            except:
                pass

        # Used in the comment describing use of custom columns in templates
        args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
        args['_genre'] = args.get('_genre', '{_genre}')

        generated_html = P('jacket/template.xhtml',
                data=True).decode('utf-8').format(**args)

        # Post-process the generated html to strip out empty header items

        soup = BeautifulSoup(generated_html)
        if not series:
            series_tag = soup.find(attrs={'class':'cbj_series'})
            if series_tag is not None:
                series_tag.extract()
        if not rating:
            rating_tag = soup.find(attrs={'class':'cbj_rating'})
            if rating_tag is not None:
                rating_tag.extract()
        if not tags:
            tags_tag = soup.find(attrs={'class':'cbj_tags'})
            if tags_tag is not None:
                tags_tag.extract()
        if not pubdate:
            pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'})
            if pubdate_tag is not None:
                pubdate_tag.extract()
        if output_profile.short_name != 'kindle':
            hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'})
            if hr_tag is not None:
                hr_tag.extract()

        return strip_encoding_declarations(
                soup.renderContents('utf-8').decode('utf-8'))
开发者ID:Eksmo,项目名称:calibre,代码行数:56,代码来源:jacket.py


注:本文中的calibre.ebooks.BeautifulSoup.BeautifulSoup类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。