当前位置: 首页>>代码示例>>Python>>正文


Python BeautifulSoup.Tag类代码示例

本文整理汇总了Python中calibre.ebooks.BeautifulSoup.Tag的典型用法代码示例。如果您正苦于以下问题:Python Tag类的具体用法?Python Tag怎么用?Python Tag使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Tag类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: postprocess_html

    def postprocess_html(self, soup, first_fetch):
        author_general = soup.find('span', { 'class': 'author_general' })
        author_general.em.extract()
    
        # the complete content
        full_div = None
    
        transcript_div = soup.find('div', { 'id': 'transcript' })
        if transcript_div: # that's an interview
            # get all <div class="qa" />
            qa_div_list = list(find_by_class(transcript_div, 'div', 'qa'))
            for qa_div in qa_div_list:
                qa_div.extract()
                
                # replace all <a class="question_link">...</a> with <strong>...</strong>
                question_link = qa_div.find('a', { 'class': 'question_link' })
                question_strong = Tag(soup, 'strong')
                question_strong.append(question_link.string)
                question_link.replaceWith(question_strong)
            
            full_div = find_by_class(soup.find('div', { 'id': 'content' }), 'div', 'presentation_full').next()
            
            # clean the <h1 />
            full_div.h1.span.extract()
            title_div = full_div.h1.div
            title_div.replaceWith(title_div.string)
            
            # clear the presentation area
            for div in full_div.findAll('div'):
                div.extract()
            
            # add qa list back to presentation area
            for qa_div in qa_div_list:
                full_div.append(qa_div)
        else:
            # text only without title
            text_div = find_by_class(soup, 'div', 'text_info').next()
            text_div.extract()
            
            for other in text_div.findAll('div'):
                other.extract()
            
            # full_div contains title
            full_div = soup.find('div', { 'id': 'content' })
            for other in full_div.findAll('div'):
                other.extract()
            
            full_div.append(text_div)

        # keep full_div in <body /> only
        full_div.extract()
        
        for other in soup.body:
            other.extract()
            
        soup.body.append(full_div)

        return soup
开发者ID:SkyRaker,项目名称:calibre-recipes,代码行数:58,代码来源:InfoQ.py

示例2: get_soup

    def get_soup(self, src, url=None):
        nmassage = []
        nmassage.extend(self.preprocess_regexps)
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        for pat, repl in nmassage:
            usrc = pat.sub(repl, usrc)
        set_soup_module(sys.modules[BeautifulSoup.__module__])
        soup = parse(usrc, return_root=False)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
            for pat, repl in nmassage:
                replace = pat.sub(repl, replace)
            soup = parse(replace, return_root=False)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
开发者ID:artbycrunk,项目名称:calibre,代码行数:57,代码来源:simple.py

示例3: get_soup

    def get_soup(self, src, url=None):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        # Some websites have buggy doctype declarations that mess up beautifulsoup
        nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        soup = BeautifulSoup(usrc, markupMassage=nmassage)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
开发者ID:rlugojr,项目名称:calibre,代码行数:53,代码来源:simple.py

示例4: _inject_css

 def _inject_css(self, html):
     '''
     stick a <style> element into html
     '''
     css = self.prefs.get('injected_css', None)
     if css:
         try:
             styled_soup = BeautifulSoup(html)
             head = styled_soup.find("head")
             style_tag = Tag(styled_soup, 'style')
             style_tag['type'] = "text/css"
             style_tag.insert(0, css)
             head.insert(0, style_tag)
             html = styled_soup.renderContents()
         except:
             return html
     return(html)
开发者ID:DuskyRose,项目名称:calibre-marvin-manager,代码行数:17,代码来源:dropbox.py

示例5: inject_css

 def inject_css(self, html):
     '''
     stick a <style> element into html
     Deep View content structured differently
     <html style=""><body style="">
     '''
     css = str(self.css_pte.toPlainText())
     if css:
         raw_soup = self._remove_old_style(html)
         style_tag = Tag(raw_soup, 'style')
         style_tag['type'] = "text/css"
         style_tag.insert(0, css)
         head = raw_soup.find("head")
         head.insert(0, style_tag)
         self.styled_soup = raw_soup
         html = self.styled_soup.renderContents()
     return html
开发者ID:Philantrop,项目名称:calibre-marvin-manager,代码行数:17,代码来源:css_editor.py

示例6: preview_css

    def preview_css(self):
        '''
        Construct a dummy set of notes and annotation for preview purposes
        Modeled after book_status:_get_formatted_annotations()
        '''
        from calibre_plugins.marvin_manager.annotations import (
            ANNOTATIONS_HTML_TEMPLATE, Annotation, Annotations, BookNotes, BookmarkNotes)

        # Assemble the preview soup
        soup = BeautifulSoup(ANNOTATIONS_HTML_TEMPLATE)

        # Load the CSS from MXD resources
        path = os.path.join(self.parent.opts.resources_path, 'css', 'annotations.css')
        with open(path, 'rb') as f:
            css = f.read().decode('utf-8')
        style_tag = Tag(soup, 'style')
        style_tag.insert(0, css)
        soup.head.style.replaceWith(style_tag)

        # Assemble the sample Book notes
        book_notes_soup = BookNotes().construct(self.sample_book_notes)
        soup.body.append(book_notes_soup)
        cd_tag = Tag(soup, 'div', [('class', "divider")])
        soup.body.append(cd_tag)

        # Assemble the sample Bookmark notes
        bookmark_notes_soup = BookmarkNotes().construct(self.sample_bookmark_notes)
        soup.body.append(bookmark_notes_soup)
        cd_tag = Tag(soup, 'div', [('class', "divider")])
        soup.body.append(cd_tag)

        # Assemble the sample annotations
        pas = Annotations(None, title="Preview")
        pas.annotations.append(Annotation(self.sample_ann_1))
        pas.annotations.append(Annotation(self.sample_ann_2))
        pas.annotations.append(Annotation(self.sample_ann_3))
        annotations_soup = pas.to_HTML(pas.create_soup())
        soup.body.append(annotations_soup)

        self.parent.wv.setHtml(unicode(soup.renderContents()))
开发者ID:DuskyRose,项目名称:calibre-marvin-manager,代码行数:40,代码来源:appearance.py

示例7: construct

 def construct(self, book_notes):
     '''
     Given a list of notes, render HTML
     '''
     soup = None
     if book_notes:
         soup = BeautifulSoup('''<div class="{0}"></div>'''.format('book_notes'))
         for note in book_notes:
             div_tag = Tag(soup, 'div', [('class', "book_note")])
             p_tag = Tag(soup, 'p', [('class', "book_note"),
                                     ('style', "{0}".format(self._get_note_style()))])
             p_tag.append(note)
             div_tag.append(p_tag)
             soup.div.append(div_tag)
     return soup
开发者ID:DuskyRose,项目名称:calibre-marvin-manager,代码行数:15,代码来源:annotations.py

示例8: comments_to_html

def comments_to_html(comments):
    '''
    Convert random comment text to normalized, xml-legal block of <p>s
    'plain text' returns as
    <p>plain text</p>

    'plain text with <i>minimal</i> <b>markup</b>' returns as
    <p>plain text with <i>minimal</i> <b>markup</b></p>

    '<p>pre-formatted text</p> returns untouched

    'A line of text\n\nFollowed by a line of text' returns as
    <p>A line of text</p>
    <p>Followed by a line of text</p>

    'A line of text.\nA second line of text.\rA third line of text' returns as
    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>

    '...end of a paragraph.Somehow the break was lost...' returns as
    <p>...end of a paragraph.</p>
    <p>Somehow the break was lost...</p>

    Deprecated HTML returns as HTML via BeautifulSoup()

    '''
    if not comments:
        return u'<p></p>'
    if not isinstance(comments, unicode):
        comments = comments.decode(preferred_encoding, 'replace')

    if comments.lstrip().startswith('<'):
        # Comment is already HTML do not mess with it
        return comments

    if '<' not in comments:
        comments = prepare_string_for_xml(comments)
        parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />')
                for x in comments.split('\n\n')]
        return '\n'.join(parts)

    if sanitize_pat.search(comments) is not None:
        try:
            return sanitize_comments_html(comments)
        except:
            import traceback
            traceback.print_exc()
            return u'<p></p>'

    # Explode lost CRs to \n\n
    comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
        '.\r'), comments)
    for lost_cr in lost_cr_pat.finditer(comments):
        comments = comments.replace(lost_cr.group(),
                                    '%s%s\n\n%s' % (lost_cr.group(1),
                                                    lost_cr.group(2),
                                                    lost_cr.group(3)))

    comments = comments.replace(u'\r', u'')
    # Convert \n\n to <p>s
    comments = comments.replace(u'\n\n', u'<p>')
    # Convert solo returns to <br />
    comments = comments.replace(u'\n', '<br />')
    # Convert two hyphens to emdash
    comments = comments.replace('--', '&mdash;')

    soup = BeautifulSoup(comments)
    result = BeautifulSoup()
    rtc = 0
    open_pTag = False

    all_tokens = list(soup.contents)
    for token in all_tokens:
        if type(token) is NavigableString:
            if not open_pTag:
                pTag = Tag(result,'p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc,prepare_string_for_xml(token))
            ptc += 1
        elif type(token) in (CData, Comment, Declaration,
                ProcessingInstruction):
            continue
        elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a',
                'hr']:
            if not open_pTag:
                pTag = Tag(result,'p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        else:
            if open_pTag:
                result.insert(rtc, pTag)
                rtc += 1
                open_pTag = False
                ptc = 0
            result.insert(rtc, token)
            rtc += 1

    if open_pTag:
#.........这里部分代码省略.........
开发者ID:artbycrunk,项目名称:calibre,代码行数:101,代码来源:comments.py

示例9: generate_annotation_html

    def generate_annotation_html(self, bookmark):
        from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
        # Returns <div class="user_annotations"> ... </div>
        last_read_location = bookmark.last_read_location
        timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp)
        percent_read = bookmark.percent_read

        ka_soup = BeautifulSoup()
        dtc = 0
        divTag = Tag(ka_soup,'div')
        divTag['class'] = 'user_annotations'

        # Add the last-read location
        spanTag = Tag(ka_soup, 'span')
        spanTag['style'] = 'font-weight:bold'
        if bookmark.book_format == 'pdf':
            spanTag.insert(0,NavigableString(
                _("%(time)s<br />Last Page Read: %(loc)d (%(pr)d%%)") % dict(
                    time=strftime(u'%x', timestamp.timetuple()),
                    loc=last_read_location,
                    pr=percent_read)))
        else:
            spanTag.insert(0,NavigableString(
                _("%(time)s<br />Last Page Read: Location %(loc)d (%(pr)d%%)") % dict(
                    time=strftime(u'%x', timestamp.timetuple()),
                    loc=last_read_location,
                    pr=percent_read)))

        divTag.insert(dtc, spanTag)
        dtc += 1
        divTag.insert(dtc, Tag(ka_soup,'br'))
        dtc += 1

        if bookmark.user_notes:
            user_notes = bookmark.user_notes
            annotations = []

            # Add the annotations sorted by location
            # Italicize highlighted text
            for location in sorted(user_notes):
                if user_notes[location]['text']:
                    annotations.append(
                            _('<b>Location %(dl)d &bull; %(typ)s</b><br />%(text)s<br />') % dict(
                                dl=user_notes[location]['displayed_location'],
                                typ=user_notes[location]['type'],
                                text=(user_notes[location]['text'] if
                                      user_notes[location]['type'] == 'Note' else
                                      '<i>%s</i>' % user_notes[location]['text'])))
                else:
                    if bookmark.book_format == 'pdf':
                        annotations.append(
                                _('<b>Page %(dl)d &bull; %(typ)s</b><br />') % dict(
                                    dl=user_notes[location]['displayed_location'],
                                    typ=user_notes[location]['type']))
                    else:
                        annotations.append(
                                _('<b>Location %(dl)d &bull; %(typ)s</b><br />') % dict(
                                    dl=user_notes[location]['displayed_location'],
                                    typ=user_notes[location]['type']))

            for annotation in annotations:
                divTag.insert(dtc, annotation)
                dtc += 1

        ka_soup.insert(0,divTag)
        return ka_soup
开发者ID:GaryMMugford,项目名称:calibre,代码行数:66,代码来源:driver.py

示例10: generate_annotation_html

    def generate_annotation_html(self, bookmark):
        from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString

        # Returns <div class="user_annotations"> ... </div>
        last_read_location = bookmark.last_read_location
        timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp)
        percent_read = bookmark.percent_read

        ka_soup = BeautifulSoup()
        dtc = 0
        divTag = Tag(ka_soup, "div")
        divTag["class"] = "user_annotations"

        # Add the last-read location
        spanTag = Tag(ka_soup, "span")
        spanTag["style"] = "font-weight:bold"
        if bookmark.book_format == "pdf":
            spanTag.insert(
                0,
                NavigableString(
                    _("%(time)s<br />Last Page Read: %(loc)d (%(pr)d%%)")
                    % dict(time=strftime(u"%x", timestamp.timetuple()), loc=last_read_location, pr=percent_read)
                ),
            )
        else:
            spanTag.insert(
                0,
                NavigableString(
                    _("%(time)s<br />Last Page Read: Location %(loc)d (%(pr)d%%)")
                    % dict(time=strftime(u"%x", timestamp.timetuple()), loc=last_read_location, pr=percent_read)
                ),
            )

        divTag.insert(dtc, spanTag)
        dtc += 1
        divTag.insert(dtc, Tag(ka_soup, "br"))
        dtc += 1

        if bookmark.user_notes:
            user_notes = bookmark.user_notes
            annotations = []

            # Add the annotations sorted by location
            # Italicize highlighted text
            for location in sorted(user_notes):
                if user_notes[location]["text"]:
                    annotations.append(
                        _("<b>Location %(dl)d &bull; %(typ)s</b><br />%(text)s<br />")
                        % dict(
                            dl=user_notes[location]["displayed_location"],
                            typ=user_notes[location]["type"],
                            text=(
                                user_notes[location]["text"]
                                if user_notes[location]["type"] == "Note"
                                else "<i>%s</i>" % user_notes[location]["text"]
                            ),
                        )
                    )
                else:
                    if bookmark.book_format == "pdf":
                        annotations.append(
                            _("<b>Page %(dl)d &bull; %(typ)s</b><br />")
                            % dict(dl=user_notes[location]["displayed_location"], typ=user_notes[location]["type"])
                        )
                    else:
                        annotations.append(
                            _("<b>Location %(dl)d &bull; %(typ)s</b><br />")
                            % dict(dl=user_notes[location]["displayed_location"], typ=user_notes[location]["type"])
                        )

            for annotation in annotations:
                divTag.insert(dtc, annotation)
                dtc += 1

        ka_soup.insert(0, divTag)
        return ka_soup
开发者ID:john-peterson,项目名称:calibre,代码行数:76,代码来源:driver.py

示例11: rebuild_collections

    def rebuild_collections(self, booklist, oncard):
        '''
        For each book in the booklist for the card oncard, remove it from all
        its current collections, then add it to the collections specified in
        device_collections.

        oncard is None for the main memory, carda for card A, cardb for card B,
        etc.

        booklist is the object created by the :method:`books` call above.

        This is called after the user edits the 'Collections' field in the Device view
        when Metadata management is set to 'Manual'.

        '''
        self._log_location()

        command_name = "rebuild_collections"
        command_element = "rebuildcollections"
        command_soup = BeautifulStoneSoup(self.parent.COMMAND_XML.format(
            command_element, time.mktime(time.localtime())))

        LOCAL_DEBUG = False
        if booklist:
            changed = 0
            for book in booklist:
                if LOCAL_DEBUG:
                    self._log("{0:7} {1}".format(book.in_library, book.title))

                filename = self.parent.path_template.format(book.uuid)
                if filename not in self.parent.cached_books:
                    for fn in self.parent.cached_books:
                        if book.uuid and book.uuid == self.parent.cached_books[fn]['uuid']:
                            if LOCAL_DEBUG:
                                self._log("'%s' matched on uuid %s" % (book.title, book.uuid))
                            filename = fn
                            break
                        elif (book.title == self.parent.cached_books[fn]['title'] and
                              book.authors == self.parent.cached_books[fn]['authors']):
                            if LOCAL_DEBUG:
                                self._log("'%s' matched on title/author" % book.title)
                            filename = fn
                            break
                    else:
                        self._log("ERROR: file %s not found in cached_books" % repr(filename))
                        continue

                cached_collections = self.parent.cached_books[filename]['device_collections']
                if cached_collections != book.device_collections:
                    # Append the changed book info to the command file
                    book_tag = Tag(command_soup, 'book')
                    book_tag['filename'] = filename
                    book_tag['title'] = book.title
                    book_tag['author'] = ', '.join(book.authors)
                    book_tag['uuid'] = book.uuid

                    collections_tag = Tag(command_soup, 'collections')
                    for tag in book.device_collections:
                        c_tag = Tag(command_soup, 'collection')
                        c_tag.insert(0, tag)
                        collections_tag.insert(0, c_tag)
                    book_tag.insert(0, collections_tag)

                    command_soup.manifest.insert(0, book_tag)

                    # Update cache
                    self.parent.cached_books[filename]['device_collections'] = book.device_collections

                    changed += 1

            if changed:
                # Stage the command file
                self.parent._stage_command_file(command_name, command_soup,
                                                show_command=self.parent.prefs.get('development_mode', False))

                # Wait for completion
                self.parent._wait_for_command_completion(command_name)
            else:
                self._log("no collection changes detected cached_books <=> device books")
开发者ID:kmshi,项目名称:calibre-ios-reader-applications,代码行数:79,代码来源:__init__.py

示例12: postprocess_html

    def postprocess_html(self,soup, True):
        try:
                if self.one_picture_per_article:
                        # Remove all images after first
                        largeImg = soup.find(True, {'class':'articleSpanImage'})
                        inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
                        if largeImg:
                                for inlineImg in inlineImgs:
                                        inlineImg.extract()
                        else:
                                if inlineImgs:
                                        firstImg = inlineImgs[0]
                                        for inlineImg in inlineImgs[1:]:
                                                inlineImg.extract()
                                        # Move firstImg before article body
                                        cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
                                        if cgFirst:
                                                # Strip all sibling NavigableStrings: noise
                                                navstrings = cgFirst.findAll(text=True, recursive=False)
                                                [ns.extract() for ns in navstrings]
                                                headline_found = False
                                                tag = cgFirst.find(True)
                                                insertLoc = 0
                                                while True:
                                                        insertLoc += 1
                                                        if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
                                                                        headline_found = True
                                                                        break
                                                        tag = tag.nextSibling
                                                        if not tag:
                                                                headline_found = False
                                                                break
                                                if headline_found:
                                                        cgFirst.insert(insertLoc,firstImg)
                                        else:
                                                self.log(">>> No class:'columnGroup first' found <<<")
        except:
                self.log("ERROR: One picture per article in postprocess_html")

        try:
                # Change captions to italic
                for caption in soup.findAll(True, {'class':'caption'}) :
                        if caption and len(caption) > 0:
                                cTag = Tag(soup, "p", [("class", "caption")])
                                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
                                mp_off = c.find("More Photos")
                                if mp_off >= 0:
                                        c = c[:mp_off]
                                cTag.insert(0, c)
                                caption.replaceWith(cTag)
        except:
                self.log("ERROR:  Problem in change captions to italic")

        try:
                # Change <nyt_headline> to <h2>
                h1 = soup.find('h1')
                blogheadline = str(h1) #added for dealbook
                if h1:
                        headline = h1.find("nyt_headline")
                        if headline:
                                tag = Tag(soup, "h2")
                                tag['class'] = "headline"
                                tag.insert(0, self.fixChars(headline.contents[0]))
                                h1.replaceWith(tag)
                        elif blogheadline.find('entry-title'):#added for dealbook
                                tag = Tag(soup, "h2")#added for dealbook
                                tag['class'] = "headline"#added for dealbook
                                tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
                                h1.replaceWith(tag)#added for dealbook

                else:
                        # Blog entry - replace headline, remove <hr> tags  - BCC I think this is no longer functional 1-18-2011
                        headline = soup.find('title')
                        if headline:
                                tag = Tag(soup, "h2")
                                tag['class'] = "headline"
                                tag.insert(0, self.fixChars(headline.renderContents()))
                                soup.insert(0, tag)
                                hrs = soup.findAll('hr')
                                for hr in hrs:
                                        hr.extract()
        except:
                self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")

        try:
                #if this is from a blog (dealbook, fix the byline format
                bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
                if bylineauthor:
                    tag = Tag(soup, "h6")
                    tag['class'] = "byline"
                    tag.insert(0, self.fixChars(bylineauthor.renderContents()))
                    bylineauthor.replaceWith(tag)
        except:
            self.log("ERROR:  fixing byline author format")

        try:
                #if this is a blog (dealbook) fix the credit style for the pictures
                blogcredit = soup.find('div',attrs={'class':'credit'})
                if blogcredit:
                    tag = Tag(soup, "h6")
#.........这里部分代码省略.........
开发者ID:seekfindcn,项目名称:python,代码行数:101,代码来源:cnnytimes.py

示例13: postprocess_html

    def postprocess_html(self,soup, True):

        if self.one_picture_per_article:
            # Remove all images after first
            largeImg = soup.find(True, {'class':'articleSpanImage'})
            inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
            if largeImg:
                for inlineImg in inlineImgs:
                    inlineImg.extract()
            else:
                if inlineImgs:
                    firstImg = inlineImgs[0]
                    for inlineImg in inlineImgs[1:]:
                        inlineImg.extract()
                    # Move firstImg before article body
                    article_body = soup.find(True, {'id':'articleBody'})
                    cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
                    if cgFirst:
                        # Strip all sibling NavigableStrings: noise
                        navstrings = cgFirst.findAll(text=True, recursive=False)
                        [ns.extract() for ns in navstrings]
                        headline_found = False
                        tag = cgFirst.find(True)
                        insertLoc = 0
                        while True:
                            insertLoc += 1
                            if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
                                    headline_found = True
                                    break
                            tag = tag.nextSibling
                            if not tag:
                                headline_found = False
                                break
                        if headline_found:
                            cgFirst.insert(insertLoc,firstImg)
                    else:
                        self.log(">>> No class:'columnGroup first' found <<<")

        # Change captions to italic 
        for caption in soup.findAll(True, {'class':'caption'}) :
            if caption and caption.contents[0]:
                cTag = Tag(soup, "p", [("class", "caption")])
                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
                mp_off = c.find("More Photos")
                if mp_off >= 0:
                    c = c[:mp_off]
                cTag.insert(0, c)
                caption.replaceWith(cTag)

        # Change <nyt_headline> to <h2>
        h1 = soup.find('h1')
        if h1:
            headline = h1.find("nyt_headline")
            if headline:
                tag = Tag(soup, "h2")
                tag['class'] = "headline"
                tag.insert(0, self.fixChars(headline.contents[0]))
                h1.replaceWith(tag)
        else:
            # Blog entry - replace headline, remove <hr> tags
            headline = soup.find('title')
            if headline:
                tag = Tag(soup, "h2")
                tag['class'] = "headline"
                tag.insert(0, self.fixChars(headline.contents[0]))
                soup.insert(0, tag)
                hrs = soup.findAll('hr')
                for hr in hrs:
                    hr.extract()

        # Change <h1> to <h3> - used in editorial blogs
        masthead = soup.find("h1")
        if masthead:
            # Nuke the href
            if masthead.a:
                del(masthead.a['href'])
            tag = Tag(soup, "h3")
            tag.insert(0, self.fixChars(masthead.contents[0]))
            masthead.replaceWith(tag)

        # Change <span class="bold"> to <b>
        for subhead in soup.findAll(True, {'class':'bold'}) :
            if subhead.contents:
                bTag = Tag(soup, "b")
                bTag.insert(0, subhead.contents[0])
                subhead.replaceWith(bTag)

        divTag = soup.find('div',attrs={'id':'articleBody'})
        if divTag:
            divTag['class'] = divTag['id']

        # Add class="authorId" to <div> so we can format with CSS
        divTag = soup.find('div',attrs={'id':'authorId'})
        if divTag and divTag.contents[0]:
            tag = Tag(soup, "p")
            tag['class'] = "authorId"
            tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
                             use_alt=False)))
            divTag.replaceWith(tag)

#.........这里部分代码省略.........
开发者ID:seekfindcn,项目名称:python,代码行数:101,代码来源:nytimes_today.py

示例14: to_HTML

    def to_HTML(self, header=''):
        '''
        Generate HTML with user-specified CSS, element order
        '''
        # Retrieve CSS prefs
        from calibre_plugins.annotations.appearance import default_elements
        stored_css = plugin_prefs.get('appearance_css', default_elements)

        elements = []
        for element in stored_css:
            elements.append(element['name'])
            if element['name'] == 'Note':
                note_style = re.sub('\n', '', element['css'])
            elif element['name'] == 'Text':
                text_style = re.sub('\n', '', element['css'])
            elif element['name'] == 'Timestamp':
                ts_style = re.sub('\n', '', element['css'])

        # Additional CSS for timestamp color and bg to be formatted
        datetime_style = ("background-color:{0};color:{1};" + ts_style)

        # Order the elements according to stored preferences
        comments_body = ''
        for element in elements:
            if element == 'Text':
                comments_body += '{text}'
            elif element == 'Note':
                comments_body += '{note}'
            elif element == 'Timestamp':
                ts_css = '''<table cellpadding="0" width="100%" style="{ts_style}" color="{color}">
                                <tr>
                                    <td class="location" style="text-align:left">{location}</td>
                                    <td class="timestamp" uts="{unix_timestamp}" style="text-align:right">{friendly_timestamp}</td>
                                </tr>
                            </table>'''
                comments_body += re.sub(r'>\s+<', r'><', ts_css)

        if self.annotations:
            soup = BeautifulSoup(ANNOTATIONS_HEADER)
            dtc = 0

            # Add the annotations
            for i, agroup in enumerate(sorted(self.annotations, key=self._annotation_sorter)):
                location = agroup.location
                if location is None:
                    location = ''

                friendly_timestamp = self._timestamp_to_datestr(agroup.timestamp)

                text = ''
                if agroup.text:
                    for agt in agroup.text:
                        text += '<p class="highlight" style="{0}">{1}</p>'.format(text_style, agt)

                note = ''
                if agroup.note:
                    for agn in agroup.note:
                        note += '<p class="note" style="{0}">{1}</p>'.format(note_style, agn)

                try:
                    dt_bgcolor = COLOR_MAP[agroup.highlightcolor]['bg']
                    dt_fgcolor = COLOR_MAP[agroup.highlightcolor]['fg']
                except:
                    if agroup.highlightcolor is None:
                        msg = "No highlight color specified, using Default"
                    else:
                        msg = "Unknown color '%s' specified" % agroup.highlightcolor
                    self._log_location(msg)
                    dt_bgcolor = COLOR_MAP['Default']['bg']
                    dt_fgcolor = COLOR_MAP['Default']['fg']

                if agroup.hash is not None:
                    # Use existing hash when re-rendering
                    hash = agroup.hash
                else:
                    m = hashlib.md5()
                    m.update(text)
                    m.update(note)
                    hash = m.hexdigest()

                divTag = Tag(BeautifulSoup(), 'div')
                content_args = {
                            'color': agroup.highlightcolor,
                            'friendly_timestamp': friendly_timestamp,
                            'location': location,
                            'note': note,
                            'text': text,
                            'ts_style': datetime_style.format(dt_bgcolor, dt_fgcolor),
                            'unix_timestamp': agroup.timestamp,
                            }
                divTag.insert(0, comments_body.format(**content_args))
                divTag['class'] = "annotation"
                divTag['genre'] = ''
                if agroup.genre:
                    divTag['genre'] = escape(agroup.genre)
                divTag['hash'] = hash
                divTag['location_sort'] = agroup.location_sort
                divTag['reader'] = agroup.reader_app
                divTag['style'] = ANNOTATION_DIV_STYLE
                soup.div.insert(dtc, divTag)
#.........这里部分代码省略.........
开发者ID:mkarpiarz,项目名称:calibre-annotations,代码行数:101,代码来源:annotations.py

示例15: comments_to_html

def comments_to_html(comments):
    """
    Convert random comment text to normalized, xml-legal block of <p>s
    'plain text' returns as
    <p>plain text</p>

    'plain text with <i>minimal</i> <b>markup</b>' returns as
    <p>plain text with <i>minimal</i> <b>markup</b></p>

    '<p>pre-formatted text</p> returns untouched

    'A line of text\n\nFollowed by a line of text' returns as
    <p>A line of text</p>
    <p>Followed by a line of text</p>

    'A line of text.\nA second line of text.\rA third line of text' returns as
    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>

    '...end of a paragraph.Somehow the break was lost...' returns as
    <p>...end of a paragraph.</p>
    <p>Somehow the break was lost...</p>

    Deprecated HTML returns as HTML via BeautifulSoup()

    """
    if not comments:
        return u"<p></p>"
    if not isinstance(comments, unicode):
        comments = comments.decode(preferred_encoding, "replace")

    if comments.lstrip().startswith("<"):
        # Comment is already HTML do not mess with it
        return comments

    if "<" not in comments:
        comments = prepare_string_for_xml(comments)
        parts = [u'<p class="description">%s</p>' % x.replace(u"\n", u"<br />") for x in comments.split("\n\n")]
        return "\n".join(parts)

    if sanitize_pat.search(comments) is not None:
        try:
            return sanitize_comments_html(comments)
        except:
            import traceback

            traceback.print_exc()
            return u"<p></p>"

    # Explode lost CRs to \n\n
    comments = lost_cr_exception_pat.sub(lambda m: m.group().replace(".", ".\r"), comments)
    for lost_cr in lost_cr_pat.finditer(comments):
        comments = comments.replace(
            lost_cr.group(), "%s%s\n\n%s" % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))
        )

    comments = comments.replace(u"\r", u"")
    # Convert \n\n to <p>s
    comments = comments.replace(u"\n\n", u"<p>")
    # Convert solo returns to <br />
    comments = comments.replace(u"\n", "<br />")
    # Convert two hyphens to emdash
    comments = comments.replace("--", "&mdash;")

    soup = BeautifulSoup(comments)
    result = BeautifulSoup()
    rtc = 0
    open_pTag = False

    all_tokens = list(soup.contents)
    for token in all_tokens:
        if type(token) is NavigableString:
            if not open_pTag:
                pTag = Tag(result, "p")
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, prepare_string_for_xml(token))
            ptc += 1
        elif type(token) in (CData, Comment, Declaration, ProcessingInstruction):
            continue
        elif token.name in ["br", "b", "i", "em", "strong", "span", "font", "a", "hr"]:
            if not open_pTag:
                pTag = Tag(result, "p")
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        else:
            if open_pTag:
                result.insert(rtc, pTag)
                rtc += 1
                open_pTag = False
                ptc = 0
            result.insert(rtc, token)
            rtc += 1

    if open_pTag:
        result.insert(rtc, pTag)

    for p in result.findAll("p"):
        p["class"] = "description"
#.........这里部分代码省略.........
开发者ID:GRiker,项目名称:calibre,代码行数:101,代码来源:comments.py


注:本文中的calibre.ebooks.BeautifulSoup.Tag类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。