当前位置: 首页>>代码示例>>Python>>正文

Python bs4.Comment方法代码示例

本文整理汇总了Python中bs4.Comment方法的典型用法代码示例。如果您正苦于以下问题:Python bs4.Comment方法的具体用法?Python bs4.Comment怎么用?Python bs4.Comment使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在bs4的用法示例。


示例1: clean_contents

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def clean_contents(self, div):
        if not div:
            return div
        # end if
        div.attrs = {}
        for tag in div.findAll(True):
            if isinstance(tag, Comment):
                tag.extract()   # Remove comments
            elif tag.name == 'br':
                next_tag = getattr(tag, 'next_sibling')
                if next_tag and getattr(next_tag, 'name') == 'br':
                # end if
            elif tag.name in self.bad_tags:
                tag.extract()   # Remove bad tags
            elif not tag.text.strip():
                tag.extract()   # Remove empty tags
            elif self.is_blacklisted(tag.text):
                tag.extract()   # Remove blacklisted contents
            elif hasattr(tag, 'attrs'):
                tag.attrs = {}    # Remove attributes
            # end if
        # end for
        return div
    # end def 

示例2: wordpressFuncXml

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def wordpressFuncXml(data):
    cms = False
    comment = ""
    version_match = None

        soup = BeautifulSoup(data.text, 'lxml')
        comments = soup.findAll(text=lambda text:isinstance(text, Comment))

        if len(comments) > 0:
            cms = True	
            version_match = re.findall(r'(?:(\d+\.[.\d]*\d+))',comments[0])
            if len(version_match) > 0:
                version_match = version_match[0]
            if version_match != WORDPRESS_LAST_CMS_VERSION:
                print "The version wordpress is outdated or not identified"
                print "The version wordpress is updated"
    except Exception as e:
        print e
        version_match = None

        return cms,version_match 

示例3: normalize_text_sections

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def normalize_text_sections(div):
        paragraph = ''
        for content in div.contents:
            text = ''
            if type(content) == NavigableString:
                text = content
            elif type(content) == Comment:
            elif content.name == 'li':
                text = content.text
                text = content.text
            text = text.strip()
            paragraph += text.strip() + ' '
        paragraph = paragraph.strip()
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\n', ', ')
        paragraph = paragraph.strip()
        return paragraph 

示例4: normalize_text_sections

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def normalize_text_sections(div):
        paragraph = ''
        for content in div.contents:
            text = ''
            if type(content) == NavigableString:
                text = content
            elif type(content) == Comment:
            elif content.name == 'li':
                text = content.text
                text = content.text
            text = text.strip()
            paragraph += text.strip() + ' '
        paragraph = paragraph.strip()
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\n', ', ')
        paragraph = paragraph.replace('  ', ' ')
        paragraph = paragraph.strip()
        return paragraph 

示例5: standings

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def standings(season=None):
    # get most recent standings if date not specified
    if(season is None):
        season = int(datetime.datetime.today().strftime("%Y"))
    if season<1871:
        raise ValueError("This query currently only returns standings until the 1871 season. Try looking at years from 1871 to present.")
    # retrieve html from baseball reference
    soup = get_soup(season)
    if season>=1969:
        tables = get_tables(soup, season)
        t = soup.find_all(string=lambda text:isinstance(text,Comment))
        # list of seasons whose table placement breaks the site's usual pattern
        exceptions = [1884, 1885, 1886, 1887, 1888, 1889, 1890, 1892, 1903]
        if (season>1904 or season in exceptions): code = BeautifulSoup(t[16], "lxml")
        elif season<=1904: code = BeautifulSoup(t[15], "lxml")
        tables = get_tables(code, season)
    tables = [pd.DataFrame(table) for table in tables]
    for idx in range(len(tables)):
        tables[idx] = tables[idx].rename(columns=tables[idx].iloc[0])
        tables[idx] = tables[idx].reindex(tables[idx].index.drop(0))
    return tables 

示例6: get_overrides

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def get_overrides(self, soup):
        Look for overrides in the text to make exceptions for specific style
        rules. Returns a set of rule strings to ignore for this block.

        overrides = set()
        comments = soup.find_all(string=lambda text:isinstance(text,Comment))
        for comment in comments:
            m = re.match(OVERRIDE_COMMENT_REGEX, comment)
            if m:
                new_overrides = m.group(1).split(",")
                new_overrides = {o.strip() for o in new_overrides}
                logger.info("Overrides found: %s" % new_overrides)
                overrides |= new_overrides

        return overrides 

示例7: _fetch

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def _fetch(url: str) -> BeautifulSoup:
    html    = ""
    req     = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) 

    with urllib.request.urlopen(req) as response:
        html = response.read()

    page    = BeautifulSoup(html, "html.parser")

    for ignored_tag in ["script", "img", "input", "button", "style", "font", "iframe", "object", "embed"]:
        for tag in page.find_all(ignored_tag):

    for tag in page.find_all(recursive=True):
        for attribute in ["class", "id", "name", "style", "role", "lang", "dir", "href", "src"]:
            del tag[attribute]
        for attribute in list(tag.attrs):
            if attribute.startswith("data-"):
                del tag.attrs[attribute]

    for node in page.find_all(text=lambda s: isinstance(s, Comment)):

    return page 

示例8: _fetchWebpage

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def _fetchWebpage(self, url):
        if isMac:
            context = _create_unverified_context()
            html = urlopen(url, context=context).read()
            headers = {'User-Agent': self.settings['userAgent']}
            html = get(url, headers=headers).content

        webpage = BeautifulSoup(html, 'html.parser')

        for tagName in self.settings['badTags']:
            for tag in webpage.find_all(tagName):

        for c in webpage.find_all(text=lambda s: isinstance(s, Comment)):

        return webpage 

示例9: find_comments_in_html_by_urls

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def find_comments_in_html_by_urls(self, urls):
        res = []
        for url in urls:
            path = urlparse(url).path
            host = urlparse(url).hostname
            scheme = urlparse(url).scheme
            req = "GET {0} {1}/1.1\r\nhost: {2}\r\n\r\n".format(path, scheme, host)
                r = self.zap.send_request(req)
                html = str(r['responseBody'])
            except Exception as e:
                r = requests.get(url)
                html = r.text
            if (html):
                soup = BeautifulSoup(html,'html.parser')
                comments = soup.findAll(text=lambda text:isinstance(text, Comment))
                comment_list = []
                for comment in comments:
                    str1 = str(comment)
                    c = { "method":"GET", "url":url, "resp":r.text, "request":"GET "+url, "data":comment_list }
        return res 

示例10: duosuo

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def duosuo(self):
        if not self.duoshuo_shortname:
            return """
            return """
            <!-- Duoshuo Comment BEGIN -->
            <div class="ds-thread"></div>
            <script type="text/javascript">
            var duoshuoQuery = {short_name:"%s"};
            (function() {
            var ds = document.createElement('script');
            ds.type = 'text/javascript';ds.async = true;
            ds.src = 'http://static.duoshuo.com/embed.js';
            ds.charset = 'UTF-8';
            || document.getElementsByTagName('body')[0]).appendChild(ds);
            <!-- Duoshuo Comment END -->
            """ % self.duoshuo_shortname 

示例11: soup_strings

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def soup_strings(soup):
  paragraph_tags = set(["caption", "details", "h1", "h2", "h3", "h4", "h5",
                        "h6", "li", "p", "td", "div", "span"])

  skip_children = None
  for descendant in soup.descendants:
    # If we've treated a tag as a contiguous paragraph, don't re-emit the
    # children (see below).
    if skip_children is not None:
        in_skip = descendant in skip_children
      except RecursionError:
        # Possible for this check to hit a nasty infinite recursion because of
        # BeautifulSoup __eq__ checks.
        in_skip = True
      if in_skip:
        skip_children = None

    # Treat some tags as contigous paragraphs, regardless of other tags nested
    # inside (like <a> or <b>).
    if isinstance(descendant, bs4.Tag):
      if descendant.name in paragraph_tags:
        if descendant.find_all(paragraph_tags):
          # If there are nested paragraph tags, don't treat it as a single
          # contiguous tag.
        skip_children = list(descendant.descendants)
        text = " ".join(descendant.get_text(" ", strip=True).split())
        if text:
          yield text

    if (isinstance(descendant, bs4.Comment) or
        not isinstance(descendant, bs4.NavigableString)):

    text = " ".join(descendant.strip().split())
    if text:
      yield text 

示例12: _parse_file

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def _parse_file(test_name):
    """Parse the given HTML file."""
    file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                             'data', 'hints', 'html', test_name)
    with open(file_path, 'r', encoding='utf-8') as html:
        soup = bs4.BeautifulSoup(html, 'html.parser')

    comment = str(soup.find(text=lambda text: isinstance(text, bs4.Comment)))

    if comment is None:
        raise InvalidFile(test_name, "no comment found")

    data = utils.yaml_load(comment)

    if not isinstance(data, dict):
        raise InvalidFile(test_name, "expected yaml dict but got {}".format(

    allowed_keys = {'target', 'qtwebengine_todo'}
    if not set(data.keys()).issubset(allowed_keys):
        raise InvalidFile(test_name, "expected keys {} but found {}".format(
            ', '.join(allowed_keys),
            ', '.join(set(data.keys()))))

    if 'target' not in data:
        raise InvalidFile(test_name, "'target' key not found")

    qtwebengine_todo = data.get('qtwebengine_todo', None)

    return ParsedFile(target=data['target'], qtwebengine_todo=qtwebengine_todo) 

示例13: text

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def text(self, target=None, ignore_pureascii_words=False):
        Get all text in HTML, skip script and comment
        :param target: the BeatuifulSoup object, default self.b
        :param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website)
        :return: list of str
        if target is None:
            target = self.b
        from bs4 import Comment
        from bs4.element import NavigableString,Doctype
        result = []
        for descendant in target.descendants:
            if not isinstance(descendant, NavigableString) \
                    or isinstance(descendant,Doctype) \
                    or descendant.parent.name in ["script", "style"] \
                    or isinstance(descendant, Comment) \
                    or "none" in descendant.parent.get("style","")\
                    or "font-size:0px" in descendant.parent.get("style",""):
            data = descendant.strip()
            if len(data) > 0:
                if not ignore_pureascii_words or any([ord(i)>127 for i in data]):
                    if PY2:
        return result 

示例14: sync_file

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def sync_file(path_prefix, course_id):
    if not os.path.exists(path_prefix):
    soup = bs(get_page('MultiLanguage/lesson/student/download.jsp?course_id=' + str(course_id)), 'html.parser')
    for comment in soup(text=lambda text: isinstance(text, bs4.Comment)):
        link = bs(comment, 'html.parser').a
        name = link.text
        uri = comment.next.next.a.get('href')
        filename = link.get('onclick').split('getfilelink=')[-1].split('&id')[0]
        file_path = os.path.join(path_prefix, filename)
        if not os.path.exists(file_path):
            print('Download ', name)
            open(file_path, 'wb').write(open_page(uri).read()) 

示例15: decomposeAdditional

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def decomposeAdditional(self, soup):

        # Clean out any local stylesheets
        for instance in soup.find_all('style', attrs={"type" : "text/css"}):

        decompose = [
            # Clear out all the iframes
            # Even if not explicitly tagged as css
            # And all remote scripts
            # Link tags
            # Meta tags

            # Stylesheets (needs further checking)

        if self.decompose_svg:

        for instance in soup.find_all(decompose):

            # If it's a style tag, make sure the type is text/css before removing
            if instance.name == 'style':
                if instance.get("type", None) == "text/css":

        # Comments
        for item in soup.findAll(text=lambda text:isinstance(text, bs4.Comment)):

        return soup 
