Python bs4.Comment方法代码示例

本文整理汇总了Python中bs4.Comment方法的典型用法代码示例。如果您正苦于以下问题：Python bs4.Comment方法的具体用法？Python bs4.Comment怎么用？Python bs4.Comment使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类bs4的用法示例。

在下文中一共展示了bs4.Comment方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: clean_contents

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def clean_contents(self, div):
        if not div:
            return div
        # end if
        div.attrs = {}
        for tag in div.findAll(True):
            if isinstance(tag, Comment):
                tag.extract()   # Remove comments
            elif tag.name == 'br':
                next_tag = getattr(tag, 'next_sibling')
                if next_tag and getattr(next_tag, 'name') == 'br':
                    tag.extract()
                # end if
            elif tag.name in self.bad_tags:
                tag.extract()   # Remove bad tags
            elif not tag.text.strip():
                tag.extract()   # Remove empty tags
            elif self.is_blacklisted(tag.text):
                tag.extract()   # Remove blacklisted contents
            elif hasattr(tag, 'attrs'):
                tag.attrs = {}    # Remove attributes
            # end if
        # end for
        return div
    # end def

开发者ID:dipu-bd，项目名称:lightnovel-crawler，代码行数:27，代码来源:crawler.py

示例2: wordpressFuncXml

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def wordpressFuncXml(data):
    cms = False
    comment = ""
    version_match = None
    try:

        soup = BeautifulSoup(data.text, 'lxml')
        comments = soup.findAll(text=lambda text:isinstance(text, Comment))

        if len(comments) > 0:
            cms = True	
            version_match = re.findall(r'(?:(\d+\.[.\d]*\d+))',comments[0])
            if len(version_match) > 0:
                version_match = version_match[0]
            if version_match != WORDPRESS_LAST_CMS_VERSION:
                print "The version wordpress is outdated or not identified"
            else:
                print "The version wordpress is updated"
            
    except Exception as e:
        print e
        version_match = None

    finally:
        return cms,version_match

开发者ID:n4xh4ck5，项目名称:CMSsc4n，代码行数:27，代码来源:wordpress.py

示例3: normalize_text_sections

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def normalize_text_sections(div):
        paragraph = ''
        for content in div.contents:
            text = ''
            if type(content) == NavigableString:
                text = content
            elif type(content) == Comment:
                pass
            elif content.name == 'li':
                text = content.text
            else:
                text = content.text
            text = text.strip()
            paragraph += text.strip() + ' '
        paragraph = paragraph.strip()
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\n', ', ')
        paragraph = paragraph.strip()
        return paragraph

开发者ID:cobalt-uoft，项目名称:uoft-scrapers，代码行数:21，代码来源:__init__.py

示例4: normalize_text_sections

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def normalize_text_sections(div):
        paragraph = ''
        for content in div.contents:
            text = ''
            if type(content) == NavigableString:
                text = content
            elif type(content) == Comment:
                pass
            elif content.name == 'li':
                text = content.text
            else:
                text = content.text
            text = text.strip()
            paragraph += text.strip() + ' '
        paragraph = paragraph.strip()
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\n', ', ')
        paragraph = paragraph.replace('  ', ' ')
        paragraph = paragraph.strip()
        return paragraph

开发者ID:cobalt-uoft，项目名称:uoft-scrapers，代码行数:22，代码来源:__init__.py

示例5: standings

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def standings(season=None):
    # get most recent standings if date not specified
    if(season is None):
        season = int(datetime.datetime.today().strftime("%Y"))
    if season<1871:
        raise ValueError("This query currently only returns standings until the 1871 season. Try looking at years from 1871 to present.")
    # retrieve html from baseball reference
    soup = get_soup(season)
    if season>=1969:
        tables = get_tables(soup, season)
    else:
        t = soup.find_all(string=lambda text:isinstance(text,Comment))
        # list of seasons whose table placement breaks the site's usual pattern
        exceptions = [1884, 1885, 1886, 1887, 1888, 1889, 1890, 1892, 1903]
        if (season>1904 or season in exceptions): code = BeautifulSoup(t[16], "lxml")
        elif season<=1904: code = BeautifulSoup(t[15], "lxml")
        tables = get_tables(code, season)
    tables = [pd.DataFrame(table) for table in tables]
    for idx in range(len(tables)):
        tables[idx] = tables[idx].rename(columns=tables[idx].iloc[0])
        tables[idx] = tables[idx].reindex(tables[idx].index.drop(0))
    return tables

开发者ID:jldbc，项目名称:pybaseball，代码行数:24，代码来源:standings.py

示例6: get_overrides

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def get_overrides(self, soup):
        """
        Look for overrides in the text to make exceptions for specific style
        rules. Returns a set of rule strings to ignore for this block.
        """

        overrides = set()
        comments = soup.find_all(string=lambda text:isinstance(text,Comment))
        for comment in comments:
            m = re.match(OVERRIDE_COMMENT_REGEX, comment)
            if m:
                new_overrides = m.group(1).split(",")
                new_overrides = {o.strip() for o in new_overrides}
                logger.info("Overrides found: %s" % new_overrides)
                overrides |= new_overrides

        return overrides

开发者ID:ripple，项目名称:dactyl，代码行数:19，代码来源:dactyl_style_checker.py

示例7: _fetch

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def _fetch(url: str) -> BeautifulSoup:
    html    = ""
    req     = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) 

    with urllib.request.urlopen(req) as response:
        html = response.read()

    page    = BeautifulSoup(html, "html.parser")

    for ignored_tag in ["script", "img", "input", "button", "style", "font", "iframe", "object", "embed"]:
        for tag in page.find_all(ignored_tag):
            tag.decompose()

    for tag in page.find_all(recursive=True):
        for attribute in ["class", "id", "name", "style", "role", "lang", "dir", "href", "src"]:
            del tag[attribute]
        for attribute in list(tag.attrs):
            if attribute.startswith("data-"):
                del tag.attrs[attribute]

    for node in page.find_all(text=lambda s: isinstance(s, Comment)):
        node.extract()

    return page

开发者ID:fonol，项目名称:anki-search-inside-add-card，代码行数:26，代码来源:web_import.py

示例8: _fetchWebpage

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def _fetchWebpage(self, url):
        if isMac:
            context = _create_unverified_context()
            html = urlopen(url, context=context).read()
        else:
            headers = {'User-Agent': self.settings['userAgent']}
            html = get(url, headers=headers).content

        webpage = BeautifulSoup(html, 'html.parser')

        for tagName in self.settings['badTags']:
            for tag in webpage.find_all(tagName):
                tag.decompose()

        for c in webpage.find_all(text=lambda s: isinstance(s, Comment)):
            c.extract()

        return webpage

开发者ID:luoliyan，项目名称:incremental-reading，代码行数:20，代码来源:importer.py

示例9: find_comments_in_html_by_urls

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def find_comments_in_html_by_urls(self, urls):
        res = []
        for url in urls:
            path = urlparse(url).path
            host = urlparse(url).hostname
            scheme = urlparse(url).scheme
            req = "GET {0} {1}/1.1\r\nhost: {2}\r\n\r\n".format(path, scheme, host)
            try:
                r = self.zap.send_request(req)
                html = str(r['responseBody'])
            except Exception as e:
                r = requests.get(url)
                html = r.text
            if (html):
                soup = BeautifulSoup(html,'html.parser')
                comments = soup.findAll(text=lambda text:isinstance(text, Comment))
                comment_list = []
                for comment in comments:
                    str1 = str(comment)
                    comment_list.append(str1)
                    c = { "method":"GET", "url":url, "resp":r.text, "request":"GET "+url, "data":comment_list }
                    res.append(c)
        return res

开发者ID:secdec，项目名称:adapt，代码行数:25，代码来源:owasp_suite.py

示例10: duosuo

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def duosuo(self):
        if not self.duoshuo_shortname:
            return """
            """
        else:
            return """
            <!-- Duoshuo Comment BEGIN -->
            <div class="ds-thread"></div>
            <script type="text/javascript">
            var duoshuoQuery = {short_name:"%s"};
            (function() {
            var ds = document.createElement('script');
            ds.type = 'text/javascript';ds.async = true;
            ds.src = 'http://static.duoshuo.com/embed.js';
            ds.charset = 'UTF-8';
            (document.getElementsByTagName('head')[0]
            || document.getElementsByTagName('body')[0]).appendChild(ds);
            })();
            </script>
            <!-- Duoshuo Comment END -->
            """ % self.duoshuo_shortname

开发者ID:LeslieZhu，项目名称:OrgNote，代码行数:23，代码来源:parser.py

示例11: soup_strings

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def soup_strings(soup):
  paragraph_tags = set(["caption", "details", "h1", "h2", "h3", "h4", "h5",
                        "h6", "li", "p", "td", "div", "span"])

  skip_children = None
  for descendant in soup.descendants:
    # If we've treated a tag as a contiguous paragraph, don't re-emit the
    # children (see below).
    if skip_children is not None:
      try:
        in_skip = descendant in skip_children
      except RecursionError:
        # Possible for this check to hit a nasty infinite recursion because of
        # BeautifulSoup __eq__ checks.
        in_skip = True
      if in_skip:
        continue
      else:
        skip_children = None

    # Treat some tags as contigous paragraphs, regardless of other tags nested
    # inside (like <a> or <b>).
    if isinstance(descendant, bs4.Tag):
      if descendant.name in paragraph_tags:
        if descendant.find_all(paragraph_tags):
          # If there are nested paragraph tags, don't treat it as a single
          # contiguous tag.
          continue
        skip_children = list(descendant.descendants)
        text = " ".join(descendant.get_text(" ", strip=True).split())
        if text:
          yield text
        continue

    if (isinstance(descendant, bs4.Comment) or
        not isinstance(descendant, bs4.NavigableString)):
      continue

    text = " ".join(descendant.strip().split())
    if text:
      yield text

开发者ID:akzaidi，项目名称:fine-lm，代码行数:43，代码来源:get_references_web_single_group.py

示例12: _parse_file

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def _parse_file(test_name):
    """Parse the given HTML file."""
    file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                             'data', 'hints', 'html', test_name)
    with open(file_path, 'r', encoding='utf-8') as html:
        soup = bs4.BeautifulSoup(html, 'html.parser')

    comment = str(soup.find(text=lambda text: isinstance(text, bs4.Comment)))

    if comment is None:
        raise InvalidFile(test_name, "no comment found")

    data = utils.yaml_load(comment)

    if not isinstance(data, dict):
        raise InvalidFile(test_name, "expected yaml dict but got {}".format(
            type(data).__name__))

    allowed_keys = {'target', 'qtwebengine_todo'}
    if not set(data.keys()).issubset(allowed_keys):
        raise InvalidFile(test_name, "expected keys {} but found {}".format(
            ', '.join(allowed_keys),
            ', '.join(set(data.keys()))))

    if 'target' not in data:
        raise InvalidFile(test_name, "'target' key not found")

    qtwebengine_todo = data.get('qtwebengine_todo', None)

    return ParsedFile(target=data['target'], qtwebengine_todo=qtwebengine_todo)

开发者ID:qutebrowser，项目名称:qutebrowser，代码行数:32，代码来源:test_hints_html.py

示例13: text

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def text(self, target=None, ignore_pureascii_words=False):
        """
        Get all text in HTML, skip script and comment
        :param target: the BeatuifulSoup object, default self.b
        :param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website)
        :return: list of str
        """
        if target is None:
            target = self.b
        from bs4 import Comment
        from bs4.element import NavigableString,Doctype
        result = []
        for descendant in target.descendants:
            if not isinstance(descendant, NavigableString) \
                    or isinstance(descendant,Doctype) \
                    or descendant.parent.name in ["script", "style"] \
                    or isinstance(descendant, Comment) \
                    or "none" in descendant.parent.get("style","")\
                    or "font-size:0px" in descendant.parent.get("style",""):
                continue
            data = descendant.strip()
            if len(data) > 0:
                if not ignore_pureascii_words or any([ord(i)>127 for i in data]):
                    if PY2:
                        result.append(data.encode())
                    else:
                        result.append(data)
        return result

开发者ID:zjuchenyuan，项目名称:cc98，代码行数:30，代码来源:EasyLogin.py

示例14: sync_file

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def sync_file(path_prefix, course_id):
    if not os.path.exists(path_prefix):
        os.makedirs(path_prefix)
    soup = bs(get_page('MultiLanguage/lesson/student/download.jsp?course_id=' + str(course_id)), 'html.parser')
    for comment in soup(text=lambda text: isinstance(text, bs4.Comment)):
        link = bs(comment, 'html.parser').a
        name = link.text
        uri = comment.next.next.a.get('href')
        filename = link.get('onclick').split('getfilelink=')[-1].split('&id')[0]
        file_path = os.path.join(path_prefix, filename)
        if not os.path.exists(file_path):
            print('Download ', name)
            open(file_path, 'wb').write(open_page(uri).read())

开发者ID:Trinkle23897，项目名称:learn2018-autodown，代码行数:15，代码来源:learn-old.py

示例15: decomposeAdditional

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def decomposeAdditional(self, soup):

        # Clean out any local stylesheets
        for instance in soup.find_all('style', attrs={"type" : "text/css"}):
            instance.decompose()

        decompose = [
            # Clear out all the iframes
            'iframe',
            # Even if not explicitly tagged as css
            'style',
            # And all remote scripts
            "script",
            # Link tags
            "link",
            # Meta tags
            "meta",

            # Stylesheets (needs further checking)
            "style",
        ]

        if self.decompose_svg:
            decompose.append("svg")

        for instance in soup.find_all(decompose):

            # If it's a style tag, make sure the type is text/css before removing
            if instance.name == 'style':
                if instance.get("type", None) == "text/css":
                    instance.decompose()
            else:
                instance.decompose()

        # Comments
        for item in soup.findAll(text=lambda text:isinstance(text, bs4.Comment)):
            item.extract()


        return soup

开发者ID:fake-name，项目名称:ReadableWebProxy，代码行数:42，代码来源:HtmlProcessor.py

注：本文中的bs4.Comment方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。