当前位置: 首页>>代码示例>>Python>>正文


Python bs4.Comment方法代码示例

本文整理汇总了Python中bs4.Comment方法的典型用法代码示例。如果您正苦于以下问题:Python bs4.Comment方法的具体用法?Python bs4.Comment怎么用?Python bs4.Comment使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在bs4的用法示例。


在下文中一共展示了bs4.Comment方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: clean_contents

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def clean_contents(self, div):
        if not div:
            return div
        # end if
        div.attrs = {}
        for tag in div.findAll(True):
            if isinstance(tag, Comment):
                tag.extract()   # Remove comments
            elif tag.name == 'br':
                next_tag = getattr(tag, 'next_sibling')
                if next_tag and getattr(next_tag, 'name') == 'br':
                    tag.extract()
                # end if
            elif tag.name in self.bad_tags:
                tag.extract()   # Remove bad tags
            elif not tag.text.strip():
                tag.extract()   # Remove empty tags
            elif self.is_blacklisted(tag.text):
                tag.extract()   # Remove blacklisted contents
            elif hasattr(tag, 'attrs'):
                tag.attrs = {}    # Remove attributes
            # end if
        # end for
        return div
    # end def 
开发者ID:dipu-bd,项目名称:lightnovel-crawler,代码行数:27,代码来源:crawler.py

示例2: wordpressFuncXml

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def wordpressFuncXml(data):
	cms = False
	comment = ""
	version_match = None
	try:

		soup = BeautifulSoup(data.text, 'lxml')
		comments = soup.findAll(text=lambda text:isinstance(text, Comment))

		if len(comments) > 0:
			cms = True	
			version_match = re.findall(r'(?:(\d+\.[.\d]*\d+))',comments[0])
			if len(version_match) > 0:
				version_match = version_match[0]
			if version_match != WORDPRESS_LAST_CMS_VERSION:
				print "The version wordpress is outdated or not identified"
			else:
				print "The version wordpress is updated"
			
	except Exception as e:
		print e
		version_match = None

	finally:
		return cms,version_match 
开发者ID:n4xh4ck5,项目名称:CMSsc4n,代码行数:27,代码来源:wordpress.py

示例3: normalize_text_sections

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def normalize_text_sections(div):
        paragraph = ''
        for content in div.contents:
            text = ''
            if type(content) == NavigableString:
                text = content
            elif type(content) == Comment:
                pass
            elif content.name == 'li':
                text = content.text
            else:
                text = content.text
            text = text.strip()
            paragraph += text.strip() + ' '
        paragraph = paragraph.strip()
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\n', ', ')
        paragraph = paragraph.strip()
        return paragraph 
开发者ID:cobalt-uoft,项目名称:uoft-scrapers,代码行数:21,代码来源:__init__.py

示例4: normalize_text_sections

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def normalize_text_sections(div):
        paragraph = ''
        for content in div.contents:
            text = ''
            if type(content) == NavigableString:
                text = content
            elif type(content) == Comment:
                pass
            elif content.name == 'li':
                text = content.text
            else:
                text = content.text
            text = text.strip()
            paragraph += text.strip() + ' '
        paragraph = paragraph.strip()
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\n', ', ')
        paragraph = paragraph.replace('  ', ' ')
        paragraph = paragraph.strip()
        return paragraph 
开发者ID:cobalt-uoft,项目名称:uoft-scrapers,代码行数:22,代码来源:__init__.py

示例5: standings

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def standings(season=None):
    # get most recent standings if date not specified
    if(season is None):
        season = int(datetime.datetime.today().strftime("%Y"))
    if season<1871:
        raise ValueError("This query currently only returns standings until the 1871 season. Try looking at years from 1871 to present.")
    # retrieve html from baseball reference
    soup = get_soup(season)
    if season>=1969:
        tables = get_tables(soup, season)
    else:
        t = soup.find_all(string=lambda text:isinstance(text,Comment))
        # list of seasons whose table placement breaks the site's usual pattern
        exceptions = [1884, 1885, 1886, 1887, 1888, 1889, 1890, 1892, 1903]
        if (season>1904 or season in exceptions): code = BeautifulSoup(t[16], "lxml")
        elif season<=1904: code = BeautifulSoup(t[15], "lxml")
        tables = get_tables(code, season)
    tables = [pd.DataFrame(table) for table in tables]
    for idx in range(len(tables)):
        tables[idx] = tables[idx].rename(columns=tables[idx].iloc[0])
        tables[idx] = tables[idx].reindex(tables[idx].index.drop(0))
    return tables 
开发者ID:jldbc,项目名称:pybaseball,代码行数:24,代码来源:standings.py

示例6: get_overrides

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def get_overrides(self, soup):
        """
        Look for overrides in the text to make exceptions for specific style
        rules. Returns a set of rule strings to ignore for this block.
        """

        overrides = set()
        comments = soup.find_all(string=lambda text:isinstance(text,Comment))
        for comment in comments:
            m = re.match(OVERRIDE_COMMENT_REGEX, comment)
            if m:
                new_overrides = m.group(1).split(",")
                new_overrides = {o.strip() for o in new_overrides}
                logger.info("Overrides found: %s" % new_overrides)
                overrides |= new_overrides

        return overrides 
开发者ID:ripple,项目名称:dactyl,代码行数:19,代码来源:dactyl_style_checker.py

示例7: _fetch

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def _fetch(url: str) -> BeautifulSoup:
    html    = ""
    req     = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) 

    with urllib.request.urlopen(req) as response:
        html = response.read()

    page    = BeautifulSoup(html, "html.parser")

    for ignored_tag in ["script", "img", "input", "button", "style", "font", "iframe", "object", "embed"]:
        for tag in page.find_all(ignored_tag):
            tag.decompose()

    for tag in page.find_all(recursive=True):
        for attribute in ["class", "id", "name", "style", "role", "lang", "dir", "href", "src"]:
            del tag[attribute]
        for attribute in list(tag.attrs):
            if attribute.startswith("data-"):
                del tag.attrs[attribute]

    for node in page.find_all(text=lambda s: isinstance(s, Comment)):
        node.extract()

    return page 
开发者ID:fonol,项目名称:anki-search-inside-add-card,代码行数:26,代码来源:web_import.py

示例8: _fetchWebpage

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def _fetchWebpage(self, url):
        if isMac:
            context = _create_unverified_context()
            html = urlopen(url, context=context).read()
        else:
            headers = {'User-Agent': self.settings['userAgent']}
            html = get(url, headers=headers).content

        webpage = BeautifulSoup(html, 'html.parser')

        for tagName in self.settings['badTags']:
            for tag in webpage.find_all(tagName):
                tag.decompose()

        for c in webpage.find_all(text=lambda s: isinstance(s, Comment)):
            c.extract()

        return webpage 
开发者ID:luoliyan,项目名称:incremental-reading,代码行数:20,代码来源:importer.py

示例9: find_comments_in_html_by_urls

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def find_comments_in_html_by_urls(self, urls):
		res = []
		for url in urls:
			path = urlparse(url).path
			host = urlparse(url).hostname
			scheme = urlparse(url).scheme
			req = "GET {0} {1}/1.1\r\nhost: {2}\r\n\r\n".format(path, scheme, host)
			try:
				r = self.zap.send_request(req)
				html = str(r['responseBody'])
			except Exception as e:
				r = requests.get(url)
				html = r.text
			if (html):
				soup = BeautifulSoup(html,'html.parser')
				comments = soup.findAll(text=lambda text:isinstance(text, Comment))
				comment_list = []
				for comment in comments:
					str1 = str(comment)
					comment_list.append(str1)
					c = { "method":"GET", "url":url, "resp":r.text, "request":"GET "+url, "data":comment_list }
					res.append(c)
		return res 
开发者ID:secdec,项目名称:adapt,代码行数:25,代码来源:owasp_suite.py

示例10: duosuo

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def duosuo(self):
        if not self.duoshuo_shortname:
            return """
            """
        else:
            return """
            <!-- Duoshuo Comment BEGIN -->
            <div class="ds-thread"></div>
            <script type="text/javascript">
            var duoshuoQuery = {short_name:"%s"};
            (function() {
            var ds = document.createElement('script');
            ds.type = 'text/javascript';ds.async = true;
            ds.src = 'http://static.duoshuo.com/embed.js';
            ds.charset = 'UTF-8';
            (document.getElementsByTagName('head')[0]
            || document.getElementsByTagName('body')[0]).appendChild(ds);
            })();
            </script>
            <!-- Duoshuo Comment END -->
            """ % self.duoshuo_shortname 
开发者ID:LeslieZhu,项目名称:OrgNote,代码行数:23,代码来源:parser.py

示例11: soup_strings

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def soup_strings(soup):
  paragraph_tags = set(["caption", "details", "h1", "h2", "h3", "h4", "h5",
                        "h6", "li", "p", "td", "div", "span"])

  skip_children = None
  for descendant in soup.descendants:
    # If we've treated a tag as a contiguous paragraph, don't re-emit the
    # children (see below).
    if skip_children is not None:
      try:
        in_skip = descendant in skip_children
      except RecursionError:
        # Possible for this check to hit a nasty infinite recursion because of
        # BeautifulSoup __eq__ checks.
        in_skip = True
      if in_skip:
        continue
      else:
        skip_children = None

    # Treat some tags as contigous paragraphs, regardless of other tags nested
    # inside (like <a> or <b>).
    if isinstance(descendant, bs4.Tag):
      if descendant.name in paragraph_tags:
        if descendant.find_all(paragraph_tags):
          # If there are nested paragraph tags, don't treat it as a single
          # contiguous tag.
          continue
        skip_children = list(descendant.descendants)
        text = " ".join(descendant.get_text(" ", strip=True).split())
        if text:
          yield text
        continue

    if (isinstance(descendant, bs4.Comment) or
        not isinstance(descendant, bs4.NavigableString)):
      continue

    text = " ".join(descendant.strip().split())
    if text:
      yield text 
开发者ID:akzaidi,项目名称:fine-lm,代码行数:43,代码来源:get_references_web_single_group.py

示例12: _parse_file

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def _parse_file(test_name):
    """Parse the given HTML file."""
    file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                             'data', 'hints', 'html', test_name)
    with open(file_path, 'r', encoding='utf-8') as html:
        soup = bs4.BeautifulSoup(html, 'html.parser')

    comment = str(soup.find(text=lambda text: isinstance(text, bs4.Comment)))

    if comment is None:
        raise InvalidFile(test_name, "no comment found")

    data = utils.yaml_load(comment)

    if not isinstance(data, dict):
        raise InvalidFile(test_name, "expected yaml dict but got {}".format(
            type(data).__name__))

    allowed_keys = {'target', 'qtwebengine_todo'}
    if not set(data.keys()).issubset(allowed_keys):
        raise InvalidFile(test_name, "expected keys {} but found {}".format(
            ', '.join(allowed_keys),
            ', '.join(set(data.keys()))))

    if 'target' not in data:
        raise InvalidFile(test_name, "'target' key not found")

    qtwebengine_todo = data.get('qtwebengine_todo', None)

    return ParsedFile(target=data['target'], qtwebengine_todo=qtwebengine_todo) 
开发者ID:qutebrowser,项目名称:qutebrowser,代码行数:32,代码来源:test_hints_html.py

示例13: text

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def text(self, target=None, ignore_pureascii_words=False):
        """
        Get all text in HTML, skip script and comment
        :param target: the BeatuifulSoup object, default self.b
        :param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website)
        :return: list of str
        """
        if target is None:
            target = self.b
        from bs4 import Comment
        from bs4.element import NavigableString,Doctype
        result = []
        for descendant in target.descendants:
            if not isinstance(descendant, NavigableString) \
                    or isinstance(descendant,Doctype) \
                    or descendant.parent.name in ["script", "style"] \
                    or isinstance(descendant, Comment) \
                    or "none" in descendant.parent.get("style","")\
                    or "font-size:0px" in descendant.parent.get("style",""):
                continue
            data = descendant.strip()
            if len(data) > 0:
                if not ignore_pureascii_words or any([ord(i)>127 for i in data]):
                    if PY2:
                        result.append(data.encode())
                    else:
                        result.append(data)
        return result 
开发者ID:zjuchenyuan,项目名称:cc98,代码行数:30,代码来源:EasyLogin.py

示例14: sync_file

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def sync_file(path_prefix, course_id):
	if not os.path.exists(path_prefix):
		os.makedirs(path_prefix)
	soup = bs(get_page('MultiLanguage/lesson/student/download.jsp?course_id=' + str(course_id)), 'html.parser')
	for comment in soup(text=lambda text: isinstance(text, bs4.Comment)):
		link = bs(comment, 'html.parser').a
		name = link.text
		uri = comment.next.next.a.get('href')
		filename = link.get('onclick').split('getfilelink=')[-1].split('&id')[0]
		file_path = os.path.join(path_prefix, filename)
		if not os.path.exists(file_path):
			print('Download ', name)
			open(file_path, 'wb').write(open_page(uri).read()) 
开发者ID:Trinkle23897,项目名称:learn2018-autodown,代码行数:15,代码来源:learn-old.py

示例15: decomposeAdditional

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def decomposeAdditional(self, soup):

		# Clean out any local stylesheets
		for instance in soup.find_all('style', attrs={"type" : "text/css"}):
			instance.decompose()

		decompose = [
			# Clear out all the iframes
			'iframe',
			# Even if not explicitly tagged as css
			'style',
			# And all remote scripts
			"script",
			# Link tags
			"link",
			# Meta tags
			"meta",

			# Stylesheets (needs further checking)
			"style",
		]

		if self.decompose_svg:
			decompose.append("svg")

		for instance in soup.find_all(decompose):

			# If it's a style tag, make sure the type is text/css before removing
			if instance.name == 'style':
				if instance.get("type", None) == "text/css":
					instance.decompose()
			else:
				instance.decompose()

		# Comments
		for item in soup.findAll(text=lambda text:isinstance(text, bs4.Comment)):
			item.extract()


		return soup 
开发者ID:fake-name,项目名称:ReadableWebProxy,代码行数:42,代码来源:HtmlProcessor.py


注:本文中的bs4.Comment方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。