本文整理汇总了Python中bs4.Comment方法的典型用法代码示例。如果您正苦于以下问题:Python bs4.Comment方法的具体用法?Python bs4.Comment怎么用?Python bs4.Comment使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类bs4
的用法示例。
在下文中一共展示了bs4.Comment方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: clean_contents
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def clean_contents(self, div):
if not div:
return div
# end if
div.attrs = {}
for tag in div.findAll(True):
if isinstance(tag, Comment):
tag.extract() # Remove comments
elif tag.name == 'br':
next_tag = getattr(tag, 'next_sibling')
if next_tag and getattr(next_tag, 'name') == 'br':
tag.extract()
# end if
elif tag.name in self.bad_tags:
tag.extract() # Remove bad tags
elif not tag.text.strip():
tag.extract() # Remove empty tags
elif self.is_blacklisted(tag.text):
tag.extract() # Remove blacklisted contents
elif hasattr(tag, 'attrs'):
tag.attrs = {} # Remove attributes
# end if
# end for
return div
# end def
示例2: wordpressFuncXml
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def wordpressFuncXml(data):
cms = False
comment = ""
version_match = None
try:
soup = BeautifulSoup(data.text, 'lxml')
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
if len(comments) > 0:
cms = True
version_match = re.findall(r'(?:(\d+\.[.\d]*\d+))',comments[0])
if len(version_match) > 0:
version_match = version_match[0]
if version_match != WORDPRESS_LAST_CMS_VERSION:
print "The version wordpress is outdated or not identified"
else:
print "The version wordpress is updated"
except Exception as e:
print e
version_match = None
finally:
return cms,version_match
示例3: normalize_text_sections
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def normalize_text_sections(div):
paragraph = ''
for content in div.contents:
text = ''
if type(content) == NavigableString:
text = content
elif type(content) == Comment:
pass
elif content.name == 'li':
text = content.text
else:
text = content.text
text = text.strip()
paragraph += text.strip() + ' '
paragraph = paragraph.strip()
paragraph = paragraph.replace('\r', '')
paragraph = paragraph.replace('\n', ', ')
paragraph = paragraph.strip()
return paragraph
示例4: normalize_text_sections
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def normalize_text_sections(div):
paragraph = ''
for content in div.contents:
text = ''
if type(content) == NavigableString:
text = content
elif type(content) == Comment:
pass
elif content.name == 'li':
text = content.text
else:
text = content.text
text = text.strip()
paragraph += text.strip() + ' '
paragraph = paragraph.strip()
paragraph = paragraph.replace('\r', '')
paragraph = paragraph.replace('\n', ', ')
paragraph = paragraph.replace(' ', ' ')
paragraph = paragraph.strip()
return paragraph
示例5: standings
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def standings(season=None):
# get most recent standings if date not specified
if(season is None):
season = int(datetime.datetime.today().strftime("%Y"))
if season<1871:
raise ValueError("This query currently only returns standings until the 1871 season. Try looking at years from 1871 to present.")
# retrieve html from baseball reference
soup = get_soup(season)
if season>=1969:
tables = get_tables(soup, season)
else:
t = soup.find_all(string=lambda text:isinstance(text,Comment))
# list of seasons whose table placement breaks the site's usual pattern
exceptions = [1884, 1885, 1886, 1887, 1888, 1889, 1890, 1892, 1903]
if (season>1904 or season in exceptions): code = BeautifulSoup(t[16], "lxml")
elif season<=1904: code = BeautifulSoup(t[15], "lxml")
tables = get_tables(code, season)
tables = [pd.DataFrame(table) for table in tables]
for idx in range(len(tables)):
tables[idx] = tables[idx].rename(columns=tables[idx].iloc[0])
tables[idx] = tables[idx].reindex(tables[idx].index.drop(0))
return tables
示例6: get_overrides
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def get_overrides(self, soup):
"""
Look for overrides in the text to make exceptions for specific style
rules. Returns a set of rule strings to ignore for this block.
"""
overrides = set()
comments = soup.find_all(string=lambda text:isinstance(text,Comment))
for comment in comments:
m = re.match(OVERRIDE_COMMENT_REGEX, comment)
if m:
new_overrides = m.group(1).split(",")
new_overrides = {o.strip() for o in new_overrides}
logger.info("Overrides found: %s" % new_overrides)
overrides |= new_overrides
return overrides
示例7: _fetch
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def _fetch(url: str) -> BeautifulSoup:
html = ""
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
with urllib.request.urlopen(req) as response:
html = response.read()
page = BeautifulSoup(html, "html.parser")
for ignored_tag in ["script", "img", "input", "button", "style", "font", "iframe", "object", "embed"]:
for tag in page.find_all(ignored_tag):
tag.decompose()
for tag in page.find_all(recursive=True):
for attribute in ["class", "id", "name", "style", "role", "lang", "dir", "href", "src"]:
del tag[attribute]
for attribute in list(tag.attrs):
if attribute.startswith("data-"):
del tag.attrs[attribute]
for node in page.find_all(text=lambda s: isinstance(s, Comment)):
node.extract()
return page
示例8: _fetchWebpage
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def _fetchWebpage(self, url):
if isMac:
context = _create_unverified_context()
html = urlopen(url, context=context).read()
else:
headers = {'User-Agent': self.settings['userAgent']}
html = get(url, headers=headers).content
webpage = BeautifulSoup(html, 'html.parser')
for tagName in self.settings['badTags']:
for tag in webpage.find_all(tagName):
tag.decompose()
for c in webpage.find_all(text=lambda s: isinstance(s, Comment)):
c.extract()
return webpage
示例9: find_comments_in_html_by_urls
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def find_comments_in_html_by_urls(self, urls):
res = []
for url in urls:
path = urlparse(url).path
host = urlparse(url).hostname
scheme = urlparse(url).scheme
req = "GET {0} {1}/1.1\r\nhost: {2}\r\n\r\n".format(path, scheme, host)
try:
r = self.zap.send_request(req)
html = str(r['responseBody'])
except Exception as e:
r = requests.get(url)
html = r.text
if (html):
soup = BeautifulSoup(html,'html.parser')
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
comment_list = []
for comment in comments:
str1 = str(comment)
comment_list.append(str1)
c = { "method":"GET", "url":url, "resp":r.text, "request":"GET "+url, "data":comment_list }
res.append(c)
return res
示例10: duosuo
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def duosuo(self):
if not self.duoshuo_shortname:
return """
"""
else:
return """
<!-- Duoshuo Comment BEGIN -->
<div class="ds-thread"></div>
<script type="text/javascript">
var duoshuoQuery = {short_name:"%s"};
(function() {
var ds = document.createElement('script');
ds.type = 'text/javascript';ds.async = true;
ds.src = 'http://static.duoshuo.com/embed.js';
ds.charset = 'UTF-8';
(document.getElementsByTagName('head')[0]
|| document.getElementsByTagName('body')[0]).appendChild(ds);
})();
</script>
<!-- Duoshuo Comment END -->
""" % self.duoshuo_shortname
示例11: soup_strings
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def soup_strings(soup):
paragraph_tags = set(["caption", "details", "h1", "h2", "h3", "h4", "h5",
"h6", "li", "p", "td", "div", "span"])
skip_children = None
for descendant in soup.descendants:
# If we've treated a tag as a contiguous paragraph, don't re-emit the
# children (see below).
if skip_children is not None:
try:
in_skip = descendant in skip_children
except RecursionError:
# Possible for this check to hit a nasty infinite recursion because of
# BeautifulSoup __eq__ checks.
in_skip = True
if in_skip:
continue
else:
skip_children = None
# Treat some tags as contigous paragraphs, regardless of other tags nested
# inside (like <a> or <b>).
if isinstance(descendant, bs4.Tag):
if descendant.name in paragraph_tags:
if descendant.find_all(paragraph_tags):
# If there are nested paragraph tags, don't treat it as a single
# contiguous tag.
continue
skip_children = list(descendant.descendants)
text = " ".join(descendant.get_text(" ", strip=True).split())
if text:
yield text
continue
if (isinstance(descendant, bs4.Comment) or
not isinstance(descendant, bs4.NavigableString)):
continue
text = " ".join(descendant.strip().split())
if text:
yield text
示例12: _parse_file
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def _parse_file(test_name):
"""Parse the given HTML file."""
file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
'data', 'hints', 'html', test_name)
with open(file_path, 'r', encoding='utf-8') as html:
soup = bs4.BeautifulSoup(html, 'html.parser')
comment = str(soup.find(text=lambda text: isinstance(text, bs4.Comment)))
if comment is None:
raise InvalidFile(test_name, "no comment found")
data = utils.yaml_load(comment)
if not isinstance(data, dict):
raise InvalidFile(test_name, "expected yaml dict but got {}".format(
type(data).__name__))
allowed_keys = {'target', 'qtwebengine_todo'}
if not set(data.keys()).issubset(allowed_keys):
raise InvalidFile(test_name, "expected keys {} but found {}".format(
', '.join(allowed_keys),
', '.join(set(data.keys()))))
if 'target' not in data:
raise InvalidFile(test_name, "'target' key not found")
qtwebengine_todo = data.get('qtwebengine_todo', None)
return ParsedFile(target=data['target'], qtwebengine_todo=qtwebengine_todo)
示例13: text
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def text(self, target=None, ignore_pureascii_words=False):
"""
Get all text in HTML, skip script and comment
:param target: the BeatuifulSoup object, default self.b
:param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website)
:return: list of str
"""
if target is None:
target = self.b
from bs4 import Comment
from bs4.element import NavigableString,Doctype
result = []
for descendant in target.descendants:
if not isinstance(descendant, NavigableString) \
or isinstance(descendant,Doctype) \
or descendant.parent.name in ["script", "style"] \
or isinstance(descendant, Comment) \
or "none" in descendant.parent.get("style","")\
or "font-size:0px" in descendant.parent.get("style",""):
continue
data = descendant.strip()
if len(data) > 0:
if not ignore_pureascii_words or any([ord(i)>127 for i in data]):
if PY2:
result.append(data.encode())
else:
result.append(data)
return result
示例14: sync_file
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def sync_file(path_prefix, course_id):
if not os.path.exists(path_prefix):
os.makedirs(path_prefix)
soup = bs(get_page('MultiLanguage/lesson/student/download.jsp?course_id=' + str(course_id)), 'html.parser')
for comment in soup(text=lambda text: isinstance(text, bs4.Comment)):
link = bs(comment, 'html.parser').a
name = link.text
uri = comment.next.next.a.get('href')
filename = link.get('onclick').split('getfilelink=')[-1].split('&id')[0]
file_path = os.path.join(path_prefix, filename)
if not os.path.exists(file_path):
print('Download ', name)
open(file_path, 'wb').write(open_page(uri).read())
示例15: decomposeAdditional
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Comment [as 别名]
def decomposeAdditional(self, soup):
# Clean out any local stylesheets
for instance in soup.find_all('style', attrs={"type" : "text/css"}):
instance.decompose()
decompose = [
# Clear out all the iframes
'iframe',
# Even if not explicitly tagged as css
'style',
# And all remote scripts
"script",
# Link tags
"link",
# Meta tags
"meta",
# Stylesheets (needs further checking)
"style",
]
if self.decompose_svg:
decompose.append("svg")
for instance in soup.find_all(decompose):
# If it's a style tag, make sure the type is text/css before removing
if instance.name == 'style':
if instance.get("type", None) == "text/css":
instance.decompose()
else:
instance.decompose()
# Comments
for item in soup.findAll(text=lambda text:isinstance(text, bs4.Comment)):
item.extract()
return soup