本文整理汇总了Python中bs4.Tag方法的典型用法代码示例。如果您正苦于以下问题:Python bs4.Tag方法的具体用法?Python bs4.Tag怎么用?Python bs4.Tag使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类bs4
的用法示例。
在下文中一共展示了bs4.Tag方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Tag [as 别名]
def __init__(self, input, id_=None, **kwargs):
# TODO: should divide this class into two subclasses
# to deal with string and bs4.Tag separately
# validate the input
if not isinstance(input, str) and not isinstance(input, Tag):
raise Exception('Unrecognized type. Valid input: str, bs4.element.Tag')
soup = BeautifulSoup(input, 'html.parser').find() if isinstance(input, str) else input
# locate the target table
if soup.name == 'table':
self._table = soup
else:
self._table = soup.find(id=id_)
if 'transformer' in kwargs:
self._transformer = kwargs['transformer']
else:
self._transformer = str
self._output = []
示例2: append_to
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Tag [as 别名]
def append_to(parent, tag, **kwargs):
"""
Append an element to the supplied parent.
:param parent: Parent to append to.
:param tag: Tag to create.
:param kwargs: Tag kwargs.
:return: New element.
"""
if hasattr(parent, "soup"):
soup = parent.soup
else:
soup = parent.find_parent("html")
# Create Tag explicitly instead of using new_tag, otherwise attribute "name" leads to clash with tag-name in bs4
new_tag = bs4.Tag(builder=soup.builder, name=tag, attrs=kwargs)
new_tag.soup = soup
parent.append(new_tag)
return new_tag
示例3: construct_element
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Tag [as 别名]
def construct_element(container=None, content=None, tag=None, element_type=None):
"""
Constructs an element and appends it to the container.
:param container: Container to add the element to.
:param content: String representation of content (e.g. JS or CSS)
:param tag: Tag name, e.g. "script" or "style"
:param element_type: E.g. "text/javascript" or "text/css"
:return: New element.
"""
if container is None:
el = root(tag, type=element_type)
else:
el = append_to(container, tag, type=element_type)
if content is not None:
el.string = content
return el
示例4: html
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Tag [as 别名]
def html(self):
html = markdown.markdown(self.markdown,
extensions=MARKDOWN_EXTENSIONS,
output_format='html5')
# fix image links
soup = BeautifulSoup(html, 'lxml')
for img in soup.find_all('img'):
img.attrs['src'] = self._get_static_url(img.attrs['src'])
# strip html and body tags
body = soup.find('body') or ''
if isinstance(body, SoupTag):
body = ''.join(map(str, body.contents))
# prefix stylesheet if necessary
if not self.is_dir or not os.path.exists(
os.path.join(self.dir_path, ARTICLE_STYLESHEET_FILENAME)):
return body
href = self._get_static_url(ARTICLE_STYLESHEET_FILENAME)
return f'<link rel="stylesheet" type="text/css" href="{href}">' + body
示例5: retrieve_erly_iframe_src
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Tag [as 别名]
def retrieve_erly_iframe_src(self, video_source_response: CachedResponse) -> str:
erly_iframe: Union[Tag, None] = pipe(
lambda r_content: BeautifulSoup(
r_content,
"html.parser"
),
lambda soup: soup.select_one(
"div.full_screen > iframe"
)
)(video_source_response.content)
if not erly_iframe:
return {"fatal_error": ".full_screen > iframe wasn't found"}
erly_iframe_src: Union[str, None] = erly_iframe.get("src")
if not erly_iframe_src:
return {"fatal_error": ".full_screen > iframe doesn't have src attribute"}
return erly_iframe_src
示例6: _find_base_element
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Tag [as 别名]
def _find_base_element(element: Tag) -> Optional[Tag]:
"""
Find the 'base element' of a symbol. In most cases, this is the base identifier that
is being modified by other symbols. For example, this function will return the
'<mi>' element for 'x' in the symbol 'x^2', or 'x_i'. If this element does not have any
descendant that can qualify as a base element, None is returned.
"""
BASE_ELEMENT_TAG = "mi"
# To find the base element perform a depth-first search. The first identifier ('<mi>') in a
# pre-order traversal of the tree is the base element. This is because the 'base' element
# is the first child of '<msub>' or '<msup>' elements.
if element.name == BASE_ELEMENT_TAG:
return element
for child in element.children:
if isinstance(child, Tag):
base_element = _find_base_element(child)
if base_element is not None:
return base_element
return None
示例7: _extract_tokens
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Tag [as 别名]
def _extract_tokens(element: Tag) -> List[Token]:
"""
Get the tokens defined in this element. Tokens are only found in low-level elements like
"<mi>" and "<mn>". This function will find no tokens in higher-level nodes that solely
group other low-level elements (like "<mrow>" and "<msub>").
"""
tokens = []
if element.name in TOKEN_TAGS and _has_s2_token_annotations(element):
tokens.append(
Token(
text=element.string,
token_index=int(element["s2:index"]),
start=int(element["s2:start"]),
end=int(element["s2:end"]),
)
)
return tokens
示例8: define
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Tag [as 别名]
def define(word, num=1):
if num < 1:
num = 1
try:
url = "http://wordnetweb.princeton.edu/perl/webwn?s=" + word + "&sub=Search+WordNet&o2=&o0=&o8=1&o1=1&o7=&o5=&o9=&o6=&o3=&o4=&h=0000000000"
except Exception as e:
print(e)
return 'Couldn\'t download definition.'
try:
soup = BeautifulSoup(request.urlopen(url))
except:
return "Network Error: Couldn't download definition.", 0
if soup.ul is not None:
definitions = [x.text for x in list(soup.ul) if isinstance(x, Tag) and x.text != '\n' and x.text != '']
if len(definitions) >= num:
return (definitions[num - 1] + '[' + str(num) + ' of ' + str(len(definitions)) + ']')[
3:].capitalize(), len(definitions)
return "Couldn\'t find definition.", 0
示例9: get_pkd
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Tag [as 别名]
def get_pkd(self, *args, **kwargs):
pkd = []
details = self._get_details(*args, **kwargs)
if details is not None:
data = BeautifulSoup(details, 'lxml')
report_type = self.pkd_report_type.get('F')
if 'P' in data.typ.get_text():
report_type = self.pkd_report_type.get('P')
report = self._service(
'DanePobierzPelnyRaport', data.regon.get_text(), report_type)
if report is not None:
for item in BeautifulSoup(report, 'lxml').find_all('dane'):
data = {i.name.split('_', 1)[1].replace('_', '').lower(): i.get_text()
for i in item.children if isinstance(i, Tag)}
pkd.append({
'code': data['pkdkod'],
'name': data['pkdnazwa'],
'main': data['pkdprzewazajace'] == '1'})
pkd = [dict(t) for t in set([tuple(d.items()) for d in pkd])]
return pkd
示例10: soup_strings
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Tag [as 别名]
def soup_strings(soup):
paragraph_tags = set(["caption", "details", "h1", "h2", "h3", "h4", "h5",
"h6", "li", "p", "td", "div", "span"])
skip_children = None
for descendant in soup.descendants:
# If we've treated a tag as a contiguous paragraph, don't re-emit the
# children (see below).
if skip_children is not None:
try:
in_skip = descendant in skip_children
except RecursionError:
# Possible for this check to hit a nasty infinite recursion because of
# BeautifulSoup __eq__ checks.
in_skip = True
if in_skip:
continue
else:
skip_children = None
# Treat some tags as contigous paragraphs, regardless of other tags nested
# inside (like <a> or <b>).
if isinstance(descendant, bs4.Tag):
if descendant.name in paragraph_tags:
if descendant.find_all(paragraph_tags):
# If there are nested paragraph tags, don't treat it as a single
# contiguous tag.
continue
skip_children = list(descendant.descendants)
text = " ".join(descendant.get_text(" ", strip=True).split())
if text:
yield text
continue
if (isinstance(descendant, bs4.Comment) or
not isinstance(descendant, bs4.NavigableString)):
continue
text = " ".join(descendant.strip().split())
if text:
yield text
示例11: get_last_descendant
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Tag [as 别名]
def get_last_descendant(self, node):
"""Get the last descendant."""
if node.next_sibling is not None:
last_descendant = node.next_sibling
else:
last_child = node
while isinstance(last_child, bs4.Tag) and last_child.contents:
last_child = last_child.contents[-1]
last_descendant = last_child.next_element
return last_descendant
示例12: __init__
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Tag [as 别名]
def __init__(self, table):
if isinstance(table, Tag):
self.table = table
elif isinstance(table, soup):
self.table = table.find_all("table")
elif isinstance(table, str):
self.table = soup(str, 'html.parser').find_all("table")
else:
raise Exception('unrecognized type')
self.output = []
示例13: clean_pullquote_tags
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Tag [as 别名]
def clean_pullquote_tags(item: BS4_Tag) -> BS4_Tag:
"""
Replace "[pullquote][/pullquote]" tags in string with "<span class='pullquote'></span>"
https://stackoverflow.com/a/44593228/1191545
"""
replacement_values = [
("[pullquote]", ""),
("[/pullquote]", ""),
]
for replacement_value in replacement_values:
item.string = item.string.replace(*replacement_value)
return item
示例14: makeTag
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Tag [as 别名]
def makeTag(name, string=None, **kwargs):
tag = Tag(name=name, attrs=kwargs)
for key in kwargs:
if kwargs[key] is None:
kwargs[key] = ""
if string:
tag.string = string
return tag
示例15: parse_business_hours
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import Tag [as 别名]
def parse_business_hours(business_hours_html):
business_hours = ''
soup = bs4.BeautifulSoup(business_hours_html, 'html.parser')
time_icon = soup.find(class_='glyphicon-time')
transfer_icon = soup.find(class_='glyphicon-transfer')
education_icon = soup.find(class_='glyphicon-education')
if time_icon:
business_hours += '\n*Öffnungszeiten*'
for sib in time_icon.parent.parent.next_siblings:
if type(sib) == bs4.Tag and transfer_icon not in sib.descendants and education_icon not in sib.descendants:
for item in sib.find_all('div', class_='col-xs-10'):
for string in item.stripped_strings:
business_hours += '\n%s' % string
return business_hours.strip()