本文整理汇总了Python中bs4.SoupStrainer方法的典型用法代码示例。如果您正苦于以下问题:Python bs4.SoupStrainer方法的具体用法?Python bs4.SoupStrainer怎么用?Python bs4.SoupStrainer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类bs4
的用法示例。
在下文中一共展示了bs4.SoupStrainer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: masterlist
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def masterlist(SITE, SHOWS, SPECIALS = None):
master_db = []
root_dict = {}
root_url = SHOWS
root_data = connection.getURL(root_url)
root_tree = BeautifulSoup(root_data, 'html.parser', parse_only = SoupStrainer('div', id = 'grid-frame'))
root_menu = root_tree.find_all('div', class_ = 'media-module')
for root_item in root_menu:
root_name = root_item.find('div', class_ = 'title').text
season_url = BASE + root_item.a['href']
if '-1' not in season_url:
tvdb_name = common.get_show_data(root_name, SITE, 'seasons')[-1]
root_name = root_name + '#' + season_url
if tvdb_name not in root_dict.keys():
root_dict[tvdb_name] = root_name
else:
root_dict[tvdb_name] = root_dict[tvdb_name] + '|' + root_name
for root_name in root_dict:
season_url = root_dict[root_name]
master_db.append((root_name, SITE, 'seasons', season_url))
more = root_tree.find('a', class_ = 'load-more')
if more:
master_db.extend(masterlist(SITE, BASE + more['href']))
return master_db
示例2: episodes
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def episodes(SITE, episode_url = common.args.url):
episodes = []
if '#' in episode_url:
episode_url = episode_url.split('#')[1]
episode_data = connection.getURL(episode_url)
episode_tree = BeautifulSoup(episode_data, 'html.parser', parse_only = SoupStrainer('div', class_ = 'show'))
try:
episodes = add_videos(episode_tree, SITE)
except:
print "Can't add video"
more = episode_tree.find('a', class_ = 'load-more')
if more:
episode_data = connection.getURL(BASE + more['href'])
episode_tree = BeautifulSoup(episode_data, 'html.parser')
episodes = add_videos(episode_tree, SITE)
return episodes
示例3: convert_subtitles
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def convert_subtitles(closedcaption):
str_output = ''
subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
lines = subtitle_data.find_all('p')
for i, line in enumerate(lines):
if line is not None:
sub = clean_subs(common.smart_utf8(line))
start_time_hours, start_time_rest = line['begin'].split(':', 1)
start_time_hours = '%02d' % (int(start_time_hours) - 1)
start_time = common.smart_utf8(start_time_hours + ':' + start_time_rest.replace('.', ','))
end_time_hours, end_time_rest = line['end'].split(':', 1)
end_time_hours = '%02d' % (int(end_time_hours) - 1)
end_time = common.smart_utf8(end_time_hours + ':' + end_time_rest.replace('.', ','))
str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub + '\n\n'
file = open(ustvpaths.SUBTITLE, 'w')
file.write(str_output)
file.close()
return True
示例4: convert_subtitles
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def convert_subtitles(closedcaption):
str_output = ''
subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
lines = subtitle_data.find_all('p')
for i, line in enumerate(lines):
if line is not None:
sub = clean_subs(common.smart_utf8(line))
start_time_rest, start_time_msec = line['begin'].rsplit(':',1)
start_time = common.smart_utf8(start_time_rest + ',' + start_time_msec)
try:
end_time_rest, end_time_msec = line['end'].rsplit(':',1)
end_time = common.smart_utf8(end_time_rest + ',' + end_time_msec)
except:
continue
str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub + '\n\n'
file = open(ustvpaths.SUBTITLE, 'w')
file.write(str_output)
file.close()
示例5: convert_subtitles
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def convert_subtitles(closedcaption):
str_output = ''
subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
lines = subtitle_data.find_all('p')
for i, line in enumerate(lines):
if line is not None:
sub = clean_subs(common.smart_utf8(line))
start_time = common.smart_utf8(line['begin'].replace('.', ','))
if ',' not in start_time:
start_time = start_time + ',00'
end_time = common.smart_utf8(line['end'].replace('.', ','))
if ',' not in end_time:
end_time = end_time + ',00'
str_output += str(i + 1) + '\n' + start_time[:11] + ' --> ' + end_time[:11] + '\n' + sub + '\n\n'
file = open(ustvpaths.SUBTITLE, 'w')
file.write(str_output)
file.close()
示例6: convert_subtitles
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def convert_subtitles(closedcaption):
str_output = ''
last_start_time = ''
subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
lines = subtitle_data.find_all('p')
for i, line in enumerate(lines):
if line is not None:
sub = clean_subs(common.smart_utf8(line))
start_time = common.smart_utf8(line['begin'].replace('.', ','))
try:
end_time = common.smart_utf8(line['end'].replace('.', ','))
except:
continue
if last_start_time != start_time:
if i != 0:
str_output += '\n\n'
str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub
else:
str_output += '\n' + sub
last_start_time = start_time
file = open(ustvpaths.SUBTITLE, 'w')
file.write(str_output)
file.close()
示例7: __get_version
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def __get_version(self):
'''
get jenkins version
:return:
'''
try:
html = urllib2.urlopen(self.url + '/login?from=%2F').read()
links = SoupStrainer('a' ,href = re.compile(VERSION_TAG))
version_text = BeautifulSoup(html, "html.parser", parse_only= links)
if version_text.text != "":
color_output("[+]....jenkins version is %s" % version_text.text)
version_re = re.findall(u"ver.\s(.*)" ,version_text.text)
if len(version_re) != 0:
if version_re[0][0:4] >= self.check_version:
self.user_link = ASYNCH_PEOPEL_PERFIX
else:
self.user_link = PEOPLE_PERFIX
else:
color_output("[-]....can't get jenkins version!")
sys.exit()
except urllib2.URLError,e:
color_output("[-]....can't get jenkins version!")
sys.exit()
示例8: parse
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def parse(response):
page = response.text
ss = SoupStrainer('table', width='650')
bs = GlobalFeaturedSoup(page, parse_only=ss)
title = bs.find('tr', bgcolor='#FB9E04')
trs = bs.find_all('tr', bgcolor=re.compile(r'#D6D3CE|#B4B9B9'))
if title:
courses = []
keys = tuple(title.stripped_strings)
value_list = parse_tr_strs(trs)
for values in value_list:
course = dict(safe_zip(keys, values))
course.pop('序号')
course['课程代码'] = course['课程代码'].upper()
course['班级容量'] = int(course['班级容量'])
courses.append(course)
return courses
else:
log_result_not_found(response)
return []
示例9: get_child_urls
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def get_child_urls(main_page, max_child=20):
"""retrieve urls from giving html page.
args:
main_page(str): html file.
max_child(int): max number of return urls.
return:
list of url string.
"""
from bs4 import BeautifulSoup, SoupStrainer
children = []
for link in BeautifulSoup(main_page,
"html.parser",
parse_only=SoupStrainer('a')):
if link.has_attr('href') and link['href'].startswith("http"):
children.append(link['href'])
if len(children) > max_child:
children = children[:max_child]
return children
示例10: getLinks
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def getLinks(text, url=""):
if url and url[-1] == "/":
url = url[:-1]
links = []
if text:
for link in BeautifulSoup(text, "html.parser", parse_only=SoupStrainer("a", href=True)):
if link.has_attr('href'):
if (link['href']):
href = link['href'].strip()
if not href.startswith("http://") and not href.startswith("https://") and not href.startswith("mailto:") and not href.startswith("tel:"):
if not href.startswith('/'):
href = "/" + href
href = url + href
links.append(href)
return links
#-----------------------------------------------------------------------------
# MAIN
#-----------------------------------------------------------------------------
示例11: get_contents
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def get_contents(game_html):
"""
Uses Beautiful soup to parses the html document.
Some parsers work for some pages but don't work for others....I'm not sure why so I just try them all here in order
:param game_html: html doc
:return: "soupified" html
"""
parsers = ["lxml", "html.parser", "html5lib"]
strainer = SoupStrainer('td', attrs={'class': re.compile(r'bborder')})
for parser in parsers:
# parse_only only works with lxml for some reason
if parser == "lxml":
soup = BeautifulSoup(game_html, parser, parse_only=strainer)
else:
soup = BeautifulSoup(game_html, parser)
tds = soup.find_all("td", {"class": re.compile('.*bborder.*')})
if len(tds) > 0:
break
return tds
示例12: __pre_process
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def __pre_process(self, page):
# As BeautifulSoup will cause memory I/O error when the page is too large
if page.find('<dl')>0:
data = page.split('<dl')
tag_dd = SoupStrainer('dd')
for idx in xrange(1, len(data)):
count = data[idx].count('<dd')
if count > 5:
parts = data[idx].split('</dl>')
dds = parts[0].split('</dd>')
data[idx] = ''.join([dds[0], '</dd> <dx>%d</dx>'%idx,
dds[-1], '</dl>', ''.join(parts[1:])])
self.__dd_ext[str(idx)] =[]
for item in dds[1:-1]:
dd = BeautifulSoup(item, parse_only=tag_dd).dd
assert dd
self.__dd_ext[str(idx)].append(dd)
return '<dl'.join(data)
else:
return page
示例13: __initdef
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def __initdef(self, word, data):
data = self.__pre_process(data)
wpg = SoupStrainer('div', class_=re.compile('[^<>]*?wordPage[^<>]*?'))
soup = BeautifulSoup(data, parse_only=wpg)
div = soup.find('div', class_=re.compile('[^<>]*?wordPage[^<>]*?'))
assert div
self.__getwordtitle(div.h1)
if word != self.__title:
self.__title = None
return False
div = soup.find('div', {'class': 'section blurb'})
if div:
self.__hasblurb = True
self.__getblurb(div)
tags = soup.find_all(re.compile(r'div|h2'), class_='sectionHeader')
tag = find_fulldefine(tags, re.compile(r'DEFINITIONS OF', re.I))
if tag:
self.__getfulldef(tag.parent)
else:
print("WARNING: %s HAS NO FULLDEFINITION" % self.__title)
assert tag # to raise error and break
div = soup.find('div', {'class': 'section family'})
if div:
self.__getwordfamily(div)
return True
示例14: get_witnesses_for_bill
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def get_witnesses_for_bill(bill_id, session):
# ex: "HB 864" -> "https://capitol.texas.gov/tlodocs/86R/witlistbill/html/HB00864H.htm"
parsed_bill_id = re.search(r"(\w+)\s+(\d+)", bill_id)
bill_type = parsed_bill_id.group(1)
bill_number = parsed_bill_id.group(2).zfill(5)
url_prefix = f"https://capitol.texas.gov/tlodocs/{session}R/witlistbill/html/{bill_type}{bill_number}"
house_url = f"{url_prefix}H.htm"
senate_url = f"{url_prefix}S.htm"
res = requests.get(house_url)
# ##### Basic Test
# # parsing all <p/> blocks up front may not be efficient
# filter = SoupStrainer('p') # only <p/> tags contain text that we care about
# text_blocks = BeautifulSoup(res.content, "html.parser", parse_only=filter)
# selecting = None;
# for block in text_blocks:
# text = block.get_text(strip=True)
# print(f"[{text}]")
return parse_witness_list_html(res.content)
示例15: find_form_request
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def find_form_request(html: str):
soup = bs4.BeautifulSoup(html, "html.parser", parse_only=bs4.SoupStrainer("form"))
form = soup.form
if not form:
raise _exception.ParseError("Could not find form to submit", data=html)
url = form.get("action")
if not url:
raise _exception.ParseError("Could not find url to submit to", data=form)
# From what I've seen, it'll always do this!
if url.startswith("/"):
url = "https://www.facebook.com" + url
# It's okay to set missing values to something crap, the values are localized, and
# hence are not available in the raw HTML
data = {
x["name"]: x.get("value", "[missing]")
for x in form.find_all(["input", "button"])
}
return url, data