当前位置: 首页>>代码示例>>Python>>正文


Python bs4.SoupStrainer方法代码示例

本文整理汇总了Python中bs4.SoupStrainer方法的典型用法代码示例。如果您正苦于以下问题:Python bs4.SoupStrainer方法的具体用法?Python bs4.SoupStrainer怎么用?Python bs4.SoupStrainer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在bs4的用法示例。


在下文中一共展示了bs4.SoupStrainer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: masterlist

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def masterlist(SITE, SHOWS, SPECIALS = None):
	master_db = []
	root_dict = {}
	root_url = SHOWS
	root_data = connection.getURL(root_url)
	root_tree = BeautifulSoup(root_data, 'html.parser', parse_only = SoupStrainer('div', id = 'grid-frame'))
	root_menu = root_tree.find_all('div', class_ = 'media-module')
	for root_item in root_menu:
		root_name = root_item.find('div', class_ = 'title').text
		season_url = BASE + root_item.a['href']
		if '-1' not in season_url:
			tvdb_name = common.get_show_data(root_name, SITE, 'seasons')[-1]
			root_name = root_name + '#' + season_url 
			if tvdb_name not in root_dict.keys():
				root_dict[tvdb_name] = root_name
			else:
				root_dict[tvdb_name] = root_dict[tvdb_name] + '|' + root_name
	for root_name in root_dict:
		season_url = root_dict[root_name]
		master_db.append((root_name, SITE, 'seasons', season_url))
	more = root_tree.find('a', class_ = 'load-more')
	if more:
		master_db.extend(masterlist(SITE, BASE + more['href']))
	return master_db 
开发者ID:moneymaker365,项目名称:plugin.video.ustvvod,代码行数:26,代码来源:main_natgeo.py

示例2: episodes

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def episodes(SITE, episode_url = common.args.url):
	episodes = []
	if '#' in episode_url:
		episode_url = episode_url.split('#')[1]
	episode_data = connection.getURL(episode_url)
	episode_tree = BeautifulSoup(episode_data, 'html.parser', parse_only = SoupStrainer('div', class_ = 'show'))
	try:
		episodes = add_videos(episode_tree, SITE)
	except:
		print "Can't add video"
	more = episode_tree.find('a', class_ = 'load-more')
	if more:
		episode_data = connection.getURL(BASE + more['href'])
		episode_tree = BeautifulSoup(episode_data, 'html.parser')
		episodes = add_videos(episode_tree, SITE)
	return episodes 
开发者ID:moneymaker365,项目名称:plugin.video.ustvvod,代码行数:18,代码来源:main_natgeo.py

示例3: convert_subtitles

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def convert_subtitles(closedcaption):
	str_output = ''
	subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
	subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
	lines = subtitle_data.find_all('p')
	for i, line in enumerate(lines):
		if line is not None:
			sub = clean_subs(common.smart_utf8(line))
			start_time_hours, start_time_rest = line['begin'].split(':', 1)
			start_time_hours = '%02d' % (int(start_time_hours) - 1)
			start_time = common.smart_utf8(start_time_hours + ':' + start_time_rest.replace('.', ','))
			end_time_hours, end_time_rest = line['end'].split(':', 1)
			end_time_hours = '%02d' % (int(end_time_hours) - 1)
			end_time = common.smart_utf8(end_time_hours + ':' + end_time_rest.replace('.', ','))
			str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub + '\n\n'
	file = open(ustvpaths.SUBTITLE, 'w')
	file.write(str_output)
	file.close()
	return True 
开发者ID:moneymaker365,项目名称:plugin.video.ustvvod,代码行数:21,代码来源:main_abcdisney.py

示例4: convert_subtitles

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def convert_subtitles(closedcaption):
	str_output = ''
	subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
	subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
	lines = subtitle_data.find_all('p')
	for i, line in enumerate(lines):
		if line is not None:
			sub = clean_subs(common.smart_utf8(line))
			start_time_rest, start_time_msec = line['begin'].rsplit(':',1)
			start_time = common.smart_utf8(start_time_rest + ',' + start_time_msec)
			try:
				end_time_rest, end_time_msec = line['end'].rsplit(':',1)
				end_time = common.smart_utf8(end_time_rest + ',' + end_time_msec)
			except:
				continue
			str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub + '\n\n'
	file = open(ustvpaths.SUBTITLE, 'w')
	file.write(str_output)
	file.close() 
开发者ID:moneymaker365,项目名称:plugin.video.ustvvod,代码行数:21,代码来源:main_nbcu.py

示例5: convert_subtitles

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def convert_subtitles(closedcaption):
	str_output = ''
	subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
	subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
	lines = subtitle_data.find_all('p')
	for i, line in enumerate(lines):
		if line is not None:
			sub = clean_subs(common.smart_utf8(line))
			start_time = common.smart_utf8(line['begin'].replace('.', ','))
			if ',' not in start_time:
				start_time = start_time + ',00'
			end_time = common.smart_utf8(line['end'].replace('.', ','))
			if ',' not in end_time:
				end_time = end_time + ',00'
			str_output += str(i + 1) + '\n' + start_time[:11] + ' --> ' + end_time[:11] + '\n' + sub + '\n\n'
	file = open(ustvpaths.SUBTITLE, 'w')
	file.write(str_output)
	file.close() 
开发者ID:moneymaker365,项目名称:plugin.video.ustvvod,代码行数:20,代码来源:pbs.py

示例6: convert_subtitles

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def convert_subtitles(closedcaption):
	str_output = ''
	last_start_time = ''
	subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
	subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
	lines = subtitle_data.find_all('p')
	for i, line in enumerate(lines):
		if line is not None:
			sub = clean_subs(common.smart_utf8(line))
			start_time = common.smart_utf8(line['begin'].replace('.', ','))
			try:
				end_time = common.smart_utf8(line['end'].replace('.', ','))
			except:
				continue
			if last_start_time != start_time:
				if i != 0:
					str_output += '\n\n'
				str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub
			else:
				str_output += '\n' + sub 
			last_start_time = start_time
	file = open(ustvpaths.SUBTITLE, 'w')
	file.write(str_output)
	file.close() 
开发者ID:moneymaker365,项目名称:plugin.video.ustvvod,代码行数:26,代码来源:fox.py

示例7: __get_version

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def __get_version(self):
        '''
        get jenkins version
        :return:
        '''
        try:
            html = urllib2.urlopen(self.url + '/login?from=%2F').read()
            links = SoupStrainer('a' ,href = re.compile(VERSION_TAG))
            version_text = BeautifulSoup(html, "html.parser", parse_only= links)
            if version_text.text != "":
                color_output("[+]....jenkins version is %s" % version_text.text)
                version_re = re.findall(u"ver.\s(.*)" ,version_text.text)
                if len(version_re) != 0:
                    if version_re[0][0:4] >= self.check_version:
                        self.user_link = ASYNCH_PEOPEL_PERFIX
                    else:
                        self.user_link = PEOPLE_PERFIX
            else:
                color_output("[-]....can't get jenkins version!")
                sys.exit()
        except urllib2.URLError,e:
            color_output("[-]....can't get jenkins version!")
            sys.exit() 
开发者ID:restran,项目名称:hacker-scripts,代码行数:25,代码来源:jenkins.py

示例8: parse

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def parse(response):
        page = response.text
        ss = SoupStrainer('table', width='650')
        bs = GlobalFeaturedSoup(page, parse_only=ss)
        title = bs.find('tr', bgcolor='#FB9E04')
        trs = bs.find_all('tr', bgcolor=re.compile(r'#D6D3CE|#B4B9B9'))
        if title:
            courses = []
            keys = tuple(title.stripped_strings)
            value_list = parse_tr_strs(trs)
            for values in value_list:
                course = dict(safe_zip(keys, values))
                course.pop('序号')
                course['课程代码'] = course['课程代码'].upper()
                course['班级容量'] = int(course['班级容量'])
                courses.append(course)
            return courses
        else:
            log_result_not_found(response)
            return [] 
开发者ID:elonzh,项目名称:hfut,代码行数:22,代码来源:interface.py

示例9: get_child_urls

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def get_child_urls(main_page, max_child=20):
    """retrieve urls from giving html page.
    args:
        main_page(str): html file.
        max_child(int): max number of return urls.
    return:
        list of url string.
    """
    from bs4 import BeautifulSoup, SoupStrainer
    children = []
    for link in BeautifulSoup(main_page,
                              "html.parser",
                              parse_only=SoupStrainer('a')):
        if link.has_attr('href') and link['href'].startswith("http"):
            children.append(link['href'])
    if len(children) > max_child:
        children = children[:max_child]
    return children 
开发者ID:yuhui-lin,项目名称:web_page_classification,代码行数:20,代码来源:collect.py

示例10: getLinks

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def getLinks(text, url=""):
    if url and url[-1] == "/":
        url = url[:-1]

    links = []
    if text:
        for link in BeautifulSoup(text, "html.parser", parse_only=SoupStrainer("a", href=True)):
            if link.has_attr('href'):
                if (link['href']):
                    href = link['href'].strip()
                    if not href.startswith("http://") and not href.startswith("https://") and not href.startswith("mailto:") and not href.startswith("tel:"):                                                     
                        if not href.startswith('/'):
                            href = "/" + href
                        href = url + href
                    links.append(href)
    return links

#-----------------------------------------------------------------------------
# MAIN
#----------------------------------------------------------------------------- 
开发者ID:tatanus,项目名称:Python4Pentesters,代码行数:22,代码来源:webscraper_beautifulsoup.py

示例11: get_contents

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def get_contents(game_html):
    """
    Uses Beautiful soup to parses the html document.
    Some parsers work for some pages but don't work for others....I'm not sure why so I just try them all here in order
    
    :param game_html: html doc
    
    :return: "soupified" html 
    """
    parsers = ["lxml", "html.parser", "html5lib"]
    strainer = SoupStrainer('td', attrs={'class': re.compile(r'bborder')})

    for parser in parsers:
        # parse_only only works with lxml for some reason
        if parser == "lxml":
            soup = BeautifulSoup(game_html, parser, parse_only=strainer)
        else:
            soup = BeautifulSoup(game_html, parser)

        tds = soup.find_all("td", {"class": re.compile('.*bborder.*')})

        if len(tds) > 0:
            break

    return tds 
开发者ID:HarryShomer,项目名称:Hockey-Scraper,代码行数:27,代码来源:html_pbp.py

示例12: __pre_process

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def __pre_process(self, page):
    # As BeautifulSoup will cause memory I/O error when the page is too large
        if page.find('<dl')>0:
            data = page.split('<dl')
            tag_dd = SoupStrainer('dd')
            for idx in xrange(1, len(data)):
                count = data[idx].count('<dd')
                if count > 5:
                    parts = data[idx].split('</dl>')
                    dds = parts[0].split('</dd>')
                    data[idx] = ''.join([dds[0], '</dd> <dx>%d</dx>'%idx,
                        dds[-1], '</dl>', ''.join(parts[1:])])
                    self.__dd_ext[str(idx)] =[]
                    for item in dds[1:-1]:
                        dd = BeautifulSoup(item, parse_only=tag_dd).dd
                        assert dd
                        self.__dd_ext[str(idx)].append(dd)
            return '<dl'.join(data)
        else:
            return page 
开发者ID:OZv,项目名称:VOC,代码行数:22,代码来源:voc_fetcher1.0.py

示例13: __initdef

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def __initdef(self, word, data):
        data = self.__pre_process(data)
        wpg = SoupStrainer('div', class_=re.compile('[^<>]*?wordPage[^<>]*?'))
        soup = BeautifulSoup(data, parse_only=wpg)
        div = soup.find('div', class_=re.compile('[^<>]*?wordPage[^<>]*?'))
        assert div
        self.__getwordtitle(div.h1)
        if word != self.__title:
            self.__title = None
            return False
        div = soup.find('div', {'class': 'section blurb'})
        if div:
            self.__hasblurb = True
            self.__getblurb(div)
        tags = soup.find_all(re.compile(r'div|h2'), class_='sectionHeader')
        tag = find_fulldefine(tags, re.compile(r'DEFINITIONS OF', re.I))
        if tag:
            self.__getfulldef(tag.parent)
        else:
            print("WARNING: %s HAS NO FULLDEFINITION" % self.__title)
            assert tag # to raise error and break
        div = soup.find('div', {'class': 'section family'})
        if div:
            self.__getwordfamily(div)
        return True 
开发者ID:OZv,项目名称:VOC,代码行数:27,代码来源:voc_fetcher1.0.py

示例14: get_witnesses_for_bill

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def get_witnesses_for_bill(bill_id, session):
    # ex: "HB 864" -> "https://capitol.texas.gov/tlodocs/86R/witlistbill/html/HB00864H.htm"
    parsed_bill_id = re.search(r"(\w+)\s+(\d+)", bill_id)
    bill_type = parsed_bill_id.group(1)
    bill_number = parsed_bill_id.group(2).zfill(5)
    url_prefix = f"https://capitol.texas.gov/tlodocs/{session}R/witlistbill/html/{bill_type}{bill_number}"
    house_url = f"{url_prefix}H.htm"
    senate_url = f"{url_prefix}S.htm"

    res = requests.get(house_url)

    # ##### Basic Test
    # # parsing all <p/> blocks up front may not be efficient
    # filter = SoupStrainer('p') # only <p/> tags contain text that we care about
    # text_blocks = BeautifulSoup(res.content, "html.parser", parse_only=filter)
    # selecting = None;
    # for block in text_blocks:
    #     text = block.get_text(strip=True)
    #     print(f"[{text}]")

    return parse_witness_list_html(res.content) 
开发者ID:open-austin,项目名称:influence-texas,代码行数:23,代码来源:get_witnesses_for_bill.py

示例15: find_form_request

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import SoupStrainer [as 别名]
def find_form_request(html: str):
    soup = bs4.BeautifulSoup(html, "html.parser", parse_only=bs4.SoupStrainer("form"))

    form = soup.form
    if not form:
        raise _exception.ParseError("Could not find form to submit", data=html)

    url = form.get("action")
    if not url:
        raise _exception.ParseError("Could not find url to submit to", data=form)

    # From what I've seen, it'll always do this!
    if url.startswith("/"):
        url = "https://www.facebook.com" + url

    # It's okay to set missing values to something crap, the values are localized, and
    # hence are not available in the raw HTML
    data = {
        x["name"]: x.get("value", "[missing]")
        for x in form.find_all(["input", "button"])
    }
    return url, data 
开发者ID:carpedm20,项目名称:fbchat,代码行数:24,代码来源:_session.py


注:本文中的bs4.SoupStrainer方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。