当前位置: 首页>>代码示例>>Python>>正文


Python bs4.BeautifulSoup方法代码示例

本文整理汇总了Python中bs4.BeautifulSoup方法的典型用法代码示例。如果您正苦于以下问题:Python bs4.BeautifulSoup方法的具体用法?Python bs4.BeautifulSoup怎么用?Python bs4.BeautifulSoup使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在bs4的用法示例。


在下文中一共展示了bs4.BeautifulSoup方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_pixiv_user_name

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def get_pixiv_user_name():
    global login_status
    tag = 'Get_Pixiv_User_Name'
    # Check if cookies works.
    pixiv_www_url = 'https://www.pixiv.net/'
    check_soup = BeautifulSoup(get_text_from_url(pixiv_www_url), 'html.parser')
    try:
        pixiv_user_nick_name = check_soup.find(name='a', attrs={'class': 'user-name js-click-trackable-later'}).string
        print_with_tag(tag, ['Login as', pixiv_user_nick_name])
    except Exception as e:
        print_with_tag(tag,['Error:',e])
        login_status = False
        print_with_tag(tag,'Failed to check the user name.')
        print_with_tag(tag,'Might be the cookies is out of the date?')
    else:
        login_status = True
        print_with_tag(tag,'Login success!')

# 
开发者ID:SuzukiHonoka,项目名称:Starx_Pixiv_Collector,代码行数:21,代码来源:start.py

示例2: check_query

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def check_query(count, url, query):
	if url[-1] == '/':
		url = url[:-1]

	url = f'{url}/search?q={query}&start={count}&num=100'
	headers = {
		'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0',
	}
	results = requests.get(url, headers=headers)

	soup = BeautifulSoup(results.text, 'lxml')

	with add_lock:
		idx = 1
		for g in soup.find_all('div', class_='r'):
			link = g.find_all('a')[0]['href']
			title = g.find_all('h3')[0]
			item = f'{title.text} ({link})'
			search_results.add(item)
			idx+=1 
开发者ID:ustayready,项目名称:fireprox,代码行数:22,代码来源:google.py

示例3: check_query

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def check_query(count, url, query):
	if url[-1] == '/':
		url = url[:-1]

	url = f'{url}/search?q={query}&first={count}'
	headers = {
		'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0',
	}
	results = requests.get(url, headers=headers)

	soup = BeautifulSoup(results.text, 'lxml')

	with add_lock:
		idx = 1
		for g in soup.find_all('li', class_='b_algo'):
			result = g.find('h2')
			link = result.find('a')['href']
			title = result.text
			item = f'{title} ({link})'
			search_results.add(item)
			idx+=1 
开发者ID:ustayready,项目名称:fireprox,代码行数:23,代码来源:bing.py

示例4: ExtractionAlgo

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def ExtractionAlgo(self,text):
        soup=BeautifulSoup(text,'html.parser')
        title=soup.title.string
        Result=[]
        #print soup
        maincontent=soup.find_all("div", class_="content__article-body from-content-api js-article__body")
        #print maincontent
        for content in maincontent:
            scripttags=content.find_all(["script","br","figure","image"])
            for scripttag in scripttags:
                scripttag.extract()
            #print content.text
            for foundcontent in content.find_all("p"):
                Result.append(foundcontent.text)
        Result=''.join(Result)
        return (title,Result) 
开发者ID:Griffintaur,项目名称:News-At-Command-Line,代码行数:18,代码来源:Extractor.py

示例5: filterConnections

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def filterConnections(proxiesList):
	workingProxies = []
	count = 0
	for proxy in proxiesList:
		count += 1
		cprint("Loading proxy # {}".format(count), "green")
		proxies = {
		  'http': proxy,
		  'https': proxy
		}
		try:
			r = requests.get("http://www.supremenewyork.com/shop/all", proxies=proxies, timeout=1)
			data = r.text
			soup = BeautifulSoup(data,"html.parser")
			headerCheck = str(soup.find("span",{"id":"time-zone-name"}).text)
			if headerCheck == "NYC":
				cprint(headerCheck, "blue")
				workingProxies.append(proxy)
				cprint("Added {}!".format(proxy),"green")
			else:
				cprint("Banned!", "red")
				raise
		except:
			cprint("Bad Proxy: {}".format(proxy), "red")
	return workingProxies 
开发者ID:supthunder,项目名称:premeStock,代码行数:27,代码来源:proxyLoader.py

示例6: site2

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def site2(proxiesList):
	url = "https://www.us-proxy.org/"
	user = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36"}
	r = requests.get(url,headers=user)

	data = r.text
	soup = BeautifulSoup(data,"html.parser")

	table = soup.find("tbody")
	for ips in table.find_all("tr"):
		count = 0
		proxy = ""
		for ip in ips.find_all("td"):
			if count == 0:
				proxy = str(ip.text)
				proxy += ":"
			if count == 1:
				proxy += str(ip.text)
				proxiesList.append(proxy)
				break;
			count += 1
	cprint("Succesfully added {} proxies!".format(len(proxiesList)), 'green') 
开发者ID:supthunder,项目名称:premeStock,代码行数:24,代码来源:proxyLoader.py

示例7: site4

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def site4(proxiesList):
	url = "https://www.proxynova.com/proxy-server-list/country-us/"
	user = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36"}
	
	r = requests.get(url,headers=user)
	data = r.text
	soup = BeautifulSoup(data,"html.parser")

	proxy = ""
	# for ips in soup.find_all("tr",{"class":"spy1xx"}):
	for ips in soup.find_all("tr"):
		count = 0
		for ip in ips.find_all("td",{"align":"left"}):
			if count == 0:
				proxy = str(ip.get_text(strip=True).replace("document.write('","").replace("'","").replace("+","").replace(");","").replace(" ",""))
			if count == 1:
				proxy += ":"+str(ip.text).strip()
				proxiesList.append(proxy)
				break;
			count += 1 
开发者ID:supthunder,项目名称:premeStock,代码行数:22,代码来源:proxyLoader.py

示例8: yt

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def yt(query):
    with requests.session() as s:
         isi = []
         if query == "":
             query = "S1B tanysyz"   
         s.headers['user-agent'] = 'Mozilla/5.0'
         url    = 'http://www.youtube.com/results'
         params = {'search_query': query}
         r    = s.get(url, params=params)
         soup = BeautifulSoup(r.content, 'html5lib')
         for a in soup.select('.yt-lockup-title > a[title]'):
            if '&list=' not in a['href']:
                if 'watch?v' in a['href']:
                    b = a['href'].replace('watch?v=', '')
                    isi += ['youtu.be' + b]
         return isi 
开发者ID:CyberTKR,项目名称:CyberTK-Self,代码行数:18,代码来源:Self.py

示例9: get_ri_status

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def get_ri_status(suburl=None):
    if not suburl:
        suburl = "m=cb&a=cb_all"  # 可转债

    url = "http://www.richvest.com/index.php?"
    url += suburl
    r = rget(url, headers={"user-agent": "Mozilla/5.0"})
    b = BeautifulSoup(r.text, "lxml")
    cl = []
    for c in b.findAll("th"):
        cl.append(c.text)
    nocl = len(cl)
    rl = []
    for i, c in enumerate(b.findAll("td")):
        if i % nocl == 0:
            r = []
        r.append(c.text)
        if i % nocl == nocl - 1:
            rl.append(r)
    return pd.DataFrame(rl, columns=cl) 
开发者ID:refraction-ray,项目名称:xalpha,代码行数:22,代码来源:misc.py

示例10: get_tdx_holidays

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def get_tdx_holidays(holidays=None, format="%Y-%m-%d"):
    r = rget("https://www.tdx.com.cn/url/holiday/")
    r.encoding = "gbk"
    b = BeautifulSoup(r.text, "lxml")
    l = b.find("textarea").string.split("\n")
    if not holidays:
        holidays = {}
    for item in l:
        if item.strip():
            c = item.split("|")
            if c[2] in region_trans:
                rg = region_trans[c[2]]
                tobj = dt.datetime.strptime(c[0], "%Y%m%d")
                tstr = tobj.strftime(format)
                if rg not in holidays:
                    holidays[rg] = [tstr]
                else:
                    holidays[rg].append(tstr)
    return holidays 
开发者ID:refraction-ray,项目名称:xalpha,代码行数:21,代码来源:misc.py

示例11: get_portfolio_fromttjj

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def get_portfolio_fromttjj(code, start=None, end=None):
    startobj = dt.datetime.strptime(start, "%Y%m%d")
    endobj = dt.datetime.strptime(end, "%Y%m%d")
    if (endobj - startobj).days < 90:
        return None  # note start is always 1.1 4.1 7.1 10.1 in incremental updates
    if code.startswith("F"):
        code = code[1:]
    r = rget("http://fundf10.eastmoney.com/zcpz_{code}.html".format(code=code))
    s = BeautifulSoup(r.text, "lxml")
    table = s.find("table", class_="tzxq")
    df = pd.read_html(str(table))[0]
    df["date"] = pd.to_datetime(df["报告期"])
    df["stock_ratio"] = df["股票占净比"].replace("---", "0%").apply(lambda s: _float(s[:-1]))
    df["bond_ratio"] = df["债券占净比"].replace("---", "0%").apply(lambda s: _float(s[:-1]))
    df["cash_ratio"] = df["现金占净比"].replace("---", "0%").apply(lambda s: _float(s[:-1]))
    #     df["dr_ratio"] = df["存托凭证占净比"].replace("---", "0%").apply(lambda s: xa.cons._float(s[:-1]))
    df["assets"] = df["净资产(亿元)"]
    df = df[::-1]
    return df[["date", "stock_ratio", "bond_ratio", "cash_ratio", "assets"]]


# this is the most elegant approach to dispatch get_daily, the definition can be such simple
# you actually don't need to bother on start end blah, everything is taken care of by ``cahcedio`` 
开发者ID:refraction-ray,项目名称:xalpha,代码行数:25,代码来源:universal.py

示例12: get_rt_from_ft

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def get_rt_from_ft(code, _type="indices"):
    url = make_ft_url(code, _type=_type)
    r = rget(url)
    b = BeautifulSoup(r.text, "lxml")
    d = {}
    d["name"] = b.find("h1").string
    d["current"] = _float(b.find("span", class_="mod-ui-data-list__value").string)
    d["percent"] = _float(
        b.select("span[class^='mod-format--']")[0].text.split("/")[-1].strip()[:-1]
    )
    d["current_ext"] = None
    d["market"] = None
    d["currency"] = b.find("span", class_="mod-ui-data-list__label").string.split("(")[
        1
    ][:-1]
    d["time"] = b.find("div", class_="mod-disclaimer").string
    return d 
开发者ID:refraction-ray,项目名称:xalpha,代码行数:19,代码来源:universal.py

示例13: get_newest_netvalue

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def get_newest_netvalue(code):
    """
    防止天天基金总量 API 最新净值更新不及时,获取基金最新公布净值及对应日期, depracated, use get_rt("F501018") instead

    :param code: six digits string for fund.
    :return: netvalue, %Y-%m-%d
    """
    code = code[1:]
    r = rget("http://fund.eastmoney.com/{code}.html".format(code=code))
    s = BeautifulSoup(r.text, "lxml")
    return (
        float(
            s.findAll("dd", class_="dataNums")[1]
            .find("span", class_="ui-font-large")
            .string
        ),
        str(s.findAll("dt")[1]).split("(")[1].split(")")[0][7:],
    ) 
开发者ID:refraction-ray,项目名称:xalpha,代码行数:20,代码来源:universal.py

示例14: test_chained_exception_handler

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def test_chained_exception_handler():
    request, response = exception_handler_app.test_client.get(
        "/6/0", debug=True
    )
    assert response.status == 500

    soup = BeautifulSoup(response.body, "html.parser")
    html = str(soup)

    assert "response = handler(request, *args, **kwargs)" in html
    assert "handler_6" in html
    assert "foo = 1 / arg" in html
    assert "ValueError" in html
    assert "The above exception was the direct cause" in html

    summary_text = " ".join(soup.select(".summary")[0].text.split())
    assert (
        "ZeroDivisionError: division by zero while handling path /6/0"
    ) == summary_text 
开发者ID:huge-success,项目名称:sanic,代码行数:21,代码来源:test_exceptions_handler.py

示例15: gist_fetch

# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def gist_fetch(query, page_idx, total_items=1000):
    gist_url = "https://gist.github.com/search?utf8=%E2%9C%93&q={}&p={}"
    query = urllib.parse.quote(query)
    gists = []

    try:
        resp = requests.get(gist_url.format(query, page_idx))
        soup = bs4.BeautifulSoup(resp.text, 'html.parser')
        total_items = min(total_items, int(
            [x.text.split()[0] for x in soup.find_all('h3')
                if "gist results" in x.text][0].replace(',', '')))
        gists = [x.get("href") for x in soup.findAll(
                            "a", class_="link-overlay")]
    except IndexError:
        return {"data": None, "total_items": 0}

    return {"data": gists, "total_items": total_items} 
开发者ID:BishopFox,项目名称:GitGot,代码行数:19,代码来源:gitgot.py


注:本文中的bs4.BeautifulSoup方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。