本文整理汇总了Python中bs4.BeautifulSoup方法的典型用法代码示例。如果您正苦于以下问题:Python bs4.BeautifulSoup方法的具体用法?Python bs4.BeautifulSoup怎么用?Python bs4.BeautifulSoup使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类bs4
的用法示例。
在下文中一共展示了bs4.BeautifulSoup方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_pixiv_user_name
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def get_pixiv_user_name():
global login_status
tag = 'Get_Pixiv_User_Name'
# Check if cookies works.
pixiv_www_url = 'https://www.pixiv.net/'
check_soup = BeautifulSoup(get_text_from_url(pixiv_www_url), 'html.parser')
try:
pixiv_user_nick_name = check_soup.find(name='a', attrs={'class': 'user-name js-click-trackable-later'}).string
print_with_tag(tag, ['Login as', pixiv_user_nick_name])
except Exception as e:
print_with_tag(tag,['Error:',e])
login_status = False
print_with_tag(tag,'Failed to check the user name.')
print_with_tag(tag,'Might be the cookies is out of the date?')
else:
login_status = True
print_with_tag(tag,'Login success!')
#
示例2: check_query
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def check_query(count, url, query):
if url[-1] == '/':
url = url[:-1]
url = f'{url}/search?q={query}&start={count}&num=100'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0',
}
results = requests.get(url, headers=headers)
soup = BeautifulSoup(results.text, 'lxml')
with add_lock:
idx = 1
for g in soup.find_all('div', class_='r'):
link = g.find_all('a')[0]['href']
title = g.find_all('h3')[0]
item = f'{title.text} ({link})'
search_results.add(item)
idx+=1
示例3: check_query
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def check_query(count, url, query):
if url[-1] == '/':
url = url[:-1]
url = f'{url}/search?q={query}&first={count}'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0',
}
results = requests.get(url, headers=headers)
soup = BeautifulSoup(results.text, 'lxml')
with add_lock:
idx = 1
for g in soup.find_all('li', class_='b_algo'):
result = g.find('h2')
link = result.find('a')['href']
title = result.text
item = f'{title} ({link})'
search_results.add(item)
idx+=1
示例4: ExtractionAlgo
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def ExtractionAlgo(self,text):
soup=BeautifulSoup(text,'html.parser')
title=soup.title.string
Result=[]
#print soup
maincontent=soup.find_all("div", class_="content__article-body from-content-api js-article__body")
#print maincontent
for content in maincontent:
scripttags=content.find_all(["script","br","figure","image"])
for scripttag in scripttags:
scripttag.extract()
#print content.text
for foundcontent in content.find_all("p"):
Result.append(foundcontent.text)
Result=''.join(Result)
return (title,Result)
示例5: filterConnections
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def filterConnections(proxiesList):
workingProxies = []
count = 0
for proxy in proxiesList:
count += 1
cprint("Loading proxy # {}".format(count), "green")
proxies = {
'http': proxy,
'https': proxy
}
try:
r = requests.get("http://www.supremenewyork.com/shop/all", proxies=proxies, timeout=1)
data = r.text
soup = BeautifulSoup(data,"html.parser")
headerCheck = str(soup.find("span",{"id":"time-zone-name"}).text)
if headerCheck == "NYC":
cprint(headerCheck, "blue")
workingProxies.append(proxy)
cprint("Added {}!".format(proxy),"green")
else:
cprint("Banned!", "red")
raise
except:
cprint("Bad Proxy: {}".format(proxy), "red")
return workingProxies
示例6: site2
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def site2(proxiesList):
url = "https://www.us-proxy.org/"
user = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36"}
r = requests.get(url,headers=user)
data = r.text
soup = BeautifulSoup(data,"html.parser")
table = soup.find("tbody")
for ips in table.find_all("tr"):
count = 0
proxy = ""
for ip in ips.find_all("td"):
if count == 0:
proxy = str(ip.text)
proxy += ":"
if count == 1:
proxy += str(ip.text)
proxiesList.append(proxy)
break;
count += 1
cprint("Succesfully added {} proxies!".format(len(proxiesList)), 'green')
示例7: site4
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def site4(proxiesList):
url = "https://www.proxynova.com/proxy-server-list/country-us/"
user = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36"}
r = requests.get(url,headers=user)
data = r.text
soup = BeautifulSoup(data,"html.parser")
proxy = ""
# for ips in soup.find_all("tr",{"class":"spy1xx"}):
for ips in soup.find_all("tr"):
count = 0
for ip in ips.find_all("td",{"align":"left"}):
if count == 0:
proxy = str(ip.get_text(strip=True).replace("document.write('","").replace("'","").replace("+","").replace(");","").replace(" ",""))
if count == 1:
proxy += ":"+str(ip.text).strip()
proxiesList.append(proxy)
break;
count += 1
示例8: yt
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def yt(query):
with requests.session() as s:
isi = []
if query == "":
query = "S1B tanysyz"
s.headers['user-agent'] = 'Mozilla/5.0'
url = 'http://www.youtube.com/results'
params = {'search_query': query}
r = s.get(url, params=params)
soup = BeautifulSoup(r.content, 'html5lib')
for a in soup.select('.yt-lockup-title > a[title]'):
if '&list=' not in a['href']:
if 'watch?v' in a['href']:
b = a['href'].replace('watch?v=', '')
isi += ['youtu.be' + b]
return isi
示例9: get_ri_status
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def get_ri_status(suburl=None):
if not suburl:
suburl = "m=cb&a=cb_all" # 可转债
url = "http://www.richvest.com/index.php?"
url += suburl
r = rget(url, headers={"user-agent": "Mozilla/5.0"})
b = BeautifulSoup(r.text, "lxml")
cl = []
for c in b.findAll("th"):
cl.append(c.text)
nocl = len(cl)
rl = []
for i, c in enumerate(b.findAll("td")):
if i % nocl == 0:
r = []
r.append(c.text)
if i % nocl == nocl - 1:
rl.append(r)
return pd.DataFrame(rl, columns=cl)
示例10: get_tdx_holidays
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def get_tdx_holidays(holidays=None, format="%Y-%m-%d"):
r = rget("https://www.tdx.com.cn/url/holiday/")
r.encoding = "gbk"
b = BeautifulSoup(r.text, "lxml")
l = b.find("textarea").string.split("\n")
if not holidays:
holidays = {}
for item in l:
if item.strip():
c = item.split("|")
if c[2] in region_trans:
rg = region_trans[c[2]]
tobj = dt.datetime.strptime(c[0], "%Y%m%d")
tstr = tobj.strftime(format)
if rg not in holidays:
holidays[rg] = [tstr]
else:
holidays[rg].append(tstr)
return holidays
示例11: get_portfolio_fromttjj
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def get_portfolio_fromttjj(code, start=None, end=None):
startobj = dt.datetime.strptime(start, "%Y%m%d")
endobj = dt.datetime.strptime(end, "%Y%m%d")
if (endobj - startobj).days < 90:
return None # note start is always 1.1 4.1 7.1 10.1 in incremental updates
if code.startswith("F"):
code = code[1:]
r = rget("http://fundf10.eastmoney.com/zcpz_{code}.html".format(code=code))
s = BeautifulSoup(r.text, "lxml")
table = s.find("table", class_="tzxq")
df = pd.read_html(str(table))[0]
df["date"] = pd.to_datetime(df["报告期"])
df["stock_ratio"] = df["股票占净比"].replace("---", "0%").apply(lambda s: _float(s[:-1]))
df["bond_ratio"] = df["债券占净比"].replace("---", "0%").apply(lambda s: _float(s[:-1]))
df["cash_ratio"] = df["现金占净比"].replace("---", "0%").apply(lambda s: _float(s[:-1]))
# df["dr_ratio"] = df["存托凭证占净比"].replace("---", "0%").apply(lambda s: xa.cons._float(s[:-1]))
df["assets"] = df["净资产(亿元)"]
df = df[::-1]
return df[["date", "stock_ratio", "bond_ratio", "cash_ratio", "assets"]]
# this is the most elegant approach to dispatch get_daily, the definition can be such simple
# you actually don't need to bother on start end blah, everything is taken care of by ``cahcedio``
示例12: get_rt_from_ft
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def get_rt_from_ft(code, _type="indices"):
url = make_ft_url(code, _type=_type)
r = rget(url)
b = BeautifulSoup(r.text, "lxml")
d = {}
d["name"] = b.find("h1").string
d["current"] = _float(b.find("span", class_="mod-ui-data-list__value").string)
d["percent"] = _float(
b.select("span[class^='mod-format--']")[0].text.split("/")[-1].strip()[:-1]
)
d["current_ext"] = None
d["market"] = None
d["currency"] = b.find("span", class_="mod-ui-data-list__label").string.split("(")[
1
][:-1]
d["time"] = b.find("div", class_="mod-disclaimer").string
return d
示例13: get_newest_netvalue
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def get_newest_netvalue(code):
"""
防止天天基金总量 API 最新净值更新不及时,获取基金最新公布净值及对应日期, depracated, use get_rt("F501018") instead
:param code: six digits string for fund.
:return: netvalue, %Y-%m-%d
"""
code = code[1:]
r = rget("http://fund.eastmoney.com/{code}.html".format(code=code))
s = BeautifulSoup(r.text, "lxml")
return (
float(
s.findAll("dd", class_="dataNums")[1]
.find("span", class_="ui-font-large")
.string
),
str(s.findAll("dt")[1]).split("(")[1].split(")")[0][7:],
)
示例14: test_chained_exception_handler
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def test_chained_exception_handler():
request, response = exception_handler_app.test_client.get(
"/6/0", debug=True
)
assert response.status == 500
soup = BeautifulSoup(response.body, "html.parser")
html = str(soup)
assert "response = handler(request, *args, **kwargs)" in html
assert "handler_6" in html
assert "foo = 1 / arg" in html
assert "ValueError" in html
assert "The above exception was the direct cause" in html
summary_text = " ".join(soup.select(".summary")[0].text.split())
assert (
"ZeroDivisionError: division by zero while handling path /6/0"
) == summary_text
示例15: gist_fetch
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import BeautifulSoup [as 别名]
def gist_fetch(query, page_idx, total_items=1000):
gist_url = "https://gist.github.com/search?utf8=%E2%9C%93&q={}&p={}"
query = urllib.parse.quote(query)
gists = []
try:
resp = requests.get(gist_url.format(query, page_idx))
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
total_items = min(total_items, int(
[x.text.split()[0] for x in soup.find_all('h3')
if "gist results" in x.text][0].replace(',', '')))
gists = [x.get("href") for x in soup.findAll(
"a", class_="link-overlay")]
except IndexError:
return {"data": None, "total_items": 0}
return {"data": gists, "total_items": total_items}