本文整理汇总了Python中pandas.read_html方法的典型用法代码示例。如果您正苦于以下问题:Python pandas.read_html方法的具体用法?Python pandas.read_html怎么用?Python pandas.read_html使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pandas
的用法示例。
在下文中一共展示了pandas.read_html方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_portfolio_fromttjj
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_html [as 别名]
def get_portfolio_fromttjj(code, start=None, end=None):
startobj = dt.datetime.strptime(start, "%Y%m%d")
endobj = dt.datetime.strptime(end, "%Y%m%d")
if (endobj - startobj).days < 90:
return None # note start is always 1.1 4.1 7.1 10.1 in incremental updates
if code.startswith("F"):
code = code[1:]
r = rget("http://fundf10.eastmoney.com/zcpz_{code}.html".format(code=code))
s = BeautifulSoup(r.text, "lxml")
table = s.find("table", class_="tzxq")
df = pd.read_html(str(table))[0]
df["date"] = pd.to_datetime(df["报告期"])
df["stock_ratio"] = df["股票占净比"].replace("---", "0%").apply(lambda s: _float(s[:-1]))
df["bond_ratio"] = df["债券占净比"].replace("---", "0%").apply(lambda s: _float(s[:-1]))
df["cash_ratio"] = df["现金占净比"].replace("---", "0%").apply(lambda s: _float(s[:-1]))
# df["dr_ratio"] = df["存托凭证占净比"].replace("---", "0%").apply(lambda s: xa.cons._float(s[:-1]))
df["assets"] = df["净资产(亿元)"]
df = df[::-1]
return df[["date", "stock_ratio", "bond_ratio", "cash_ratio", "assets"]]
# this is the most elegant approach to dispatch get_daily, the definition can be such simple
# you actually don't need to bother on start end blah, everything is taken care of by ``cahcedio``
示例2: addpositionstodict
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_html [as 别名]
def addpositionstodict(gendict):
print("Downloading position information from web...")
for accidwithloc in tqdm(gendict):
if 'Start' in gendict[accidwithloc]:
continue
accid = '_'.join(accidwithloc.split('_')[:-1])
url = ('http://crispr.i2bc.paris-saclay.fr/crispr/crispr_db.php?'
'checked%5B%5D={}'.format(accid))
page = requests.get(url)
htmltable = html.fromstring(page.content).xpath(
"//table[normalize-space(@class)='primary_table']")[1]
strtable = etree.tostring(htmltable)
# converts to pandas df and then to numpy array then drop titles
arrtable = pandas.read_html(strtable)[0].as_matrix()[2:]
for row in arrtable:
if row[0] in gendict:
gendict[row[0]]['Start'] = row[2]
gendict[row[0]]['Stop'] = row[3]
else:
if row[1] != 'questionable':
print("Can't find %s in local files" % row[0])
return gendict
示例3: __query_new_stocks
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_html [as 别名]
def __query_new_stocks(self):
DATA_URL = 'http://vip.stock.finance.sina.com.cn/corp/view/vRPD_NewStockIssue.php?page=1&cngem=0&orderBy=NetDate&orderType=desc'
html = lxml.html.parse(DATA_URL)
res = html.xpath('//table[@id=\"NewStockTable\"]/tr')
if six.PY2:
sarr = [etree.tostring(node) for node in res]
else:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
sarr = ''.join(sarr)
sarr = sarr.replace('<font color="red">*</font>', '')
sarr = '<table>%s</table>' % sarr
df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0]
df = df.select(lambda x: x in [0, 1, 2, 3, 7], axis=1)
df.columns = ['code', 'xcode', 'name', 'ipo_date', 'price']
df['code'] = df['code'].map(lambda x: str(x).zfill(6))
df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6))
return df
示例4: _fetch_documentation
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_html [as 别名]
def _fetch_documentation(version, base_url="https://spark.apache.org/docs"):
doc_urls = [
"{base_url}/{version}/configuration.html",
"{base_url}/{version}/sql-programming-guide.html",
"{base_url}/{version}/monitoring.html",
"{base_url}/{version}/spark-standalone.html",
"{base_url}/{version}/running-on-mesos.html",
"{base_url}/{version}/running-on-yarn.html",
]
for url in doc_urls:
doc_url = url.format(version=version, base_url=base_url)
# print(url)
print("Loading spark properties from %s", doc_url)
dfs = pd.read_html(doc_url, header=0)
desired_cols = ["Property Name", "Default", "Meaning"]
for df in dfs:
if ("Property Name" in df) and ('Default' in df):
for pn, default, desc in df[desired_cols].itertuples(index=False):
if type(default) == numpy.bool_:
default = bool(default)
yield pn, default, desc
示例5: get_forex_buy_quote
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_html [as 别名]
def get_forex_buy_quote(currency_code: str = 'EUR', source: str = 'FNB', order_type: str = 'buy'):
"""Get latest forex from FNB website
"""
if source == 'FNB':
tables = pd.read_html(
'https://www.fnb.co.za/Controller?nav=rates.forex.list.ForexRatesList',
index_col=1, header=0, match=currency_code)
df = tables[0]
types = {
'buy': 'Bank Selling Rate',
'sell': 'Bank Buying Rate',
}
exhange_rate = df.loc[currency_code, types[order_type]]
return Decimal("%.4f" % float(exhange_rate))
示例6: _profit_divis
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_html [as 别名]
def _profit_divis(pageNo, dataArr, nextPage):
ct._write_console()
html = lxml.html.parse('%sdata.cfi.cn/%s'%(ct.P_TYPE['http'], nextPage))
res = html.xpath("//table[@class=\"table_data\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = sarr.replace('--', '0')
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr, skiprows=[0])[0]
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@id=\"content\"]/div[2]/a[last()]/@href')[0]
np = nextPage.split('&')[2].split('=')[1]
if pageNo < int(np):
return _profit_divis(int(np), dataArr, nextPage)
else:
return dataArr
示例7: _today_ticks
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_html [as 别名]
def _today_ticks(symbol, tdate, pageNo, retry_count, pause):
ct._write_console()
for _ in range(retry_count):
time.sleep(pause)
try:
html = lxml.html.parse(ct.TODAY_TICKS_URL % (ct.P_TYPE['http'],
ct.DOMAINS['vsf'], ct.PAGES['t_ticks'],
symbol, tdate, pageNo
))
res = html.xpath('//table[@id=\"datatbl\"]/tbody/tr')
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = '<table>%s</table>'%sarr
sarr = sarr.replace('--', '0')
df = pd.read_html(StringIO(sarr), parse_dates=False)[0]
df.columns = ct.TODAY_TICK_COLUMNS
df['pchange'] = df['pchange'].map(lambda x : x.replace('%', ''))
except Exception as e:
print(e)
else:
return df
raise IOError(ct.NETWORK_URL_ERROR_MSG)
示例8: fix_span_tables
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_html [as 别名]
def fix_span_tables(soup):
classes = OrderedDict([("ltx_tabular", "table"), ("ltx_tr", "tr"), ("ltx_th", "th"),
("ltx_tbody", "tbody"), ("ltx_thead", "thead"), ("ltx_td", "td"),
("ltx_tfoot", "tfoot")])
query = ','.join(["span." + c for c in classes.keys()])
for elem in soup.select(query):
for k, v in classes.items():
if k in elem.attrs["class"]:
elem.name = v
break
# pandas.read_html treats th differently
# by trying in a few places to get column names
# for now <th>s are changed to <td>s, but we still
# have classes (ltx_th) to distinguish them
示例9: _sz_hz
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_html [as 别名]
def _sz_hz(date='', retry_count=3, pause=0.001):
for _ in range(retry_count):
time.sleep(pause)
ct._write_console()
try:
request = Request(rv.MAR_SZ_HZ_URL%(ct.P_TYPE['http'], ct.DOMAINS['szse'],
ct.PAGES['szsefc'], date))
lines = urlopen(request, timeout = 10).read()
if len(lines) <= 200:
return pd.DataFrame()
df = pd.read_html(lines, skiprows=[0])[0]
df.columns = rv.MAR_SZ_HZ_COLS
df['opDate'] = date
except Exception as e:
print(e)
else:
return df
raise IOError(ct.NETWORK_URL_ERROR_MSG)
示例10: getdatafromViewDNS
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_html [as 别名]
def getdatafromViewDNS(searchQuery):
searchQuery = searchQuery.replace(" ", "+")
url = "https://viewdns.info/reversewhois/?q=" + searchQuery
print ("[*] Extracting from: " + url)
try:
result = pd.read_html(requests.get(url, headers={"User-Agent": "Mozilla/5.0"}).text)
response = result[3][0]
iter_url = iter(response)
return iter_url
# next(iter_url)
#for url in iter_url:
# print(url)
except Exception as e:
print("[!] Couldn't send query, error: {e} exiting...\n")
exit
# Will return the org name for any domain name.
示例11: stock_info_change_name
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_html [as 别名]
def stock_info_change_name(stock="688588"):
"""
新浪财经-股票曾用名
http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CorpInfo/stockid/300378.phtml
:param stock: 股票代码
:type stock: str
:return: 股票曾用名列表
:rtype: list
"""
url = f"http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CorpInfo/stockid/{stock}.phtml"
r = requests.get(url)
temp_df = pd.read_html(r.text)[3].iloc[:, :2]
temp_df.dropna(inplace=True)
temp_df.columns = ["item", "value"]
temp_df["item"] = temp_df["item"].str.split(":", expand=True)[0]
try:
name_list = temp_df[temp_df["item"] == "证券简称更名历史"].value.tolist()[0].split(" ")
return name_list
except:
return None
示例12: stock_sse_summary
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_html [as 别名]
def stock_sse_summary():
"""
上海证券交易所-总貌
http://www.sse.com.cn/market/stockdata/statistic/
:return: 上海证券交易所-总貌
:rtype: pandas.DataFrame
"""
url = "http://www.sse.com.cn/market/stockdata/statistic/"
r = requests.get(url)
r.encoding = "utf-8"
big_df = pd.DataFrame()
temp_list = ["总貌", "主板", "科创板"]
for i in range(len(pd.read_html(r.text))):
for j in range(0, 2):
inner_df = pd.read_html(r.text)[i].iloc[:, j].str.split(" ", expand=True)
inner_df["item"] = temp_list[i]
big_df = big_df.append(inner_df)
big_df.dropna(how="any", inplace=True)
big_df.columns = ["item", "number", "type"]
big_df = big_df[["type", "item", "number"]]
return big_df
示例13: sunrise_city_list
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_html [as 别名]
def sunrise_city_list() -> list:
"""
查询日出与日落数据的城市列表
:return: 所有可以获取的数据的城市列表
:rtype: list
"""
url = "https://www.timeanddate.com/sun/china"
res = requests.get(url)
city_list = []
china_city_one_df = pd.read_html(res.text)[0]
china_city_two_df = pd.read_html(res.text)[1]
city_list.extend([item.lower() for item in china_city_one_df.iloc[:, 0].tolist()])
city_list.extend([item.lower() for item in china_city_one_df.iloc[:, 1].tolist()])
city_list.extend([item.lower() for item in china_city_two_df.iloc[:, 0].tolist()])
city_list.extend([item.lower() for item in china_city_two_df.iloc[:, 1].tolist()])
city_list.extend([item.lower() for item in china_city_two_df.iloc[:, 2].tolist()])
city_list.extend([item.lower() for item in china_city_two_df.iloc[:, 3].tolist()])
city_list.extend([item.lower() for item in china_city_two_df.iloc[:, 4][:-2].tolist()])
return city_list
示例14: sunrise_daily
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_html [as 别名]
def sunrise_daily(date: str = "20200428", city: str = "北京") -> pd.DataFrame:
"""
每日日出日落数据
https://www.timeanddate.com/sun/china/shaoxing
:param date: 需要查询的日期, e.g., “20200428”
:type date: str
:param city: 需要查询的城市; 注意输入的格式, e.g., "北京", "上海"
:type city: str
:return: 返回指定日期指定地区的日出日落数据
:rtype: pandas.DataFrame
"""
if pypinyin.slug(city, separator='') in sunrise_city_list():
year = date[:4]
month = date[4:6]
url = f"https://www.timeanddate.com/sun/china/{pypinyin.slug(city, separator='')}?month={month}&year={year}"
res = requests.get(url)
table = pd.read_html(res.text, header=2)[0]
month_df = table.iloc[:-1, ]
day_df = month_df[month_df.iloc[:, 0].astype(str).str.zfill(2) == date[6:]]
day_df.index = pd.to_datetime([date] * len(day_df), format="%Y%m%d")
return day_df
else:
return "请输入正确的城市名称"
示例15: scrap
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import read_html [as 别名]
def scrap(url, browser, retryCount=2):
""" 抓取网页table
:param url: 网址
:param browser: 浏览器
:return: dataframe
"""
try:
while retryCount > 0:
try:
browser.get(url)
time.sleep(random.random() / 4)
if 'thead' in browser.page_source:
break
except Exception as e:
print(retryCount, e.args)
retryCount -= 1
if retryCount == 1:
mProxy.deleteProxy(myProxy)
for x in ['lxml', 'xml', 'html5lib']:
# 可能会出现lxml版本大于4.1.1时,获取不到table
try:
soup = BeautifulSoup(browser.page_source, x)
table = soup.find_all(id='tb_cgtj')[0]
if table:
break
except:
time.sleep(0.1)
print('using BeautifulSoup {}'.format(x))
df = pd.read_html(str(table), header=1)[0]
df.columns = ['tradedate', 'related', 'close', 'zd', 'hvol', 'hamount', 'hpercent', 'oneday', 'fiveday',
'tenday']
except Exception as e:
print(e.args)
return pd.DataFrame()
return df