本文整理汇总了Python中src.tools.http.Http.get_content方法的典型用法代码示例。如果您正苦于以下问题:Python Http.get_content方法的具体用法?Python Http.get_content怎么用?Python Http.get_content使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类src.tools.http.Http
的用法示例。
在下文中一共展示了Http.get_content方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: login
# 需要导入模块: from src.tools.http import Http [as 别名]
# 或者: from src.tools.http.Http import get_content [as 别名]
def login(self, account, password, captcha=''):
content = Http.get_content('https://www.zhihu.com/')
xsrf = Match.xsrf(content)
if not xsrf:
Debug.logger.info(u'登陆失败')
Debug.logger.info(u'敲击回车重新发送登陆请求')
return False
xsrf = xsrf.split('=')[1]
# add xsrf as cookie into cookieJar,
cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com')
self.cookieJar.set_cookie(cookie)
if captcha:
post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True,
'captcha': captcha}
else:
post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True}
header = {
'Accept': '*/*',
'Accept-Encoding': 'gzip,deflate', # 主要属性,只要有此项知乎即认为来源非脚本
'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4',
'Host': 'www.zhihu.com',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36',
'Connection': 'keep-alive',
'X-Requested-With': 'XMLHttpRequest',
'Origin': 'https://www.zhihu.com',
'Referer': 'https://www.zhihu.com/',
}
result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header)
if not result:
Debug.logger.info(u'登陆失败,请敲击回车重新登陆')
return False
response = json.loads(result)
if response['r'] == 0:
print u'登陆成功!'
print u'登陆账号:', account
print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认'
if raw_input() == 'yes':
Config.account, Config.password, Config.remember_account = account, password, True
print u'帐号密码已保存,可通过修改config.json修改设置'
else:
Config.account, Config.password, Config.remember_account = '', '', False
print u'跳过保存环节,进入下一流程'
Config._save()
cookie = self.get_cookie()
DB.execute('delete from LoginRecord') # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录
data = {}
data['account'] = account
data['password'] = password
data['recordDate'] = ExtraTools.get_today()
data['cookieStr'] = cookie
DB.save(data, 'LoginRecord')
DB.commit()
return True
else:
print u'登陆失败'
Debug.print_dict(response)
return False
示例2: get_sinablog_question_list
# 需要导入模块: from src.tools.http import Http [as 别名]
# 或者: from src.tools.http.Http import get_content [as 别名]
def get_sinablog_question_list(self, author_id):
u"""
get sinablog_info, article_num
:param author_id:
:return:
"""
href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.format(author_id)
href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(author_id)
content_profile = Http.get_content(href_profile)
parser = SinaBlogParser(content_profile)
self.question_list += parser.get_extra_info()
content_article_list = Http.get_content(href_article_list)
article_num = int(self.parse_article_num(content_article_list))
return article_num
示例3: create_work_set
# 需要导入模块: from src.tools.http import Http [as 别名]
# 或者: from src.tools.http.Http import get_content [as 别名]
def create_work_set(self, target_url):
u"""
根据博客首页的url, 首先通过re获得博客id, 然后根据博客"关于我"的页面的内容获得写入SinaBlog_Info
的数据(这部分理应不在这个函数中, 可以改进), 最后通过博客目录页面的内容, 获得每篇博文的地址,
放入work_set中
:param target_url: 博客首页的url
:return:
"""
Debug.logger.debug(u"target_url是:" + str(target_url))
if target_url in self.task_complete_set:
return
result = Match.SinaBlog(target_url)
SinaBlog_author_id = int(result.group('SinaBlog_people_id'))
href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.format(SinaBlog_author_id)
href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(SinaBlog_author_id)
# ############下面这部分应该是SinaBlogAuthorWorker的内容, 写到SinaBlog_Info, 暂时写在这, 以后再优化
content_profile = Http.get_content(href_profile)
parser = SinaBlogParser(content_profile)
self.question_list += parser.get_SinaBlog_info_list()
# Debug.logger.debug(u"create_work_set中的question_list是什么??" + str(self.question_list))
# #############上面这部分应该是SinaBlogAuthorWorker的内容, 写到SinaBlog_Info, 暂时写在这, 以后再优化
# content_index = Http.get_content(href_index)
content_article_list = Http.get_content(href_article_list)
article_num = int(self.parse_article_num(content_article_list))
Debug.logger.debug(u"article_num:" + str(article_num))
if article_num % 50 != 0:
page_num = article_num/50 + 1 # 博客目录页面, 1页放50个博客链接
else:
page_num = article_num / 50
self.question_list[0]['article_num'] = article_num # 这样的话, 每行只能放一个新浪博客地址!!!
# 上面这行, 暂时只能这样写, 因为"关于我"的页面, 没有文章的数量
self.task_complete_set.add(target_url)
for page in range(page_num):
url = 'http://blog.sina.com.cn/s/articlelist_{}_0_{}.html'.format(SinaBlog_author_id, page+1)
content_article_list = Http.get_content(url)
article_list = self.parse_get_article_list(content_article_list)
for item in article_list:
self.work_set.add(item)
# self.work_set.add(article_list[0])
return
示例4: create_work_set
# 需要导入模块: from src.tools.http import Http [as 别名]
# 或者: from src.tools.http.Http import get_content [as 别名]
def create_work_set(self, target_url):
u"""
根据博客首页的url, 首先通过re获得博客id, 然后根据博客"关于我"的页面的内容获得写入sinablog_info
的数据(这部分理应不在这个函数中, 可以改进), 最后通过博客目录页面的内容, 获得每篇博文的地址,
放入work_set中
:param target_url: 博客首页的url
:return:
"""
if target_url in self.task_complete_set:
return
result = Match.sinablog_author(target_url)
sinablog_author_id = int(result.group('sinablog_people_id'))
article_num = self.get_sinablog_question_list(sinablog_author_id)
if article_num % 50 != 0:
page_num = article_num/50 + 1 # 50 href on 1 page
else:
page_num = article_num / 50
self.question_list[0]['article_num'] = article_num
# 上面这行, 暂时只能这样写, 因为"关于我"的页面没有文章的数量
self.task_complete_set.add(target_url)
for page in range(page_num):
url = 'http://blog.sina.com.cn/s/articlelist_{}_0_{}.html'.format(sinablog_author_id, page+1)
content_article_list = Http.get_content(url)
article_list = self.parse_get_article_list(content_article_list)
for item in article_list:
self.work_set.add(item)
return
示例5: worker
# 需要导入模块: from src.tools.http import Http [as 别名]
# 或者: from src.tools.http.Http import get_content [as 别名]
def worker(self, target_url):
content = Http.get_content(target_url)
if not content:
return
self.work_set.discard(target_url)
self.parse_content(content)
return
示例6: check_update
# 需要导入模块: from src.tools.http import Http [as 别名]
# 或者: from src.tools.http.Http import get_content [as 别名]
def check_update(): # 强制更新
u"""
* 功能
* 检测更新。
* 若在服务器端检测到新版本,自动打开浏览器进入新版下载页面
* 网页请求超时或者版本号正确都将自动跳过
* 输入
* 无
* 返回
* 无
"""
print u"检查更新。。。"
try:
content = Http.get_content(u"http://zhihuhelpbyyzy-zhihu.stor.sinaapp.com/ZhihuHelpUpdateTime.txt")
if not content:
raise Exception("HttpError")
except:
return
time, url = [x.strip() for x in content.split("\n")]
if time == Config.update_time:
return
else:
print u"发现新版本,\n更新日期:{} ,点按回车进入更新页面".format(time)
print u"新版本下载地址:" + url
raw_input()
import webbrowser
webbrowser.open_new_tab(url)
return
示例7: create_work_set
# 需要导入模块: from src.tools.http import Http [as 别名]
# 或者: from src.tools.http.Http import get_content [as 别名]
def create_work_set(self, target_url):
u"""
根据target_url(例:http://www.jianshu.com/users/b1dd2b2c87a8/latest_articles)的内容,
先获得creator_id, 再根据文章的数目, 获得页面数, 依次打开每个页面, 将文章的地址放入work_set中
:param target_url:
:return:
"""
if target_url in self.task_complete_set:
return
id_result = Match.jianshu_author(target_url)
jianshu_id = id_result.group('jianshu_id')
article_num, article_list = self.get_jianshu_question_list(target_url)
self.task_complete_set.add(target_url)
if article_num % 9 != 0:
page_num = article_num/9 + 1 # 9 href on one page
else:
page_num = article_num / 9
for item in article_list:
self.work_set.add(item)
for page in range(page_num-1): # page+2, don't need to get the first page
url = 'http://www.jianshu.com/users/{}/latest_articles?page={}'.format(jianshu_id, page+2)
content_article_list = Http.get_content(url)
article_list = self.parse_get_article_list(content_article_list)
for item in article_list:
self.work_set.add(item)
return
示例8: create_work_set
# 需要导入模块: from src.tools.http import Http [as 别名]
# 或者: from src.tools.http.Http import get_content [as 别名]
def create_work_set(self, target_url):
if target_url in self.task_complete_set:
return
result = Match.column(target_url)
self.column_id = result.group("column_id")
content = Http.get_content("https://zhuanlan.zhihu.com/api/columns/" + self.column_id)
if not content:
return
raw_info = json.loads(content)
info = {}
info["creator_id"] = raw_info["creator"]["slug"]
info["creator_hash"] = raw_info["creator"]["hash"]
info["creator_sign"] = raw_info["creator"]["bio"]
info["creator_name"] = raw_info["creator"]["name"]
info["creator_logo"] = (
raw_info["creator"]["avatar"]["template"]
.replace("{id}", raw_info["creator"]["avatar"]["id"])
.replace("_{size}", "")
)
info["column_id"] = raw_info["slug"]
info["name"] = raw_info["name"]
info["logo"] = (
raw_info["creator"]["avatar"]["template"].replace("{id}", raw_info["avatar"]["id"]).replace("_{size}", "")
)
info["article"] = raw_info["postsCount"]
info["follower"] = raw_info["followersCount"]
info["description"] = raw_info["description"]
self.info_list.append(info)
self.task_complete_set.add(target_url)
detect_url = "https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset=".format(self.column_id)
for i in range(info["article"] / 10 + 1):
self.work_set.add(detect_url + str(i * 10))
return
示例9: create_work_set
# 需要导入模块: from src.tools.http import Http [as 别名]
# 或者: from src.tools.http.Http import get_content [as 别名]
def create_work_set(self, target_url):
if target_url in self.task_complete_set:
return
result = Match.column(target_url)
self.column_id = result.group('column_id')
content = Http.get_content('https://zhuanlan.zhihu.com/api/columns/' + self.column_id)
if not content:
return
raw_info = json.loads(content)
info = {}
info['creator_id'] = raw_info['creator']['slug']
info['creator_hash'] = raw_info['creator']['hash']
info['creator_sign'] = raw_info['creator']['bio']
info['creator_name'] = raw_info['creator']['name']
info['creator_logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['creator']['avatar'][
'id']).replace('_{size}', '')
info['column_id'] = raw_info['slug']
info['name'] = raw_info['name']
info['logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['avatar']['id']).replace(
'_{size}', '')
info['article'] = raw_info['postsCount']
info['follower'] = raw_info['followersCount']
info['description'] = raw_info['description']
self.info_list.append(info)
self.task_complete_set.add(target_url)
detect_url = 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset='.format(self.column_id)
for i in range(info['article'] / 10 + 1):
self.work_set.add(detect_url + str(i * 10))
return
示例10: check_update
# 需要导入模块: from src.tools.http import Http [as 别名]
# 或者: from src.tools.http.Http import get_content [as 别名]
def check_update(): # 强制更新
u"""
* 功能
* 检测更新。
* 若在服务器端检测到新版本,自动打开浏览器进入新版下载页面
* 网页请求超时或者版本号正确都将自动跳过
* 输入
* 无
* 返回
* 无
"""
print u"检查更新。。。"
if Config.debug:
# 当位于debug模式时,不检查更新
return
try:
content = Http.get_content(u"https://www.yaozeyuan.online/zhihuhelp/upgrade.txt")
if not content:
raise Exception(u'HttpError')
time, url = [x.strip() for x in content.strip('\n').split('\n')]
if time == Config.update_time:
return
else:
print u"发现新版本,\n更新日期:{} ,点按回车进入更新页面".format(time)
print u'新版本下载地址:' + url
raw_input()
import webbrowser
webbrowser.open_new_tab(url)
except Exception:
# 不论发生任何异常均直接返回
return
示例11: catch_info
# 需要导入模块: from src.tools.http import Http [as 别名]
# 或者: from src.tools.http.Http import get_content [as 别名]
def catch_info(self, target_url):
content = Http.get_content(target_url + '/about')
if not content:
return
self.info_url_set.discard(target_url)
parser = AuthorParser(content)
self.info_list.append(parser.get_extra_info())
return
示例12: worker
# 需要导入模块: from src.tools.http import Http [as 别名]
# 或者: from src.tools.http.Http import get_content [as 别名]
def worker(self, target_url):
Debug.logger.info(u'开始抓取{}的内容'.format(target_url))
content = Http.get_content(target_url)
if not content:
return
self.work_set.discard(target_url)
self.parse_content(content)
return
示例13: create_work_set
# 需要导入模块: from src.tools.http import Http [as 别名]
# 或者: from src.tools.http.Http import get_content [as 别名]
def create_work_set(self, target_url):
if target_url in self.task_complete_set:
return
self.task_complete_set.add(target_url)
url = target_url + '?page=2' # there are page num in this url
content = Http.get_content(url)
page_num = self.parse_max_page(content)
for item in range(int(page_num)):
url = target_url + '?page={}'.format(str(item+1))
content = Http.get_content(url)
parser = CnblogsAuthorParser(content)
article_url_list = parser.get_article_list()
for item in article_url_list:
self.work_set.add(item)
return
示例14: create_work_set
# 需要导入模块: from src.tools.http import Http [as 别名]
# 或者: from src.tools.http.Http import get_content [as 别名]
def create_work_set(self, target_url):
content = Http.get_content(target_url + '?nr=1&sort=created')
if not content:
return
self.task_set.discard(target_url)
max_page = self.parse_max_page(content)
for page in range(max_page):
url = '{}?nr=1&sort=created&page={}'.format(target_url, page + 1)
self.work_set.add(url)
return
示例15: catch_info
# 需要导入模块: from src.tools.http import Http [as 别名]
# 或者: from src.tools.http.Http import get_content [as 别名]
def catch_info(self, target_url):
if target_url in self.info_url_complete_set:
return
content = Http.get_content(target_url)
if not content:
return
self.info_url_complete_set.add(target_url)
parser = YiibaiParser(content)
self.info_list.append(parser.get_extra_info())
return