本文整理汇总了Python中scrapy.http.HtmlResponse方法的典型用法代码示例。如果您正苦于以下问题:Python http.HtmlResponse方法的具体用法?Python http.HtmlResponse怎么用?Python http.HtmlResponse使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.http
的用法示例。
在下文中一共展示了http.HtmlResponse方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: process_request
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def process_request(self, request):
"""使用selenium模拟点击,获取js等操作,所以需要重写process_request方法"""
# 获取调度器返回的url
url = request.url
if 'month=' in url:
# 手动打开chrome发送请求,执行js
driver = webdriver.Chrome()
driver.get(url=url)
# 延迟一下,让页面进行加载
time.sleep(4)
data = driver.page_source.encode()
driver.close()
# 返回数据给引擎
resp = HtmlResponse(
url=url,
body=data,
request=request,
encoding='utf8'
)
return resp
示例2: evaluate
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def evaluate(self, meta_object,
text, expected_raw, expected_requests):
request = Request(url='http://www.drudgereport.com',
meta=meta_object)
response = HtmlResponse('drudge.url', body=text, request=request,
encoding='utf8')
raw_item_count = 0
request_count = 0
for x in self.spider.parse(response):
if isinstance(x, RawResponseItem):
raw_item_count = raw_item_count + 1
elif isinstance(x, Request):
request_count = request_count + 1
self.assertEqual(raw_item_count, expected_raw)
self.assertEqual(request_count, expected_requests)
示例3: test_parse_drug_details_or_overview_generates_new_request_if_redirected_to_search_page
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def test_parse_drug_details_or_overview_generates_new_request_if_redirected_to_search_page(self):
url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.Search_Drug_Name'
meta = {
'original_url': 'http://www.accessdata.fda.gov/somewhere.cfm',
'original_cookies': {
'foo': 'bar',
},
}
mock_response = HtmlResponse(url=url)
mock_response.request = Request(url, meta=meta)
with mock.patch('random.random', return_value='random_cookiejar'):
spider = Spider()
request = spider.parse_drug_details_or_overview(mock_response)
assert request.url == meta['original_url']
assert request.cookies == meta['original_cookies']
assert request.dont_filter
assert request.callback == spider.parse_drug_details_or_overview
assert request.meta['cookiejar'] == 'random_cookiejar'
示例4: process_request
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def process_request(self, request, spider):
try:
self.webdriver.get(url=request.url)
time.sleep(2)
# 部分智能等待的代码,提高浏览器效率的处理
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.wait import WebDriverWait as wbw
# locator = (By.XPATH, '//img[@class="focus-item-img"]')
# # wbw(self.webdriver,10).until(EC.presence_of_element_located(locator)) # 判断某个元素是否被加到了dom树里
# wbw(self.webdriver,10).until(EC.visibility_of_element_located(locator)) # 判断某个元素是否被添加到了dom里并且可见,即宽和高都大于0
current_url = self.webdriver.current_url
page_source = self.webdriver.page_source
except Exception as e:
return self._parse_selenium_temp_exceptions(request, spider, e)
# 若是出现请求异常(验证码,或者重新登陆之类的处理),请在这里判断 page_source 是否是异常情况,并在这里处理重新进行登录或其他
h = HtmlResponse(
url = current_url,
headers = {'Selenium':'Selenium cannot get a certain headers, This is the information created automatically by middleware.'},
body = page_source,
encoding = 'utf-8',
request = request
)
return h
示例5: parse_ershicimi_page
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def parse_ershicimi_page(rsp):
"""
解析 https://www.ershicimi.com/p/3e250905e46b0827af501c19c1c3f2ed
:param rsp:
:return:
"""
response = HtmlResponse(url=rsp.url, body=rsp.text, encoding='utf8')
title = response.selector.xpath('//h1[@class="article-title"]/text()').extract_first().strip()
author = response.selector.xpath('//div[@class="article-sub"]//a/text()').extract_first().strip()
try:
content = response.selector.xpath('//div[@id="js_content"]').extract_first().strip()
except:
content = response.selector.xpath('//div[@class="abstract"]').extract_first().strip()
return title, author, content
示例6: process_request
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
if spider.browser:
request.meta['browser'] = self.browser # to access driver from response
self.browser.get(request.url)
# wait js eval
time.sleep(15)
body = to_bytes(self.browser.page_source) # body must be of type bytes
return HtmlResponse(self.browser.current_url, body=body, encoding='utf-8', request=request)
else:
return None
示例7: get_url
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def get_url(betamax_session):
def _get_url(url, request_kwargs={}):
'''Returns a scrapy.html.HtmlResponse with the contents of the received
url.
Note that the session is kept intact among multiple calls to this
method (i.e. cookies are passed over).
We also don't verify SSL certificates, because Takeda's certificate is
invalid. If they become valid, we can resume verifying the
certificates.
'''
response = betamax_session.get(url, verify=False)
scrapy_response = HtmlResponse(
url=str(response.url),
body=response.content,
)
scrapy_response.request = Request(url, **request_kwargs)
return scrapy_response
return _get_url
示例8: open_in_browser
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def open_in_browser(response, _openfunc=webbrowser.open):
"""Open the given response in a local web browser, populating the <base>
tag for external links to work
"""
from scrapy.http import HtmlResponse, TextResponse
# XXX: this implementation is a bit dirty and could be improved
body = response.body
if isinstance(response, HtmlResponse):
if b'<base' not in body:
repl = '<head><base href="%s">' % response.url
body = body.replace(b'<head>', to_bytes(repl))
ext = '.html'
elif isinstance(response, TextResponse):
ext = '.txt'
else:
raise TypeError("Unsupported response type: %s" %
response.__class__.__name__)
fd, fname = tempfile.mkstemp(ext)
os.write(fd, body)
os.close(fd)
return _openfunc("file://%s" % fname)
示例9: process_response
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def process_response(self, request, response, spider):
if not isinstance(response, HtmlResponse) or response.status != 200:
return response
if request.method != 'GET':
# other HTTP methods are either not safe or don't have a body
return response
if 'ajax_crawlable' in request.meta: # prevent loops
return response
if not self._has_ajax_crawlable_variant(response):
return response
# scrapy already handles #! links properly
ajax_crawl_request = request.replace(url=request.url+'#!')
logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
{'ajax_crawl_request': ajax_crawl_request, 'request': request},
extra={'spider': spider})
ajax_crawl_request.meta['ajax_crawlable'] = True
return ajax_crawl_request
示例10: _requests_to_follow
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
self.headers['Referer'] = response.url
for n, rule in enumerate(self._rules):
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = Request(
url=link.url,
callback=self._response_downloaded,
headers=self.headers,
dont_filter=True
)
# keep cookie
r.meta.update(
rule=n,
link_text=link.text,
cookiejar=response.meta['cookiejar']
)
yield rule.process_request(r)
示例11: parse
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def parse(self, response):
feed_url = response.css('a.feed-icon::attr(href)').extract()[0]
feed = feedparser.parse(feed_url)
for entry in feed['entries']:
detail = HtmlResponse(url='string', body=entry['summary'], encoding='utf-8')
description = detail.css('.body.text-secondary p::text').extract()
address = detail.css('[itemprop="streetAddress"]::text').extract()
yield {
'address': address[0] if len(address) > 0 else '',
'url': entry.link,
'title': entry.title,
'event_time': {
'date': detail.css('span.date-display-single::attr("content")').extract()[0].split('T')[0],
'time_range': detail.css('span.date-display-single::attr("content")').extract()[0].split('T')[1]
},
'description': description[0] if len(description) > 0 else ''
}
示例12: parse
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def parse(self, response):
# Wiener Linien returns HTML with an XML content type which creates an
# XmlResponse.
response = HtmlResponse(url=response.url, body=response.body)
for item in response.css(".block-news-item"):
il = FeedEntryItemLoader(
response=response,
timezone="Europe/Vienna",
ignoretz=True,
base_url="https://www.{}".format(self.name),
)
link = response.urljoin(item.css("a::attr(href)").extract_first())
il.add_value("link", link)
il.add_value("title", item.css("h3::text").extract_first())
il.add_value("updated", item.css(".date::text").extract_first())
yield scrapy.Request(link, self.parse_item, meta={"il": il})
示例13: do_test
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def do_test(self, meta_object,
text, expected_raw, expected_requests):
request = Request(url='http://www.drudgereport.com',
meta=meta_object)
response = HtmlResponse('drudge.url', body=text, request=request)
raw_item_count = 0
request_count = 0
for x in self.spider.parse(response):
if isinstance(x, RawResponseItem):
raw_item_count = raw_item_count + 1
elif isinstance(x, Request):
request_count = request_count + 1
self.assertEqual(raw_item_count, expected_raw)
self.assertEqual(request_count, expected_requests)
示例14: parse_multi_items
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def parse_multi_items(self, hxs, node, item, response, index, count):
if node.restrict_xpaths:
for child in node.children:
if child.xpaths:
restrict_xpath = '|'.join([restrict_xpath.replace("<<", "").replace(">>", "") for restrict_xpath in node.restrict_xpaths])
try:
m = re.search(r'<<(.+)&(.*)>>',xpath)
restrict_xpath = m.group(1)
except:
pass
restrict_selectors = hxs.select(restrict_xpath)
#fetch multi items from one page
if index != None and len(restrict_selectors) > index and len(restrict_selectors)==count:
try:
XmlXPathSelector = Selector
except:
pass
restrict_hxs = XmlXPathSelector(HtmlResponse(response.url, body=re.sub('[\n\r\t]+', '', restrict_selectors[index].extract()), encoding='utf8'))
#restrict_hxs = restrict_selectors[index]
self.parse_item_xpaths(restrict_hxs, child.xpaths, item, response.url, child.name, True, False)
示例15: process_request
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def process_request(self, request, spider):
if spider.name == 'seleniumSpider':
self.driver.get(request.url)
time.sleep(2)
body = self.driver.page_source
return HtmlResponse(self.driver.current_url,
body=body,
encoding='utf-8',
request=request)