当前位置: 首页>>代码示例>>Python>>正文


Python http.HtmlResponse方法代码示例

本文整理汇总了Python中scrapy.http.HtmlResponse方法的典型用法代码示例。如果您正苦于以下问题:Python http.HtmlResponse方法的具体用法?Python http.HtmlResponse怎么用?Python http.HtmlResponse使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.http的用法示例。


在下文中一共展示了http.HtmlResponse方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: process_request

# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def process_request(self, request):
        """使用selenium模拟点击,获取js等操作,所以需要重写process_request方法"""

        # 获取调度器返回的url
        url = request.url
        if 'month=' in url:
            # 手动打开chrome发送请求,执行js
            driver = webdriver.Chrome()
            driver.get(url=url)

            # 延迟一下,让页面进行加载
            time.sleep(4)
            data = driver.page_source.encode()
            driver.close()

            # 返回数据给引擎
            resp = HtmlResponse(
                url=url,
                body=data,
                request=request,
                encoding='utf8'
            )

            return resp 
开发者ID:HaoZhang95,项目名称:Python24,代码行数:26,代码来源:middlewares.py

示例2: evaluate

# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def evaluate(self, meta_object,
                text, expected_raw, expected_requests):
        request = Request(url='http://www.drudgereport.com',
                          meta=meta_object)
        response = HtmlResponse('drudge.url', body=text, request=request,
                                encoding='utf8')

        raw_item_count = 0
        request_count = 0

        for x in self.spider.parse(response):
            if isinstance(x, RawResponseItem):
                raw_item_count = raw_item_count + 1
            elif isinstance(x, Request):
                request_count = request_count + 1

        self.assertEqual(raw_item_count, expected_raw)
        self.assertEqual(request_count, expected_requests) 
开发者ID:istresearch,项目名称:scrapy-cluster,代码行数:20,代码来源:test_link_spider.py

示例3: test_parse_drug_details_or_overview_generates_new_request_if_redirected_to_search_page

# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def test_parse_drug_details_or_overview_generates_new_request_if_redirected_to_search_page(self):
        url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.Search_Drug_Name'
        meta = {
            'original_url': 'http://www.accessdata.fda.gov/somewhere.cfm',
            'original_cookies': {
                'foo': 'bar',
            },
        }
        mock_response = HtmlResponse(url=url)
        mock_response.request = Request(url, meta=meta)

        with mock.patch('random.random', return_value='random_cookiejar'):
            spider = Spider()
            request = spider.parse_drug_details_or_overview(mock_response)

        assert request.url == meta['original_url']
        assert request.cookies == meta['original_cookies']
        assert request.dont_filter
        assert request.callback == spider.parse_drug_details_or_overview
        assert request.meta['cookiejar'] == 'random_cookiejar' 
开发者ID:opentrials,项目名称:collectors,代码行数:22,代码来源:test_fda_dap.py

示例4: process_request

# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def process_request(self, request, spider):
        try:
            self.webdriver.get(url=request.url)
            time.sleep(2)
            # 部分智能等待的代码,提高浏览器效率的处理
            # from selenium.webdriver.common.by import By
            # from selenium.webdriver.support import expected_conditions as EC
            # from selenium.webdriver.support.wait import WebDriverWait as wbw
            # locator = (By.XPATH, '//img[@class="focus-item-img"]')
            # # wbw(self.webdriver,10).until(EC.presence_of_element_located(locator)) # 判断某个元素是否被加到了dom树里
            # wbw(self.webdriver,10).until(EC.visibility_of_element_located(locator)) # 判断某个元素是否被添加到了dom里并且可见,即宽和高都大于0
            current_url = self.webdriver.current_url
            page_source = self.webdriver.page_source
        except Exception as e:
            return self._parse_selenium_temp_exceptions(request, spider, e)
        # 若是出现请求异常(验证码,或者重新登陆之类的处理),请在这里判断 page_source 是否是异常情况,并在这里处理重新进行登录或其他
        h = HtmlResponse(
            url      = current_url,
            headers  = {'Selenium':'Selenium cannot get a certain headers, This is the information created automatically by middleware.'},
            body     = page_source,
            encoding = 'utf-8',
            request  = request
        )
        return h 
开发者ID:cilame,项目名称:vrequest,代码行数:26,代码来源:middlewares.py

示例5: parse_ershicimi_page

# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def parse_ershicimi_page(rsp):
    """
    解析 https://www.ershicimi.com/p/3e250905e46b0827af501c19c1c3f2ed
    :param rsp:
    :return:
    """
    response = HtmlResponse(url=rsp.url, body=rsp.text, encoding='utf8')

    title = response.selector.xpath('//h1[@class="article-title"]/text()').extract_first().strip()
    author = response.selector.xpath('//div[@class="article-sub"]//a/text()').extract_first().strip()

    try:
        content = response.selector.xpath('//div[@id="js_content"]').extract_first().strip()
    except:
        content = response.selector.xpath('//div[@class="abstract"]').extract_first().strip()

    return title, author, content 
开发者ID:richshaw2015,项目名称:oh-my-rss,代码行数:19,代码来源:wemp.py

示例6: process_request

# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        if spider.browser:
            request.meta['browser'] = self.browser  # to access driver from response
            self.browser.get(request.url)
            # wait js eval
            time.sleep(15)
            body = to_bytes(self.browser.page_source)  # body must be of type bytes
            return HtmlResponse(self.browser.current_url, body=body, encoding='utf-8', request=request)
        else:
            return None 
开发者ID:richshaw2015,项目名称:oh-my-rss,代码行数:21,代码来源:middlewares.py

示例7: get_url

# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def get_url(betamax_session):
    def _get_url(url, request_kwargs={}):
        '''Returns a scrapy.html.HtmlResponse with the contents of the received
        url.

        Note that the session is kept intact among multiple calls to this
        method (i.e. cookies are passed over).

        We also don't verify SSL certificates, because Takeda's certificate is
        invalid. If they become valid, we can resume verifying the
        certificates.
        '''
        response = betamax_session.get(url, verify=False)
        scrapy_response = HtmlResponse(
            url=str(response.url),
            body=response.content,
        )
        scrapy_response.request = Request(url, **request_kwargs)

        return scrapy_response
    return _get_url 
开发者ID:opentrials,项目名称:collectors,代码行数:23,代码来源:conftest.py

示例8: open_in_browser

# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def open_in_browser(response, _openfunc=webbrowser.open):
    """Open the given response in a local web browser, populating the <base>
    tag for external links to work
    """
    from scrapy.http import HtmlResponse, TextResponse
    # XXX: this implementation is a bit dirty and could be improved
    body = response.body
    if isinstance(response, HtmlResponse):
        if b'<base' not in body:
            repl = '<head><base href="%s">' % response.url
            body = body.replace(b'<head>', to_bytes(repl))
        ext = '.html'
    elif isinstance(response, TextResponse):
        ext = '.txt'
    else:
        raise TypeError("Unsupported response type: %s" %
                        response.__class__.__name__)
    fd, fname = tempfile.mkstemp(ext)
    os.write(fd, body)
    os.close(fd)
    return _openfunc("file://%s" % fname) 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:23,代码来源:response.py

示例9: process_response

# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def process_response(self, request, response, spider):

        if not isinstance(response, HtmlResponse) or response.status != 200:
            return response

        if request.method != 'GET':
            # other HTTP methods are either not safe or don't have a body
            return response

        if 'ajax_crawlable' in request.meta:  # prevent loops
            return response

        if not self._has_ajax_crawlable_variant(response):
            return response

        # scrapy already handles #! links properly
        ajax_crawl_request = request.replace(url=request.url+'#!')
        logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
                     {'ajax_crawl_request': ajax_crawl_request, 'request': request},
                     extra={'spider': spider})

        ajax_crawl_request.meta['ajax_crawlable'] = True
        return ajax_crawl_request 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:25,代码来源:ajaxcrawl.py

示例10: _requests_to_follow

# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def _requests_to_follow(self, response):
        if not isinstance(response, HtmlResponse):
            return
        seen = set()
        self.headers['Referer'] = response.url
        for n, rule in enumerate(self._rules):
            links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
            if links and rule.process_links:
                links = rule.process_links(links)
            for link in links:
                seen.add(link)
                r = Request(
                    url=link.url,
                    callback=self._response_downloaded,
                    headers=self.headers,
                    dont_filter=True
                )
                # keep cookie
                r.meta.update(
                    rule=n,
                    link_text=link.text,
                    cookiejar=response.meta['cookiejar']
                )
                yield rule.process_request(r) 
开发者ID:SylvanasSun,项目名称:scrapy-picture-spider,代码行数:26,代码来源:pixiv_spider.py

示例11: parse

# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def parse(self, response):
        feed_url = response.css('a.feed-icon::attr(href)').extract()[0]
        feed = feedparser.parse(feed_url)

        for entry in feed['entries']:
            detail = HtmlResponse(url='string', body=entry['summary'], encoding='utf-8')
            description = detail.css('.body.text-secondary p::text').extract()
            address = detail.css('[itemprop="streetAddress"]::text').extract()
            yield {
                'address': address[0] if len(address) > 0 else '',
                'url': entry.link,
                'title': entry.title,
                'event_time': {
                    'date': detail.css('span.date-display-single::attr("content")').extract()[0].split('T')[0],
                    'time_range': detail.css('span.date-display-single::attr("content")').extract()[0].split('T')[1]
                },
                'description': description[0] if len(description) > 0 else ''
            } 
开发者ID:In2ItChicago,项目名称:In2ItChicago,代码行数:20,代码来源:lwv_chicago.py

示例12: parse

# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def parse(self, response):
        # Wiener Linien returns HTML with an XML content type which creates an
        # XmlResponse.
        response = HtmlResponse(url=response.url, body=response.body)
        for item in response.css(".block-news-item"):
            il = FeedEntryItemLoader(
                response=response,
                timezone="Europe/Vienna",
                ignoretz=True,
                base_url="https://www.{}".format(self.name),
            )
            link = response.urljoin(item.css("a::attr(href)").extract_first())
            il.add_value("link", link)
            il.add_value("title", item.css("h3::text").extract_first())
            il.add_value("updated", item.css(".date::text").extract_first())
            yield scrapy.Request(link, self.parse_item, meta={"il": il}) 
开发者ID:PyFeeds,项目名称:PyFeeds,代码行数:18,代码来源:wienerlinien_at.py

示例13: do_test

# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def do_test(self, meta_object,
                text, expected_raw, expected_requests):
        request = Request(url='http://www.drudgereport.com',
                          meta=meta_object)
        response = HtmlResponse('drudge.url', body=text, request=request)

        raw_item_count = 0
        request_count = 0

        for x in self.spider.parse(response):
            if isinstance(x, RawResponseItem):
                raw_item_count = raw_item_count + 1
            elif isinstance(x, Request):
                request_count = request_count + 1

        self.assertEqual(raw_item_count, expected_raw)
        self.assertEqual(request_count, expected_requests) 
开发者ID:openslack,项目名称:openslack-crawler,代码行数:19,代码来源:offline.py

示例14: parse_multi_items

# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def parse_multi_items(self, hxs, node, item, response, index, count):
        if node.restrict_xpaths:
            for child in node.children:
                if child.xpaths:
                    restrict_xpath = '|'.join([restrict_xpath.replace("<<", "").replace(">>", "") for restrict_xpath in node.restrict_xpaths])
                    try:
                        m = re.search(r'<<(.+)&(.*)>>',xpath)
                        restrict_xpath = m.group(1)
                    except:
                        pass
                    restrict_selectors = hxs.select(restrict_xpath)
                    #fetch multi items from one page
                    if index != None and len(restrict_selectors) > index and len(restrict_selectors)==count:
                        try:
                            XmlXPathSelector = Selector
                        except:
                            pass
                        restrict_hxs = XmlXPathSelector(HtmlResponse(response.url, body=re.sub('[\n\r\t]+', '', restrict_selectors[index].extract()), encoding='utf8'))
                        #restrict_hxs = restrict_selectors[index]
                        self.parse_item_xpaths(restrict_hxs, child.xpaths, item, response.url, child.name, True, False) 
开发者ID:heartfly,项目名称:ajax_crawler,代码行数:22,代码来源:common_spider.py

示例15: process_request

# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import HtmlResponse [as 别名]
def process_request(self, request, spider):
        if spider.name == 'seleniumSpider':
            self.driver.get(request.url)
            time.sleep(2)
            body = self.driver.page_source

            return HtmlResponse(self.driver.current_url,
                                body=body,
                                encoding='utf-8',
                                request=request) 
开发者ID:kingname,项目名称:SourceCodeOfBook,代码行数:12,代码来源:middlewares.py


注:本文中的scrapy.http.HtmlResponse方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。