当前位置: 首页>>代码示例>>Python>>正文


Python scrapy.http方法代码示例

本文整理汇总了Python中scrapy.http方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy.http方法的具体用法?Python scrapy.http怎么用?Python scrapy.http使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy的用法示例。


在下文中一共展示了scrapy.http方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def parse(self, response: Response) -> Iterator[Request]:
        self.log(response)

        if getattr(self, 'validate_html', False):
            yield Request(
                'http://127.0.0.1:9988/?out=json',
                method='POST',
                headers={'Content-Type': response.headers['Content-Type']},
                body=response.body,
                callback=self._vnu_callback(response.url),
                errback=self.error_callback,
            )

        for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=['doc'],
                                      tags=self.tags, attrs=self.attrs, deny=self.deny,
                                      canonicalize=False).extract_links(response):
            yield from self._make_requests(link.url) 
开发者ID:zulip,项目名称:zulip,代码行数:19,代码来源:spiders.py

示例2: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def parse(self, response):
        content_list = json.loads(response.body)
        jianshu_user_article_item = JianShuUserArticleItem()
        for i in range(len(content_list)):
            url = content_list[i]['object']['data']['slug']
            url = 'http://www.jianshu.com/p/{}'.format(url)
            title = content_list[i]['object']['data']['title']
            picture_url = content_list[i]['object']['data']['list_image_url']
            day = content_list[i]['object']['data']['first_shared_at']
            day = day.replace('T',' ')[:19]
            info_time = parser.parse(day)

            jianshu_user_article_item['pictureUrls'] = picture_url
            jianshu_user_article_item['title'] = title
            jianshu_user_article_item['url'] = url
            jianshu_user_article_item['publishTime'] = info_time
            yield jianshu_user_article_item 
开发者ID:huangtao1208,项目名称:scrapy_spider,代码行数:19,代码来源:jianshu_user_article_spider.py

示例3: __proxy

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def __proxy(self, line):
        # print line
        if '//' in line:
            p = "%s" % line
            htt = line.split(':')[0]
        else:
            p = "http://%s" % line
            htt = "http"
        h = urllib2.ProxyHandler({htt: p})
        o = urllib2.build_opener(h, urllib2.HTTPHandler)
        try:
            r = o.open("http://www.baidu.com/", timeout=3)
            if len(r.read()) > 10:
                return p
            else:
                print "[!] {%s} NONO !" % p
        except:
            print "[!] {%s} NONO !" % p 
开发者ID:openslack,项目名称:openslack-crawler,代码行数:20,代码来源:pipelines.py

示例4: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def parse(self, response):

        if 'kuaidaili.com' in response.url:
            for d in range(1, 11):
                new_url = "http://www.kuaidaili.com/proxylist/%d/" % d
                yield Request(url=new_url, callback=self.fllow_parse)

        if 'cn-proxy.com' in response.url:
            yield Request(url='http://www.cn-proxy.com/', callback=self.fllow_parse)

        if 'proxy-list.org' in response.url:
            for i in range(1, 11):
                pa_url = "http://proxy-list.org/english/index.php?p=%d" % i
                yield Request(url=pa_url, callback=self.fllow_parse)

        if "vipiu.net" in response.url:
            for n in range(2, 5):
                if n == 1:
                    url_v = "http://vipiu.net/free/mianfeidaili/2014/04/27/42417.html"
                else:
                    url_v = "http://vipiu.net/free/mianfeidaili/2014/04/27/42417_%d.html" % n
                yield Request(url=url_v, callback=self.fllow_parse) 
开发者ID:openslack,项目名称:openslack-crawler,代码行数:24,代码来源:proxy.py

示例5: __init__

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def __init__(self, url=None, *args, **kwargs):
		super(RedSpider, self).__init__(*args, **kwargs)
		# self.allowed_domains = [url]
		# self.start_urls = ["http://" + url]
		input = kwargs.get('urls', '').split(',') or []
		self.allowed_domains = input
		self.start_urls = ["http://" + input[0]] 
开发者ID:mertsarica,项目名称:hack4career,代码行数:9,代码来源:RedScanner.py

示例6: parse_start_url

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def parse_start_url(self, response):
        js = response.xpath('//script[@type="commment"]').extract()[0]
        data = re.search('\{[\s\S]*\}', js).group(0)
        data = json.loads(data)
        region_list = data['BizAreaList']
        category_list = data['CateList'][0]['subCategories']

        for category in category_list:
            if category['name'] == u'全部':
                continue
            for region in region_list:
                if region['name'] == u'全城':
                    continue
                for area in region['subareas']:
                    if area['name'] == u'全部':
                        continue
                    item = MeituanItem()
                    item['province'], item['city'] = [s.split('=')[1] for s in response.xpath('//meta[@name="location"]/@content').extract()[0].split(';')]
                    item['region'] = region['name'].strip()
                    item['area'] = area['name'].strip()
                    item['category'] = category['name'].strip()
                    url = 'http://i.meituan.com/%s?cid=%d&bid=%d&cateType=poi&stid=_b1'%(pinyin.get(item['city']), category['id'], area['id'])

                    yield Request(url,
                            method='GET',
                            meta={'item': item, 'url': url},
                            headers=headers,
                            cookies=None,
                            body=None,
                            priority=0,
                            errback=None,
                            encoding=response.encoding,
                            callback=self.parse_category_area) 
开发者ID:piaotiejun,项目名称:restaurant,代码行数:35,代码来源:meituan.py

示例7: _is_external_url

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def _is_external_url(self, url: str) -> bool:
        return url.startswith('http') or self._has_extension(url) 
开发者ID:zulip,项目名称:zulip,代码行数:4,代码来源:spiders.py

示例8: __init__

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def __init__(self, response: scrapy.http.Response):
        self.response = response 
开发者ID:scrapinghub,项目名称:scrapy-poet,代码行数:4,代码来源:test_middleware.py

示例9: __call__

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def __call__(self):
        data = {
            'product': {
                'url': 'http://example.com/sample',
                'name': 'Sample',
            },
        }
        return DummyProductResponse(data=data) 
开发者ID:scrapinghub,项目名称:scrapy-poet,代码行数:10,代码来源:test_utils.py

示例10: test_get_callback

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def test_get_callback():
    spider = MySpider()

    req = scrapy.Request("http://example.com")
    assert get_callback(req, spider) == spider.parse

    req = scrapy.Request("http://example.com", spider.parse2)
    assert get_callback(req, spider) == spider.parse2

    def cb(response):
        pass

    req = scrapy.Request("http://example.com", cb)
    assert get_callback(req, spider) == cb 
开发者ID:scrapinghub,项目名称:scrapy-poet,代码行数:16,代码来源:test_utils.py

示例11: test_is_response_going_to_be_used

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def test_is_response_going_to_be_used():
    spider = MySpider()

    request = scrapy.Request("http://example.com")
    assert is_response_going_to_be_used(request, spider) is True

    request = scrapy.Request("http://example.com", callback=spider.parse2)
    assert is_response_going_to_be_used(request, spider) is True

    request = scrapy.Request("http://example.com", callback=spider.parse3)
    assert is_response_going_to_be_used(request, spider) is False

    request = scrapy.Request("http://example.com", callback=spider.parse4)
    assert is_response_going_to_be_used(request, spider) is False

    request = scrapy.Request("http://example.com", callback=spider.parse5)
    assert is_response_going_to_be_used(request, spider) is True

    request = scrapy.Request("http://example.com", callback=spider.parse6)
    assert is_response_going_to_be_used(request, spider) is True

    request = scrapy.Request("http://example.com", callback=spider.parse7)
    assert is_response_going_to_be_used(request, spider) is True

    request = scrapy.Request("http://example.com", callback=spider.parse8)
    assert is_response_going_to_be_used(request, spider) is False

    request = scrapy.Request("http://example.com", callback=spider.parse9)
    assert is_response_going_to_be_used(request, spider) is True

    request = scrapy.Request("http://example.com", callback=spider.parse10)
    assert is_response_going_to_be_used(request, spider) is False

    request = scrapy.Request("http://example.com", callback=spider.parse11)
    assert is_response_going_to_be_used(request, spider) is True

    request = scrapy.Request("http://example.com", callback=spider.parse12)
    assert is_response_going_to_be_used(request, spider) is True 
开发者ID:scrapinghub,项目名称:scrapy-poet,代码行数:40,代码来源:test_utils.py

示例12: start_requests

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def start_requests(self):

        for i in range(1, 11):
            url = self.base_url + str(i) + '_1' + self.end_Url
            yield Request(url, self.parse)  # 各类小说的连接

        yield Request('http://www.23us.com/quanben/1', self.parse)  # 全本小说的连接 
开发者ID:yyyy777,项目名称:crawler,代码行数:9,代码来源:spider_dingdian.py

示例13: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def parse(self, response):

        max_num = BeautifulSoup(response.text, 'lxml').find(
            'div', class_='pagelink').find_all('a')[-1].get_text()
        baseurl = str(response.url)[:27]
        for num in range(1, int(max_num) + 1):
            if baseurl == 'http://www.23us.com/quanben':
                url = baseurl + '/' + str(num)
            else:
                url = baseurl + '_' + str(num) + self.end_Url
            yield Request(url, callback=self.get_name) 
开发者ID:yyyy777,项目名称:crawler,代码行数:13,代码来源:spider_dingdian.py

示例14: parse_js_links

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def parse_js_links(self, response):
		requests = []
		base_url = AbsUrl(response.url)

		jscall_regex = re.compile("\S+\((.*?)\)", re.M | re.S)
		for a in response.xpath('//a'):
			href = a.xpath('@href').extract()
			if href is not None and len(href) > 0:
				href = href[0]
			else:
				continue
			if href.find('javascript') != -1:
				onclick = a.xpath('@onclick').extract()
				if onclick is not  None and len(onclick) > 0:
					onclick = onclick[0]
				else:
					continue
				matches = jscall_regex.findall(onclick)
				if matches:
					jscall = matches[0]
					jscall_args = jscall.split(',')
					url = jscall_args[0].strip('\'').strip('\"')

					if url == '':
						continue						
					if not url.startswith('http'):
						url = base_url.extend(url)
					requests.append(Request(url, meta={'js_link': True}, headers={'Referer': response.url}))
					log.debug('adding js url: %s'%url)

		return requests 
开发者ID:amol9,项目名称:imagebot,代码行数:33,代码来源:bot.py

示例15: __init__

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def __init__(self, book_url=None, **kw):
        super(FollowAllSpider, self).__init__(**kw)

        url = book_url
        if not url.startswith('http://') and not url.startswith('https://'):
            url = 'http://%s/' % url
        self.url = url
        self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
        self.link_extractor = LinkExtractor()
        self.cookies_seen = set()
        self.previtem = 0
        self.items = 0
        self.timesec = datetime.datetime.utcnow() 
开发者ID:scrapy,项目名称:scrapy-bench,代码行数:15,代码来源:followall.py


注:本文中的scrapy.http方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。