本文整理汇总了Python中scrapy.http方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy.http方法的具体用法?Python scrapy.http怎么用?Python scrapy.http使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy
的用法示例。
在下文中一共展示了scrapy.http方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def parse(self, response: Response) -> Iterator[Request]:
self.log(response)
if getattr(self, 'validate_html', False):
yield Request(
'http://127.0.0.1:9988/?out=json',
method='POST',
headers={'Content-Type': response.headers['Content-Type']},
body=response.body,
callback=self._vnu_callback(response.url),
errback=self.error_callback,
)
for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=['doc'],
tags=self.tags, attrs=self.attrs, deny=self.deny,
canonicalize=False).extract_links(response):
yield from self._make_requests(link.url)
示例2: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def parse(self, response):
content_list = json.loads(response.body)
jianshu_user_article_item = JianShuUserArticleItem()
for i in range(len(content_list)):
url = content_list[i]['object']['data']['slug']
url = 'http://www.jianshu.com/p/{}'.format(url)
title = content_list[i]['object']['data']['title']
picture_url = content_list[i]['object']['data']['list_image_url']
day = content_list[i]['object']['data']['first_shared_at']
day = day.replace('T',' ')[:19]
info_time = parser.parse(day)
jianshu_user_article_item['pictureUrls'] = picture_url
jianshu_user_article_item['title'] = title
jianshu_user_article_item['url'] = url
jianshu_user_article_item['publishTime'] = info_time
yield jianshu_user_article_item
示例3: __proxy
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def __proxy(self, line):
# print line
if '//' in line:
p = "%s" % line
htt = line.split(':')[0]
else:
p = "http://%s" % line
htt = "http"
h = urllib2.ProxyHandler({htt: p})
o = urllib2.build_opener(h, urllib2.HTTPHandler)
try:
r = o.open("http://www.baidu.com/", timeout=3)
if len(r.read()) > 10:
return p
else:
print "[!] {%s} NONO !" % p
except:
print "[!] {%s} NONO !" % p
示例4: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def parse(self, response):
if 'kuaidaili.com' in response.url:
for d in range(1, 11):
new_url = "http://www.kuaidaili.com/proxylist/%d/" % d
yield Request(url=new_url, callback=self.fllow_parse)
if 'cn-proxy.com' in response.url:
yield Request(url='http://www.cn-proxy.com/', callback=self.fllow_parse)
if 'proxy-list.org' in response.url:
for i in range(1, 11):
pa_url = "http://proxy-list.org/english/index.php?p=%d" % i
yield Request(url=pa_url, callback=self.fllow_parse)
if "vipiu.net" in response.url:
for n in range(2, 5):
if n == 1:
url_v = "http://vipiu.net/free/mianfeidaili/2014/04/27/42417.html"
else:
url_v = "http://vipiu.net/free/mianfeidaili/2014/04/27/42417_%d.html" % n
yield Request(url=url_v, callback=self.fllow_parse)
示例5: __init__
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def __init__(self, url=None, *args, **kwargs):
super(RedSpider, self).__init__(*args, **kwargs)
# self.allowed_domains = [url]
# self.start_urls = ["http://" + url]
input = kwargs.get('urls', '').split(',') or []
self.allowed_domains = input
self.start_urls = ["http://" + input[0]]
示例6: parse_start_url
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def parse_start_url(self, response):
js = response.xpath('//script[@type="commment"]').extract()[0]
data = re.search('\{[\s\S]*\}', js).group(0)
data = json.loads(data)
region_list = data['BizAreaList']
category_list = data['CateList'][0]['subCategories']
for category in category_list:
if category['name'] == u'全部':
continue
for region in region_list:
if region['name'] == u'全城':
continue
for area in region['subareas']:
if area['name'] == u'全部':
continue
item = MeituanItem()
item['province'], item['city'] = [s.split('=')[1] for s in response.xpath('//meta[@name="location"]/@content').extract()[0].split(';')]
item['region'] = region['name'].strip()
item['area'] = area['name'].strip()
item['category'] = category['name'].strip()
url = 'http://i.meituan.com/%s?cid=%d&bid=%d&cateType=poi&stid=_b1'%(pinyin.get(item['city']), category['id'], area['id'])
yield Request(url,
method='GET',
meta={'item': item, 'url': url},
headers=headers,
cookies=None,
body=None,
priority=0,
errback=None,
encoding=response.encoding,
callback=self.parse_category_area)
示例7: _is_external_url
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def _is_external_url(self, url: str) -> bool:
return url.startswith('http') or self._has_extension(url)
示例8: __init__
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def __init__(self, response: scrapy.http.Response):
self.response = response
示例9: __call__
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def __call__(self):
data = {
'product': {
'url': 'http://example.com/sample',
'name': 'Sample',
},
}
return DummyProductResponse(data=data)
示例10: test_get_callback
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def test_get_callback():
spider = MySpider()
req = scrapy.Request("http://example.com")
assert get_callback(req, spider) == spider.parse
req = scrapy.Request("http://example.com", spider.parse2)
assert get_callback(req, spider) == spider.parse2
def cb(response):
pass
req = scrapy.Request("http://example.com", cb)
assert get_callback(req, spider) == cb
示例11: test_is_response_going_to_be_used
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def test_is_response_going_to_be_used():
spider = MySpider()
request = scrapy.Request("http://example.com")
assert is_response_going_to_be_used(request, spider) is True
request = scrapy.Request("http://example.com", callback=spider.parse2)
assert is_response_going_to_be_used(request, spider) is True
request = scrapy.Request("http://example.com", callback=spider.parse3)
assert is_response_going_to_be_used(request, spider) is False
request = scrapy.Request("http://example.com", callback=spider.parse4)
assert is_response_going_to_be_used(request, spider) is False
request = scrapy.Request("http://example.com", callback=spider.parse5)
assert is_response_going_to_be_used(request, spider) is True
request = scrapy.Request("http://example.com", callback=spider.parse6)
assert is_response_going_to_be_used(request, spider) is True
request = scrapy.Request("http://example.com", callback=spider.parse7)
assert is_response_going_to_be_used(request, spider) is True
request = scrapy.Request("http://example.com", callback=spider.parse8)
assert is_response_going_to_be_used(request, spider) is False
request = scrapy.Request("http://example.com", callback=spider.parse9)
assert is_response_going_to_be_used(request, spider) is True
request = scrapy.Request("http://example.com", callback=spider.parse10)
assert is_response_going_to_be_used(request, spider) is False
request = scrapy.Request("http://example.com", callback=spider.parse11)
assert is_response_going_to_be_used(request, spider) is True
request = scrapy.Request("http://example.com", callback=spider.parse12)
assert is_response_going_to_be_used(request, spider) is True
示例12: start_requests
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def start_requests(self):
for i in range(1, 11):
url = self.base_url + str(i) + '_1' + self.end_Url
yield Request(url, self.parse) # 各类小说的连接
yield Request('http://www.23us.com/quanben/1', self.parse) # 全本小说的连接
示例13: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def parse(self, response):
max_num = BeautifulSoup(response.text, 'lxml').find(
'div', class_='pagelink').find_all('a')[-1].get_text()
baseurl = str(response.url)[:27]
for num in range(1, int(max_num) + 1):
if baseurl == 'http://www.23us.com/quanben':
url = baseurl + '/' + str(num)
else:
url = baseurl + '_' + str(num) + self.end_Url
yield Request(url, callback=self.get_name)
示例14: parse_js_links
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def parse_js_links(self, response):
requests = []
base_url = AbsUrl(response.url)
jscall_regex = re.compile("\S+\((.*?)\)", re.M | re.S)
for a in response.xpath('//a'):
href = a.xpath('@href').extract()
if href is not None and len(href) > 0:
href = href[0]
else:
continue
if href.find('javascript') != -1:
onclick = a.xpath('@onclick').extract()
if onclick is not None and len(onclick) > 0:
onclick = onclick[0]
else:
continue
matches = jscall_regex.findall(onclick)
if matches:
jscall = matches[0]
jscall_args = jscall.split(',')
url = jscall_args[0].strip('\'').strip('\"')
if url == '':
continue
if not url.startswith('http'):
url = base_url.extend(url)
requests.append(Request(url, meta={'js_link': True}, headers={'Referer': response.url}))
log.debug('adding js url: %s'%url)
return requests
示例15: __init__
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import http [as 别名]
def __init__(self, book_url=None, **kw):
super(FollowAllSpider, self).__init__(**kw)
url = book_url
if not url.startswith('http://') and not url.startswith('https://'):
url = 'http://%s/' % url
self.url = url
self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
self.link_extractor = LinkExtractor()
self.cookies_seen = set()
self.previtem = 0
self.items = 0
self.timesec = datetime.datetime.utcnow()