本文整理汇总了Python中scrapy.linkextractors.LinkExtractor.extract_links方法的典型用法代码示例。如果您正苦于以下问题:Python LinkExtractor.extract_links方法的具体用法?Python LinkExtractor.extract_links怎么用?Python LinkExtractor.extract_links使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.linkextractors.LinkExtractor
的用法示例。
在下文中一共展示了LinkExtractor.extract_links方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_sesja
# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
def parse_sesja(self, response):
# uchwaly
uchwaly_le = LinkExtractor(allow=FindReportsSpider.UCHWALA_RE, restrict_xpaths="//table")
links = uchwaly_le.extract_links(response)
self.print_links("uchwaly", links)
cnt = 0
for link in links:
yield scrapy.Request(link.url, callback=self.parse_uchwala)
k = items.PageItem()
k["text"] = link.text.encode("utf8")
k["url"] = link.url
k["ref"] = response.url
k["order"] = cnt
yield k
if cnt >= DEBUG_CNT and DEBUG:
break
cnt += 1
# files (glosowania, obecnosc)
le = LinkExtractor(allow=FindReportsSpider.PLIK_RE)
links = le.extract_links(response)
self.print_links("glosowania", links)
cnt = 0
for link in links:
fi = items.FiledownloadItem()
fi["file_urls"] = [link.url]
fi["text"] = link.text.encode("utf8")
fi["url"] = link.url
fi["ref"] = response.url
fi["order"] = cnt
yield fi
if cnt >= DEBUG_CNT and DEBUG:
break
cnt += 1
示例2: parse
# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
def parse(self, response):
#提取书籍页面中每本书的链接
le = LinkExtractor(restrict_css='article.product_pod h3')
for link in le.extract_links(response):
yield scrapy.Request(link.url, callback=self.parse_book)
#提取下一页的链接
le = LinkExtractor(restrict_css='ul.pager li.next')
links = le.extract_links(response)
if links:
next_url = links[0].url
yield scrapy.Request (next_url, callback=self.parse)
示例3: parse_state
# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
def parse_state(self, response):
""" Yields a scrapy.Request object for each city with a store in the state """
state_url = 'stores.joann.com/{}*'.format(response.meta['state'])
extractor = LinkExtractor(allow=state_url)
for link in extractor.extract_links(response):
yield scrapy.Request(link.url, callback=self.parse_city, headers=HEADERS)
示例4: MySpider
# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
class MySpider(scrapy.Spider):
# Your spider definition
name="fetch_data"
def __init__(self, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
self.start_urls = [kwargs.get('start_url')]
self.link_extractor = LinkExtractor()
urls = self.start_urls
def parse(self, response):
item = WebpageScraperItem()
item['key'] = self.start_urls
item['title'] = response.xpath('//title/text()').extract()
item['paragraphs'] = response.xpath('//p/text()').extract()
item['headings'] = response.xpath('//h1/text()').extract()
links = self.link_extractor.extract_links(response)
item['links'] = [x.url for x in links]
img_urls = []
img_url = response.xpath('//img/@src').extract()
for img in img_url:
parse_url = urlparse.urlparse(img)
parsed_url = parse_url._replace(**{"scheme":"http"})
img_urls.append(parsed_url.geturl())
item['image_urls'] = img_urls
return item
示例5: BCSpider
# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
class BCSpider(Spider):
name = 'bc'
def __init__(self, *args, **kwargs):
super(BCSpider, self).__init__(*args, **kwargs)
self.le = LinkExtractor()
def parse(self, response):
if not isinstance(response, HtmlResponse):
return
for link in self.le.extract_links(response):
r = Request(url=link.url)
r.meta.update(link_text=link.text)
yield r
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(BCSpider, cls).from_crawler(crawler, *args, **kwargs)
spider._set_crawler(crawler)
spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
return spider
def spider_idle(self):
self.log("Spider idle signal caught.")
raise DontCloseSpider
示例6: parse_link
# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
def parse_link(self, response):
# log
self.logger.info('Hi, this is an item page! %s', response.url)
# parse link
linkExtractor = LinkExtractor(allow=r".+\.shtml", restrict_css='div.list > ul', unique=True)
links = linkExtractor.extract_links(response)
for link in links:
yield scrapy.Request(link.url, callback=self.parse_content)
示例7: parse
# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
def parse(self,response):
extractor = LinkExtractor(allow="/article/*")
links = extractor.extract_links(response)
for link in links:
item = XiubaiItem()
req = Request(link.url, self.parse_detail_page)
req.meta['item'] = item
yield req
示例8: parse
# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
def parse(self, response):
le = LinkExtractor()
for link in le.extract_links(response):
yield scrapy.Request(link.url, self.parse_link, meta={
'splash': {
'args': {'har': 1, 'html': 0},
}
})
示例9: parse
# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
def parse(self, response):
if response.status != 200 or response.body == "":
return
ads_links = response.xpath("//a[img]")
for ads_link in ads_links:
link_href = ads_link.xpath("@href").extract_first()
if self._from_same_site(response.url, link_href):
continue
ads_profile = AdsProfileItem()
ads_profile["ads_host"] = response.url
ads_profile["ads_present_mode"] = "normal_1"
ads_profile["ads_target_url"] = link_href
img_src = response.urljoin(ads_link.xpath("img/@src").extract_first())
ads_profile["ads_content_url"] = img_src
ads_profile["ads_content_frame"] = ""
ads_profile["ads_host_domain"] = urlparse(response.url).netloc
ads_profile["ads_target_domain"] = urlparse(link_href).netloc
yield ads_profile
if isinstance(response, SplashJsonResponse):
if "childFrames" in response.data:
frames = self._get_all_child_frames(response)
print "Get %s childFrames in %s" % (len(frames), response.url)
for frame_response in frames:
if not self._is_valid_frame(frame_response.url):
continue
ads_links = frame_response.xpath("//a[img]")
for ads_link in ads_links:
link_href = ads_link.xpath("@href").extract_first()
if self._from_same_site(response.url, link_href):
continue
ads_profile = AdsProfileItem()
ads_profile["ads_host"] = response.url
ads_profile["ads_present_mode"] = "normal_1"
ads_profile["ads_target_url"] = link_href
img_src = frame_response.urljoin(ads_link.xpath("img/@src").extract_first())
ads_profile["ads_content_url"] = img_src
ads_profile["ads_content_frame"] = frame_response.url
ads_profile["ads_host_domain"] = urlparse(response.url).netloc
ads_profile["ads_target_domain"] = urlparse(link_href).netloc
yield ads_profile
link_extractor = LinkExtractor()
all_links = link_extractor.extract_links(response)
for link in all_links:
request = SplashRequest(
response.urljoin(link.url),
self.parse,
endpoint="render.json",
slot_policy=SlotPolicy.PER_DOMAIN,
args={"html": 1, "iframes": 1},
)
request.headers.setdefault("User-Agent", self.ua_generater.get_user_agent())
yield request
示例10: parse
# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
def parse(self, response):
e = LinkExtractor()
urls = [link.url for link in e.extract_links(response)]
for url in urls:
parsed = urlparse.urlsplit(url)
qs = urlparse.parse_qs(parsed.query)
if qs and 'Url' in qs:
event_url = qs['Url'][0]
yield self.add_url(event_url)
示例11: parse
# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
def parse(self, response):
le = LinkExtractor()
user_profiles = []
for link in le.extract_links(response):
result = re.search(r'.*(http://www.last.fm/user/.*)', link.url)
if result:
user_profiles.append(result.group(1))
for user_profile in user_profiles:
print user_profile
示例12: parse_code
# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
def parse_code(self, response):
#提取source code的url
# le = LinkExtractor(restrict_css='div.bodywrapper p', allow='matplotlib.org/examples')
# link = le.extract_links(response)
le = LinkExtractor(restrict_css='a.reference.external')
link = le.extract_links(response)
file = FilesItem()
file['file_urls'] = [link[0].url]
return file
示例13: parse
# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
def parse(self, response):
name = 'example'
lx = LinkExtractor()
lst = lx.extract_links(response) # List contains the list of jobs
# Call the function which compares between lst and MongoDB. Return Boolean Value
flag = compare(name, lst)
# if True, call the function which send an email to users
if flag:
notify(name)
else:
print("No Update")
示例14: parse
# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
def parse(self, response):
link_extractor = LinkExtractor()
links = link_extractor.extract_links(response)
for link in links:
item = DomainItem()
item['link'] = link.url
item['domain'] = self.getHost(link.url)
yield item
for link in links:
if (not db.scrapy_items.find_one({'link': link.url})):
yield scrapy.Request(link.url, callback=self.parse)
示例15: parse
# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
def parse(self, response):
le = LinkExtractor()
for link in le.extract_links(response):
yield SplashRequest(
link.url,
self.parse_link,
endpoint='render.json',
args={
'har': 1,
'html': 1,
}
)