本文整理汇总了Python中scrapy.linkextractors.LinkExtractor类的典型用法代码示例。如果您正苦于以下问题:Python LinkExtractor类的具体用法?Python LinkExtractor怎么用?Python LinkExtractor使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了LinkExtractor类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_sesja
def parse_sesja(self, response):
# uchwaly
uchwaly_le = LinkExtractor(allow=FindReportsSpider.UCHWALA_RE, restrict_xpaths="//table")
links = uchwaly_le.extract_links(response)
self.print_links("uchwaly", links)
cnt = 0
for link in links:
yield scrapy.Request(link.url, callback=self.parse_uchwala)
k = items.PageItem()
k["text"] = link.text.encode("utf8")
k["url"] = link.url
k["ref"] = response.url
k["order"] = cnt
yield k
if cnt >= DEBUG_CNT and DEBUG:
break
cnt += 1
# files (glosowania, obecnosc)
le = LinkExtractor(allow=FindReportsSpider.PLIK_RE)
links = le.extract_links(response)
self.print_links("glosowania", links)
cnt = 0
for link in links:
fi = items.FiledownloadItem()
fi["file_urls"] = [link.url]
fi["text"] = link.text.encode("utf8")
fi["url"] = link.url
fi["ref"] = response.url
fi["order"] = cnt
yield fi
if cnt >= DEBUG_CNT and DEBUG:
break
cnt += 1
示例2: parse_state
def parse_state(self, response):
""" Yields a scrapy.Request object for each city with a store in the state """
state_url = 'stores.joann.com/{}*'.format(response.meta['state'])
extractor = LinkExtractor(allow=state_url)
for link in extractor.extract_links(response):
yield scrapy.Request(link.url, callback=self.parse_city, headers=HEADERS)
示例3: parse
def parse(self, response):
le = LinkExtractor()
for link in le.extract_links(response):
yield scrapy.Request(link.url, self.parse_link, meta={
'splash': {
'args': {'har': 1, 'html': 0},
}
})
示例4: parse_link
def parse_link(self, response):
# log
self.logger.info('Hi, this is an item page! %s', response.url)
# parse link
linkExtractor = LinkExtractor(allow=r".+\.shtml", restrict_css='div.list > ul', unique=True)
links = linkExtractor.extract_links(response)
for link in links:
yield scrapy.Request(link.url, callback=self.parse_content)
示例5: parse
def parse(self,response):
extractor = LinkExtractor(allow="/article/*")
links = extractor.extract_links(response)
for link in links:
item = XiubaiItem()
req = Request(link.url, self.parse_detail_page)
req.meta['item'] = item
yield req
示例6: parse
def parse(self, response):
e = LinkExtractor()
urls = [link.url for link in e.extract_links(response)]
for url in urls:
parsed = urlparse.urlsplit(url)
qs = urlparse.parse_qs(parsed.query)
if qs and 'Url' in qs:
event_url = qs['Url'][0]
yield self.add_url(event_url)
示例7: parse
def parse(self, response):
if response.status != 200 or response.body == "":
return
ads_links = response.xpath("//a[img]")
for ads_link in ads_links:
link_href = ads_link.xpath("@href").extract_first()
if self._from_same_site(response.url, link_href):
continue
ads_profile = AdsProfileItem()
ads_profile["ads_host"] = response.url
ads_profile["ads_present_mode"] = "normal_1"
ads_profile["ads_target_url"] = link_href
img_src = response.urljoin(ads_link.xpath("img/@src").extract_first())
ads_profile["ads_content_url"] = img_src
ads_profile["ads_content_frame"] = ""
ads_profile["ads_host_domain"] = urlparse(response.url).netloc
ads_profile["ads_target_domain"] = urlparse(link_href).netloc
yield ads_profile
if isinstance(response, SplashJsonResponse):
if "childFrames" in response.data:
frames = self._get_all_child_frames(response)
print "Get %s childFrames in %s" % (len(frames), response.url)
for frame_response in frames:
if not self._is_valid_frame(frame_response.url):
continue
ads_links = frame_response.xpath("//a[img]")
for ads_link in ads_links:
link_href = ads_link.xpath("@href").extract_first()
if self._from_same_site(response.url, link_href):
continue
ads_profile = AdsProfileItem()
ads_profile["ads_host"] = response.url
ads_profile["ads_present_mode"] = "normal_1"
ads_profile["ads_target_url"] = link_href
img_src = frame_response.urljoin(ads_link.xpath("img/@src").extract_first())
ads_profile["ads_content_url"] = img_src
ads_profile["ads_content_frame"] = frame_response.url
ads_profile["ads_host_domain"] = urlparse(response.url).netloc
ads_profile["ads_target_domain"] = urlparse(link_href).netloc
yield ads_profile
link_extractor = LinkExtractor()
all_links = link_extractor.extract_links(response)
for link in all_links:
request = SplashRequest(
response.urljoin(link.url),
self.parse,
endpoint="render.json",
slot_policy=SlotPolicy.PER_DOMAIN,
args={"html": 1, "iframes": 1},
)
request.headers.setdefault("User-Agent", self.ua_generater.get_user_agent())
yield request
示例8: parse
def parse(self, response):
le = LinkExtractor()
user_profiles = []
for link in le.extract_links(response):
result = re.search(r'.*(http://www.last.fm/user/.*)', link.url)
if result:
user_profiles.append(result.group(1))
for user_profile in user_profiles:
print user_profile
示例9: parse_code
def parse_code(self, response):
#提取source code的url
# le = LinkExtractor(restrict_css='div.bodywrapper p', allow='matplotlib.org/examples')
# link = le.extract_links(response)
le = LinkExtractor(restrict_css='a.reference.external')
link = le.extract_links(response)
file = FilesItem()
file['file_urls'] = [link[0].url]
return file
示例10: parse
def parse(self, response):
link_extractor = LinkExtractor()
links = link_extractor.extract_links(response)
for link in links:
item = DomainItem()
item['link'] = link.url
item['domain'] = self.getHost(link.url)
yield item
for link in links:
if (not db.scrapy_items.find_one({'link': link.url})):
yield scrapy.Request(link.url, callback=self.parse)
示例11: parse
def parse(self, response):
name = 'example'
lx = LinkExtractor()
lst = lx.extract_links(response) # List contains the list of jobs
# Call the function which compares between lst and MongoDB. Return Boolean Value
flag = compare(name, lst)
# if True, call the function which send an email to users
if flag:
notify(name)
else:
print("No Update")
示例12: parse
def parse(self, response):
le = LinkExtractor()
for link in le.extract_links(response):
yield SplashRequest(
link.url,
self.parse_link,
endpoint='render.json',
args={
'har': 1,
'html': 1,
}
)
示例13: parse
def parse(self, response):
e = LinkExtractor()
urls = [link.url for link in e.extract_links(response)]
for url in urls:
if response.url != url:
yield self.add_url(url)
if urls:
qs = urlparse.parse_qs(urlparse.urlparse(response.url).query)
qs = dict((k, v[0]) for (k, v) in qs.iteritems())
qs['p'] = int(qs['p']) + 1
url = 'http://comeon5678.com/event/list'
yield scrapy.Request('%s?%s' % (url, urllib.urlencode(qs)))
示例14: parse
def parse(self, response):
#提取书籍页面中每本书的链接
le = LinkExtractor(restrict_css='article.product_pod h3')
for link in le.extract_links(response):
yield scrapy.Request(link.url, callback=self.parse_book)
#提取下一页的链接
le = LinkExtractor(restrict_css='ul.pager li.next')
links = le.extract_links(response)
if links:
next_url = links[0].url
yield scrapy.Request (next_url, callback=self.parse)
示例15: __init__
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
deny_extensions=None):
LinkExtractor.__init__(self, allow = allow,
deny = deny,
allow_domains = allow_domains,
deny_domains = deny_domains,
restrict_xpaths = restrict_xpaths,
tags = tags,
attrs = attrs,
canonicalize = canonicalize,
unique = unique,
process_value = self.process_value,
deny_extensions = deny_extensions
)