当前位置: 首页>>代码示例>>Python>>正文


Python LinkExtractor.extract_links方法代码示例

本文整理汇总了Python中scrapy.linkextractors.LinkExtractor.extract_links方法的典型用法代码示例。如果您正苦于以下问题:Python LinkExtractor.extract_links方法的具体用法?Python LinkExtractor.extract_links怎么用?Python LinkExtractor.extract_links使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.linkextractors.LinkExtractor的用法示例。


在下文中一共展示了LinkExtractor.extract_links方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse_sesja

# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
    def parse_sesja(self, response):
        # uchwaly
        uchwaly_le = LinkExtractor(allow=FindReportsSpider.UCHWALA_RE, restrict_xpaths="//table")
        links = uchwaly_le.extract_links(response)
        self.print_links("uchwaly", links)
        cnt = 0
        for link in links:
            yield scrapy.Request(link.url, callback=self.parse_uchwala)
            k = items.PageItem()
            k["text"] = link.text.encode("utf8")
            k["url"] = link.url
            k["ref"] = response.url
            k["order"] = cnt
            yield k
            if cnt >= DEBUG_CNT and DEBUG:
                break
            cnt += 1

        # files (glosowania, obecnosc)
        le = LinkExtractor(allow=FindReportsSpider.PLIK_RE)
        links = le.extract_links(response)
        self.print_links("glosowania", links)
        cnt = 0
        for link in links:
            fi = items.FiledownloadItem()
            fi["file_urls"] = [link.url]
            fi["text"] = link.text.encode("utf8")
            fi["url"] = link.url
            fi["ref"] = response.url
            fi["order"] = cnt
            yield fi
            if cnt >= DEBUG_CNT and DEBUG:
                break
            cnt += 1
开发者ID:orian,项目名称:umo,代码行数:36,代码来源:find_reports.py

示例2: parse

# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
 def parse(self, response):
     #提取书籍页面中每本书的链接
     le = LinkExtractor(restrict_css='article.product_pod h3')
     for link in le.extract_links(response):
         yield scrapy.Request(link.url, callback=self.parse_book)
         
     #提取下一页的链接
     le =  LinkExtractor(restrict_css='ul.pager li.next')
     links = le.extract_links(response)
     if links:
         next_url = links[0].url
         yield scrapy.Request (next_url, callback=self.parse)
开发者ID:daguanqiao,项目名称:gitt1,代码行数:14,代码来源:book.py

示例3: parse_state

# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
    def parse_state(self, response):
        """ Yields a scrapy.Request object for each city with a store in the state """
        state_url = 'stores.joann.com/{}*'.format(response.meta['state'])
        extractor = LinkExtractor(allow=state_url)

        for link in extractor.extract_links(response):
            yield scrapy.Request(link.url, callback=self.parse_city, headers=HEADERS)
开发者ID:iandees,项目名称:all-the-places,代码行数:9,代码来源:joann_fabrics.py

示例4: MySpider

# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
class MySpider(scrapy.Spider):
    # Your spider definition
    name="fetch_data"

    def __init__(self, *args, **kwargs):
    	super(MySpider, self).__init__(*args, **kwargs)
        self.start_urls = [kwargs.get('start_url')]
        self.link_extractor = LinkExtractor()
        urls = self.start_urls

    def parse(self, response):
    	item = WebpageScraperItem()
        
        item['key'] = self.start_urls
    	item['title'] = response.xpath('//title/text()').extract()
    	item['paragraphs'] = response.xpath('//p/text()').extract()
    	item['headings'] = response.xpath('//h1/text()').extract()
        
        links = self.link_extractor.extract_links(response)
        item['links'] = [x.url for x in links]
        
        img_urls = []
        img_url = response.xpath('//img/@src').extract()
        for img in img_url:
            parse_url = urlparse.urlparse(img)
            parsed_url = parse_url._replace(**{"scheme":"http"})
            img_urls.append(parsed_url.geturl())
    	
        item['image_urls'] = img_urls
        return item
开发者ID:TanvirMahmudEmon,项目名称:scrapage,代码行数:32,代码来源:__init__.py

示例5: BCSpider

# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
class BCSpider(Spider):
    name = 'bc'

    def __init__(self, *args, **kwargs):
        super(BCSpider, self).__init__(*args, **kwargs)
        self.le = LinkExtractor()

    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            return

        for link in self.le.extract_links(response):
            r = Request(url=link.url)
            r.meta.update(link_text=link.text)
            yield r

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(BCSpider, cls).from_crawler(crawler, *args, **kwargs)
        spider._set_crawler(crawler)
        spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
        return spider

    def spider_idle(self):
        self.log("Spider idle signal caught.")
        raise DontCloseSpider
开发者ID:scrapinghub,项目名称:frontera,代码行数:28,代码来源:bc.py

示例6: parse_link

# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
 def parse_link(self, response):
     # log
     self.logger.info('Hi, this is an item page! %s', response.url)
     # parse link
     linkExtractor = LinkExtractor(allow=r".+\.shtml", restrict_css='div.list > ul', unique=True)
     links = linkExtractor.extract_links(response)
     for link in links:
         yield scrapy.Request(link.url, callback=self.parse_content)
开发者ID:bdtgzj,项目名称:learning-git,代码行数:10,代码来源:sipf_investor_fund.py

示例7: parse

# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
 def parse(self,response):
     extractor = LinkExtractor(allow="/article/*")
     links = extractor.extract_links(response)
     for link in links:
         item = XiubaiItem()
         req = Request(link.url, self.parse_detail_page)
         req.meta['item'] = item
         yield req
开发者ID:lijunchao16,项目名称:scrapy,代码行数:10,代码来源:indexpage.py

示例8: parse

# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
 def parse(self, response):
     le = LinkExtractor()
     for link in le.extract_links(response):
         yield scrapy.Request(link.url, self.parse_link, meta={
             'splash': {
                 'args': {'har': 1, 'html': 0},
             }
         })
开发者ID:Sunil-Cube,项目名称:scrapy-splash-1,代码行数:10,代码来源:dmoz.py

示例9: parse

# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
    def parse(self, response):
        if response.status != 200 or response.body == "":
            return

        ads_links = response.xpath("//a[img]")
        for ads_link in ads_links:
            link_href = ads_link.xpath("@href").extract_first()
            if self._from_same_site(response.url, link_href):
                continue

            ads_profile = AdsProfileItem()
            ads_profile["ads_host"] = response.url
            ads_profile["ads_present_mode"] = "normal_1"
            ads_profile["ads_target_url"] = link_href
            img_src = response.urljoin(ads_link.xpath("img/@src").extract_first())
            ads_profile["ads_content_url"] = img_src
            ads_profile["ads_content_frame"] = ""
            ads_profile["ads_host_domain"] = urlparse(response.url).netloc
            ads_profile["ads_target_domain"] = urlparse(link_href).netloc
            yield ads_profile

        if isinstance(response, SplashJsonResponse):
            if "childFrames" in response.data:
                frames = self._get_all_child_frames(response)
                print "Get %s childFrames in %s" % (len(frames), response.url)
                for frame_response in frames:
                    if not self._is_valid_frame(frame_response.url):
                        continue
                    ads_links = frame_response.xpath("//a[img]")
                    for ads_link in ads_links:
                        link_href = ads_link.xpath("@href").extract_first()
                        if self._from_same_site(response.url, link_href):
                            continue

                        ads_profile = AdsProfileItem()
                        ads_profile["ads_host"] = response.url
                        ads_profile["ads_present_mode"] = "normal_1"
                        ads_profile["ads_target_url"] = link_href
                        img_src = frame_response.urljoin(ads_link.xpath("img/@src").extract_first())
                        ads_profile["ads_content_url"] = img_src
                        ads_profile["ads_content_frame"] = frame_response.url
                        ads_profile["ads_host_domain"] = urlparse(response.url).netloc
                        ads_profile["ads_target_domain"] = urlparse(link_href).netloc
                        yield ads_profile

        link_extractor = LinkExtractor()
        all_links = link_extractor.extract_links(response)
        for link in all_links:
            request = SplashRequest(
                response.urljoin(link.url),
                self.parse,
                endpoint="render.json",
                slot_policy=SlotPolicy.PER_DOMAIN,
                args={"html": 1, "iframes": 1},
            )
            request.headers.setdefault("User-Agent", self.ua_generater.get_user_agent())
            yield request
开发者ID:yuanbei,项目名称:adspider,代码行数:59,代码来源:ads_profile_spider.py

示例10: parse

# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
 def parse(self, response):
     e = LinkExtractor()
     urls = [link.url for link in e.extract_links(response)]
     for url in urls:
         parsed = urlparse.urlsplit(url)
         qs = urlparse.parse_qs(parsed.query)
         if qs and 'Url' in qs:
             event_url = qs['Url'][0]
             yield self.add_url(event_url)
开发者ID:DanceDeets,项目名称:dancedeets-server,代码行数:11,代码来源:bboybattles.py

示例11: parse

# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
    def parse(self, response):
        le = LinkExtractor()
        user_profiles = []
        for link in le.extract_links(response):
            result = re.search(r'.*(http://www.last.fm/user/.*)', link.url)
            if result:
                user_profiles.append(result.group(1))

        for user_profile in user_profiles:
            print user_profile
开发者ID:denholms,项目名称:track-classifier,代码行数:12,代码来源:spider.py

示例12: parse_code

# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
    def parse_code(self, response):
        #提取source code的url
#        le = LinkExtractor(restrict_css='div.bodywrapper p', allow='matplotlib.org/examples')
#        link = le.extract_links(response)
        le = LinkExtractor(restrict_css='a.reference.external')
        link = le.extract_links(response)
        
        file = FilesItem()
        file['file_urls'] = [link[0].url]
        return file
开发者ID:daguanqiao,项目名称:gitt1,代码行数:12,代码来源:filesDown.py

示例13: parse

# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
 def parse(self, response):
     name = 'example'
     lx = LinkExtractor()
     lst = lx.extract_links(response)  # List contains the list of jobs
     # Call the function which compares between lst and MongoDB. Return Boolean Value
     flag = compare(name, lst)
     # if True, call the function which send an email to users
     if flag:
         notify(name)
     else:
         print("No Update")
开发者ID:WHYjun,项目名称:job-search-bot,代码行数:13,代码来源:example.py

示例14: parse

# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
 def parse(self, response):
     link_extractor = LinkExtractor()
     links = link_extractor.extract_links(response)
     for link in links:
         item = DomainItem()
         item['link'] = link.url
         item['domain'] = self.getHost(link.url)
         yield item
     for link in links:
         if (not db.scrapy_items.find_one({'link': link.url})):
             yield scrapy.Request(link.url, callback=self.parse)
开发者ID:freskyme,项目名称:python-study-demo,代码行数:13,代码来源:host_spider.py

示例15: parse

# 需要导入模块: from scrapy.linkextractors import LinkExtractor [as 别名]
# 或者: from scrapy.linkextractors.LinkExtractor import extract_links [as 别名]
 def parse(self, response):
     le = LinkExtractor()
     for link in le.extract_links(response):
         yield SplashRequest(
             link.url,
             self.parse_link,
             endpoint='render.json',
             args={
                 'har': 1,
                 'html': 1,
             }
         )
开发者ID:AllenCHM,项目名称:scrapy-splash,代码行数:14,代码来源:dmoz.py


注:本文中的scrapy.linkextractors.LinkExtractor.extract_links方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。