当前位置: 首页>>代码示例>>Python>>正文


Python linkextractors.LinkExtractor类代码示例

本文整理汇总了Python中scrapy.linkextractors.LinkExtractor的典型用法代码示例。如果您正苦于以下问题:Python LinkExtractor类的具体用法?Python LinkExtractor怎么用?Python LinkExtractor使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了LinkExtractor类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse_sesja

    def parse_sesja(self, response):
        # uchwaly
        uchwaly_le = LinkExtractor(allow=FindReportsSpider.UCHWALA_RE, restrict_xpaths="//table")
        links = uchwaly_le.extract_links(response)
        self.print_links("uchwaly", links)
        cnt = 0
        for link in links:
            yield scrapy.Request(link.url, callback=self.parse_uchwala)
            k = items.PageItem()
            k["text"] = link.text.encode("utf8")
            k["url"] = link.url
            k["ref"] = response.url
            k["order"] = cnt
            yield k
            if cnt >= DEBUG_CNT and DEBUG:
                break
            cnt += 1

        # files (glosowania, obecnosc)
        le = LinkExtractor(allow=FindReportsSpider.PLIK_RE)
        links = le.extract_links(response)
        self.print_links("glosowania", links)
        cnt = 0
        for link in links:
            fi = items.FiledownloadItem()
            fi["file_urls"] = [link.url]
            fi["text"] = link.text.encode("utf8")
            fi["url"] = link.url
            fi["ref"] = response.url
            fi["order"] = cnt
            yield fi
            if cnt >= DEBUG_CNT and DEBUG:
                break
            cnt += 1
开发者ID:orian,项目名称:umo,代码行数:34,代码来源:find_reports.py

示例2: parse_state

    def parse_state(self, response):
        """ Yields a scrapy.Request object for each city with a store in the state """
        state_url = 'stores.joann.com/{}*'.format(response.meta['state'])
        extractor = LinkExtractor(allow=state_url)

        for link in extractor.extract_links(response):
            yield scrapy.Request(link.url, callback=self.parse_city, headers=HEADERS)
开发者ID:iandees,项目名称:all-the-places,代码行数:7,代码来源:joann_fabrics.py

示例3: parse

 def parse(self, response):
     le = LinkExtractor()
     for link in le.extract_links(response):
         yield scrapy.Request(link.url, self.parse_link, meta={
             'splash': {
                 'args': {'har': 1, 'html': 0},
             }
         })
开发者ID:Sunil-Cube,项目名称:scrapy-splash-1,代码行数:8,代码来源:dmoz.py

示例4: parse_link

 def parse_link(self, response):
     # log
     self.logger.info('Hi, this is an item page! %s', response.url)
     # parse link
     linkExtractor = LinkExtractor(allow=r".+\.shtml", restrict_css='div.list > ul', unique=True)
     links = linkExtractor.extract_links(response)
     for link in links:
         yield scrapy.Request(link.url, callback=self.parse_content)
开发者ID:bdtgzj,项目名称:learning-git,代码行数:8,代码来源:sipf_investor_fund.py

示例5: parse

 def parse(self,response):
     extractor = LinkExtractor(allow="/article/*")
     links = extractor.extract_links(response)
     for link in links:
         item = XiubaiItem()
         req = Request(link.url, self.parse_detail_page)
         req.meta['item'] = item
         yield req
开发者ID:lijunchao16,项目名称:scrapy,代码行数:8,代码来源:indexpage.py

示例6: parse

 def parse(self, response):
     e = LinkExtractor()
     urls = [link.url for link in e.extract_links(response)]
     for url in urls:
         parsed = urlparse.urlsplit(url)
         qs = urlparse.parse_qs(parsed.query)
         if qs and 'Url' in qs:
             event_url = qs['Url'][0]
             yield self.add_url(event_url)
开发者ID:DanceDeets,项目名称:dancedeets-server,代码行数:9,代码来源:bboybattles.py

示例7: parse

    def parse(self, response):
        if response.status != 200 or response.body == "":
            return

        ads_links = response.xpath("//a[img]")
        for ads_link in ads_links:
            link_href = ads_link.xpath("@href").extract_first()
            if self._from_same_site(response.url, link_href):
                continue

            ads_profile = AdsProfileItem()
            ads_profile["ads_host"] = response.url
            ads_profile["ads_present_mode"] = "normal_1"
            ads_profile["ads_target_url"] = link_href
            img_src = response.urljoin(ads_link.xpath("img/@src").extract_first())
            ads_profile["ads_content_url"] = img_src
            ads_profile["ads_content_frame"] = ""
            ads_profile["ads_host_domain"] = urlparse(response.url).netloc
            ads_profile["ads_target_domain"] = urlparse(link_href).netloc
            yield ads_profile

        if isinstance(response, SplashJsonResponse):
            if "childFrames" in response.data:
                frames = self._get_all_child_frames(response)
                print "Get %s childFrames in %s" % (len(frames), response.url)
                for frame_response in frames:
                    if not self._is_valid_frame(frame_response.url):
                        continue
                    ads_links = frame_response.xpath("//a[img]")
                    for ads_link in ads_links:
                        link_href = ads_link.xpath("@href").extract_first()
                        if self._from_same_site(response.url, link_href):
                            continue

                        ads_profile = AdsProfileItem()
                        ads_profile["ads_host"] = response.url
                        ads_profile["ads_present_mode"] = "normal_1"
                        ads_profile["ads_target_url"] = link_href
                        img_src = frame_response.urljoin(ads_link.xpath("img/@src").extract_first())
                        ads_profile["ads_content_url"] = img_src
                        ads_profile["ads_content_frame"] = frame_response.url
                        ads_profile["ads_host_domain"] = urlparse(response.url).netloc
                        ads_profile["ads_target_domain"] = urlparse(link_href).netloc
                        yield ads_profile

        link_extractor = LinkExtractor()
        all_links = link_extractor.extract_links(response)
        for link in all_links:
            request = SplashRequest(
                response.urljoin(link.url),
                self.parse,
                endpoint="render.json",
                slot_policy=SlotPolicy.PER_DOMAIN,
                args={"html": 1, "iframes": 1},
            )
            request.headers.setdefault("User-Agent", self.ua_generater.get_user_agent())
            yield request
开发者ID:yuanbei,项目名称:adspider,代码行数:57,代码来源:ads_profile_spider.py

示例8: parse

    def parse(self, response):
        le = LinkExtractor()
        user_profiles = []
        for link in le.extract_links(response):
            result = re.search(r'.*(http://www.last.fm/user/.*)', link.url)
            if result:
                user_profiles.append(result.group(1))

        for user_profile in user_profiles:
            print user_profile
开发者ID:denholms,项目名称:track-classifier,代码行数:10,代码来源:spider.py

示例9: parse_code

    def parse_code(self, response):
        #提取source code的url
#        le = LinkExtractor(restrict_css='div.bodywrapper p', allow='matplotlib.org/examples')
#        link = le.extract_links(response)
        le = LinkExtractor(restrict_css='a.reference.external')
        link = le.extract_links(response)
        
        file = FilesItem()
        file['file_urls'] = [link[0].url]
        return file
开发者ID:daguanqiao,项目名称:gitt1,代码行数:10,代码来源:filesDown.py

示例10: parse

 def parse(self, response):
     link_extractor = LinkExtractor()
     links = link_extractor.extract_links(response)
     for link in links:
         item = DomainItem()
         item['link'] = link.url
         item['domain'] = self.getHost(link.url)
         yield item
     for link in links:
         if (not db.scrapy_items.find_one({'link': link.url})):
             yield scrapy.Request(link.url, callback=self.parse)
开发者ID:freskyme,项目名称:python-study-demo,代码行数:11,代码来源:host_spider.py

示例11: parse

 def parse(self, response):
     name = 'example'
     lx = LinkExtractor()
     lst = lx.extract_links(response)  # List contains the list of jobs
     # Call the function which compares between lst and MongoDB. Return Boolean Value
     flag = compare(name, lst)
     # if True, call the function which send an email to users
     if flag:
         notify(name)
     else:
         print("No Update")
开发者ID:WHYjun,项目名称:job-search-bot,代码行数:11,代码来源:example.py

示例12: parse

 def parse(self, response):
     le = LinkExtractor()
     for link in le.extract_links(response):
         yield SplashRequest(
             link.url,
             self.parse_link,
             endpoint='render.json',
             args={
                 'har': 1,
                 'html': 1,
             }
         )
开发者ID:AllenCHM,项目名称:scrapy-splash,代码行数:12,代码来源:dmoz.py

示例13: parse

 def parse(self, response):
     e = LinkExtractor()
     urls = [link.url for link in e.extract_links(response)]
     for url in urls:
         if response.url != url:
             yield self.add_url(url)
     if urls:
         qs = urlparse.parse_qs(urlparse.urlparse(response.url).query)
         qs = dict((k, v[0]) for (k, v) in qs.iteritems())
         qs['p'] = int(qs['p']) + 1
         url = 'http://comeon5678.com/event/list'
         yield scrapy.Request('%s?%s' % (url, urllib.urlencode(qs)))
开发者ID:mikelambert,项目名称:dancedeets-monorepo,代码行数:12,代码来源:comeon5678.py

示例14: parse

 def parse(self, response):
     #提取书籍页面中每本书的链接
     le = LinkExtractor(restrict_css='article.product_pod h3')
     for link in le.extract_links(response):
         yield scrapy.Request(link.url, callback=self.parse_book)
         
     #提取下一页的链接
     le =  LinkExtractor(restrict_css='ul.pager li.next')
     links = le.extract_links(response)
     if links:
         next_url = links[0].url
         yield scrapy.Request (next_url, callback=self.parse)
开发者ID:daguanqiao,项目名称:gitt1,代码行数:12,代码来源:book.py

示例15: __init__

 def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
                tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
                deny_extensions=None):
   LinkExtractor.__init__(self, allow = allow,
       deny = deny,
       allow_domains = allow_domains,
       deny_domains = deny_domains,
       restrict_xpaths = restrict_xpaths,
       tags = tags,
       attrs = attrs,
       canonicalize = canonicalize,
       unique = unique,
       process_value = self.process_value,
       deny_extensions = deny_extensions
       )
开发者ID:cfhb,项目名称:crawl_youtube,代码行数:15,代码来源:le_sgml.py


注:本文中的scrapy.linkextractors.LinkExtractor类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。