當前位置: 首頁>>代碼示例>>Python>>正文


Python linkextractors.LinkExtractor類代碼示例

本文整理匯總了Python中scrapy.linkextractors.LinkExtractor的典型用法代碼示例。如果您正苦於以下問題:Python LinkExtractor類的具體用法?Python LinkExtractor怎麽用?Python LinkExtractor使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。


在下文中一共展示了LinkExtractor類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: parse_sesja

    def parse_sesja(self, response):
        # uchwaly
        uchwaly_le = LinkExtractor(allow=FindReportsSpider.UCHWALA_RE, restrict_xpaths="//table")
        links = uchwaly_le.extract_links(response)
        self.print_links("uchwaly", links)
        cnt = 0
        for link in links:
            yield scrapy.Request(link.url, callback=self.parse_uchwala)
            k = items.PageItem()
            k["text"] = link.text.encode("utf8")
            k["url"] = link.url
            k["ref"] = response.url
            k["order"] = cnt
            yield k
            if cnt >= DEBUG_CNT and DEBUG:
                break
            cnt += 1

        # files (glosowania, obecnosc)
        le = LinkExtractor(allow=FindReportsSpider.PLIK_RE)
        links = le.extract_links(response)
        self.print_links("glosowania", links)
        cnt = 0
        for link in links:
            fi = items.FiledownloadItem()
            fi["file_urls"] = [link.url]
            fi["text"] = link.text.encode("utf8")
            fi["url"] = link.url
            fi["ref"] = response.url
            fi["order"] = cnt
            yield fi
            if cnt >= DEBUG_CNT and DEBUG:
                break
            cnt += 1
開發者ID:orian,項目名稱:umo,代碼行數:34,代碼來源:find_reports.py

示例2: parse_state

    def parse_state(self, response):
        """ Yields a scrapy.Request object for each city with a store in the state """
        state_url = 'stores.joann.com/{}*'.format(response.meta['state'])
        extractor = LinkExtractor(allow=state_url)

        for link in extractor.extract_links(response):
            yield scrapy.Request(link.url, callback=self.parse_city, headers=HEADERS)
開發者ID:iandees,項目名稱:all-the-places,代碼行數:7,代碼來源:joann_fabrics.py

示例3: parse

 def parse(self, response):
     le = LinkExtractor()
     for link in le.extract_links(response):
         yield scrapy.Request(link.url, self.parse_link, meta={
             'splash': {
                 'args': {'har': 1, 'html': 0},
             }
         })
開發者ID:Sunil-Cube,項目名稱:scrapy-splash-1,代碼行數:8,代碼來源:dmoz.py

示例4: parse_link

 def parse_link(self, response):
     # log
     self.logger.info('Hi, this is an item page! %s', response.url)
     # parse link
     linkExtractor = LinkExtractor(allow=r".+\.shtml", restrict_css='div.list > ul', unique=True)
     links = linkExtractor.extract_links(response)
     for link in links:
         yield scrapy.Request(link.url, callback=self.parse_content)
開發者ID:bdtgzj,項目名稱:learning-git,代碼行數:8,代碼來源:sipf_investor_fund.py

示例5: parse

 def parse(self,response):
     extractor = LinkExtractor(allow="/article/*")
     links = extractor.extract_links(response)
     for link in links:
         item = XiubaiItem()
         req = Request(link.url, self.parse_detail_page)
         req.meta['item'] = item
         yield req
開發者ID:lijunchao16,項目名稱:scrapy,代碼行數:8,代碼來源:indexpage.py

示例6: parse

 def parse(self, response):
     e = LinkExtractor()
     urls = [link.url for link in e.extract_links(response)]
     for url in urls:
         parsed = urlparse.urlsplit(url)
         qs = urlparse.parse_qs(parsed.query)
         if qs and 'Url' in qs:
             event_url = qs['Url'][0]
             yield self.add_url(event_url)
開發者ID:DanceDeets,項目名稱:dancedeets-server,代碼行數:9,代碼來源:bboybattles.py

示例7: parse

    def parse(self, response):
        if response.status != 200 or response.body == "":
            return

        ads_links = response.xpath("//a[img]")
        for ads_link in ads_links:
            link_href = ads_link.xpath("@href").extract_first()
            if self._from_same_site(response.url, link_href):
                continue

            ads_profile = AdsProfileItem()
            ads_profile["ads_host"] = response.url
            ads_profile["ads_present_mode"] = "normal_1"
            ads_profile["ads_target_url"] = link_href
            img_src = response.urljoin(ads_link.xpath("img/@src").extract_first())
            ads_profile["ads_content_url"] = img_src
            ads_profile["ads_content_frame"] = ""
            ads_profile["ads_host_domain"] = urlparse(response.url).netloc
            ads_profile["ads_target_domain"] = urlparse(link_href).netloc
            yield ads_profile

        if isinstance(response, SplashJsonResponse):
            if "childFrames" in response.data:
                frames = self._get_all_child_frames(response)
                print "Get %s childFrames in %s" % (len(frames), response.url)
                for frame_response in frames:
                    if not self._is_valid_frame(frame_response.url):
                        continue
                    ads_links = frame_response.xpath("//a[img]")
                    for ads_link in ads_links:
                        link_href = ads_link.xpath("@href").extract_first()
                        if self._from_same_site(response.url, link_href):
                            continue

                        ads_profile = AdsProfileItem()
                        ads_profile["ads_host"] = response.url
                        ads_profile["ads_present_mode"] = "normal_1"
                        ads_profile["ads_target_url"] = link_href
                        img_src = frame_response.urljoin(ads_link.xpath("img/@src").extract_first())
                        ads_profile["ads_content_url"] = img_src
                        ads_profile["ads_content_frame"] = frame_response.url
                        ads_profile["ads_host_domain"] = urlparse(response.url).netloc
                        ads_profile["ads_target_domain"] = urlparse(link_href).netloc
                        yield ads_profile

        link_extractor = LinkExtractor()
        all_links = link_extractor.extract_links(response)
        for link in all_links:
            request = SplashRequest(
                response.urljoin(link.url),
                self.parse,
                endpoint="render.json",
                slot_policy=SlotPolicy.PER_DOMAIN,
                args={"html": 1, "iframes": 1},
            )
            request.headers.setdefault("User-Agent", self.ua_generater.get_user_agent())
            yield request
開發者ID:yuanbei,項目名稱:adspider,代碼行數:57,代碼來源:ads_profile_spider.py

示例8: parse

    def parse(self, response):
        le = LinkExtractor()
        user_profiles = []
        for link in le.extract_links(response):
            result = re.search(r'.*(http://www.last.fm/user/.*)', link.url)
            if result:
                user_profiles.append(result.group(1))

        for user_profile in user_profiles:
            print user_profile
開發者ID:denholms,項目名稱:track-classifier,代碼行數:10,代碼來源:spider.py

示例9: parse_code

    def parse_code(self, response):
        #提取source code的url
#        le = LinkExtractor(restrict_css='div.bodywrapper p', allow='matplotlib.org/examples')
#        link = le.extract_links(response)
        le = LinkExtractor(restrict_css='a.reference.external')
        link = le.extract_links(response)
        
        file = FilesItem()
        file['file_urls'] = [link[0].url]
        return file
開發者ID:daguanqiao,項目名稱:gitt1,代碼行數:10,代碼來源:filesDown.py

示例10: parse

 def parse(self, response):
     link_extractor = LinkExtractor()
     links = link_extractor.extract_links(response)
     for link in links:
         item = DomainItem()
         item['link'] = link.url
         item['domain'] = self.getHost(link.url)
         yield item
     for link in links:
         if (not db.scrapy_items.find_one({'link': link.url})):
             yield scrapy.Request(link.url, callback=self.parse)
開發者ID:freskyme,項目名稱:python-study-demo,代碼行數:11,代碼來源:host_spider.py

示例11: parse

 def parse(self, response):
     name = 'example'
     lx = LinkExtractor()
     lst = lx.extract_links(response)  # List contains the list of jobs
     # Call the function which compares between lst and MongoDB. Return Boolean Value
     flag = compare(name, lst)
     # if True, call the function which send an email to users
     if flag:
         notify(name)
     else:
         print("No Update")
開發者ID:WHYjun,項目名稱:job-search-bot,代碼行數:11,代碼來源:example.py

示例12: parse

 def parse(self, response):
     le = LinkExtractor()
     for link in le.extract_links(response):
         yield SplashRequest(
             link.url,
             self.parse_link,
             endpoint='render.json',
             args={
                 'har': 1,
                 'html': 1,
             }
         )
開發者ID:AllenCHM,項目名稱:scrapy-splash,代碼行數:12,代碼來源:dmoz.py

示例13: parse

 def parse(self, response):
     e = LinkExtractor()
     urls = [link.url for link in e.extract_links(response)]
     for url in urls:
         if response.url != url:
             yield self.add_url(url)
     if urls:
         qs = urlparse.parse_qs(urlparse.urlparse(response.url).query)
         qs = dict((k, v[0]) for (k, v) in qs.iteritems())
         qs['p'] = int(qs['p']) + 1
         url = 'http://comeon5678.com/event/list'
         yield scrapy.Request('%s?%s' % (url, urllib.urlencode(qs)))
開發者ID:mikelambert,項目名稱:dancedeets-monorepo,代碼行數:12,代碼來源:comeon5678.py

示例14: parse

 def parse(self, response):
     #提取書籍頁麵中每本書的鏈接
     le = LinkExtractor(restrict_css='article.product_pod h3')
     for link in le.extract_links(response):
         yield scrapy.Request(link.url, callback=self.parse_book)
         
     #提取下一頁的鏈接
     le =  LinkExtractor(restrict_css='ul.pager li.next')
     links = le.extract_links(response)
     if links:
         next_url = links[0].url
         yield scrapy.Request (next_url, callback=self.parse)
開發者ID:daguanqiao,項目名稱:gitt1,代碼行數:12,代碼來源:book.py

示例15: __init__

 def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
                tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
                deny_extensions=None):
   LinkExtractor.__init__(self, allow = allow,
       deny = deny,
       allow_domains = allow_domains,
       deny_domains = deny_domains,
       restrict_xpaths = restrict_xpaths,
       tags = tags,
       attrs = attrs,
       canonicalize = canonicalize,
       unique = unique,
       process_value = self.process_value,
       deny_extensions = deny_extensions
       )
開發者ID:cfhb,項目名稱:crawl_youtube,代碼行數:15,代碼來源:le_sgml.py


注:本文中的scrapy.linkextractors.LinkExtractor類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。