当前位置: 首页>>代码示例>>Python>>正文


Python url.urljoin_rfc函数代码示例

本文整理汇总了Python中w3lib.url.urljoin_rfc函数的典型用法代码示例。如果您正苦于以下问题:Python urljoin_rfc函数的具体用法?Python urljoin_rfc怎么用?Python urljoin_rfc使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了urljoin_rfc函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse

    def parse(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        base_url  = get_base_url(response)
        for href in response.xpath('//table/tr/td/strong/a/@href').extract():
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
            #self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
            #yield scrapy.Request(url=abs_url,callback=self.parse)

        #解析pdf
        for href in response.xpath('//table[@class="object_table"]/tr/td[4]/a/@href').extract():
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
            #self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
            #yield scrapy.Request(url=abs_url,callback=self.parse)

        #解析翻页
        for href in response.xpath('//table/tr/td/table/tr/td/a/@href').extract():
            if ("page=" not in href  and "browse-date?top=" not in href ) or "itemsPerPage=" in href:
                continue

            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
            #self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
            yield scrapy.Request(url=abs_url,callback=self.parse)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:30,代码来源:handle.py

示例2: parse_index

    def parse_index(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            yield scrapy.Request(url=response.url,callback=self.parse_index)
            return
        base_url  = get_base_url(response)
        #解析期刊首页
        count = 0
        for href in response.xpath("//div[@id='divperilist']/ul/li/a/@href").extract():
            if href.startswith("Rss.ashx?"):
                continue
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            #self.log("Parse %s %s"%(response.url,abs_url),level=scrapy.log.INFO)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
            yield scrapy.Request(url=abs_url,callback=self.parse_content)
            count += 1
        self.log("Fuck %s %d"%(response.url,count),level=scrapy.log.INFO)

        #解析索引页翻页
        for href in response.xpath("//div[@id='divperilist']/table//a/@href").extract():
            if "PageNo" not in href:
                continue
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            self.log("Parse %s %s"%(response.url,abs_url),level=scrapy.log.INFO)
            yield scrapy.Request(url=abs_url,callback=self.parse_index)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:27,代码来源:wanfangdata.py

示例3: extract_links

    def extract_links(self, response):
        xs = HtmlXPathSelector(response)
        base_url = xs.select('//base/@href').extract()
        base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url

        links = []
        for location in self.locations:
            if isinstance(location, basestring):
                selectors = xs.select(location)
            elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)):
                selectors = [location] if isinstance(location, HtmlXPathSelector) else location
            else:
                continue

            for selector in selectors:
                links.extend(self.extract_from_selector(selector, response.encoding))

        seen, ret = set(), []
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response.encoding)
            if self.unique:
                if link.url in seen:
                    continue
                else:
                    seen.add(link.url)
            if self.canonicalize:
                link.url = canonicalize_url(link.url)
            ret.append(link)

        return ret
开发者ID:bihicheng,项目名称:scrapy,代码行数:30,代码来源:image.py

示例4: process_response

    def process_response(self, request, response, spider):
        if "dont_redirect" in request.meta:
            return response
        if request.method.upper() == "HEAD":
            if response.status in [301, 302, 303, 307] and "Location" in response.headers:
                redirected_url = urljoin_rfc(request.url, response.headers["location"])
                redirected = request.replace(url=redirected_url)
                return self._redirect(redirected, request, spider, response.status)
            else:
                return response

        if response.status in [302, 303] and "Location" in response.headers:
            redirected_url = urljoin_rfc(request.url, response.headers["location"])
            redirected = self._redirect_request_using_get(request, redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        if response.status in [301, 307] and "Location" in response.headers:
            redirected_url = urljoin_rfc(request.url, response.headers["location"])
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        if isinstance(response, HtmlResponse):
            interval, url = get_meta_refresh(response)
            if url and interval < self.max_metarefresh_delay:
                redirected = self._redirect_request_using_get(request, url)
                return self._redirect(redirected, request, spider, "meta refresh")

        return response
开发者ID:saidimu,项目名称:scrapy,代码行数:28,代码来源:redirect.py

示例5: _extract_links

    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]
开发者ID:bihicheng,项目名称:scrapy,代码行数:10,代码来源:regex.py

示例6: parse

    def parse(self, response):
        self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO)
        # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        count = 0
        for a in response.xpath('//a'):
            text = a.xpath("string(.)").extract()
            text = "".join(text).strip()
            if len(text) > 5 or "PDF" not in text:
                continue
            href = a.xpath("@href").extract()
            if len(href) != 1:
                continue
            href = href[0]
            if (href == "#" or href.startswith("javascript")) and len(a.xpath("@onclick").extract()) == 1:
                onclick = a.xpath("@onclick").extract()[0]
                onclick = onclick.split(",")
                if len(onclick) < 2:
                    continue
                if onclick[0].startswith("showArticleFile"):
                    id = onclick[-1].split(")", 1)[0].replace("'", "")
                else:
                    id = onclick[1].split(")", 1)[0].replace("'", "")
                if "/CN/" in response.url:
                    pdf = response.url.split("/CN/", 1)[
                              0] + "/CN/article/downloadArticleFile.do?attachType=PDF&id=" + id
                elif "/EN/" in response.url:
                    pdf = response.url.split("/EN/", 1)[
                              0] + "/EN/article/downloadArticleFile.do?attachType=PDF&id=" + id
                else:
                    continue
            elif "attachType=PDF&id=" in href:

                abs_url = urljoin_rfc(response.url, href)
                pdf = abs_url
            else:
                continue
            # url = "http://www.zjnyxb.cn/CN/article/downloadArticleFile.do?attachType=PDF&id="+id
            # print pdf
            self.log("PDF_URL %s" % (pdf), level=scrapy.log.INFO)
            yield self.baidu_rpc_request({"url": pdf, "src_id": 22})
            count += 1

        base_url = get_base_url(response)
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url == "#":
                continue
            abs_url = urljoin_rfc(base_url, relative_url)
            abs_url = safe_url_string(abs_url, encoding=response.encoding)
            yield self.baidu_rpc_request({"url": abs_url, "src_id": 22})
        self.log("PDF_TOTAL %s %d" % (response.url, count), level=scrapy.log.INFO)
开发者ID:wjianwei126,项目名称:scrapyc,代码行数:53,代码来源:pdf.py

示例7: _extract_links

    def _extract_links(self, response_text, response_url, response_encoding):
        self.base_url, self.links = etree.HTML(response_text, self.parser)

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding, errors="replace")
            ret.append(link)

        return ret
开发者ID:netconstructor,项目名称:scrapy,代码行数:14,代码来源:lxmlparser.py

示例8: test_urljoin_rfc

 def test_urljoin_rfc(self):
     self.assertEqual(urljoin_rfc('http://example.com/some/path', 'newpath/test'),
                                  'http://example.com/some/newpath/test')
     self.assertEqual(urljoin_rfc('http://example.com/some/path/a.jpg', '../key/other'),
                                  'http://example.com/some/key/other')
     u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele', u'lala/\xa3')
     self.assertEqual(u, 'http://example.com/lolo/\xc2\xa3/lala/\xc2\xa3')
     assert isinstance(u, str)
     u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele', 'lala/\xa3', encoding='latin-1')
     self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3')
     assert isinstance(u, str)
     u = urljoin_rfc('http://example.com/lolo/\xa3/lele', 'lala/\xa3')
     self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3')
     assert isinstance(u, str)
开发者ID:LucianU,项目名称:w3lib,代码行数:14,代码来源:test_url.py

示例9: parse

    def parse(self, response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        
        site = get_url_site(response.url)

        if site in self.parses:
            parser = self.parses[site]
            #self.log("Parser %s %s"%(response.url,parser.name),level=scrapy.log.INFO)
            for item in parser.parse(response) :
                yield item
            return

        base_url  = get_base_url(response)
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract()

            abs_url =urljoin_rfc(base_url,relative_url)
            #print abs_url
            schema = get_url_scheme(abs_url)
            if schema not in ["http","https"]:
                continue            
            site = get_url_site(abs_url)
            yield NimeiItem(url=abs_url,furl=response.url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":4})
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:27,代码来源:base.py

示例10: parse

    def parse(self, response):
        self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO)
        # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            # self.log(response.headers,level=scrapy.log.INFO)
            yield scrapy.Request(response.url)
            return
        if response.__class__ != scrapy.http.HtmlResponse:
            return

        base_site = get_url_site(response.url)
        # print response.url,response.status
        base_url = response.url
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract()
            if not self.is_valid_url(relative_url):
                continue
            abs_url = urljoin_rfc(base_url, relative_url)
            # print abs_url
            schema = get_url_scheme(abs_url)
            if schema not in ["http", "https"]:
                continue
            site = get_url_site(abs_url)

            # yield NimeiItem(url=abs_url,furl=response.url)
            yield self.baidu_rpc_request({"url": abs_url, "src_id": 22}, furl=response.url)
            if site != base_site and site not in self.settings.get("ALLOW_SITES", []):
                continue
            self.log("SendCrawl %s" % (abs_url), level=scrapy.log.INFO)
            yield scrapy.Request(abs_url)
开发者ID:wjianwei126,项目名称:scrapyc,代码行数:30,代码来源:base.py

示例11: parse_all

    def parse_all(self, response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        base_url  = get_base_url(response)
        base_site = get_url_site(base_url)

        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
                continue              
            abs_url = urljoin_rfc(base_url,relative_url)
            abs_url = safe_url_string(abs_url,encoding=response.encoding)

            filename = abs_url.split("?")[0].split("/")[-1]
            if filename :
                ctype  = filename.split(".")[-1].lower() 
            else:
                ctype = None
            if ctype in ["jpeg","jpg","swf","rar","zip","gz","gif","mov","png","bmp","exe","pps","db","txt","pptx",'xls',"ppt","xlsx"]:
                continue

            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})

            site = get_url_site(abs_url)
            if site != base_site:
                continue
            if ctype in ["pdf","doc","docx","rtf",]:
                continue
            yield scrapy.Request(url=abs_url,callback=self.parse_all)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:31,代码来源:pdf.py

示例12: parse

    def parse(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return

        base_url  = get_base_url(response)
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract()
            if relative_url.startswith("javascript:"):
                continue
            if "mod=redirect" in relative_url or "redirect.php" in relative_url:
                continue
                
            abs_url =urljoin_rfc(base_url,relative_url)
            schema = get_url_scheme(abs_url)
            if schema not in ["http","https"]:
                continue  

            #yield NimeiItem(url=abs_url,furl=response.url)
            abs_url = self.remove_param(abs_url,["extra","orderby","typeid","filter","sortid","searchsort","vk_payway_13","sid","recommend","digest"])


            if self.PATTERN1.match(abs_url):
                abs_url = re.sub("\-\d+\-\d+\.html.*","-1-1.html",abs_url,1)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":4})
            if relative_url.startswith("forum_") or relative_url.startswith("forum-") or relative_url.startswith("/archives/") or relative_url.startswith("forumdisplay.php?fid=") or relative_url.startswith("forum.php?mod=forumdisplay&fid="):
                
                yield scrapy.Request(abs_url)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:29,代码来源:bbs.py

示例13: _extract_links

    def _extract_links(self, response_text, response_url, response_encoding):
        self.reset()
        self.feed(response_text)
        self.close()

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = link.text.decode(response_encoding)
            ret.append(link)

        return ret
开发者ID:bihicheng,项目名称:scrapy,代码行数:16,代码来源:htmlparser.py

示例14: _extract_links

    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        """ Do the real extraction work """
        self.reset()
        self.feed(response_text)
        self.close()

        ret = []
        if base_url is None:
            base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in self.links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding, errors='replace')
            ret.append(link)

        return ret
开发者ID:bihicheng,项目名称:scrapy,代码行数:16,代码来源:sgml.py

示例15: parse_zgyszz

    def parse_zgyszz(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return

        #base_site = get_url_site(base_url)
        if  "qklist/show-" in response.url:
            base_url  = get_base_url(response)

            downLink = response.xpath("//div[@id='down']//a/@onclick").extract()[0]
            relative_url = downLink.split("'")[1]

            abs_url = urljoin_rfc(base_url,relative_url)
            yield scrapy.Request(abs_url,callback=self.parse_zgyszz)

            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
            
            return
        if '/upload/qklist/' in response.url:
            yield self.baidu_rpc_request({"url":response.url,"src_id":22})
            return

        base_url  = response.url
        for sel in response.xpath("//div[@class='main_box']//table/tr[1]/td/a/@href"):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
                continue              
            abs_url = urljoin_rfc(base_url,relative_url)
            abs_url = safe_url_string(abs_url,encoding=response.encoding)
            request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
            #request.meta["dont_redirect"] = True
            yield request
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
        
        for sel in response.xpath("//div[@class='flickr']/a/@href"):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
                continue         
            abs_url = urljoin_rfc(base_url,relative_url)
            abs_url = safe_url_string(abs_url,encoding=response.encoding)
            request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
            yield request
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:44,代码来源:pdf.py


注:本文中的w3lib.url.urljoin_rfc函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。