本文整理汇总了Python中w3lib.url.urljoin_rfc函数的典型用法代码示例。如果您正苦于以下问题:Python urljoin_rfc函数的具体用法?Python urljoin_rfc怎么用?Python urljoin_rfc使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了urljoin_rfc函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
def parse(self,response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
base_url = get_base_url(response)
for href in response.xpath('//table/tr/td/strong/a/@href').extract():
relative_url = href
abs_url =urljoin_rfc(base_url,relative_url)
yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
#self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
#yield scrapy.Request(url=abs_url,callback=self.parse)
#解析pdf
for href in response.xpath('//table[@class="object_table"]/tr/td[4]/a/@href').extract():
relative_url = href
abs_url =urljoin_rfc(base_url,relative_url)
yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
#self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
#yield scrapy.Request(url=abs_url,callback=self.parse)
#解析翻页
for href in response.xpath('//table/tr/td/table/tr/td/a/@href').extract():
if ("page=" not in href and "browse-date?top=" not in href ) or "itemsPerPage=" in href:
continue
relative_url = href
abs_url =urljoin_rfc(base_url,relative_url)
yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
#self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
yield scrapy.Request(url=abs_url,callback=self.parse)
示例2: parse_index
def parse_index(self,response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
if response.status / 100 != 2:
yield scrapy.Request(url=response.url,callback=self.parse_index)
return
base_url = get_base_url(response)
#解析期刊首页
count = 0
for href in response.xpath("//div[@id='divperilist']/ul/li/a/@href").extract():
if href.startswith("Rss.ashx?"):
continue
relative_url = href
abs_url =urljoin_rfc(base_url,relative_url)
#self.log("Parse %s %s"%(response.url,abs_url),level=scrapy.log.INFO)
yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
yield scrapy.Request(url=abs_url,callback=self.parse_content)
count += 1
self.log("Fuck %s %d"%(response.url,count),level=scrapy.log.INFO)
#解析索引页翻页
for href in response.xpath("//div[@id='divperilist']/table//a/@href").extract():
if "PageNo" not in href:
continue
relative_url = href
abs_url =urljoin_rfc(base_url,relative_url)
self.log("Parse %s %s"%(response.url,abs_url),level=scrapy.log.INFO)
yield scrapy.Request(url=abs_url,callback=self.parse_index)
示例3: extract_links
def extract_links(self, response):
xs = HtmlXPathSelector(response)
base_url = xs.select('//base/@href').extract()
base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url
links = []
for location in self.locations:
if isinstance(location, basestring):
selectors = xs.select(location)
elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)):
selectors = [location] if isinstance(location, HtmlXPathSelector) else location
else:
continue
for selector in selectors:
links.extend(self.extract_from_selector(selector, response.encoding))
seen, ret = set(), []
for link in links:
link.url = urljoin_rfc(base_url, link.url, response.encoding)
if self.unique:
if link.url in seen:
continue
else:
seen.add(link.url)
if self.canonicalize:
link.url = canonicalize_url(link.url)
ret.append(link)
return ret
示例4: process_response
def process_response(self, request, response, spider):
if "dont_redirect" in request.meta:
return response
if request.method.upper() == "HEAD":
if response.status in [301, 302, 303, 307] and "Location" in response.headers:
redirected_url = urljoin_rfc(request.url, response.headers["location"])
redirected = request.replace(url=redirected_url)
return self._redirect(redirected, request, spider, response.status)
else:
return response
if response.status in [302, 303] and "Location" in response.headers:
redirected_url = urljoin_rfc(request.url, response.headers["location"])
redirected = self._redirect_request_using_get(request, redirected_url)
return self._redirect(redirected, request, spider, response.status)
if response.status in [301, 307] and "Location" in response.headers:
redirected_url = urljoin_rfc(request.url, response.headers["location"])
redirected = request.replace(url=redirected_url)
return self._redirect(redirected, request, spider, response.status)
if isinstance(response, HtmlResponse):
interval, url = get_meta_refresh(response)
if url and interval < self.max_metarefresh_delay:
redirected = self._redirect_request_using_get(request, url)
return self._redirect(redirected, request, spider, "meta refresh")
return response
示例5: _extract_links
def _extract_links(self, response_text, response_url, response_encoding):
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
links_text = linkre.findall(response_text)
urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])
return [Link(url, text) for url, text in urlstext]
示例6: parse
def parse(self, response):
self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO)
# self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
count = 0
for a in response.xpath('//a'):
text = a.xpath("string(.)").extract()
text = "".join(text).strip()
if len(text) > 5 or "PDF" not in text:
continue
href = a.xpath("@href").extract()
if len(href) != 1:
continue
href = href[0]
if (href == "#" or href.startswith("javascript")) and len(a.xpath("@onclick").extract()) == 1:
onclick = a.xpath("@onclick").extract()[0]
onclick = onclick.split(",")
if len(onclick) < 2:
continue
if onclick[0].startswith("showArticleFile"):
id = onclick[-1].split(")", 1)[0].replace("'", "")
else:
id = onclick[1].split(")", 1)[0].replace("'", "")
if "/CN/" in response.url:
pdf = response.url.split("/CN/", 1)[
0] + "/CN/article/downloadArticleFile.do?attachType=PDF&id=" + id
elif "/EN/" in response.url:
pdf = response.url.split("/EN/", 1)[
0] + "/EN/article/downloadArticleFile.do?attachType=PDF&id=" + id
else:
continue
elif "attachType=PDF&id=" in href:
abs_url = urljoin_rfc(response.url, href)
pdf = abs_url
else:
continue
# url = "http://www.zjnyxb.cn/CN/article/downloadArticleFile.do?attachType=PDF&id="+id
# print pdf
self.log("PDF_URL %s" % (pdf), level=scrapy.log.INFO)
yield self.baidu_rpc_request({"url": pdf, "src_id": 22})
count += 1
base_url = get_base_url(response)
for sel in response.xpath('//a/@href'):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url == "#":
continue
abs_url = urljoin_rfc(base_url, relative_url)
abs_url = safe_url_string(abs_url, encoding=response.encoding)
yield self.baidu_rpc_request({"url": abs_url, "src_id": 22})
self.log("PDF_TOTAL %s %d" % (response.url, count), level=scrapy.log.INFO)
示例7: _extract_links
def _extract_links(self, response_text, response_url, response_encoding):
self.base_url, self.links = etree.HTML(response_text, self.parser)
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
ret = []
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
for link in links:
link.url = urljoin_rfc(base_url, link.url, response_encoding)
link.url = safe_url_string(link.url, response_encoding)
link.text = str_to_unicode(link.text, response_encoding, errors="replace")
ret.append(link)
return ret
示例8: test_urljoin_rfc
def test_urljoin_rfc(self):
self.assertEqual(urljoin_rfc('http://example.com/some/path', 'newpath/test'),
'http://example.com/some/newpath/test')
self.assertEqual(urljoin_rfc('http://example.com/some/path/a.jpg', '../key/other'),
'http://example.com/some/key/other')
u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele', u'lala/\xa3')
self.assertEqual(u, 'http://example.com/lolo/\xc2\xa3/lala/\xc2\xa3')
assert isinstance(u, str)
u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele', 'lala/\xa3', encoding='latin-1')
self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3')
assert isinstance(u, str)
u = urljoin_rfc('http://example.com/lolo/\xa3/lele', 'lala/\xa3')
self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3')
assert isinstance(u, str)
示例9: parse
def parse(self, response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
site = get_url_site(response.url)
if site in self.parses:
parser = self.parses[site]
#self.log("Parser %s %s"%(response.url,parser.name),level=scrapy.log.INFO)
for item in parser.parse(response) :
yield item
return
base_url = get_base_url(response)
for sel in response.xpath('//a/@href'):
relative_url = sel.extract()
abs_url =urljoin_rfc(base_url,relative_url)
#print abs_url
schema = get_url_scheme(abs_url)
if schema not in ["http","https"]:
continue
site = get_url_site(abs_url)
yield NimeiItem(url=abs_url,furl=response.url)
yield self.baidu_rpc_request({"url":abs_url,"src_id":4})
示例10: parse
def parse(self, response):
self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO)
# self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
# self.log(response.headers,level=scrapy.log.INFO)
yield scrapy.Request(response.url)
return
if response.__class__ != scrapy.http.HtmlResponse:
return
base_site = get_url_site(response.url)
# print response.url,response.status
base_url = response.url
for sel in response.xpath('//a/@href'):
relative_url = sel.extract()
if not self.is_valid_url(relative_url):
continue
abs_url = urljoin_rfc(base_url, relative_url)
# print abs_url
schema = get_url_scheme(abs_url)
if schema not in ["http", "https"]:
continue
site = get_url_site(abs_url)
# yield NimeiItem(url=abs_url,furl=response.url)
yield self.baidu_rpc_request({"url": abs_url, "src_id": 22}, furl=response.url)
if site != base_site and site not in self.settings.get("ALLOW_SITES", []):
continue
self.log("SendCrawl %s" % (abs_url), level=scrapy.log.INFO)
yield scrapy.Request(abs_url)
示例11: parse_all
def parse_all(self, response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
base_url = get_base_url(response)
base_site = get_url_site(base_url)
for sel in response.xpath('//a/@href'):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
continue
abs_url = urljoin_rfc(base_url,relative_url)
abs_url = safe_url_string(abs_url,encoding=response.encoding)
filename = abs_url.split("?")[0].split("/")[-1]
if filename :
ctype = filename.split(".")[-1].lower()
else:
ctype = None
if ctype in ["jpeg","jpg","swf","rar","zip","gz","gif","mov","png","bmp","exe","pps","db","txt","pptx",'xls',"ppt","xlsx"]:
continue
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
site = get_url_site(abs_url)
if site != base_site:
continue
if ctype in ["pdf","doc","docx","rtf",]:
continue
yield scrapy.Request(url=abs_url,callback=self.parse_all)
示例12: parse
def parse(self,response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
base_url = get_base_url(response)
for sel in response.xpath('//a/@href'):
relative_url = sel.extract()
if relative_url.startswith("javascript:"):
continue
if "mod=redirect" in relative_url or "redirect.php" in relative_url:
continue
abs_url =urljoin_rfc(base_url,relative_url)
schema = get_url_scheme(abs_url)
if schema not in ["http","https"]:
continue
#yield NimeiItem(url=abs_url,furl=response.url)
abs_url = self.remove_param(abs_url,["extra","orderby","typeid","filter","sortid","searchsort","vk_payway_13","sid","recommend","digest"])
if self.PATTERN1.match(abs_url):
abs_url = re.sub("\-\d+\-\d+\.html.*","-1-1.html",abs_url,1)
yield self.baidu_rpc_request({"url":abs_url,"src_id":4})
if relative_url.startswith("forum_") or relative_url.startswith("forum-") or relative_url.startswith("/archives/") or relative_url.startswith("forumdisplay.php?fid=") or relative_url.startswith("forum.php?mod=forumdisplay&fid="):
yield scrapy.Request(abs_url)
示例13: _extract_links
def _extract_links(self, response_text, response_url, response_encoding):
self.reset()
self.feed(response_text)
self.close()
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
ret = []
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
for link in links:
link.url = urljoin_rfc(base_url, link.url, response_encoding)
link.url = safe_url_string(link.url, response_encoding)
link.text = link.text.decode(response_encoding)
ret.append(link)
return ret
示例14: _extract_links
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
""" Do the real extraction work """
self.reset()
self.feed(response_text)
self.close()
ret = []
if base_url is None:
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
for link in self.links:
link.url = urljoin_rfc(base_url, link.url, response_encoding)
link.url = safe_url_string(link.url, response_encoding)
link.text = str_to_unicode(link.text, response_encoding, errors='replace')
ret.append(link)
return ret
示例15: parse_zgyszz
def parse_zgyszz(self,response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
#base_site = get_url_site(base_url)
if "qklist/show-" in response.url:
base_url = get_base_url(response)
downLink = response.xpath("//div[@id='down']//a/@onclick").extract()[0]
relative_url = downLink.split("'")[1]
abs_url = urljoin_rfc(base_url,relative_url)
yield scrapy.Request(abs_url,callback=self.parse_zgyszz)
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
return
if '/upload/qklist/' in response.url:
yield self.baidu_rpc_request({"url":response.url,"src_id":22})
return
base_url = response.url
for sel in response.xpath("//div[@class='main_box']//table/tr[1]/td/a/@href"):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
continue
abs_url = urljoin_rfc(base_url,relative_url)
abs_url = safe_url_string(abs_url,encoding=response.encoding)
request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
#request.meta["dont_redirect"] = True
yield request
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
for sel in response.xpath("//div[@class='flickr']/a/@href"):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
continue
abs_url = urljoin_rfc(base_url,relative_url)
abs_url = safe_url_string(abs_url,encoding=response.encoding)
request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
yield request
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})