当前位置: 首页>>代码示例>>Python>>正文


Python parse.urlsplit函数代码示例

本文整理汇总了Python中urllib.parse.urlsplit函数的典型用法代码示例。如果您正苦于以下问题:Python urlsplit函数的具体用法?Python urlsplit怎么用?Python urlsplit使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了urlsplit函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _main

def _main():
    base_url = sys.argv[1]
    soup = bs4.BeautifulSoup(urlopen(base_url), from_encoding="windows-1252")
    index_urls = [urljoin(base_url, h3("a")[0]["href"]) for h3 in soup("h3")]
    for index_url in index_urls:
        try:
            resp = urlopen(index_url)
        except HTTPError as err:
            print(err, err.url, file=sys.stderr)
            print("Skipping..", file=sys.stderr)
            continue
        index_soup = bs4.BeautifulSoup(resp, from_encoding="iso-8859-1")
        index_path = urlsplit(index_url).path
        index_filepath = os.path.normpath("." + index_path)
        try:
            os.makedirs(os.path.dirname(index_filepath))
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise e
        for issue_url in iter_issue_urls(index_soup):
            issue_url = urljoin(index_url, issue_url)
            try:
                resp = urlopen(issue_url)
            except HTTPError as err:
                print(err, err.url, file=sys.stderr)
                print("Skipping..", file=sys.stderr)
                continue
            issue_soup = bs4.BeautifulSoup(resp, from_encoding="windows-1252")
            issue_path = urlsplit(issue_url).path
            issue_filepath = os.path.normpath("." + issue_path)
            with open(issue_filepath, "w") as f:
                print(klupu.clean_soup(issue_soup), file=f)
        with open(index_filepath, "w") as f:
            print(klupu.clean_soup(index_soup), file=f)
开发者ID:imclab,项目名称:klupu,代码行数:34,代码来源:fetch.py

示例2: oauth

 def oauth(self, req, credentials = None, params = {}):
     #NOTE: While flickr supports HTTPS in its oauth endpoints, flickr
     #thinks that the HTTPS endpoints are being accessed via HTTP, and thus
     #constructs the signature base string accordingly, which
     #will hence not match the signature base string generated by
     #pyoauth1client. We solve this by replacing HTTPS with HTTP
     #when generating the signature base string, and then revert the change
     #after the base string is generated. This way the signature
     #base string will match the one generated by flickr even though
     #we are accessing the endpoints via HTTPS for ADDED SECURITY!!!111one
     x = urlsplit(req.url)
     if x.scheme == "https":
         #Remove the HTTPS Scheme
         https = True
         x = x._replace(scheme = "http")
         req = req._replace(url = urlunsplit(x))
     else:
         https = False
     y = super().oauth(req, credentials, params)
     if https:
         #Add back the HTTPS scheme
         x = urlsplit(y.url)
         x = x._replace(scheme = "https")
         y = y._replace(url = urlunsplit(x))
     return y
开发者ID:pyokagan,项目名称:pyoauth1client,代码行数:25,代码来源:__init__.py

示例3: main

def main(GET):
	global mail,error,error_list
	parser = argparse.ArgumentParser(description='Scrape a simple site.')
	parser.add_argument('url', help='the URL at which to begin')
	start_url = parser.parse_args().url
	starting_netloc = urlsplit(start_url).netloc
	url_filter = (lambda url: urlsplit(url).netloc == starting_netloc)
	scrape((GET, start_url), url_filter)
	print ("\n\nresult--------------------------------\nerror:%d" %(error))
	count = 1;
	for url in error_list:
		print(url)
	print("\n")
	for url in mail:
		print("[%d]url:%s" %(count,url))
		data = mail[url][0]
		if data:
			tmp = []
			for val in data:
				
				if not val in tmp:
					print (val)
				tmp.append(val)
			
		else:
			print("None")
		print ("")
		count+=1
开发者ID:cheersa,项目名称:python,代码行数:28,代码来源:hw3.py

示例4: __form_data

 def __form_data(text, formid, params, soup=None, form_url=None):
     if type(params) is not dict:
         raise TypeError('Params must be a dict')
     if soup is None:
         soup = BeautifulSoup(text, 'html.parser')
     form = soup.find('form', attrs={'id': formid})
     action = form.attrs.get('action')
     if not urlsplit(action).netloc:
         if form_url is None or not urlsplit(form_url).netloc:
             raise ValueError('kwarg form_url must be specified if form '
                              'action lacks a host')
         action = urljoin(form_url, action)
     inputs = form.find_all('input') + form.find_all('textarea')
     for i in inputs:
         try:
             name = i.attrs['name']
             type_ = i.attrs['type']
             value = params.get(name)
             if type_ == 'submit':
                 continue
             elif type_ == 'hidden':
                 value = i.attrs['value'] if value is None else value
             elif value is None:
                 raise ValueError('kwarg params dictionary is missing a '
                                  'value for a non-hidden field')
         except KeyError:
             pass
         else:
             params[name] = value
     return Session.FormInfo(params=params, post_url=action)
开发者ID:lachm,项目名称:fbbot,代码行数:30,代码来源:infra.py

示例5: clean_url

def clean_url(value):
    """
    Taken from Django' URLField, this helps to normalize URLs. Raises a
    ValueError if an invalid url is passed.

    Example:

    >>> clean_url("www.google.com")
    "http://www.google.com"

    >>> clean_url("_.com")
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
    ValueError: Enter a valid URL.
    """
    if value:
        value = value.strip()
        value = value.encode('ascii', 'ignore').decode("utf-8")
        url_fields = list(urlsplit((value)))
        if not url_fields[0]:
            # If no URL scheme given, assume http://
            url_fields[0] = 'http'
        if not url_fields[1]:
            # Assume that if no domain is provided, that the path segment
            # contains the domain.
            url_fields[1] = url_fields[2]
            url_fields[2] = ''
            # Rebuild the url_fields list, since the domain segment may now
            # contain the path too.
            url_fields = list(urlsplit((urlunsplit(url_fields))))
        if not url_fields[2]:
            # the path portion may need to be added before query params
            url_fields[2] = '/'
        value = urlunsplit(url_fields)
    return value
开发者ID:TrackMaven,项目名称:trackmaven-common,代码行数:35,代码来源:urls.py

示例6: assertRedirects

    def assertRedirects(self, response, expected_url, status_code=302,
                        target_status_code=200, host=None):
        """Asserts that a response redirected to a specific URL, and that the
        redirect URL can be loaded.

        Note that assertRedirects won't work for external links since it uses
        TestClient to do a request.
        """
        self.assertEqual(response.status_code, status_code,
            ("Response didn't redirect as expected: Response code was %d"
             " (expected %d)" % (response.status_code, status_code)))
        url = response['Location']
        scheme, netloc, path, query, fragment = urlsplit(url)
        e_scheme, e_netloc, e_path, e_query, e_fragment = urlsplit(expected_url)
        if not (e_scheme or e_netloc):
            expected_url = urlunsplit(('http', host or 'testserver', e_path,
                    e_query, e_fragment))
        self.assertEqual(url, expected_url,
            "Response redirected to '%s', expected '%s'" % (url, expected_url))

        # Get the redirection page, using the same client that was used
        # to obtain the original response.
        redirect_response = response.client.get(path, QueryDict(query))
        self.assertEqual(redirect_response.status_code, target_status_code,
            ("Couldn't retrieve redirection page '%s': response code was %d"
             " (expected %d)") %
                 (path, redirect_response.status_code, target_status_code))
开发者ID:gitdlam,项目名称:geraldo,代码行数:27,代码来源:testcases.py

示例7: parse_url

def parse_url(link):
    """Say Website Title information in channel"""
    baseurl = '{uri.scheme}://{uri.netloc}'.format(uri=urlsplit(link))
    path = urlsplit(link).path
    query = '?{uri.query}'.format(uri=urlsplit(link))
    try:
        headers = {'Accept-Encoding': 'utf-8',
                   'User-Agent': 'Mozilla/5.0'}
        response = get(baseurl + path + query, headers=headers)
    except:
        return
    if response.headers["Content-Type"] and "text/html" in response.headers["Content-Type"]:
        try:
            URL = BeautifulSoup(response.text, "html.parser")
        except:
            return
        if not URL.title:
            return
        if URL.title.string is None:
            return
        if len(URL.title.string) > 250:
            title=URL.title.string[0:250] + '…'
        else:
            title=URL.title.string
        return title.replace('\n', ' ').strip() + " (" + urlsplit(link).netloc + ")"
    else:
        return
开发者ID:meskarune,项目名称:autobot,代码行数:27,代码来源:url_announce.py

示例8: find_pingback_urls

    def find_pingback_urls(self, urls):
        """Find the pingback urls of each urls"""
        pingback_urls = {}

        for url in urls:
            try:
                page = urlopen(url)
                headers = page.info()

                if 'text/' not in headers.get('Content-Type', '').lower():
                    continue

                server_url = headers.get('X-Pingback')
                if not server_url:
                    server_url = self.find_pingback_href(page.read())

                if server_url:
                    server_url_splitted = urlsplit(server_url)
                    if not server_url_splitted.netloc:
                        url_splitted = urlsplit(url)
                        server_url = '%s://%s%s' % (url_splitted.scheme,
                                                    url_splitted.netloc,
                                                    server_url)
                    pingback_urls[url] = server_url
            except IOError:
                pass
        return pingback_urls
开发者ID:sergeny,项目名称:django-blog-zinnia,代码行数:27,代码来源:ping.py

示例9: run

    def run(self):
        while True:
            # grabs url from queue
            level, u = self.input_q.get()

            main = '{0.scheme}://{0.netloc}/'.format(urlsplit(u))

            # fetching urls
            if level < MAX_URL_LEVEL:
                html = _get_content(u)
                if not isinstance(html, list):
                    soup = bs(html)
                    for link in soup.find_all('a'):
                        href = link.get('href')
                        
                        if not href or len(href) < 2:
                            continue

                        # Check if URL is relative
                        elif not urlsplit(href)[0] and not urlsplit(href)[1]:
                            self.output_q.put((level+1, _url_discard(urljoin(u, href))))
                        
                        elif href.startswith(main):
                            self.output_q.put((level+1, _url_discard(href)))
                else:
                    # Place for possible error logs (:
                    pass

            # signals to queue job is done
            self.input_q.task_done()
开发者ID:komarovf,项目名称:uwc2015,代码行数:30,代码来源:parser.py

示例10: test_flow

    def test_flow(self):
        url = self.sp.make_auth_req()
        status, headers, _ = self.getPage(url)
        assert status == '303 See Other'

        url = self.get_redirect_location(headers)
        req = parse_qs(urlsplit(url).query)
        assert 'SAMLRequest' in req
        assert 'RelayState' in req

        action, body = self.idp.handle_auth_req(req['SAMLRequest'][0],
                                                req['RelayState'][0],
                                                BINDING_HTTP_REDIRECT,
                                                'test1')
        status, headers, body = self.getPage(action, method='POST',
                                             body=urlencode(body))
        assert status == '302 Found'

        url = self.get_redirect_location(headers)
        req = parse_qs(urlsplit(url).query)
        assert 'SAMLResponse' in req
        assert 'RelayState' in req
        resp = self.sp.parse_authn_request_response(req['SAMLResponse'][0],
                                                    BINDING_HTTP_REDIRECT)
        identity = resp.ava
        assert identity["displayName"][0] == "Test1"
        assert identity["sn"][0] == "[email protected]"
        assert identity['o'][0] == "Small university"
开发者ID:ibrsp,项目名称:s2sproxy,代码行数:28,代码来源:test_proxy_server.py

示例11: _url

    def _url(self, hashed_name_func, name, force=False, hashed_files=None):
        """
        Return the non-hashed URL in DEBUG mode.
        """
        if settings.DEBUG and not force:
            hashed_name, fragment = name, ''
        else:
            clean_name, fragment = urldefrag(name)
            if urlsplit(clean_name).path.endswith('/'):  # don't hash paths
                hashed_name = name
            else:
                args = (clean_name,)
                if hashed_files is not None:
                    args += (hashed_files,)
                hashed_name = hashed_name_func(*args)

        final_url = super().url(hashed_name)

        # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
        # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
        query_fragment = '?#' in name  # [sic!]
        if fragment or query_fragment:
            urlparts = list(urlsplit(final_url))
            if fragment and not urlparts[4]:
                urlparts[4] = fragment
            if query_fragment and not urlparts[3]:
                urlparts[2] += '?'
            final_url = urlunsplit(urlparts)

        return unquote(final_url)
开发者ID:Damgaard,项目名称:django,代码行数:30,代码来源:storage.py

示例12: https_open

    def https_open(self, request):
        """
        Send an HTTP request, which can be either GET or POST,
        depending on req.has_data()

        Args:
            request - instance of urllib2.Request
        """
        full_url = request.get_full_url()
        url_parts = parse.urlsplit(full_url)
        robo = None
        if url_parts.netloc in self.robots:
            robo = self.robots[url_parts.netloc]
        else:
            # Getting request url, for checking robots.txt
            host = parse.urlsplit(full_url)[1]
            rurl = parse.urlunparse(("http", host, "/robots.txt", "", ""))
            robo = reppy.cache.RobotsCache()
            robo.fetch(rurl, self.agent_name)
            self.robots[url_parts.netloc] = robo

        # Is url allow for crawler in robots.txt
        if robo.allowed(full_url, self.agent_name):
            # Return result of request
            return request.HTTPHandler.https_open(self, request)
        else:
            raise RuntimeError('Forbidden by robots.txt')
开发者ID:Armoken,项目名称:Learning,代码行数:27,代码来源:crawler.py

示例13: get_fetcher

def get_fetcher(url=None, *, item=dict()):
	RTMP_PROTOCOLS = {'rtmp', 'rtmpt', 'rtmpe', 'rtmpte'}
	
	url = item.get("url", url)
	if urlsplit(url).scheme in RTMP_PROTOCOLS:
		return RtmpFetcher(url, live=True)
	
	auth = comm.get_auth()
	protocol = urlsplit(auth['server']).scheme
	if protocol in RTMP_PROTOCOLS:
		(url, ext) = url.rsplit('.', 1) # strip the extension (.flv or .mp4)
		url = auth['playpath_prefix'] + url

		if ext == 'mp4':
			url = 'mp4:' + url

		rtmp_url = auth['rtmp_url']
		token = auth.get('token')
		if token:
		    # Cannot use urljoin() because
		    # the RTMP scheme would have to be added to its whitelist
		    rtmp_url += '?auth=' + token
		
		return RtmpFetcher(rtmp_url, playpath=url)
	else:
		return HdsFetcher(url, auth)
开发者ID:timwhite,项目名称:python-iview,代码行数:26,代码来源:fetch.py

示例14: zoom_article

 def zoom_article(self, ticket_id, article_id):
     art_descr = self.__db.article_description(article_id)
     if art_descr[4] & ART_TEXT:
         return eval(self.__db.article_message(article_id))
     self.echo("Zoom article:", ticket_id, article_id)
     url_beg = urlsplit(self.runtime.get("site"))[:3]
     params = (
         ("Action", "AgentTicketZoom"), ("Subaction", "ArticleUpdate"),
         ("TicketID", ticket_id), ("ArticleID", article_id),
         ("OTRSAgentInterface", self.runtime["OTRSAgentInterface"]))
     url = urlunsplit(url_beg + (urlencode(params), ""))
     pg = TicketsPage(self.core)
     page = pg.load(url)
     if page is None:
         return
     mail_header = page.get("mail_header", [])
     if "mail_src" in page:
         url = urlunsplit(url_beg[:2] + urlsplit(page["mail_src"])[2:])
         self.echo("Get message:", url)
         pg = MessagePage(self.core)
         try:
             mail_text = pg.load(url)
         except LoginError:
             mail_text = pg.login()
     else:
         mail_text = page["message_text"]
     if mail_header:
         mail_text.insert(0, ("\n",))
     for i in reversed(mail_header):
         mail_text.insert(0, ("%s\t%s\n" % i,))
     shrink_tupled_text(mail_text)
     self.__db.article_message(article_id, repr(mail_text))
     return mail_text
开发者ID:Lysovenko,项目名称:OTRS_US,代码行数:33,代码来源:msg_ldr.py

示例15: main

def main(GET):
    parser = argparse.ArgumentParser(description='Scrape a simple site.')
    parser.add_argument('url', help='the URL at which to begin')
    start_url = parser.parse_args().url
    starting_netloc = urlsplit(start_url).netloc
    url_filter = (lambda url: urlsplit(url).netloc == starting_netloc)
    scrape((GET, start_url), url_filter)
开发者ID:Kasfen,项目名称:networkprogramming,代码行数:7,代码来源:rscrape1.py


注:本文中的urllib.parse.urlsplit函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。