当前位置: 首页>>代码示例>>Python>>正文


Python urlparse.urlparse函数代码示例

本文整理汇总了Python中urlparse.urlparse函数的典型用法代码示例。如果您正苦于以下问题:Python urlparse函数的具体用法?Python urlparse怎么用?Python urlparse使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了urlparse函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: sitemap_parse

def sitemap_parse(sitemap_option, astring, google_results, website_url):
    not_indexed = []
    not_sitemap = []
    error = ''
    sitemap_results = []
    website_host = urlparse(website_url).scheme
    if website_host != '':
        website_url = urlparse(website_url).scheme + "://" + urlparse(website_url).netloc
    if website_url[-1] != '/':
        website_url += '/'
    if astring != '':
        if sitemap_option == 'sitemap':

            resp = requests.get(astring)
            soup = Soup(resp.content)

        elif sitemap_option == 'upload_sitemap':

            soup = Soup(astring)
        urls = soup.findAll('url')
        for u in urls:
            loc = u.find('loc').string
            sitemap_results.append(loc)
            if loc not in google_results:
                not_indexed.append(loc)
        for loc in google_results:
            if loc not in sitemap_results:
                not_sitemap.append(loc)
    return not_indexed, not_sitemap, error
开发者ID:tanyaho,项目名称:web,代码行数:29,代码来源:sitemap.py

示例2: ConfigureHostnames

def ConfigureHostnames(config):
  """This configures the hostnames stored in the config."""
  if flags.FLAGS.external_hostname:
    hostname = flags.FLAGS.external_hostname
  else:
    try:
      hostname = socket.gethostname()
    except (OSError, IOError):
      print "Sorry, we couldn't guess your hostname.\n"

    hostname = RetryQuestion("Please enter your hostname e.g. "
                             "grr.example.com", "^[\\.A-Za-z0-9-]+$", hostname)

  print """\n\n-=Server URL=-
The Server URL specifies the URL that the clients will connect to
communicate with the server. For best results this should be publicly
accessible. By default this will be port 8080 with the URL ending in /control.
"""
  frontend_url = RetryQuestion("Frontend URL", "^http://.*/$",
                               "http://%s:8080/" % hostname)
  config.Set("Client.server_urls", [frontend_url])

  frontend_port = urlparse.urlparse(frontend_url).port or config_lib.CONFIG.Get(
      "Frontend.bind_port")
  config.Set("Frontend.bind_port", frontend_port)

  print """\n\n-=AdminUI URL=-:
The UI URL specifies where the Administrative Web Interface can be found.
"""
  ui_url = RetryQuestion("AdminUI URL", "^http[s]*://.*$",
                         "http://%s:8000" % hostname)
  config.Set("AdminUI.url", ui_url)
  ui_port = urlparse.urlparse(ui_url).port or config_lib.CONFIG.Get(
      "AdminUI.port")
  config.Set("AdminUI.port", ui_port)
开发者ID:JackWangCUMT,项目名称:grr,代码行数:35,代码来源:config_updater.py

示例3: __call__

	def __call__(self, url, count_of_crawler):
		"""
		Function which fetch the content from the given URL and collect all the
		URL in the content and pass the first url of the page to fetch the
		content.
		"""
		try:
			page = urllib2.urlopen(url)
			soup = BeautifulSoup(page.read())	

			links_on_page = map(lambda anchor: anchor.get('href'), 
						soup.find_all('a'))

			cleaned_url = map(lambda link: link if urlparse(link).scheme 
	 				and urlparse(url).netloc else (urlparse(url)
					.scheme+"://"+urlparse(url).netloc+link if 
					link[0] == "/" else url+link), links_on_page)
			visited_url.append(url)
			total_collected_url.append(cleaned_url)
			next_url_to_visit = [next_url for next_url in cleaned_url\
				 if not next_url in visited_url and not "#" in next_url][0]
		
			if count_of_crawler and next_url_to_visit:	
				count_of_crawler = crawler(next_url_to_visit, 
								count_of_crawler-1)
	
		except:
			print "It seems there is some issue in URL "+url
	
		return count_of_crawler
开发者ID:singhalvibhor05,项目名称:Simple-crawler,代码行数:30,代码来源:crawler.py

示例4: absolute_url

def absolute_url(url, base_href):
    """
    >>> absolute_url('foo', 'http://base/whatever/ooo/fdsh')
    'http://base/whatever/ooo/foo'

    >>> absolute_url('foo/bar/', 'http://base')
    'http://base/foo/bar/'

    >>> absolute_url('/foo/bar', 'http://base/whatever/fdskf')
    'http://base/foo/bar'

    >>> absolute_url('\\n/foo/bar', 'http://base/whatever/fdskf')
    'http://base/foo/bar'

    >>> absolute_url('http://localhost/foo', 'http://base/whatever/fdskf')
    'http://localhost/foo'
    """
    url = url.strip()
    proto = urlparse(url)[0]
    if proto:
        return url

    base_url_parts = urlparse(base_href)
    base_server = '://'.join(base_url_parts[:2])
    if url.startswith('/'):
        return base_server + url
    else:
        path = base_url_parts[2]
        if '/' in path:
            path = path.rsplit('/', 1)[0] + '/'
        else:
            path = '/'
        return base_server + path + url
开发者ID:BUAA-DreamTeam,项目名称:cola,代码行数:33,代码来源:utils.py

示例5: checkRedir

    def checkRedir(self, orig_path):
        # old_url = portal_url+item['_orig_path']
        # XXX: referers to target and not portal
        old_url = self.target + orig_path

        # this downloads file. We need a way to do this without the download
        _, host, targetpath, _, _, _ = urlparse.urlparse(self.target)
        if "@" in host:
            auth, host = host.split("@")
        else:
            auth = None

        conn = httplib.HTTPConnection(host)
        headers = {}
        if auth:
            auth = "Basic " + string.strip(base64.encodestring(auth))
            headers["Authorization"] = auth
        # /view is a hack as zope seems to send all content on head request
        conn.request("HEAD", targetpath + orig_path, headers=headers)
        res = conn.getresponse()
        redir = res.status == 301
        if redir and res.getheader("location"):
            _, _, oldpath, _, _, _ = urlparse.urlparse(res.getheader("location"))
            parts = oldpath.split("/")
            if parts[-1] == "view":
                parts = parts[:-1]
            return "/".join(parts)
        if res.status == 200:
            return orig_path
        return None
开发者ID:ju55i,项目名称:transmogrify.ploneremote,代码行数:30,代码来源:remoteconstructor.py

示例6: __init__

 def __init__(self, layer, mapfile, fonts=None):
     """ Initialize Mapnik provider with layer and mapfile.
         
         XML mapfile keyword arg comes from TileStache config,
         and is an absolute path by the time it gets here.
     """
     maphref = urljoin(layer.config.dirpath, mapfile)
     scheme, h, path, q, p, f = urlparse(maphref)
     
     if scheme in ('file', ''):
         self.mapfile = path
     else:
         self.mapfile = maphref
     
     self.layer = layer
     self.mapnik = None
     
     engine = mapnik.FontEngine.instance()
     
     if fonts:
         fontshref = urljoin(layer.config.dirpath, fonts)
         scheme, h, path, q, p, f = urlparse(fontshref)
         
         if scheme not in ('file', ''):
             raise Exception('Fonts from "%s" can\'t be used by Mapnik' % fontshref)
     
         for font in glob(path.rstrip('/') + '/*.ttf'):
             engine.register_font(str(font))
开发者ID:Outdooractive,项目名称:TileStache,代码行数:28,代码来源:Mapnik.py

示例7: getRepStr

    def getRepStr(self):
        urlList1 = [ i for i in self.url1.replace('http://', '').split('/') if i]
        urlList2 = [ i for i in self.url2.replace('http://', '').split('/') if i]
#        print urlList1
#        print urlList2

        n = 0
        while True:
            if urlList1[:n]==urlList2[:n]:
                n+=1
                if n>10:
                    break
                continue
            break

        urlPart = 'http://'+'/'.join(urlList1[:n-1])
        if urlparse(urlPart).netloc and ('.' not in urlparse(urlPart).path):
            urlPart += '/'

        urlListLen = len(urlList1[n-1:])

        if urlListLen<1:
            return (urlPart, './')

        if urlListLen>=1:
            return (urlPart, urlListLen*'../', self.url1, self.url2)
开发者ID:wowngasb,项目名称:kl_tool,代码行数:26,代码来源:get_web.py

示例8: test_that_checks_redirect_using_incorrect_query_values

    def test_that_checks_redirect_using_incorrect_query_values(self, base_url):
        param = {
            'product': 'firefox-31.0',
            'lang': 'kitty_language',
            'os': 'stella'
        }
        response = self._head_request(base_url, params=param)

        assert (requests.codes.not_found == response.status_code,
                self.response_info_failure_message(base_url, param, response))

        parsed_url = urlparse(response.url)

        assert ('http' == parsed_url.scheme, 'Failed to redirect to the correct scheme. %s' %
                self.response_info_failure_message(base_url, param, response))

        assert (urlparse(base_url).netloc == parsed_url.netloc,
                self.response_info_failure_message(base_url, param, response))

        assert (urlencode(param) == parsed_url.query,
                self.response_info_failure_message(base_url, param, response))

        assert ('Unknown' != self.get_x_backend_server(response),
                'Failed, x-backend-server was not in the response object. %s' %
                self.response_info_failure_message(base_url, param, response))
开发者ID:ithompson4,项目名称:bouncer-tests,代码行数:25,代码来源:test_redirects.py

示例9: generateUrls

def generateUrls(url):
	baseulp = urlparse(url)
	host = baseulp.netloc

	paths = getCrawlerPaths(url)
	#pprint(paths)
	
	urls = []
	rulefile = BASEDIR + '/lib/db/compresed_file.rule'
	for eachpath in paths:
		eachulp = urlparse(eachpath)
		if eachulp.path == '':
			host = eachulp.netloc
			domain = GetFirstLevelDomain(host)
			args = {'host':host,'com':domain}

		else:
			pos = eachulp.path.rfind('/')
			tmp = eachulp.path[pos+1:]
			args = {'com':tmp}

		rf = RuleFile(rulefile,args)
		rf._getRules()
		for i in rf.ret:
			urls.append(eachpath + '/' +i)

	ret = list(set(urls))
	ret.sort()
	return ret
开发者ID:YHHK,项目名称:Hammer,代码行数:29,代码来源:compressedfile.py

示例10: get_show

    def get_show(self, imdb, tvdb, show, show_alt, year):
        try:
            query = self.search_link
            post = urllib.urlencode({'searchquery': show, 'searchin': '2'})

            result = ''
            links = [self.link_1, self.link_3]
            for base_link in links:
                result = client.source(urlparse.urljoin(base_link, query), post=post, headers=self.headers)
                if 'widget search-page' in str(result): break

            result = client.parseDOM(result, "div", attrs = { "class": "widget search-page" })[0]
            result = client.parseDOM(result, "td")

            shows = [cleantitle.tv(show), cleantitle.tv(show_alt)]
            years = ['(%s)' % str(year), '(%s)' % str(int(year)+1), '(%s)' % str(int(year)-1)]
            result = [(client.parseDOM(i, "a", ret="href")[-1], client.parseDOM(i, "a")[-1]) for i in result]
            result = [i for i in result if any(x == cleantitle.tv(i[1]) for x in shows)]
            result = [i[0] for i in result if any(x in i[1] for x in years)][0]

            url = client.replaceHTMLCodes(result)
            try: url = urlparse.parse_qs(urlparse.urlparse(url).query)['u'][0]
            except: pass
            url = urlparse.urlparse(url).path
            url = url.encode('utf-8')
            return url
        except:
            return
开发者ID:Mendim,项目名称:tdbaddon,代码行数:28,代码来源:iwatchonline_mv_tv.py

示例11: sources

    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []

            if url == None: return sources

            url = urlparse.urljoin(self.base_link, url)

            r = proxy.request(url, 'tv shows')

            links = client.parseDOM(r, 'a', ret='href', attrs = {'target': '.+?'})
            links = [x for y,x in enumerate(links) if x not in links[:y]]

            for i in links:
                try:
                    url = i
                    url = proxy.parse(url)
                    url = urlparse.parse_qs(urlparse.urlparse(url).query)['r'][0]
                    url = url.decode('base64')
                    url = client.replaceHTMLCodes(url)
                    url = url.encode('utf-8')

                    host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(url.strip().lower()).netloc)[0]
                    if not host in hostDict: raise Exception()
                    host = host.encode('utf-8')

                    sources.append({'source': host, 'quality': 'SD', 'language': 'en', 'url': url, 'direct': False, 'debridonly': False})
                except:
                    pass

            return sources
        except:
            return sources
开发者ID:varunrai,项目名称:repository.magicality,代码行数:33,代码来源:mywatchseries.py

示例12: mangle_url

    def mangle_url(self, url):
        self.check_connection()

        try:
            endpoint_url = urlparse.urlparse(url)
        except Exception as e:
            script_unknown("you must provide an endpoint_url in the form"
                           + "<scheme>://<url>/ (%s)\n" % e)
        scheme = endpoint_url.scheme
        if scheme is None:
            script_unknown("you must provide an endpoint_url in the form"
                           + "<scheme>://<url>/ (%s)\n" % e)
        catalog_url = None
        try:
            catalog_url = urlparse.urlparse(
                self.nova_client.client.management_url)
        except Exception as e:
            script_unknown("unknown error parsing the catalog url : %s\n" % e)

        port = endpoint_url.port
        if port is None:
            if catalog_url.port is None:
                port = 8774
            else:
                port = catalog_url.port

        netloc = "%s:%i" % (endpoint_url.hostname, port)
        url = urlparse.urlunparse([scheme,
                                   netloc,
                                   catalog_url.path,
                                   catalog_url.params,
                                   catalog_url.query,
                                   catalog_url.fragment])
        self.nova_client.client.set_management_url(url)
开发者ID:ShooterTT,项目名称:openstack-monitoring,代码行数:34,代码来源:check_nova-instance.py

示例13: get_download_url_ssl

    def get_download_url_ssl(self):
        """
        SSL-enabled links should be used for the specific verions, except the
        Windows stub installers.
        """

        # SSL-enabled links won't be used for 26.0
        url = firefox_details.get_download_url("OS X", "pt-BR", "26.0")
        self.assertListEqual(
            parse_qsl(urlparse(url).query), [("product", "firefox-26.0"), ("os", "osx"), ("lang", "pt-BR")]
        )

        # SSL-enabled links won't be used for 27.0 Windows builds (but SSL
        # download is enabled by default for stub installers)
        url = firefox_details.get_download_url("Windows", "pt-BR", "27.0")
        self.assertListEqual(
            parse_qsl(urlparse(url).query), [("product", "firefox-27.0"), ("os", "win"), ("lang", "pt-BR")]
        )

        # SSL-enabled links will be used for 27.0 OS X builds
        url = firefox_details.get_download_url("OS X", "pt-BR", "27.0")
        self.assertListEqual(
            parse_qsl(urlparse(url).query), [("product", "firefox-27.0-SSL"), ("os", "osx"), ("lang", "pt-BR")]
        )

        # SSL-enabled links will be used for 27.0 Linux builds
        url = firefox_details.get_download_url("Linux", "pt-BR", "27.0")
        self.assertListEqual(
            parse_qsl(urlparse(url).query), [("product", "firefox-27.0-SSL"), ("os", "linux"), ("lang", "pt-BR")]
        )
开发者ID:RiteshBhat,项目名称:bedrock,代码行数:30,代码来源:tests.py

示例14: get_all_href_list

def get_all_href_list(root_my_url, soup, file_encode):

    root_parse = urlparse.urlparse(root_my_url.get_abs_url())
    href_list = []

    if not root_parse.hostname:
        return href_list

    # get tags' href
    tag_list = soup.find_all(['a', 'img', 'link'])
    href_filter = r'#|\n|(mailto:)'

    for tag in tag_list:
        add_my_url = DownloadUrl(None, None, root_my_url.get_abs_path())

        if tag.get('href') and not re.search(href_filter, tag.get('href')):
            add_my_url.url = tag.get('href')
        elif tag.get('src'):
            add_my_url.url = tag.get('src')

        if add_my_url.url:
            temp_parse = urlparse.urlparse(add_my_url.url)
            if temp_parse.hostname:
                add_my_url.host = temp_parse.hostname
            else:
                add_my_url.host = root_parse.hostname
            href_list.append(add_my_url)

    return href_list
开发者ID:voiddog,项目名称:python_web_spider,代码行数:29,代码来源:spider.py

示例15: __init__

    def __init__(self, uri, consumer, extra_headers=None):
        asyncore.dispatcher_with_send.__init__(self)

        # turn the uri into a valid request
        scheme, host, path, params, query, fragment = urlparse.urlparse(uri)

        # use origin host
        self.host = host

        # get proxy settings, if any
        proxy = self.proxies.get(scheme)
        if proxy:
            scheme, host, x, x, x, x = urlparse.urlparse(proxy)

        assert scheme == "http", "only supports HTTP requests (%s)" % scheme

        if not path:
            path = "/"
        if params:
            path = path + ";" + params
        if query:
            path = path + "?" + query
        if proxy:
            path = scheme + "://" + self.host + path

        self.path = path

        # get port number
        try:
            host, port = host.split(":", 1)
            port = int(port)
        except (TypeError, ValueError):
            port = 80 # default port

        self.consumer = consumer

        self.status = None
        self.header = None

        self.bytes_in = 0
        self.bytes_out = 0

        self.content_type = None
        self.content_length = None
        self.content_encoding = None
        self.transfer_encoding = None

        self.data = ""

        self.chunk_size = None

        self.timestamp = time.time()

        self.extra_headers = extra_headers

        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            self.connect((host, port))
        except socket.error:
            self.consumer.http(0, self, sys.exc_info())
开发者ID:c0ns0le,项目名称:cygwin,代码行数:60,代码来源:http_client.py


注:本文中的urlparse.urlparse函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。