本文整理汇总了Python中urllib.parse.urldefrag函数的典型用法代码示例。如果您正苦于以下问题:Python urldefrag函数的具体用法?Python urldefrag怎么用?Python urldefrag使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了urldefrag函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: startElementNS
def startElementNS(self, name, qname, attrs):
stack = self.stack
stack.append(ElementHandler())
current = self.current
parent = self.parent
base = attrs.get(BASE, None)
if base is not None:
base, frag = urldefrag(base)
if parent and parent.base:
base = urljoin(parent.base, base)
else:
systemId = self.locator.getPublicId() \
or self.locator.getSystemId()
if systemId:
base = urljoin(systemId, base)
else:
if parent:
base = parent.base
if base is None:
systemId = self.locator.getPublicId() \
or self.locator.getSystemId()
if systemId:
base, frag = urldefrag(systemId)
current.base = base
language = attrs.get(LANG, None)
if language is None:
if parent:
language = parent.language
current.language = language
current.start(name, qname, attrs)
示例2: __init__
def __init__(self, url, previous=None, **info):
# Apply the simple idempotent optimizations to all urls (no need to
# ever deal with "HTTP://.."). This means case-sensitivity, and a
# whole lot of other things that the urlnorm library will do for us.
# We call this the original url, even though it is a bit of a lie.
try:
self.original_url = urlnorm.norm(url)
except urlnorm.InvalidUrl as e:
raise urlnorm.InvalidUrl('{}: {}'.format(e, url))
# For the normalized url that we'll be exposing, remove the
# fragment, and treat https and http the same.
url, fragment = urldefrag(self.original_url)
self.lossy_url_data = {'fragment': fragment}
if url.startswith('https:'):
url = 'http' + url[5:]
self.lossy_url_data.update({'protocol': 'https'})
self.url = url
self.set_previous(previous)
self.info = info
self.post = None
# Runtime data
self.response = None
self.exception = None
self.retries = 0
示例3: getlinks
def getlinks(pageurl, pageresponse, domain):
"""Returns a list of links from from this page to be crawled.
pageurl = URL of this page
pageresponse = page content; response object from requests module
domain = domain being crawled (None to return links to *any* domain)
"""
soup = bs4.BeautifulSoup(pageresponse.text, "html.parser")
# get target URLs for all links on the page
links = [a.attrs.get('href') for a in soup.select('a[href]')]
# remove fragment identifiers
links = [urldefrag(link)[0] for link in links]
# remove any empty strings
links = [link for link in links if link]
# if it's a relative link, change to absolute
links = [link if bool(urlparse(link).netloc) else urljoin(pageurl, link) \
for link in links]
# if only crawing a single domain, remove links to other domains
if domain:
links = [link for link in links if urlparse(link).netloc == domain]
return links
示例4: get_div_link
def get_div_link(self, tip):
tag_a = tip.parent.find('a', class_='qlink')
if tag_a:
url = tag_a.get('href')
return urldefrag(url)[0]
else:
return ''
示例5: url
def url(self, name, force=False):
"""
Returns the real URL in DEBUG mode.
"""
if settings.DEBUG and not force:
hashed_name, fragment = name, ''
else:
clean_name, fragment = urldefrag(name)
if urlsplit(clean_name).path.endswith('/'): # don't hash paths
hashed_name = name
else:
cache_key = self.cache_key(name)
hashed_name = self.cache.get(cache_key)
if hashed_name is None:
hashed_name = self.hashed_name(clean_name).replace('\\', '/')
# set the cache if there was a miss
# (e.g. if cache server goes down)
self.cache.set(cache_key, hashed_name)
final_url = super(CachedFilesMixin, self).url(hashed_name)
# Special casing for a @font-face hack, like url(myfont.eot?#iefix")
# http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
query_fragment = '?#' in name # [sic!]
if fragment or query_fragment:
urlparts = list(urlsplit(final_url))
if fragment and not urlparts[4]:
urlparts[4] = fragment
if query_fragment and not urlparts[3]:
urlparts[2] += '?'
final_url = urlunsplit(urlparts)
return unquote(final_url)
示例6: oauth_callback
def oauth_callback():
if not settings.OAUTH:
abort(404)
resp = oauth.provider.authorized_response()
if resp is None or isinstance(resp, OAuthException):
log.warning("Failed OAuth: %r", resp)
return Unauthorized("Authentication has failed.")
response = signals.handle_oauth_session.send(provider=oauth.provider,
oauth=resp)
for (_, role) in response:
if role is None:
continue
db.session.commit()
update_role(role)
log.info("Logged in: %r", role)
request.authz = Authz.from_role(role)
record_audit(Audit.ACT_LOGIN)
token = request.authz.to_token(role=role)
token = token.decode('utf-8')
state = request.args.get('state')
next_url = get_best_next_url(state, request.referrer)
next_url, _ = urldefrag(next_url)
next_url = '%s#token=%s' % (next_url, token)
return redirect(next_url)
log.error("No OAuth handler for %r was installed.", oauth.provider.name)
return Unauthorized("Authentication has failed.")
示例7: replace_refs
def replace_refs(cls, obj, _recursive=False, **kwargs):
"""
Returns a deep copy of `obj` with all contained JSON reference objects
replaced with :class:`JsonRef` instances.
:param obj: If this is a JSON reference object, a :class:`JsonRef`
instance will be created. If `obj` is not a JSON reference object,
a deep copy of it will be created with all contained JSON
reference objects replaced by :class:`JsonRef` instances
:param base_uri: URI to resolve relative references against
:param loader: Callable that takes a URI and returns the parsed JSON
(defaults to global ``jsonloader``, a :class:`JsonLoader` instance)
:param jsonschema: Flag to turn on `JSON Schema mode
<http://json-schema.org/latest/json-schema-core.html#anchor25>`_.
'id' keyword changes the `base_uri` for references contained within
the object
:param load_on_repr: If set to ``False``, :func:`repr` call on a
:class:`JsonRef` object will not cause the reference to be loaded
if it hasn't already. (defaults to ``True``)
"""
store = kwargs.setdefault("_store", _URIDict())
base_uri, frag = urlparse.urldefrag(kwargs.get("base_uri", ""))
store_uri = None # If this does not get set, we won't store the result
if not frag and not _recursive:
store_uri = base_uri
try:
if kwargs.get("jsonschema") and isinstance(obj["id"], basestring):
kwargs["base_uri"] = urlparse.urljoin(
kwargs.get("base_uri", ""), obj["id"]
)
store_uri = kwargs["base_uri"]
except (TypeError, LookupError):
pass
try:
if not isinstance(obj["$ref"], basestring):
raise TypeError
except (TypeError, LookupError):
pass
else:
return cls(obj, **kwargs)
# If our obj was not a json reference object, iterate through it,
# replacing children with JsonRefs
kwargs["_recursive"] = True
path = list(kwargs.pop("_path", ()))
if isinstance(obj, Mapping):
obj = type(obj)(
(k, cls.replace_refs(v, _path=path+[k], **kwargs))
for k, v in iteritems(obj)
)
elif isinstance(obj, Sequence) and not isinstance(obj, basestring):
obj = type(obj)(
cls.replace_refs(v, _path=path+[i], **kwargs) for i, v in enumerate(obj)
)
if store_uri is not None:
store[store_uri] = obj
return obj
示例8: getlinks
def getlinks(pageurl, domain, soup):
"""Returns a list of links from from this page to be crawled.
pageurl = URL of this page
domain = domain being crawled (None to return links to *any* domain)
soup = BeautifulSoup object for this page
"""
# get target URLs for all links on the page
links = [a.attrs.get('href') for a in soup.select('a[href]')]
# remove fragment identifiers
links = [urldefrag(link)[0] for link in links]
# remove any empty strings
links = [link for link in links if link]
# if it's a relative link, change to absolute
links = [link if bool(urlparse(link).netloc) else urljoin(pageurl, link) \
for link in links]
# if only crawing a single domain, remove links to other domains
if domain:
links = [link for link in links if samedomain(urlparse(link).netloc, domain)]
return links
示例9: validate_url
def validate_url(url, parent_url='http:'):
"""
Validate a URL to be a string having an explicit recognized scheme.
Arguments:
url: string URL
parent_url: optional string URL from which to inherit an implicit
scheme.
Returns: dict having:
valid: boolean truth value.
url: string modified URL.
"""
if bytes == type(url):
url = url.decode()
parsed_url = urlparse(url)
if 0 < len(parsed_url.path) and '/' == parsed_url.path[0]:
url = urldefrag(urljoin(parent_url, url))[0]
elif not parsed_url.scheme:
parent_scheme = urlparse(parent_url).scheme or 'http'
url = parent_scheme + ':' + url
parsed_url = urlparse(url)
valid = parsed_url.scheme in ('http', 'https', '') and \
bool(parsed_url.netloc)
return {'valid': valid, 'url': url}
示例10: validate
def validate(url):
if url in visitedUrls: return
visitedUrls.append(url)
try:
content = urlopen(url).read().decode("utf8")
except:
# Assume the content is binary.
return
wikiUrls = []
invalidUrls = []
# This may see redundant, but without the `.find_all('a')`, soup will also
# contain the `DocType` element which does not have an `href` attribute.
# See <http://stackoverflow.com/questions/17943992/beautifulsoup-and-soupstrainer-for-getting-links-dont-work-with-hasattr-returni>.
soup = BeautifulSoup(content, parse_only=SoupStrainer('a', href=True)).find_all('a')
for externalUrl in soup:
fullExternalUrl = urljoin(url, urldefrag(externalUrl['href']).url)
if baseUrl in fullExternalUrl and \
not fullExternalUrl.endswith('/_history'):
if externalUrl.has_attr('class') and 'absent' in externalUrl['class']:
invalidUrls.append(fullExternalUrl)
else:
wikiUrls.append(fullExternalUrl)
if len(invalidUrls) > 0:
invalidWikiPages.append((url, invalidUrls))
for wikiUrl in wikiUrls:
if wikiUrl not in visitedUrls:
validate(wikiUrl)
示例11: _parse
def _parse(self, page: BeautifulSoup, url):
seasons = OrderedDict()
eqg = OrderedSet()
child = page.select_one("#WikiaArticle h2")
season = child.text
while child.next_sibling:
child = child.next_sibling
if child.name == "table":
for a in child.find_all("a", string="Transcript"):
if not a.has_attr("class") or "new" not in a["class"]:
episode_url, fragment = urldefrag(a["href"])
episode_url = urljoin(url, episode_url)
if "Equestria Girls" not in season:
if season not in seasons:
seasons[season] = OrderedSet()
seasons[season].append(episode_url)
else:
eqg.append(episode_url)
continue
if child.name == "h2":
season = child.text
continue
seasons["Equestria Girls"] = eqg
return seasons
示例12: resolving
def resolving(self, ref):
"""
Context manager which resolves a JSON ``ref`` and enters the
resolution scope of this ref.
:argument str ref: reference to resolve
"""
full_uri = urlparse.urljoin(self.resolution_scope, ref)
uri, fragment = urlparse.urldefrag(full_uri)
if uri in self.store:
document = self.store[uri]
elif not uri or uri == self.base_uri:
document = self.referrer
else:
document = self.resolve_remote(uri)
old_base_uri, old_referrer = self.base_uri, self.referrer
self.base_uri, self.referrer = uri, document
try:
with self.in_scope(uri):
yield self.resolve_fragment(document, fragment)
finally:
self.base_uri, self.referrer = old_base_uri, old_referrer
示例13: _url
def _url(self, hashed_name_func, name, force=False, hashed_files=None):
"""
Return the non-hashed URL in DEBUG mode.
"""
if settings.DEBUG and not force:
hashed_name, fragment = name, ''
else:
clean_name, fragment = urldefrag(name)
if urlsplit(clean_name).path.endswith('/'): # don't hash paths
hashed_name = name
else:
args = (clean_name,)
if hashed_files is not None:
args += (hashed_files,)
hashed_name = hashed_name_func(*args)
final_url = super().url(hashed_name)
# Special casing for a @font-face hack, like url(myfont.eot?#iefix")
# http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
query_fragment = '?#' in name # [sic!]
if fragment or query_fragment:
urlparts = list(urlsplit(final_url))
if fragment and not urlparts[4]:
urlparts[4] = fragment
if query_fragment and not urlparts[3]:
urlparts[2] += '?'
final_url = urlunsplit(urlparts)
return unquote(final_url)
示例14: extract_domains
def extract_domains(site_text):
domains = set()
only_a_tags = SoupStrainer("a")
for link in BeautifulSoup(site_text, "html.parser", parse_only=only_a_tags):
if link.has_attr('href') and urlparse(link["href"]).scheme not in ["", "mailto"]:
domains.add(urldefrag(link["href"])[0])
return list(domains)
示例15: splitDecodeFragment
def splitDecodeFragment(url):
if url is None: # urldefrag returns byte strings for none, instead of unicode strings
return _STR_UNICODE(""), _STR_UNICODE("")
urlPart, fragPart = urldefrag(url)
if isPy3:
return (urlPart, unquote(fragPart, "utf-8", errors=None))
else:
return _STR_UNICODE(urlPart), unquote(_STR_UNICODE(fragPart), "utf-8", errors=None)