本文整理汇总了Python中urlparse.urldefrag函数的典型用法代码示例。如果您正苦于以下问题:Python urldefrag函数的具体用法?Python urldefrag怎么用?Python urldefrag使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了urldefrag函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: make_correct_link
def make_correct_link(base, link):
"""
makes links correct:
http://... -- pass
/... -- adding link to site before
www. ... -- adding http:// before
smth.html -- adding full path before
it's planned to be a wrapper over urlparse's functions
done in order to handle all possible cases of url presentation
handles absolute url and relative url
clean up all 'fragments' in url (like http://site.ru/1.html#4 -> http://site.ru/1.html)
"""
defrag_link, _ = urlparse.urldefrag(link)
defrag_base, _ = urlparse.urldefrag(base)
# case 'g.html on http://ya.ru/a ==> http://ya.ru/a/g.html
# (added slash after a if it's a unslashed folder
# defining unslashed folder: empty query (like 'a.php?set=1'),
# no dots
# no closing slash
scheme, netloc, url, params, query, fragment = urlparse.urlparse(defrag_base)
if url and not query and not re.search("/$", url) and not re.search("\.", url ):
url += '/'
defrag_base = urlparse.urlunparse( (scheme, netloc, url, params, query, fragment) )
#just rejoining all parts
return_link = urlparse.urljoin(defrag_base, defrag_link)
return return_link
示例2: crawlWeb
def crawlWeb(UrlafterConnect,keyword):
if not UrlafterConnect:
print("Url is empty")
return list()
#Get all the links
soup = BeautifulSoup(UrlafterConnect)
urllist = []
#check for the existence of keyword IR and crawl on those urls
if re.search(keyword, str(soup), re.IGNORECASE) != None:
for link in soup.find_all('a', href=True):
crawl = link.get('href')
crawl_url = crawl.encode('utf-8')
if not crawl_url:
continue
#links present in the same directory of /wiki, if so convert them to http form
if crawl_url.startswith('/wiki'):
if (crawl_url.find(':') == -1) and (crawl_url != "/wiki/Main_Page"):
crawl_url = urlparse.urljoin("http://en.wikipedia.org",crawl_url)
crawl_url, frag = urlparse.urldefrag(crawl_url)
urllist.append(crawl_url)
else:
#Get only wiki links without colons in it and not redirecting to main page
if crawl_url.startswith('http://en.wikipedia.org'):
if crawl_url != "http://en.wikipedia.org/wiki/Main_Page":
s = "http://en"
crawl = crawl_url.lstrip("http://en")
if crawl.find(':') == -1:
crawl_url, frag = urlparse.urldefrag(crawl_url)
urllist.append(crawl_url)
#Remove duplicate entries from the list while returning
return list(set(urllist))
示例3: _toc_from_html
def _toc_from_html(self, opf):
if 'toc' not in self.oeb.guide:
return False
self.log.debug('Reading TOC from HTML...')
itempath, frag = urldefrag(self.oeb.guide['toc'].href)
item = self.oeb.manifest.hrefs[itempath]
html = item.data
if frag:
elems = xpath(html, './/*[@id="%s"]' % frag)
if not elems:
elems = xpath(html, './/*[@name="%s"]' % frag)
elem = elems[0] if elems else html
while elem != html and not xpath(elem, './/h:a[@href]'):
elem = elem.getparent()
html = elem
titles = defaultdict(list)
order = []
for anchor in xpath(html, './/h:a[@href]'):
href = anchor.attrib['href']
href = item.abshref(urlnormalize(href))
path, frag = urldefrag(href)
if path not in self.oeb.manifest.hrefs:
continue
title = xml2text(anchor)
title = COLLAPSE_RE.sub(' ', title.strip())
if href not in titles:
order.append(href)
titles[href].append(title)
toc = self.oeb.toc
for href in order:
toc.add(' '.join(titles[href]), href)
return True
示例4: get_links
def get_links(url, depth, atmost_count):
urldfg = urlparse.urldefrag(url)
url = urldfg[0]
urls_list = []
myopener = MyOpener()
try:
page = myopener.open(url)
except:
return []
text = page.read()
page.close()
url_parsed = urlparse.urlparse(url)
domain_name_url_arr = url_parsed.netloc.split(".")
soup = BeautifulSoup(text, "html.parser")
for tag in soup.findAll('a', href=True):
if atmost_count == 0:
break;
tag['href'] = urlparse.urljoin(url, tag['href'])
new_url = urlparse.urldefrag(tag['href'])[0]
new_url_parsed = urlparse.urlparse(new_url)
domain_name_new_url_arr = new_url_parsed.netloc.split('.');
if len(domain_name_url_arr) >= 2 and len(domain_name_new_url_arr) >= 2:
if domain_name_url_arr[-1] != domain_name_new_url_arr[-1] or domain_name_url_arr[-2] != domain_name_new_url_arr[-2]:
continue;
else:
continue;
if new_url[-4:] == '.pdf':
continue;
if new_url not in urls_list:
urls_list.append([new_url, depth + 1])
atmost_count -= 1;
return urls_list
示例5: __init__
def __init__(self, toc, j, renderlist, redirects):
self.typedoc = StringIO.StringIO()
self.toc = toc
self.subs = {} # type: Dict
self.docParent = {} # type: Dict
self.docAfter = {} # type: Dict
self.rendered = set() # type: Set
self.redirects = redirects
self.title = None # type: str
for t in j:
if "extends" in t:
for e in aslist(t["extends"]):
add_dictlist(self.subs, e, t["name"])
#if "docParent" not in t and "docAfter" not in t:
# add_dictlist(self.docParent, e, t["name"])
if t.get("docParent"):
add_dictlist(self.docParent, t["docParent"], t["name"])
if t.get("docChild"):
for c in aslist(t["docChild"]):
add_dictlist(self.docParent, t["name"], c)
if t.get("docAfter"):
add_dictlist(self.docAfter, t["docAfter"], t["name"])
_, _, metaschema_loader = schema.get_metaschema()
alltypes = schema.extend_and_specialize(j, metaschema_loader)
self.typemap = {} # type: Dict
self.uses = {} # type: Dict
self.record_refs = {} # type: Dict
for t in alltypes:
self.typemap[t["name"]] = t
try:
if t["type"] == "record":
self.record_refs[t["name"]] = []
for f in t.get("fields", []):
p = has_types(f)
for tp in p:
if tp not in self.uses:
self.uses[tp] = []
if (t["name"], f["name"]) not in self.uses[tp]:
_, frg1 = urlparse.urldefrag(t["name"])
_, frg2 = urlparse.urldefrag(f["name"])
self.uses[tp].append((frg1, frg2))
if tp not in basicTypes and tp not in self.record_refs[t["name"]]:
self.record_refs[t["name"]].append(tp)
except KeyError as e:
_logger.error("Did not find 'type' in %s", t)
raise
for f in alltypes:
if (f["name"] in renderlist or
((not renderlist) and
("extends" not in f) and
("docParent" not in f) and
("docAfter" not in f))):
self.render_type(f, 1)
示例6: startElementNS
def startElementNS(self, name, qname, attrs):
stack = self.stack
stack.append(ElementHandler())
current = self.current
parent = self.parent
base = attrs.get(BASE, None)
if base is not None:
base, frag = urldefrag(base)
if parent and parent.base:
base = urljoin(parent.base, base)
else:
systemId = self.locator.getPublicId() or self.locator.getSystemId()
if systemId:
base = urljoin(systemId, base)
else:
if parent:
base = parent.base
if base is None:
systemId = self.locator.getPublicId() or self.locator.getSystemId()
if systemId:
base, frag = urldefrag(systemId)
current.base = base
language = attrs.get(LANG, None)
if language is None:
if parent:
language = parent.language
current.language = language
current.start(name, qname, attrs)
示例7: _urljoin
def _urljoin(base, url):
"""
Construct a full ("absolute") URL by combining a "base URL" with another
URL. Informally, this uses components of the base URL, in particular the
addressing scheme, the network location and (part of) the path, to provide
missing components in the relative URL.
Additionally, the fragment identifier is preserved according to the HTTP
1.1 bis draft.
@type base: C{bytes}
@param base: Base URL.
@type url: C{bytes}
@param url: URL to combine with C{base}.
@return: An absolute URL resulting from the combination of C{base} and
C{url}.
@see: L{urlparse.urljoin}
@see: U{https://tools.ietf.org/html/draft-ietf-httpbis-p2-semantics-22#section-7.1.2}
"""
base, baseFrag = urldefrag(base)
url, urlFrag = urldefrag(urljoin(base, url))
return urljoin(url, b'#' + (urlFrag or baseFrag))
示例8: parse_showings_table
def parse_showings_table(self, response):
movie_title = response.meta['movieTitle']
movie_url = response.meta['movieUrl']
showings_table_value = response.meta['showingsTableValue']
theater_url = response.meta['theaterUrl']
version = response.meta['version']
showings_table = response.xpath('//div[@class="cinema-movie clearfix"]/div[@value="' + showings_table_value + '"]')
at_least_one_showing_found = False
jump_links = showings_table.css('.jump-to-show').xpath('a')
if len(jump_links) >= 1:
jump_link = jump_links[-1]
if jump_link.xpath('text()').extract_first().endswith(u'>'):
jump_url = urldefrag(response.urljoin(jump_link.xpath('@href').extract_first()))[0]
request = scrapy.Request(jump_url, callback=self.parse_showings_table)
request.meta['movieTitle'] = movie_title
request.meta['movieUrl'] = movie_url
request.meta['showingsTableValue'] = showings_table_value
request.meta['theaterUrl'] = theater_url
request.meta['version'] = version
yield request
else:
for showings_column in showings_table.css('.cinema-movie-dates').xpath('li'):
for showing_cell in showings_column.xpath('ul/li/a'):
at_least_one_showing_found = True
dayAndMonth = showings_column.xpath('div[2]/text()').extract_first().split('/')
day = int(dayAndMonth[0])
month = int(dayAndMonth[1])
hourAndMinute = showing_cell.xpath('text()').extract_first().split(':')
hour = int(hourAndMinute[0])
minute = int(hourAndMinute[1])
#seating_info = showing_cell.xpath('@title').extract_first()[len('<div>'):len('</div>')]
seating_info = showing_cell.xpath('@title').extract_first()[len('<div>'):-len('</div>')].split('</div><div>')
date_obj = datetime(datetime.now().year, month, day, hour, minute)
if date_obj < datetime.now():
date_obj = datetime(datetime.now().year + 1, month, day, hour, minute)
showing = ShowingItem()
showing['movieTitle'] = movie_title
showing['movieUrl'] = movie_url
showing['theaterUrl'] = theater_url
showing['seatingInfo'] = seating_info
showing['showingUrl'] = response.urljoin(showing_cell.xpath('@href').extract_first())
showing['start'] = date_obj.strftime('%Y-%m-%dT%H:%M:00')
showing['version'] = version
yield showing
if at_least_one_showing_found:
next_page = showings_table.css('.showtimes-extra').xpath('a[last()]')
if next_page:
next_page_url = urldefrag(response.urljoin(next_page.xpath('@href')[0].extract()))[0]
request = scrapy.Request(next_page_url, callback=self.parse_showings_table)
request.meta['movieTitle'] = movie_title
request.meta['movieUrl'] = movie_url
request.meta['showingsTableValue'] = showings_table_value
request.meta['theaterUrl'] = theater_url
request.meta['version'] = version
yield request
示例9: get_links
def get_links(response):
if 300 <= response.status_code < 400 and response.headers['location']:
# redirect
yield urlparse.urldefrag(urlparse.urljoin(response.url, response.headers['location'], False))[0]
try:
html = beautify(response)
for i in html.findAll('a', href=True):
yield urlparse.urldefrag(urlparse.urljoin(response.url, i['href'], False))[0]
except NotHtmlException:
pass
示例10: job
def job(self, joborder, basedir, output_callback, **kwargs):
# Validate job order
validate.validate_ex(self.names.get_name("input_record_schema", ""), joborder)
requirements = kwargs.get("requirements", []) + self.tool.get("requirements", [])
hints = kwargs.get("hints", []) + self.tool.get("hints", [])
steps = [makeTool(step, basedir) for step in self.tool.get("steps", [])]
random.shuffle(steps)
self.state = {}
self.processStatus = "success"
for i in self.tool["inputs"]:
(_, iid) = urlparse.urldefrag(i["id"])
if iid in joborder:
self.state[i["id"]] = WorkflowStateItem(i, copy.deepcopy(joborder[iid]))
elif "default" in i:
self.state[i["id"]] = WorkflowStateItem(i, copy.deepcopy(i["default"]))
else:
raise WorkflowException("Input '%s' not in input object and does not have a default value." % (i["id"]))
for s in steps:
for out in s.tool["outputs"]:
self.state[out["id"]] = None
s.completed = False
completed = 0
while completed < len(steps):
made_progress = False
completed = 0
for step in steps:
if step.completed:
completed += 1
else:
for newjob in self.try_make_job(step, basedir, requirements=requirements, hints=hints, **kwargs):
if newjob:
made_progress = True
yield newjob
if not made_progress and completed < len(steps):
yield None
wo = {}
for i in self.tool["outputs"]:
if "connect" in i:
(_, src) = urlparse.urldefrag(i['id'])
if i["connect"]["source"] not in self.state:
raise WorkflowException("Connect source '%s' on parameter '%s' does not exist" % (i["connect"]["source"], inp["id"]))
wo[src] = self.state[i["connect"]["source"]].value
output_callback(wo, self.processStatus)
示例11: crawl_web
def crawl_web( scope, tocrawl, index, graph, url_info, limits = [-1, 0, 0.0, 1.0]): # returns index, graph of inlinks
tocrawl_next = [] # used for depth control
depth = 0
pages = 0
max_pages, max_depth, max_time, time_delay = limits
if max_time > 0.0: start_time = time()
while tocrawl or tocrawl_next:
if not tocrawl:
#
# Descent one more level (depth)
#
tocrawl = tocrawl_next
tocrawl_next = []
depth += 1
if max_depth >= 0 and depth > max_depth:
print 'Reached maximum depth. Interrupting crawler.'
break
page = tocrawl.pop(0)
# Remove fragment portion from the url
page = urlparse.urldefrag(page)[0]
if not page in graph:
pages += 1
print 'Crawling page:', page
if max_time != 0.0: print 'time = ', time()-start_time, ' max_time = ', max_time
if max_pages > 0:
print 'Pages crawled:', pages, 'max_pages = ', max_pages
# [ToDo:]Transform meta_data into a dictionary
text, outlinks, meta_data = get_page( page)
add_page_to_index( index, page, text)
# Need to filter outlinks only to current scope
outlinks = [ [urlparse.urldefrag(l[0])[0],l[1]] for l in outlinks if is_inscope( scope, l[0]) and (l[0].endswith('.html') or l[0].endswith('.htm')) ]
newlinks = [ urlparse.urldefrag(l[0])[0] for l in outlinks]
graph[page] = outlinks
url_info[page] = meta_data
tocrawl_next = list( set(tocrawl_next + newlinks))
if pages >= max_pages:
print 'Reached number of pages limit. Interrupting crawler.'
break
if max_time > 0.0 and max_time > time()-start_time:
print 'Reached time limit. Interrupting crawler.'
break
tocrawl = list( set(tocrawl + tocrawl_next))
return tocrawl, index, graph, url_info
示例12: __init__
def __init__(self, request, timeout=180):
self.url = urldefrag(request.url)[0]
self.method = request.method
self.body = request.body or None
self.headers = Headers(request.headers)
self.response_headers = None
self.timeout = request.meta.get('download_timeout') or timeout
self.start_time = time()
self.deferred = defer.Deferred().addCallback(self._build_response, request)
# Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
# to have _disconnectedDeferred. See Twisted r32329.
# As Scrapy implements it's own logic to handle redirects is not
# needed to add the callback _waitForDisconnect.
# Specifically this avoids the AttributeError exception when
# clientConnectionFailed method is called.
self._disconnectedDeferred = defer.Deferred()
self._set_connection_attributes(request)
# set Host header based on url
self.headers.setdefault('Host', self.netloc)
# set Content-Length based len of body
if self.body is not None:
self.headers['Content-Length'] = len(self.body)
# just in case a broken http/1.1 decides to keep connection alive
self.headers.setdefault("Connection", "close")
示例13: grab_links
def grab_links(self):
if self.document is not None:
for item in self.document.xpath('//a/@href'):
item = urldefrag(item)[0]
url = urlparse(item)
if url.geturl() and item not in self.crawler.visited_urls and url.hostname in self.processor.allowed_urls:
self.crawler.urls.put(item)
示例14: write_opf
def write_opf(self, guide, toc, spine, resource_map):
mi = self.header.exth.mi
if (self.cover_offset is not None and self.cover_offset <
len(resource_map)):
mi.cover = resource_map[self.cover_offset]
if len(list(toc)) < 2:
self.log.warn('KF8 has no metadata Table of Contents')
for ref in guide:
if ref.type == 'toc':
href = ref.href()
href, frag = urldefrag(href)
if os.path.exists(href.replace('/', os.sep)):
try:
toc = self.read_inline_toc(href, frag)
except:
self.log.exception('Failed to read inline ToC')
opf = OPFCreator(os.getcwdu(), mi)
opf.guide = guide
def exclude(path):
return os.path.basename(path) == 'debug-raw.html'
opf.create_manifest_from_files_in([os.getcwdu()], exclude=exclude)
opf.create_spine(spine)
opf.set_toc(toc)
with open('metadata.opf', 'wb') as of, open('toc.ncx', 'wb') as ncx:
opf.render(of, ncx, 'toc.ncx')
return 'metadata.opf'
示例15: serialize_href
def serialize_href(self, href, base=None):
'''
Serialize the href attribute of an <a> or <reference> tag. It is
serialized as filepos="000000000" and a pointer to its location is
stored in self.href_offsets so that the correct value can be filled in
at the end.
'''
hrefs = self.oeb.manifest.hrefs
try:
path, frag = urldefrag(urlnormalize(href))
except ValueError:
# Unparseable URL
return False
if path and base:
path = base.abshref(path)
if path and path not in hrefs:
return False
buf = self.buf
item = hrefs[path] if path else None
if item and item.spine_position is None:
return False
path = item.href if item else base.href
href = '#'.join((path, frag)) if frag else path
buf.write(b'filepos=')
self.href_offsets[href].append(buf.tell())
buf.write(b'0000000000')
return True