本文整理汇总了Python中urllib.URLopener.info方法的典型用法代码示例。如果您正苦于以下问题:Python URLopener.info方法的具体用法?Python URLopener.info怎么用?Python URLopener.info使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类urllib.URLopener
的用法示例。
在下文中一共展示了URLopener.info方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: read_component_sitemap
# 需要导入模块: from urllib import URLopener [as 别名]
# 或者: from urllib.URLopener import info [as 别名]
def read_component_sitemap(self, sitemapindex_uri, sitemap_uri, sitemap, sitemapindex_is_file):
"""Read a component sitemap of a Resource List with index
Each component must be a sitemap with the
"""
if (sitemapindex_is_file):
if (not self.is_file_uri(sitemap_uri)):
# Attempt to map URI to local file
remote_uri = sitemap_uri
sitemap_uri = self.mapper.src_to_dst(remote_uri)
self.logger.info("Mapped %s to local file %s" % (remote_uri, sitemap_uri))
else:
# The individual sitemaps should be at a URL (scheme/server/path)
# that the sitemapindex URL can speak authoritatively about
if (self.check_url_authority and
not UrlAuthority(sitemapindex_uri).has_authority_over(sitemap_uri)):
raise ListBaseIndexError("The sitemapindex (%s) refers to sitemap at a location it does not have authority over (%s)" % (sitemapindex_uri,sitemap_uri))
try:
fh = URLopener().open(sitemap_uri)
self.num_files += 1
except IOError as e:
raise ListBaseIndexError("Failed to load sitemap from %s listed in sitemap index %s (%s)" % (sitemap_uri,sitemapindex_uri,str(e)))
# Get the Content-Length if we can (works fine for local files)
try:
self.content_length = int(fh.info()['Content-Length'])
self.bytes_read += self.content_length
except KeyError:
# If we don't get a length then c'est la vie
pass
self.logger.info( "Reading sitemap from %s (%d bytes)" % (sitemap_uri,self.content_length) )
component = sitemap.parse_xml( fh=fh, sitemapindex=False )
# Copy resources into self, check any metadata
for r in component:
self.resources.add(r)
示例2: read
# 需要导入模块: from urllib import URLopener [as 别名]
# 或者: from urllib.URLopener import info [as 别名]
def read(self, uri=None, resources=None, index_only=False):
"""Read sitemap from a URI including handling sitemapindexes
If index_only is True then individual sitemaps references in a sitemapindex
will not be read. This will result in no resources being returned and is
useful only to read the metadata and links listed in the sitemapindex.
Includes the subtlety that if the input URI is a local file and is a
sitemapindex which contains URIs for the individual sitemaps, then these
are mapped to the filesystem also.
"""
try:
fh = URLopener().open(uri)
self.num_files += 1
except IOError as e:
raise IOError("Failed to load sitemap/sitemapindex from %s (%s)" % (uri,str(e)))
# Get the Content-Length if we can (works fine for local files)
try:
self.content_length = int(fh.info()['Content-Length'])
self.bytes_read += self.content_length
self.logger.debug( "Read %d bytes from %s" % (self.content_length,uri) )
except KeyError:
# If we don't get a length then c'est la vie
self.logger.debug( "Read ????? bytes from %s" % (uri) )
pass
self.logger.info( "Read sitemap/sitemapindex from %s" % (uri) )
s = self.new_sitemap()
s.parse_xml(fh=fh,resources=self,capability=self.capability_name)
# what did we read? sitemap or sitemapindex?
if (s.parsed_index):
# sitemapindex
if (not self.allow_multifile):
raise ListBaseIndexError("Got sitemapindex from %s but support for sitemapindex disabled" % (uri))
self.logger.info( "Parsed as sitemapindex, %d sitemaps" % (len(self.resources)) )
sitemapindex_is_file = self.is_file_uri(uri)
if (index_only):
# don't read the component sitemaps
self.sitemapindex = True
return
# now loop over all entries to read each sitemap and add to resources
sitemaps = self.resources
self.resources = self.resources_class()
self.logger.info( "Now reading %d sitemaps" % len(sitemaps.uris()) )
for sitemap_uri in sorted(sitemaps.uris()):
self.read_component_sitemap(uri,sitemap_uri,s,sitemapindex_is_file)
else:
# sitemap
self.logger.info( "Parsed as sitemap, %d resources" % (len(self.resources)) )
示例3: read
# 需要导入模块: from urllib import URLopener [as 别名]
# 或者: from urllib.URLopener import info [as 别名]
def read(self, uri=None, resources=None):
"""Read sitemap from a URI including handling sitemapindexes
Returns the inventory or changeset. If resources is not specified then
it is assumed that an Inventory is to be read, pass in a ChangeSet object
to read a changeset.
Includes the subtlety that if the input URI is a local file and the
"""
try:
fh = URLopener().open(uri)
except IOError as e:
raise Exception("Failed to load sitemap/sitemapindex from %s (%s)" % (uri,str(e)))
# Get the Content-Length if we can (works fine for local files)
try:
self.content_length = int(fh.info()['Content-Length'])
self.bytes_read += self.content_length
except KeyError:
# If we don't get a length then c'est la vie
pass
self.logger.info( "Read sitemap/sitemapindex from %s" % (uri) )
etree = parse(fh)
# check root element: urlset (for sitemap), sitemapindex or bad
self.sitemaps_created=0
root = etree.getroot()
# assume inventory but look to see whether this is a changeset
# as indicated with rs:type="changeset" on the root
resources_class = self.inventory_class
sitemap_xml_parser = self.inventory_parse_xml
self.changeset_read = False
root_type = root.attrib.get('{'+RS_NS+'}type',None)
if (root_type is not None):
if (root_type == 'changeset'):
resources_class = self.changeset_class
sitemap_xml_parser = self.changeset_parse_xml
self.changeset_read = True
else:
self.logger.info("Bad value of rs:type on root element (%s), ignoring" % (root_type))
# now have make sure we have a place to put the data we read
if (resources is None):
resources=resources_class()
# sitemap or sitemapindex?
if (root.tag == '{'+SITEMAP_NS+"}urlset"):
self.logger.info( "Parsing as sitemap" )
sitemap_xml_parser(etree=etree, resources=resources)
self.sitemaps_created+=1
elif (root.tag == '{'+SITEMAP_NS+"}sitemapindex"):
if (not self.allow_multifile):
raise Exception("Got sitemapindex from %s but support for sitemapindex disabled" % (uri))
self.logger.info( "Parsing as sitemapindex" )
sitemaps=self.sitemapindex_parse_xml(etree=etree)
sitemapindex_is_file = self.is_file_uri(uri)
# now loop over all entries to read each sitemap and add to resources
self.logger.info( "Now reading %d sitemaps" % len(sitemaps) )
for sitemap_uri in sorted(sitemaps.resources.keys()):
if (sitemapindex_is_file):
if (not self.is_file_uri(sitemap_uri)):
# Attempt to map URI to local file
remote_uri = sitemap_uri
sitemap_uri = self.mapper.src_to_dst(remote_uri)
else:
# The individual sitemaps should be at a URL (scheme/server/path)
# that the sitemapindex URL can speak authoritatively about
if (not UrlAuthority(uri).has_authority_over(sitemap_uri)):
raise Exception("The sitemapindex (%s) refers to sitemap at a location it does not have authority over (%s)" % (uri,sitemap_uri))
try:
fh = URLopener().open(sitemap_uri)
except IOError as e:
raise Exception("Failed to load sitemap from %s listed in sitemap index %s (%s)" % (sitemap_uri,uri,str(e)))
# Get the Content-Length if we can (works fine for local files)
try:
self.content_length = int(fh.info()['Content-Length'])
self.bytes_read += self.content_length
except KeyError:
# If we don't get a length then c'est la vie
pass
self.logger.info( "Read sitemap from %s (%d)" % (sitemap_uri,self.content_length) )
sitemap_xml_parser( fh=fh, resources=resources )
self.sitemaps_created+=1
else:
raise ValueError("XML read from %s is not a sitemap or sitemapindex" % (sitemap_uri))
return(resources)
示例4: read
# 需要导入模块: from urllib import URLopener [as 别名]
# 或者: from urllib.URLopener import info [as 别名]
def read(self, uri=None, resources=None, changelist=None, index_only=False):
"""Read sitemap from a URI including handling sitemapindexes
Returns the resourcelist or changelist. If changelist is not specified (None)
then it is assumed that an ResourceList is to be read, unless the XML
indicates a Changelist.
If changelist is True then a Changelist if expected; if changelist if False
then an ResourceList is expected.
If index_only is True then individual sitemaps references in a sitemapindex
will not be read. This will result in no resources being returned and is
useful only to read the capabilities and metadata listed in the sitemapindex.
Will set self.read_type to a string value sitemap/sitemapindex/changelist/changelistindex
depleding on the type of the file expected/read.
Includes the subtlety that if the input URI is a local file and is a
sitemapindex which contains URIs for the individual sitemaps, then these
are mapped to the filesystem also.
"""
try:
fh = URLopener().open(uri)
except IOError as e:
raise Exception("Failed to load sitemap/sitemapindex from %s (%s)" % (uri,str(e)))
# Get the Content-Length if we can (works fine for local files)
try:
self.content_length = int(fh.info()['Content-Length'])
self.bytes_read += self.content_length
self.logger.debug( "Read %d bytes from %s" % (self.content_length,uri) )
except KeyError:
# If we don't get a length then c'est la vie
self.logger.debug( "Read ????? bytes from %s" % (uri) )
pass
self.logger.info( "Read sitemap/sitemapindex from %s" % (uri) )
etree = parse(fh)
# check root element: urlset (for sitemap), sitemapindex or bad
self.sitemaps_created=0
root = etree.getroot()
# assume resourcelist but look to see whether this is a changelist
# as indicated with rs:type="changelist" on the root
resources_class = self.resourcelist_class
sitemap_xml_parser = self.resourcelist_parse_xml
self.changelist_read = False
self.read_type = 'sitemap'
root_type = root.attrib.get('{'+RS_NS+'}type',None)
if (root_type is not None):
if (root_type == 'changelist'):
self.changelist_read = True
else:
self.logger.info("Bad value of rs:type on root element (%s), ignoring" % (root_type))
elif (changelist is True):
self.changelist_read = True
if (self.changelist_read):
self.read_type = 'changelist'
resources_class = self.changelist_class
sitemap_xml_parser = self.changelist_parse_xml
# now have make sure we have a place to put the data we read
if (resources is None):
resources=resources_class()
# sitemap or sitemapindex?
if (root.tag == '{'+SITEMAP_NS+"}urlset"):
self.logger.info( "Parsing as sitemap" )
sitemap_xml_parser(etree=etree, resources=resources)
self.sitemaps_created+=1
elif (root.tag == '{'+SITEMAP_NS+"}sitemapindex"):
self.read_type += 'index'
if (not self.allow_multifile):
raise Exception("Got sitemapindex from %s but support for sitemapindex disabled" % (uri))
self.logger.info( "Parsing as sitemapindex" )
sitemaps=self.sitemapindex_parse_xml(etree=etree)
sitemapindex_is_file = self.is_file_uri(uri)
if (index_only):
return(resources)
# now loop over all entries to read each sitemap and add to resources
self.logger.info( "Now reading %d sitemaps" % len(sitemaps) )
for sitemap_uri in sorted(sitemaps.resources.keys()):
if (sitemapindex_is_file):
if (not self.is_file_uri(sitemap_uri)):
# Attempt to map URI to local file
remote_uri = sitemap_uri
sitemap_uri = self.mapper.src_to_dst(remote_uri)
else:
# The individual sitemaps should be at a URL (scheme/server/path)
# that the sitemapindex URL can speak authoritatively about
if (not UrlAuthority(uri).has_authority_over(sitemap_uri)):
raise Exception("The sitemapindex (%s) refers to sitemap at a location it does not have authority over (%s)" % (uri,sitemap_uri))
try:
fh = URLopener().open(sitemap_uri)
except IOError as e:
raise Exception("Failed to load sitemap from %s listed in sitemap index %s (%s)" % (sitemap_uri,uri,str(e)))
# Get the Content-Length if we can (works fine for local files)
try:
self.content_length = int(fh.info()['Content-Length'])
self.bytes_read += self.content_length
except KeyError:
# If we don't get a length then c'est la vie
pass
self.logger.info( "Read sitemap from %s (%d)" % (sitemap_uri,self.content_length) )
sitemap_xml_parser( fh=fh, resources=resources )
#.........这里部分代码省略.........