本文整理汇总了Python中sigil_bs4.BeautifulSoup类的典型用法代码示例。如果您正苦于以下问题:Python BeautifulSoup类的具体用法?Python BeautifulSoup怎么用?Python BeautifulSoup使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了BeautifulSoup类的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: performPageMapUpdates
def performPageMapUpdates(data, currentdir, keylist, valuelist):
data = _remove_xml_header(data)
# lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
data = data.encode('utf-8')
# rebuild serialized lookup dictionary of xml_updates properly adjusted
updates = {}
for i in range(0, len(keylist)):
updates[ keylist[i] ] = "../" + valuelist[i]
xml_empty_tags = ["page"]
xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=xml_empty_tags)
soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder)
for tag in soup.find_all(["page"]):
for att in ["href"]:
if att in tag.attrs :
ref = tag[att]
if ref.find(":") == -1 :
parts = ref.split('#')
url = parts[0]
fragment = ""
if len(parts) > 1:
fragment = parts[1]
bookrelpath = os.path.join(currentdir, unquoteurl(url))
bookrelpath = os.path.normpath(bookrelpath)
bookrelpath = bookrelpath.replace(os.sep, "/")
if bookrelpath in updates:
attribute_value = updates[bookrelpath]
if fragment != "":
attribute_value = attribute_value + "#" + fragment
attribute_value = quoteurl(attribute_value)
tag[att] = attribute_value
newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ")
return newdata
示例2: performNCXSourceUpdates
def performNCXSourceUpdates(data, currentdir, keylist, valuelist):
# rebuild serialized lookup dictionary
updates = {}
for i in range(0, len(keylist)):
updates[ keylist[i] ] = valuelist[i]
xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags)
soup = BeautifulSoup(data, features=None, builder=xmlbuilder)
for tag in soup.find_all("content"):
if "src" in tag.attrs:
src = tag["src"]
if src.find(":") == -1:
parts = src.split('#')
url = parts[0]
fragment = ""
if len(parts) > 1:
fragment = parts[1]
bookrelpath = os.path.join(currentdir, unquoteurl(url))
bookrelpath = os.path.normpath(bookrelpath)
bookrelpath = bookrelpath.replace(os.sep, "/")
if bookrelpath in updates:
attribute_value = updates[bookrelpath]
if fragment != "":
attribute_value = attribute_value + "#" + fragment
attribute_value = quoteurl(attribute_value)
tag["src"] = attribute_value
newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ")
return newdata
示例3: diagnose
def diagnose(data):
"""Diagnostic suite for isolating common problems."""
print("Diagnostic running on Beautiful Soup %s" % __version__)
print("Python version %s" % sys.version)
basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers:
for builder in builder_registry.builders:
if name in builder.features:
break
else:
basic_parsers.remove(name)
print((
"I noticed that %s is not installed. Installing it may help." %
name))
if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"])
try:
from lxml import etree
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
except ImportError as e:
print (
"lxml is not installed or couldn't be imported.")
if 'html5lib' in basic_parsers:
try:
import html5lib
print("Found html5lib version %s" % html5lib.__version__)
except ImportError as e:
print (
"html5lib is not installed or couldn't be imported.")
if hasattr(data, 'read'):
data = data.read()
elif os.path.exists(data):
print('"%s" looks like a filename. Reading data from the file.' % data)
data = open(data).read()
elif data.startswith("http:") or data.startswith("https:"):
print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return
print()
for parser in basic_parsers:
print("Trying to parse your markup with %s" % parser)
success = False
try:
soup = BeautifulSoup(data, parser)
success = True
except Exception as e:
print("%s could not parse the markup." % parser)
traceback.print_exc()
if success:
print("Here's what %s did with the markup:" % parser)
print(soup.prettify())
print("-" * 80)
示例4: repairXML
def repairXML(data, self_closing_tags=ebook_xml_empty_tags, indent_chars=" "):
data = _remove_xml_header(data)
# lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
data = data.encode('utf-8')
xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=self_closing_tags)
soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder)
newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=indent_chars)
return newdata
示例5: repairXML
def repairXML(data, mtype="", indent_chars=" "):
data = _remove_xml_header(data)
data = _make_it_sane(data)
voidtags = get_void_tags(mtype)
# lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
data = data.encode('utf-8')
xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=voidtags)
soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder)
newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=indent_chars)
return newdata
示例6: repairXML
def repairXML(data, mtype="", indent_chars=" "):
newdata = _remove_xml_header(data)
# if well-formed - don't mess with it
if _well_formed(newdata):
return data
newdata = _make_it_sane(newdata)
if not _well_formed(newdata):
newdata = _reformat(newdata)
if mtype == "application/oebps-package+xml":
newdata = newdata.decode('utf-8')
newdata = Opf_Parser(newdata).rebuild_opfxml()
# lxml requires utf-8 on Mac, won't work with unicode
if isinstance(newdata, str):
newdata = newdata.encode('utf-8')
voidtags = get_void_tags(mtype)
xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=voidtags)
soup = BeautifulSoup(newdata, features=None, from_encoding="utf-8", builder=xmlbuilder)
newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=indent_chars)
return newdata
示例7: anchorNCXUpdates
def anchorNCXUpdates(data, originating_filename, keylist, valuelist):
# rebuild serialized lookup dictionary
id_dict = {}
for i in range(0, len(keylist)):
id_dict[ keylist[i] ] = valuelist[i]
xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags)
soup = BeautifulSoup(data, features=None, builder=xmlbuilder)
original_filename_with_relative_path = TEXT_FOLDER_NAME + "/" + originating_filename
for tag in soup.find_all("content"):
if "src" in tag.attrs:
src = tag["src"]
if src.find(":") == -1:
parts = src.split('#')
if (parts is not None) and (len(parts) > 1) and (parts[0] == original_filename_with_relative_path) and (parts[1] != ""):
fragment_id = parts[1]
if fragment_id in id_dict:
attribute_value = TEXT_FOLDER_NAME + "/" + quoteurl(id_dict[fragment_id]) + "#" + fragment_id
tag["src"] = attribute_value
newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ")
return newdata
示例8: repairXML
def repairXML(data, self_closing_tags=ebook_xml_empty_tags, indent_chars=" "):
xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=self_closing_tags)
soup = BeautifulSoup(data, features=None, builder=xmlbuilder)
newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=indent_chars)
return newdata