本文整理汇总了Python中lxml.etree.ElementTree.xpath方法的典型用法代码示例。如果您正苦于以下问题:Python ElementTree.xpath方法的具体用法?Python ElementTree.xpath怎么用?Python ElementTree.xpath使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml.etree.ElementTree
的用法示例。
在下文中一共展示了ElementTree.xpath方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: scrape_pre_58_legislators
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import xpath [as 别名]
def scrape_pre_58_legislators(self, chamber, term, suffix):
url = 'http://leg.mt.gov/css/Sessions/%s%s/legname.asp' % (term, suffix)
legislator_page = ElementTree(lxml.html.fromstring(self.urlopen(url)))
if term == '57':
if chamber == 'upper':
tableName = '57th Legislatore Roster Senate (2001-2002)'
startRow = 3
else:
tableName = '57th Legislator Roster (House)(2001-2002)'
startRow = 5
elif term == '56':
if chamber == 'upper':
tableName = 'Members of the Senate'
startRow = 3
else:
tableName = 'Members of the House'
startRow = 5
for table in legislator_page.xpath("//table"):
if table.attrib.has_key('name') and table.attrib['name'] == tableName:
parse_names = False
for row in table.getchildren():
if row.tag != 'tr':
continue
celldata = row.getchildren()[0].text_content().strip()
if parse_names and len(celldata) != 0:
name, party_letter = celldata.rsplit(' (', 1)
party_letter = party_letter[0]
nameParts = [namePart.strip() for namePart in name.split(',')]
assert len(nameParts) < 4
if len(nameParts) == 2:
last_name, first_name = nameParts
elif len(nameParts) == 3:
last_name = ' '.join(nameParts[0:2])
first_name = nameParts[2]
else:
name, party_letter = celldata.rsplit(' (', 1)
district = row.getchildren()[2].text_content().strip()
if party_letter == 'R':
party = 'Republican'
elif party_letter == 'D':
party = 'Democrat'
else:
party = party_letter
legislator = Legislator(term, chamber, district, '%s %s' % (first_name, last_name), \
first_name, last_name, '', party)
legislator.add_source(url)
self.save_legislator(legislator)
if celldata == "Name (Party)":
# The table headers seem to vary in size, but the last row
# always seems to start with 'Name (Party)' -- once we find
# that, start parsing legislator names
parse_names = True
示例2: parse_bill
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import xpath [as 别名]
def parse_bill(self, bill_url, session, chamber):
# Temporarily skip the differently-formatted house budget bill.
if "/2011/billhtml/hb0002.htm" in bill_url.lower():
return
bill = None
try:
doc = lxml.html.fromstring(self.urlopen(bill_url))
except XMLSyntaxError as e:
self.logger.warning("Got %r while parsing %r" % (e, bill_url))
return
bill_page = ElementTree(doc)
for anchor in bill_page.findall("//a"):
if anchor.text_content().startswith("status of") or anchor.text_content().startswith(
"Detailed Information (status)"
):
status_url = anchor.attrib["href"].replace("\r", "").replace("\n", "")
bill = self.parse_bill_status_page(status_url, bill_url, session, chamber)
if bill is None:
# No bill was found. Maybe something like HB0790 in the 2005 session?
# We can search for the bill metadata.
page_name = bill_url.split("/")[-1].split(".")[0]
bill_type = page_name[0:2]
bill_number = page_name[2:]
laws_year = self.metadata["session_details"][session]["years"][0] % 100
status_url = self.search_url_template % (laws_year, bill_type, bill_number)
bill = self.parse_bill_status_page(status_url, bill_url, session, chamber)
# Get versions on the detail page.
versions = [a["action"] for a in bill["actions"]]
versions = [a for a in versions if "Version Available" in a]
if not versions:
version_name = "Introduced"
else:
version = versions.pop()
if "New Version" in version:
version_name = "Amended"
elif "Enrolled" in version:
version_name = "Enrolled"
self.add_other_versions(bill)
# Add html.
bill.add_version(version_name, bill_url, mimetype="text/html")
# Add pdf.
url = set(bill_page.xpath('//a/@href[contains(., "BillPdf")]')).pop()
bill.add_version(version_name, url, mimetype="application/pdf")
# Add status url as a source.
bill.add_source(status_url)
return bill
示例3: parse_bill_status_page
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import xpath [as 别名]
def parse_bill_status_page(self, status_url, bill_url, session, chamber):
status_page = ElementTree(lxml.html.fromstring(self.urlopen(status_url)))
# see 2007 HB 2... weird.
try:
bill_id = status_page.xpath("/div/form[1]/table[2]/tr[2]/td[2]")[0].text_content()
except IndexError:
bill_id = status_page.xpath("/html/html[2]/tr[1]/td[2]")[0].text_content()
try:
title = status_page.xpath("/div/form[1]/table[2]/tr[3]/td[2]")[0].text_content()
except IndexError:
title = status_page.xpath("/html/html[3]/tr[1]/td[2]")[0].text_content()
bill = Bill(session, chamber, bill_id, title)
bill.add_source(bill_url)
self.add_sponsors(bill, status_page)
self.add_actions(bill, status_page)
return bill
示例4: parse_bill_status_page
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import xpath [as 别名]
def parse_bill_status_page(self, status_url, bill_url, term, session, chamber):
bill = None
bill_id = None
sources = [bill_url, status_url]
status_page = ElementTree(lxml.html.fromstring(self.urlopen(status_url)))
if status_url == 'http://leg.mt.gov/css/sessions/special%20session/august_2002/bills/sb0001.asp':
import pdb; pdb.set_trace()
# see 2007 HB 2... weird.
try:
bill_id = status_page.xpath("/div/form[1]/table[2]/tr[2]/td[2]")[0].text_content()
except IndexError:
try:
bill_id = status_page.xpath('/html/html[2]/tr[1]/td[2]')[0].text_content()
except IndexError:
pass
if bill_id is None:
try:
bill_table = self.get_bill_table(status_page)
bill_id = bill_table.xpath('//tr[2]/td[2]')[0].text_content()
# bill_id = status_page.xpath('/html/body/table[4]/tr/td[2]/table/tr/td[1]/table[1]/tbody/tr[2]/td[2]')[0].text_content()
# bill_id = status_page.xpath('/html/body/table[4]/tr/td[2]/table/tr[1]/td[1]/table/tr[2]/td[2]')[0].text_content()
bill = self.parse_special_session_bill_status_page(bill_id,
status_page,
bill_table,
session,
chamber,
sources)
except IndexError:
pass
else:
bill = self.parse_standard_bill_status_page(bill_id,
status_page,
session,
chamber,
sources)
if bill is None:
self.logger.error("No bill parsed for %s" % bill_url)
return bill
示例5: parse_bill
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import xpath [as 别名]
def parse_bill(self, bill_url, session, chamber):
# Temporarily skip the differently-formatted house budget bill.
if '/2011/billhtml/hb0002.htm' in bill_url.lower():
return
bill = None
bill_page = ElementTree(lxml.html.fromstring(self.urlopen(bill_url)))
for anchor in bill_page.findall('//a'):
if (anchor.text_content().startswith('status of') or
anchor.text_content().startswith('Detailed Information (status)')):
status_url = anchor.attrib['href'].replace("\r", "").replace("\n", "")
bill = self.parse_bill_status_page(status_url, bill_url, session, chamber)
if bill is None:
# No bill was found. Maybe something like HB0790 in the 2005 session?
# We can search for the bill metadata.
page_name = bill_url.split("/")[-1].split(".")[0]
bill_type = page_name[0:2]
bill_number = page_name[2:]
laws_year = self.metadata['session_details'][session]['years'][0] % 100
status_url = self.search_url_template % (laws_year, bill_type, bill_number)
bill = self.parse_bill_status_page(status_url, bill_url, session, chamber)
# Get versions on the detail page.
versions = [a['action'] for a in bill['actions']]
versions = [a for a in versions if 'Version Available' in a]
if not versions:
version_name = 'Introduced'
else:
version = versions.pop()
if 'New Version' in version:
version_name = 'Amended'
elif 'Enrolled' in version:
version_name = 'Enrolled'
self.add_other_versions(bill)
# Add html.
bill.add_version(version_name, bill_url, mimetype='text/html')
# Add pdf.
url = set(bill_page.xpath('//a/@href[contains(., "BillPdf")]')).pop()
bill.add_version(version_name, url, mimetype='application/pdf')
# Add status url as a source.
bill.add_source(status_url)
return bill
示例6: __init__
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import xpath [as 别名]
class CDRLog:
def __init__(self):
self.str = ''
self.ast = None
self.xml = None
def parseString(self, string):
self.ast = CDR.parseString(string)
def parseFile(self, file):
self.ast = CDR.parseFile(file)
def printList(self):
'''for debug purpose only'''
import pprint
cdrs = self.ast.asList()
pprint.pprint(cdrs)
def dump(self):
'''for debug purpose only'''
print self.ast.dump()
def asXML(self):
'''generate the XML string by the AST (list/list)
then create the XML ElementTree representation
'''
self.str = ''
self.str += '<CDR>\n'
list = self.ast.asList()
head = list[0]
self.__asHeadXML(head)
cdrs = list[1:]
for cdr in cdrs:
self.__asRecordXML(cdr)
self.str += '</CDR>'
self.xml = ElementTree(fromstring(self.str))
def __asHeadXML(self, head):
''' for the head part of CDR log
'''
self.str += ' <head>\n'
for item in head:
self.__asItemXML(item, ' ')
self.str += ' </head>\n'
def __asRecordXML(self, cdr):
''' for the each record part of CDR log
'''
num = cdr[0]
cdr = cdr[1]
self.str += ' <Record n="'+num+'">\n'
self.str += ' <n>'+num+'</n>\n'
for item in cdr:
self.__asItemXML(item, ' ')
self.str += ' </Record>\n'
def __asItemXML(self, item, indent=' '):
''' for the each item(n/v pair) part of CDR log
'''
#print 'item=',item
name, value = item
# some name start with 3GPP which is not a valid xml tag
name = name.strip('3')
if isinstance(value, list):
self.str += indent+'<'+name+'>\n'
for i in value:
self.__asItemXML(i, indent+' ')
self.str += indent+'</'+name+'>\n'
else:
value = value.strip()
self.str += indent+'<'+name+'>'+value+'</'+name+'>\n'
def xpath(self, path):
''' API to locate the node(s) in the self.xml (ElementTree) by XPATH
'''
return self.xml.xpath(path)
def abstract(self):
return self.str
def __str__(self):
'''string representation, should be the XML rep
'''
return self.str
示例7: ODTTemplate
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import xpath [as 别名]
class ODTTemplate(object):
def __init__(self, filename):
# first create a temporary copy of the template
self.tempCopy = create_temporary_copy(filename)
self.zipfile = zipfile.ZipFile(self.tempCopy,"a")
# extract the content and parse xml tree
content = self.zipfile.open("content.xml", "r")
self.tree = ElementTree()
self.tree.parse(content)
self.usedVariables = set()
self.unknownVariables = set()
self.rendered = False
def render(self, context):
if self.rendered:
raise RuntimeError("You may only render the template once")
# create matcher expressions for simple and multiline keywords
for elem in self.tree.xpath(FIELD_XPATH, namespaces=OD_NSMAP):
self._field_render(elem, context)
self.render_warnings = []
if len(self.unknownVariables) > 0:
self.render_warnings.append("The following variables occurred in the template but were not provided in the context: {}".format(",".join(self.unknownVariables)))
unused = set(context.keys()) - self.usedVariables
if len(unused) > 0:
self.render_warnings.append("The following variables where not used: {}".format(",".join(unused)))
# write out result to temporary archive
outbuf = StringIO()
self.tree.write(outbuf, encoding="utf8", xml_declaration=True)
self.zipfile.writestr("content.xml", outbuf.getvalue())
self.zipfile.close()
self.rendered = True
def debugSave(self, target):
with open(target,"w") as f:
self.tree.write(f, pretty_print=True, encoding="utf8", xml_declaration=True)
def saveODT(self, target):
if not self.rendered:
raise RuntimeError("You cannot save an unrendered template")
shutil.copy2(self.tempCopy, target)
def getTemporaryODT(self):
if not self.rendered:
raise RuntimeError("You cannot save an unrendered template")
return self.tempCopy
def getTemporaryPDF(self):
temp = tempfile.NamedTemporaryFile()
self.savePDF(temp.name)
return temp
def savePDF(self, target):
tempdir = os.path.dirname(self.tempCopy)
with open(os.devnull,"w") as DEVNULL:
subprocess.call([ODT_BINARY, "--headless", "--convert-to", "pdf", "--outdir", tempdir, self.tempCopy], stdout=DEVNULL, stderr=DEVNULL)
pdfPath = os.path.splitext(self.tempCopy)[0] + ".pdf"
shutil.copy2(pdfPath, target)
os.unlink(pdfPath)
def _field_render(self, elem, context):
# retrieve the variable name of this field
name = elem.attrib["{{{}}}name".format(OD_TEXT_NS)]
# check if this is in our dictionary, if not, we leave it as it is
# but create a warning
if not name in context.keys():
self.unknownVariables.add(name)
return
# otherwise mark it as used
self.usedVariables.add(name)
replacement = context[name]
parent = elem.getparent()
idx = parent.index(elem)
# now distinguish whether its a multi-line or a single-line content
if not "\n" in replacement:
# in single-line replacements we just replace this element by its replacement text
self._replace_var_with_content(elem, parent, idx, replacement)
else:
replacementLines = replacement.split("\n")
# check if we are in a listing to use listing formatter
li = parent
isList = False
while li is not None:
if li.tag == "{{{}}}list-item".format(OD_TEXT_NS):
isList = True
break
li = li.getparent()
if isList and self._test_listitem_empty(li):
# we do list-style replacement
nodes = []
for line in replacementLines:
cli = deepcopy(li)
var = cli.xpath(FIELD_XPATH, namespaces=OD_NSMAP)
if len(var) > 1:
#.........这里部分代码省略.........
示例8: XMLRegistery
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import xpath [as 别名]
class XMLRegistery(object):
"""Represent a LifecycleManager in XML.
This is a singleton.
"""
_instance = None
_xml_root_tree = None
def __new__(cls, *args, **kwargs):
if not cls._instance:
cls._instance = super(XMLRegistery, cls).__new__(cls, *args, **kwargs)
return cls._instance
def _xml_register(self, ressource, parent=None):
"""
:type ressource: XMLRessource
:type parent: lxml.Element
"""
attributes = {RESSOURCE_ATTR: ressource._xml_ressource_name()}
attributes.update(ressource._xml_attributes())
if parent is None:
xml_elt = Element(ressource._xml_tag(), attrib=attributes)
self._xml_root_tree = ElementTree(xml_elt)
else:
xml_elt = SubElement(parent,
ressource._xml_tag(),
attrib=attributes)
ressource._xpath = self._xml_root_tree.getpath(xml_elt)
try:
ressource._xpath_relative = ressource._xpath.split("/", 2)[2]
except IndexError:
ressource._xpath_relative = ressource._xpath
if (ressource._xml_add_properties()
or ressource._xml_add_properties_tuple()):
properties_node = SubElement(xml_elt, "properties")
for (prop, value) in ressource._xml_add_properties_tuple():
logger.trace("Add property '%s:%s' on node with tag %s" % (
prop, value, ressource._xml_tag()))
sub = SubElement(properties_node, prop)
sub.text = value
for elt in ressource._xml_add_properties():
logger.trace("Add property '%s' on node with tag %s" % (
elt.tag, ressource._xml_tag()))
properties_node.append(elt)
self._xml_register_children(xml_elt, ressource)
logger.trace("Registered %s in XML registery" % ressource.__repr__())
ressource._xml_on_registration()
def _xml_register_children(self, xml_elt, ressource):
"""Be careful, this removes children before adding them."""
# Children are removed to avoid multiple adding if the
# lifecycle is created several times.
for c in xml_elt.iterchildren():
if c.tag != "properties":
xml_elt.remove(c)
for c in ressource._xml_children():
self._xml_register(c, parent=xml_elt)
def to_string(self, xpath):
return tostring(self._find_one(xpath), pretty_print=True)
def xpath(self, xpath):
"""
:rtype: [str]
"""
acc = []
try:
request = self._xml_root_tree.xpath(xpath)
if type(request) != list:
return [str(request)]
for e in request:
if type(e) == _Element:
acc.append(tostring(e, pretty_print=True))
else:
acc.append(str(e))
except XPathEvalError:
raise XpathInvalidExpression("xpath '%s' is not valid!" % xpath)
return acc
def find_all_elts(self, xpath):
try:
return [self._xml_root_tree.getpath(e) for e in
self._xml_root_tree.xpath(xpath)]
except XPathEvalError:
raise XpathInvalidExpression("xpath '%s' is not valid!" % xpath)
def _find_one(self, xpath):
"""Return the ressource uri. Raise exception if multiple match
#.........这里部分代码省略.........
示例9: get_sponsor_table
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import xpath [as 别名]
def get_sponsor_table(self, status_page):
for table in status_page.xpath('//table'):
table = ElementTree(table)
if ((len(table.xpath('//th')) == 4) and
(table.xpath('//th')[0].text_content().startswith('Sponsor,'))):
return table
示例10: get_action_table
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import xpath [as 别名]
def get_action_table(self, status_page):
for table in status_page.xpath('//table'):
table = ElementTree(table)
if ((len(table.xpath('//th')) == 5) and
(table.xpath('//th')[0].text_content().startswith('Action'))):
return table
示例11: get_bill_table
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import xpath [as 别名]
def get_bill_table(self, status_page):
for table in status_page.xpath('//table'):
table = ElementTree(table)
if ((len(table.xpath('//tr')) == 4) and
(table.xpath('//tr[1]/td[1]')[0].text_content().strip().startswith('Bill Draft Number:'))):
return table