本文整理汇总了Python中lxml.etree.ElementTree.findall方法的典型用法代码示例。如果您正苦于以下问题:Python ElementTree.findall方法的具体用法?Python ElementTree.findall怎么用?Python ElementTree.findall使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml.etree.ElementTree
的用法示例。
在下文中一共展示了ElementTree.findall方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import findall [as 别名]
def parse(self, data):
""" Parse a webdav reply. Retrieve any resources as objects
and return them as a list.
:param data: The webdav reply to parse
:type data: String
:return: self.response_objects
"""
data_elements = HTML(data)
xml_etree = ElementTree(data_elements)
all_response_elements = xml_etree.findall("//response")
for response in all_response_elements:
new_response = Response()
resp_tree = ElementTree(response)
new_response.href = resp_tree.find('//href').text
if resp_tree.find('//collection') is not None:
new_response.resourcetype = 'collection'
else:
new_response.resourcetype = 'resource'
new_response.executable = getattr(
resp_tree.find('//executable'), 'text', None)
new_response.creationdate = getattr(
resp_tree.find('//creationdate'), 'text', None)
new_response.getcontentlength = getattr(
resp_tree.find('//getcontentlength'), 'text', None)
new_response.getlastmodified = getattr(
resp_tree.find('//getlastmodified'), 'text', None)
new_response.getetag = getattr(
resp_tree.find('//getetag'), 'text', None)
new_response.getcontenttype = getattr(
resp_tree.find('//getcontenttype'), 'text', None)
new_response.status = getattr(
resp_tree.find('//status'), 'text', None)
# Now we have the properties that are easy to get,
# lets get the lock information
lock_tree = resp_tree.findall('//lockentry')
for lock in lock_tree:
lock_tree = ElementTree(lock)
lock_obj = Lock()
lock_obj.locktype = lock_tree.find(
'//locktype').getchildren()[-1].tag
lock_obj.lockscope = lock_tree.find(
'//lockscope').getchildren()[-1].tag
new_response.locks.append(lock_obj)
self.response_objects.append(new_response)
return self.response_objects
示例2: getdescendants
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import findall [as 别名]
def getdescendants(request, code):
params = {}
results = {}
language = request.LANGUAGE_CODE.lower()
if language == 'pt-br':
language = 'pt'
for lang in DECS_LANGS:
params[lang] = urllib.urlencode({
'tree_id': code or '',
'lang': lang,
})
resource = urllib.urlopen(settings.DECS_SERVICE, params[lang])
tree = ElementTree()
tree.parse(resource)
descendants = tree.findall('decsws_response/tree/descendants/term_list[@lang="%s"]/term' % lang)
for d in descendants:
if d.attrib['tree_id'] in results:
results[ d.attrib['tree_id'] ] += ',"%s":"%s"' % (lang,d.text.capitalize())
else:
results[ d.attrib['tree_id'] ] = '"%s":"%s"' % (lang,d.text.capitalize())
json = '[%s]' % ','.join((JSON_MULTILINGUAL_TERM % (id,desc) for desc,id in results.items()))
json_response = json_loads(json)
json_response.sort(key=lambda x: x['fields']['description'][language])
return HttpResponse(json_dumps(json_response), mimetype='application/json')
示例3: scrape
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import findall [as 别名]
def scrape(self, chamber, session):
for term in self.metadata['terms']:
if session in term['sessions']:
year = term['start_year']
break
self.versions_dict = self._versions_dict(year)
base_bill_url = 'http://leg.mt.gov/bills/%d/BillHtml/' % year
index_page = ElementTree(lxml.html.fromstring(self.get(base_bill_url).text))
bill_urls = []
for bill_anchor in index_page.findall('//a'):
# See 2009 HB 645
if bill_anchor.text.find("govlineveto") == -1:
# House bills start with H, Senate bills start with S
if chamber == 'lower' and bill_anchor.text.startswith('H'):
bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text))
elif chamber == 'upper' and bill_anchor.text.startswith('S'):
bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text))
for bill_url in bill_urls:
bill = self.parse_bill(bill_url, session, chamber)
if bill:
self.save_bill(bill)
示例4: scrape
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import findall [as 别名]
def scrape(self, chamber, year):
year = int(year)
session = self.getSession(year)
# 2 year terms starting on odd year, so if even number, use the previous odd year
if year < 1999:
raise NoDataForPeriod(year)
if year % 2 == 0:
year -= 1
if year == 1999:
base_bill_url = "http://data.opi.mt.gov/bills/BillHtml/"
else:
base_bill_url = "http://data.opi.mt.gov/bills/%d/BillHtml/" % year
index_page = ElementTree(lxml.html.fromstring(self.urlopen(base_bill_url)))
bill_urls = []
for bill_anchor in index_page.findall("//a"):
# See 2009 HB 645
if bill_anchor.text.find("govlineveto") == -1:
# House bills start with H, Senate bills start with S
if chamber == "lower" and bill_anchor.text.startswith("H"):
bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text))
elif chamber == "upper" and bill_anchor.text.startswith("S"):
bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text))
for bill_url in bill_urls:
bill = self.parse_bill(bill_url, session, chamber)
self.save_bill(bill)
示例5: print_predictions
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import findall [as 别名]
def print_predictions(agency, stops, label=""):
title_index = build_title_index(stops)
url = build_url(agency, stops)
debug("NextBus predictions for %s: %s" % (agency, url))
f = urllib.urlopen(url)
e = ElementTree(file=f)
predictions = e.findall("//predictions")
predictions = filter(lambda el: el.find(".//prediction") is not None, predictions)
predictions.sort(key=lambda el: el.find(".//prediction").get("epochTime"))
for n, p in enumerate(predictions):
routeTag = p.get("routeTag")
stopTag = p.get("stopTag")
title = title_index.get((routeTag, stopTag), False)
if title:
title = "<em>%s</em>" % (title.replace("\n", "<br>"), )
else:
title = p.get("routeTitle")
title = re.sub(r'^Saferide ', '', title)
title = label + title
print "<h2>"+title+"</h2>"
times = p.findall(".//prediction")
print "<ol class='predictions'>"
print '<li>%s</li>' % minutes(times.pop(0).get("minutes"))
for t in times[0:2]:
print '<li>%s</li>' % minutes(t.get("minutes"))
print "</ol>"
示例6: parse_bill
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import findall [as 别名]
def parse_bill(self, bill_url, session, chamber):
bill = None
bill_page = ElementTree(lxml.html.fromstring(self.urlopen(bill_url)))
for anchor in bill_page.findall("//a"):
if anchor.text_content().startswith("status of") or anchor.text_content().startswith(
"Detailed Information (status)"
):
status_url = anchor.attrib["href"].replace("\r", "").replace("\n", "")
bill = self.parse_bill_status_page(status_url, bill_url, session, chamber)
elif anchor.text_content().startswith("This bill in WP"):
index_url = anchor.attrib["href"]
index_url = index_url[0 : index_url.rindex("/")]
# this looks weird. See http://data.opi.mt.gov/bills/BillHtml/SB0002.htm for why
index_url = index_url[index_url.rindex("http://") :]
self.add_bill_versions(bill, index_url)
if bill is None:
# No bill was found. Maybe something like HB0790 in the 2005 session?
# We can search for the bill metadata.
page_name = bill_url.split("/")[-1].split(".")[0]
bill_type = page_name[0:2]
bill_number = page_name[2:]
laws_year = metadata["session_details"][session]["years"][0] % 100
status_url = self.search_url_template % (laws_year, bill_type, bill_number)
bill = self.parse_bill_status_page(status_url, bill_url, session, chamber)
return bill
示例7: get_chapters
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import findall [as 别名]
def get_chapters(request):
params = {}
results = {}
language = "pt"
# language = request.LANGUAGE_CODE.lower()
# if language == 'pt-br':
# language = 'pt'
params = urllib.urlencode({"LI": "CAPITULO"})
resource = urllib.urlopen(settings.ICD10_SERVICE, params)
tree = ElementTree()
tree.parse(resource)
terms = tree.findall("cid10ws_response")
data = []
for term in terms:
description = {}
chapter = term.findall("tree/self/term_list/term")[0]
for lang in ICD10_LANGS:
term_trans = term.findall('record_list/record/descriptor_list/descriptor[@lang="%s"]' % lang)[0]
if term_trans.text:
description[lang] = "%s - %s" % (chapter.attrib["chapter"], term_trans.text.strip().capitalize())
data.append({"fields": {"description": description, "label": chapter.attrib["tree_id"]}})
return HttpResponse(json.dumps(data), mimetype="application/json")
示例8: scrape
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import findall [as 别名]
def scrape(self, chamber, session):
for term in self.metadata["terms"]:
if session in term["sessions"]:
year = term["start_year"]
break
self.versions_dict = self._versions_dict(year)
base_bill_url = "http://data.opi.mt.gov/bills/%d/BillHtml/" % year
index_page = ElementTree(lxml.html.fromstring(self.urlopen(base_bill_url)))
bill_urls = []
for bill_anchor in index_page.findall("//a"):
# See 2009 HB 645
if bill_anchor.text.find("govlineveto") == -1:
# House bills start with H, Senate bills start with S
if chamber == "lower" and bill_anchor.text.startswith("H"):
bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text))
elif chamber == "upper" and bill_anchor.text.startswith("S"):
bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text))
for bill_url in bill_urls:
bill = self.parse_bill(bill_url, session, chamber)
if bill:
self.save_bill(bill)
示例9: parse_bill
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import findall [as 别名]
def parse_bill(self, bill_url, session, chamber):
# Temporarily skip the differently-formatted house budget bill.
if "/2011/billhtml/hb0002.htm" in bill_url.lower():
return
bill = None
try:
doc = lxml.html.fromstring(self.urlopen(bill_url))
except XMLSyntaxError as e:
self.logger.warning("Got %r while parsing %r" % (e, bill_url))
return
bill_page = ElementTree(doc)
for anchor in bill_page.findall("//a"):
if anchor.text_content().startswith("status of") or anchor.text_content().startswith(
"Detailed Information (status)"
):
status_url = anchor.attrib["href"].replace("\r", "").replace("\n", "")
bill = self.parse_bill_status_page(status_url, bill_url, session, chamber)
if bill is None:
# No bill was found. Maybe something like HB0790 in the 2005 session?
# We can search for the bill metadata.
page_name = bill_url.split("/")[-1].split(".")[0]
bill_type = page_name[0:2]
bill_number = page_name[2:]
laws_year = self.metadata["session_details"][session]["years"][0] % 100
status_url = self.search_url_template % (laws_year, bill_type, bill_number)
bill = self.parse_bill_status_page(status_url, bill_url, session, chamber)
# Get versions on the detail page.
versions = [a["action"] for a in bill["actions"]]
versions = [a for a in versions if "Version Available" in a]
if not versions:
version_name = "Introduced"
else:
version = versions.pop()
if "New Version" in version:
version_name = "Amended"
elif "Enrolled" in version:
version_name = "Enrolled"
self.add_other_versions(bill)
# Add html.
bill.add_version(version_name, bill_url, mimetype="text/html")
# Add pdf.
url = set(bill_page.xpath('//a/@href[contains(., "BillPdf")]')).pop()
bill.add_version(version_name, url, mimetype="application/pdf")
# Add status url as a source.
bill.add_source(status_url)
return bill
示例10: getterm
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import findall [as 别名]
def getterm(request, lang, code):
params = urllib.urlencode({
'tree_id': code or '',
'lang': lang,
})
resource = urllib.urlopen(settings.DECS_SERVICE, params)
tree = ElementTree()
tree.parse(resource)
result = tree.find("decsws_response/tree/self/term_list/term")
if result is None:
result = tree.findall('decsws_response/tree/term_list[@lang="%s"]/term' % lang)
json = '[%s]' % ','.join((JSON_TERM % (r.text.capitalize(),r.attrib['tree_id']) for r in result))
else:
descriptors = tree.findall('decsws_response/record_list/record/descriptor_list/descriptor')
description = ','.join(['"%s":"%s"'%(d.attrib['lang'],d.text) for d in descriptors])
json = '[%s]' % (JSON_MULTILINGUAL_TERM % (description,result.attrib['tree_id']))
return HttpResponse(json, mimetype='application/json')
示例11: parse_bill
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import findall [as 别名]
def parse_bill(self, bill_url, session, chamber):
# Temporarily skip the differently-formatted house budget bill.
if '/2011/billhtml/hb0002.htm' in bill_url.lower():
return
bill = None
bill_page = ElementTree(lxml.html.fromstring(self.urlopen(bill_url)))
for anchor in bill_page.findall('//a'):
if (anchor.text_content().startswith('status of') or
anchor.text_content().startswith('Detailed Information (status)')):
status_url = anchor.attrib['href'].replace("\r", "").replace("\n", "")
bill = self.parse_bill_status_page(status_url, bill_url, session, chamber)
if bill is None:
# No bill was found. Maybe something like HB0790 in the 2005 session?
# We can search for the bill metadata.
page_name = bill_url.split("/")[-1].split(".")[0]
bill_type = page_name[0:2]
bill_number = page_name[2:]
laws_year = self.metadata['session_details'][session]['years'][0] % 100
status_url = self.search_url_template % (laws_year, bill_type, bill_number)
bill = self.parse_bill_status_page(status_url, bill_url, session, chamber)
# Get versions on the detail page.
versions = [a['action'] for a in bill['actions']]
versions = [a for a in versions if 'Version Available' in a]
if not versions:
version_name = 'Introduced'
else:
version = versions.pop()
if 'New Version' in version:
version_name = 'Amended'
elif 'Enrolled' in version:
version_name = 'Enrolled'
self.add_other_versions(bill)
# Add html.
bill.add_version(version_name, bill_url, mimetype='text/html')
# Add pdf.
url = set(bill_page.xpath('//a/@href[contains(., "BillPdf")]')).pop()
bill.add_version(version_name, url, mimetype='application/pdf')
# Add status url as a source.
bill.add_source(status_url)
return bill
示例12: fetch_text_from_url
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import findall [as 别名]
def fetch_text_from_url(url):
"""Simple helper to scrap the text content of a webpage"""
opener = urllib2.build_opener()
request = urllib2.Request(url)
# change the User Agent to avoid being blocked by Wikipedia
# downloading a couple of articles ones should not be abusive
request.add_header('User-Agent', 'pignlproc categorizer')
html_content = opener.open(request).read()
tree = ElementTree(lxml.html.document_fromstring(html_content))
elements = [e.text_content()
for tag in ('h1', 'h2', 'h3', 'h4', 'p')
for e in tree.findall('//' + tag)]
text = "\n\n".join(elements)
return text
示例13: __init__
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import findall [as 别名]
def __init__(self, name):
self.name = name
tree = ElementTree(file = name)
labels = tuple((elt.tag.strip(), elt.text.strip()) for elt in tree.find("labels"))
self.labels = tuple(pair[0] for pair in labels)
self.descrs = dict(labels)
self.date = tree.getroot().get("date")
for elt in tree.findall("validation_status"):
status = elt.get("status")
uri = elt.text.strip()
if status.startswith("rsync_transfer_") or elt.get("generation") != "current":
continue
if uri not in self:
self[uri] = Object(self, uri)
self[uri].add(status)
示例14: get_bill_urls
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import findall [as 别名]
def get_bill_urls(self, base_bill_url, chamber):
bill_urls = []
if base_bill_url is None:
return bill_urls
index_page = ElementTree(lxml.html.fromstring(self.urlopen(base_bill_url)))
for bill_anchor in index_page.findall('//a'):
# See 2009 HB 645
if bill_anchor.text.find("govlineveto") == -1:
# House bills start with H, Senate bills start with S
if chamber == 'lower' and bill_anchor.text.startswith('H'):
bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text))
elif chamber == 'upper' and bill_anchor.text.startswith('S'):
bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text))
return bill_urls
示例15: add_bill_versions
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import findall [as 别名]
def add_bill_versions(self, bill, index_url):
# This method won't pick up bill versions where the bill is published
# exclusively in PDF. See 2009 HB 645 for a sample
index_page = ElementTree(lxml.html.fromstring(self.urlopen(index_url)))
tokens = bill["bill_id"].split(" ")
bill_regex = re.compile("%s0*%s\_" % (tokens[0], tokens[1]))
for anchor in index_page.findall("//a"):
if bill_regex.match(anchor.text_content()) is not None:
file_name = anchor.text_content()
version = file_name[file_name.find("_") + 1 : file_name.find(".")]
version_title = "Final Version"
if version != "x":
version_title = "Version %s" % version
version_url = index_url[0 : index_url.find("bills") - 1] + anchor.attrib["href"]
bill.add_version(version_title, version_url)