本文整理汇总了Python中lxml.html方法的典型用法代码示例。如果您正苦于以下问题:Python lxml.html方法的具体用法?Python lxml.html怎么用?Python lxml.html使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml
的用法示例。
在下文中一共展示了lxml.html方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_tags
# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def get_tags(doc):
'''
Get tags from a DOM tree
:param doc: lxml parsed object
:return:
'''
tags = list()
for el in doc.getroot().iter():
if isinstance(el, lxml.html.HtmlElement):
tags.append(el.tag)
elif isinstance(el, lxml.html.HtmlComment):
tags.append('comment')
else:
raise ValueError('Don\'t know what to do with element: {}'.format(el))
return tags
示例2: structural_similarity
# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def structural_similarity(document_1, document_2):
"""
Computes the structural similarity between two DOM Trees
:param document_1: html string
:param document_2: html string
:return: int
"""
try:
document_1 = lxml.html.parse(StringIO(document_1))
document_2 = lxml.html.parse(StringIO(document_2))
except Exception as e:
print(e)
return 0
tags1 = get_tags(document_1)
tags2 = get_tags(document_2)
diff = difflib.SequenceMatcher()
diff.set_seq1(tags1)
diff.set_seq2(tags2)
return diff.ratio()
示例3: testParseCurrencyListAll
# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def testParseCurrencyListAll(self):
"""Test parseCurrencyListAll."""
f = codecs.open("{0}/example/currencylist.html".format(
os.path.dirname(os.path.abspath(__file__))), 'r', 'utf-8')
html = f.read()
f.close()
data = parseCurrencyListAll(html)
self.assertEqual(len(data), 452)
expectedFirst = {
'name': 'Bitcoin',
'slug': 'bitcoin',
'symbol': 'BTC',
'explorer_link': 'http://blockchain.info'
}
self.assertEqual(data[0], expectedFirst)
expectedLast = {
'name': 'Marscoin',
'slug': 'marscoin',
'symbol': 'MRS',
'explorer_link': 'http://explore.marscoin.org/chain/Marscoin/'
}
self.assertEqual(data[-1], expectedLast)
示例4: remove_html_encode_errors
# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def remove_html_encode_errors(self, headers, error):
"""
Use this method to remove html special characters (Eg. &nbps), encoding errors or other unicode text.
Simply pass headers rows to the method and the error, as a unicode string, you want to correct
:param headers: rows list of headers
:param error: unicode string you want to delete from header cells
:return: nothing
"""
# Iterates over headers
for row in headers:
# Iterate over header cells
for header in row:
# Replace 'error' with u'' in the text of this header cell
header['th'] = header['th'].replace(error, u'')
示例5: url_composer
# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def url_composer(self, query, service):
"""
This function is used to compose a url to call some web services, such as sparql endpoints.
:param query: is the string used in some rest calls.
:param service: type of service you request (dbpedia sparql endpoint)
:return url: the url composed
"""
# use quote_plus method from urllib to encode special character (must to do with web service)
query = urllib.quote_plus(query)
"""
The following if clause are differentiated by service requested Eg. 'dbpedia',..
but in all the cases url is composed using pre formatted string along with the query
"""
if service == 'dbpedia':
url = self.dbpedia_sparql_url + query + self.call_format_sparql
elif service == 'html':
url = self.html_format + query
else:
url = "ERROR"
return url
示例6: rss_item_to_relevant_data
# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def rss_item_to_relevant_data(self, item):
"""
Extract the relevant data from the given RSS item.
Args:
`item`:
A single item from the RSS feed. Such an
item is an element of a list obtained with a
`<lxml etree/html document>.xpath(...)` call
(see the source code of the _process_rss()
method).
Returns:
Some hashable object. It may be, for example, a
tuple or a string -- the exact type depends on the
implementation provided by a particular subclass
of BaseRSSCollector.
"""
raise NotImplementedError
示例7: test_terms_of_service
# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_terms_of_service(self) -> None:
user = self.example_user('hamlet')
self.login_user(user)
for user_tos_version in [None, '1.1', '2.0.3.4']:
user.tos_version = user_tos_version
user.save()
with \
self.settings(TERMS_OF_SERVICE='whatever'), \
self.settings(TOS_VERSION='99.99'):
result = self.client_get('/', dict(stream='Denmark'))
html = result.content.decode('utf-8')
self.assertIn('Accept the new Terms of Service', html)
示例8: test_invites_by_admins_only
# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_invites_by_admins_only(self) -> None:
user_profile = self.example_user('hamlet')
realm = user_profile.realm
realm.invite_by_admins_only = True
realm.save()
self.login_user(user_profile)
self.assertFalse(user_profile.is_realm_admin)
result = self._get_home_page()
html = result.content.decode('utf-8')
self.assertNotIn('Invite more users', html)
user_profile.role = UserProfile.ROLE_REALM_ADMINISTRATOR
user_profile.save()
result = self._get_home_page()
html = result.content.decode('utf-8')
self.assertIn('Invite more users', html)
示例9: test_quotation_splitter_inside_blockquote
# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_quotation_splitter_inside_blockquote():
msg_body = """Reply
<blockquote>
<div>
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
</div>
<div>
Test
</div>
</blockquote>"""
eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
示例10: test_regular_blockquote
# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_regular_blockquote():
msg_body = """Reply
<blockquote>Regular</blockquote>
<div>
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
</div>
<blockquote>
<div>
<blockquote>Nested</blockquote>
</div>
</blockquote>
"""
eq_("<html><head></head><body>Reply<blockquote>Regular</blockquote></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
示例11: test_validate_output_html
# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_validate_output_html():
msg_body = """Reply
<div>
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
<blockquote>
<div>
Test
</div>
</blockquote>
</div>
<div/>
"""
out = quotations.extract_from_html(msg_body)
ok_('<html>' in out and '</html>' in out,
'Invalid HTML - <html>/</html> tag not present')
ok_('<div/>' not in out,
'Invalid HTML output - <div/> element is not valid')
示例12: test_date_block
# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_date_block():
msg_body = """
<div>
message<br>
<div>
<hr>
Date: Fri, 23 Mar 2012 12:35:31 -0600<br>
To: <a href="mailto:bob@example.com">bob@example.com</a><br>
From: <a href="mailto:rob@example.com">rob@example.com</a><br>
Subject: You Have New Mail From Mary!<br><br>
text
</div>
</div>
"""
eq_('<html><head></head><body><div>message<br></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
示例13: test_from_block_and_quotations_in_separate_divs
# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_from_block_and_quotations_in_separate_divs():
msg_body = '''
Reply
<div>
<hr/>
<div>
<font>
<b>From: bob@example.com</b>
<b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b>
</font>
</div>
<div>
Quoted message
</div>
</div>
'''
eq_('<html><head></head><body>Reply<div><hr></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
示例14: test_remove_namespaces
# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_remove_namespaces():
msg_body = """
<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns="http://www.w3.org/TR/REC-html40">
<body>
<o:p>Dear Sir,</o:p>
<o:p>Thank you for the email.</o:p>
<blockquote>thing</blockquote>
</body>
</html>
"""
rendered = quotations.extract_from_html(msg_body)
assert_true("<p>" in rendered)
assert_true("xmlns" in rendered)
assert_true("<o:p>" not in rendered)
assert_true("<xmlns:o>" not in rendered)
示例15: scrape
# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def scrape():
html = requests.get(URL, params={
"_": random.random()
}).content
dom = lxml.html.fromstring(html)
table = dom.cssselect("table")[0]
trs = table.cssselect("tr")
rows = [ [ parse_cell(td.text_content())
for td in tr.cssselect("td:nth-child(1), td:nth-child(2), td:nth-child(4)") ]
for tr in trs ]
data = [ row for row in rows[1:]
if len(row) and not row[0] in [ "", "States", "Territories" ] ]
df = pd.DataFrame(data, columns=COLS)
for c in INT_COLS:
df[c] = df[c].str.replace(",", "").str.strip("*").astype(int)
return df