本文整理汇总了Python中lxml.html.document_fromstring方法的典型用法代码示例。如果您正苦于以下问题:Python html.document_fromstring方法的具体用法?Python html.document_fromstring怎么用?Python html.document_fromstring使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml.html
的用法示例。
在下文中一共展示了html.document_fromstring方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import document_fromstring [as 别名]
def __init__(self, file_name, user_id):
with open(file_name, 'r') as self.opened_file:
# So Instapaper doesn't close <li> tags
# This was causing infinite recursion when using BS directly
# Hence why the stuff below is being done, so that the <li> tags get closed
self.html = html.document_fromstring(self.opened_file.read())
self.html = html.tostring(self.html)
self.soup = BeautifulSoup4(self.html)
self.user = user_id
self.urls = dict()
self.check_duplicates = dict()
self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
Bookmark.deleted == False).all()
for bmark in self.check_duplicates_query:
self.check_duplicates[bmark.main_url] = bmark
self.tags_dict = dict()
self.tags_set = set()
self.valid_url = re.compile(
r'^(?:[a-z0-9\.\-]*)://'
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
r'(?::\d+)?'
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
示例2: download_page
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import document_fromstring [as 别名]
def download_page(url, cookie_jar):
"""
Request page using authenticated cookies (cookiejar).
Download html source and save in browser directory, to
be used by in show_in_browser().
"""
browser_dir = os.path.join(server_path, 'static/browser')
delete_directory_files(browser_dir)
filename = '{}.html'.format(uuid.uuid4())
filepath = os.path.join(browser_dir, filename)
try:
response = cookie_request(url, cookie_jar)
except requests.RequestException as e:
return e, None
doc = html.document_fromstring(response.text)
with open(filepath, 'wb') as f:
f.write(html.tostring(doc))
return None, filename
示例3: __init__
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import document_fromstring [as 别名]
def __init__(self, html_content, encoding="utf-8"):
if isinstance(html_content, bytes):
html_content = html_content.decode(encoding)
self.document = document_fromstring(html_content)
strip_elements(self.document, "style")
示例4: parse_html_string
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import document_fromstring [as 别名]
def parse_html_string(s):
from lxml import html
utf8_parser = html.HTMLParser(encoding='utf-8')
html_tree = html.document_fromstring(s , parser=utf8_parser)
return html_tree
示例5: reverse_image_search
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import document_fromstring [as 别名]
def reverse_image_search(url):
value = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
headers = {'User-Bot': value}
search_url = 'https://www.google.com/searchbyimage?image_url=%s' % url
resp = requests.get(search_url, headers=headers)
root = document_fromstring(resp.content)
href = root.cssselect(".bia")[0].attrib['href']
print(search_url)
new_url = "https://www.google.com" + href
resp = requests.get(new_url, headers=headers)
return re.search("imgurl=([^&]*)", resp.content).group(1)
示例6: parse_items
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import document_fromstring [as 别名]
def parse_items(self, response, root_name=None):
"""Recursively get data/make requests for all parser hierarchical levels"""
data = response.request.meta.get("data", {})
root_name = root_name or response.request.meta["root_name"]
metadata = self.parsers[root_name]
xpath_id = metadata["xpath_id"]
xpath_description = metadata["xpath_description"]
xpath_url = metadata["xpath_url"]
item_name = metadata["next"]
for item in response.xpath(metadata["xpath_items"]):
tree = document_fromstring(item.extract())
url = urljoin("https://cnae.ibge.gov.br/", tree.xpath(xpath_url)[0])
item_id = get_text(tree.xpath(xpath_id))
item_description = get_text(tree.xpath(xpath_description))
item_data = {}
if item_name == "subclasse" or len(item_id) == self.parsers[item_name]["id_length"]:
next_root_name = item_name
else:
descricao = response.xpath("//span[@class = 'destaque']//text()").extract()[0]
item_data[f"id_{item_name}"] = descricao.split()[0]
item_data[f"descricao_{item_name}"] = descricao
next_root_name = self.parsers[item_name]["next"]
item_data.update({
f"id_{next_root_name}": item_id.strip(),
f"descricao_{next_root_name}": item_description.strip(),
})
item_data.update(data)
callback = self.parse_items if next_root_name != "subclasse" else self.parse_subclasse
yield scrapy.Request(
url=url,
meta={"data": item_data, "root_name": next_root_name},
callback=callback,
)
示例7: parse_subclasse
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import document_fromstring [as 别名]
def parse_subclasse(self, response):
"""Yield the subclass item (last mile of the recursive strategy)"""
data = response.request.meta["data"]
tree = document_fromstring(response.body)
data["notas_explicativas"] = "\n".join(
[
line.strip()
for line in tree.xpath('//div[@id = "notas-explicativas"]//text()')
if line.strip()
]
)
data["url"] = response.request.url
data["id"] = int(data["id_subclasse"].replace("/", "").replace("-", ""))
data["versao"] = self.versao
yield data
示例8: initial_output
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import document_fromstring [as 别名]
def initial_output(html_partial=False):
"""
Creates initial HTML document according to the given flag
:param html_partial: determines if there should be the html page or only a fragment
:return: html output element
"""
return fragment_fromstring('<div/>') if html_partial else document_fromstring('<div/>')
示例9: replace_img_url
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import document_fromstring [as 别名]
def replace_img_url(self, content):
utf8_parser = html.HTMLParser(encoding='utf-8')
tree = html.document_fromstring(str(content), parser=utf8_parser)
for _pic_link in tree.xpath("//img"):
href = str(_pic_link.get('src'))
pic_id, pic_type = href.split('.')
_pic_link.set('src', "https://pic4.zhimg.com/" + pic_id + "_b." + pic_type)
replaced_content = etree.tostring(tree, encoding=str)
return replaced_content
示例10: get_drive_file
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import document_fromstring [as 别名]
def get_drive_file(file):
req = urllib.request.Request("https://docs.google.com/feeds/download/documents/export/Export?id=%s&exportFormat=html" %(file,))
str = urllib.request.urlopen(req).read().decode('UTF-8')
doc = html.document_fromstring(str)
head = doc.xpath('//head')[0]
head.tag = 'div'
body = doc.xpath('//body')[0]
body.tag = 'div'
str = html.tostring(head)+html.tostring(body)
return str
示例11: corpo_body
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import document_fromstring [as 别名]
def corpo_body(self):
"""
Prova ad estrarre il corpo della pagina (body).
:return:
"""
if not self.corpo:
return ""
doc = html.document_fromstring(self.corpo)
body = doc.xpath('//body')[0]
body.tag = 'div'
#try:
return html.tostring(body)
#except:
# return self.corpo
#print html.parse('http://someurl.at.domain').xpath('//body')[0].text_content()
示例12: processa_link
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import document_fromstring [as 别名]
def processa_link(self):
"""
Controlla i link nella e-mail relativi e li rende assoluti.
"""
doc = html.document_fromstring(self.corpo)
links = doc.xpath('//a')
for el in links:
try:
url = el.attrib['href']
if '://' not in url:
el.attrib['href'] = "https://gaia.cri.it%s" % (url,)
except KeyError:
continue
self.corpo = html.tostring(doc, pretty_print=True).decode('UTF-8')
示例13: setUp
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import document_fromstring [as 别名]
def setUp(self):
query_user = User.query.filter_by(email='instapaper@example.com').first()
if query_user:
query_bookmarks = Bookmark.query.filter_by(user=query_user.id)
for bmark in query_bookmarks:
db.session.delete(bmark)
db.session.commit()
db.session.delete(query_user)
db.session.commit()
create_user = User()
create_user.first_name = 'Instapaper'
create_user.last_name = 'Test'
create_user.email = 'instapaper@example.com'
create_user.password = 'instapaper_pass'
create_user.active = True
create_user.confirmed_at = datetime.datetime.utcnow()
db.session.add(create_user)
db.session.commit()
self.user = create_user
with open('Instapaper.html') as json_file:
create_file = open(os.path.join(app.config['CRESTIFY_UPLOAD_DIRECTORY'], 'test_instapaper.html'), 'w+')
self.data = html.document_fromstring(json_file.read())
self.data = html.tostring(self.data)
self.html_data = BeautifulSoup4(self.data)
self.bookmarks = {}
for tag in self.html_data.find_all('h1'):
parent_elem = tag.find_next_sibling('ol')
links = parent_elem.find_all('a')
for link in links:
title = link.text
url = link['href']
tags = [tag.text]
tags.append('Imported')
# Thanks Instapaper for not adding timestamps
self.bookmarks[url] = {
'href': url,
'title': title,
'tags': tags
}
create_file.write(self.data)
self.file_path = create_file.name
create_file.close()
init_parser = InstapaperParser(self.file_path, self.user.id)
init_parser.process()
init_parser.add_to_database()
self.query = Bookmark.query.filter_by(user=self.user.id).all()
self.html_parser = HTMLParser()
示例14: __init__
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import document_fromstring [as 别名]
def __init__(self, story, corpus):
self.story = story
self.corpus = corpus
self.parser = html.HTMLParser(encoding=chardet.detect(self.story.html)['encoding'])
self.tree = html.document_fromstring(self.story.html, parser=self.parser)
# Elements to delete.
self.delete_selectors = {
'bbc': [
'//blockquote[contains(@class, "twitter-tweet")]',
'//blockquote[contains(@class, "instagram-media")]'
]
}
# Title Selector
self.title_selectors = {
'bbc': [
'//h1[contains(@class, "story-headline")]',
'//h1[contains(@class, "story-body__h1")]'
]
}
# Introduction Selector
self.introduction_selectors = {
'bbc': [
'//p[contains(@class, "story-body__introduction")]'
]
}
# Rest Content exclusions: ads, links, bylines, comments, headline and story introduction
self.bbc_exclude = (
'not(contains(@class, "story-headline"))'
' and not(contains(@class, "story-body__h1"))'
' and not(contains(@class, "story-body__introduction"))'
' and not(contains(@class, "with-extracted-share-icons"))'
)
# Rest Content Selector
self.restcontent_selectors = {
'bbc': [
'//div[contains(@class, "story-body")]//p[%s]' % self.bbc_exclude # story-body__inner
]
}
示例15: test_us_attivazione_credenziali
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import document_fromstring [as 别名]
def test_us_attivazione_credenziali(self):
EMAIL_UTENZA = email_fittizzia()
presidente = crea_persona()
persona, sede, appartenenza = crea_persona_sede_appartenenza(presidente=presidente)
sessione_presidente = self.sessione_utente(persona=presidente)
sessione_presidente.visit("%s%s" % (self.live_server_url, persona.url_profilo_credenziali))
sessione_presidente.fill('email', EMAIL_UTENZA)
sessione_presidente.find_by_xpath("//button[@type='submit']").first.click()
self.assertTrue(
Utenza.objects.filter(persona=persona).exists(),
msg="L'utenza e' stata creata correttamente"
)
self.assertTrue(
Utenza.objects.get(persona=persona).email == EMAIL_UTENZA,
msg="L'email e' stata correttamente creata"
)
# Ottieni e-mail inviata
msg = Messaggio.objects.filter(oggetto__icontains="credenziali",
oggetti_destinatario__persona=persona)
self.assertTrue(
msg.exists(),
msg="Email delle credenziali spedita"
)
corpo_msg = msg.first().corpo
self.assertTrue(
EMAIL_UTENZA in corpo_msg,
msg="L'email contiene il nuovo indirizzo e-mail"
)
doc = html.document_fromstring(corpo_msg)
nuova_pwd = doc.xpath("//*[@id='nuova-password']")[0].text.strip()
utenza = persona.utenza
utenza.password_testing = nuova_pwd # Password per accesso
# Prova accesso con nuova utenza.
sessione_persona = self.sessione_utente(utente=utenza)