本文整理汇总了Python中html2text.HTML2Text方法的典型用法代码示例。如果您正苦于以下问题:Python html2text.HTML2Text方法的具体用法?Python html2text.HTML2Text怎么用?Python html2text.HTML2Text使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类html2text
的用法示例。
在下文中一共展示了html2text.HTML2Text方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: send_email
# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def send_email(to_address, to_name, subject, body_html):
"""Sends an email from the configured address.
Does not check for address validity.
"""
if config.ALLOWED_EMAIL_TO_ADDRESSES is not None and \
to_address not in config.ALLOWED_EMAIL_TO_ADDRESSES:
# Not allowed to send to this address
logging.info('send_email: not allowed to send to: %s' % to_address)
return
full_to_address = '%s <%s>' % (to_name, to_address)
h2t = html2text.HTML2Text()
h2t.body_width = 0
body_text = h2t.handle(body_html)
message = mail.EmailMessage(sender=config.MASTER_EMAIL_SEND_ADDRESS,
subject=subject,
to=full_to_address,
body=body_text,
html=body_html)
message.send()
示例2: get_article
# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def get_article(article_id, links=False, url=URL):
# type: (str, bool, str) -> str
"""Get article from feed with the given ID"""
articles = _feed(url).entries
try:
article = articles[int(article_id)]
except (IndexError, ValueError):
max_id = len(articles) - 1
msg = "Unknown article ID, use ID from 0 to {}".format(max_id)
raise SystemExit("Error: {}".format(msg))
# Get article as HTML
try:
html = article.content[0].value
except AttributeError:
html = article.summary
# Convert HTML to plain text
to_text = html2text.HTML2Text()
to_text.ignore_links = not links
text = to_text.handle(html)
return u"# {}\n\n{}".format(article.title, text)
示例3: get_url_markdown
# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def get_url_markdown(baseurl,start,increment):
'''
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
try:
j = opener.open(baseurl)
except:
return None
data = j.read()
'''
urlHandler = urllib2.urlopen(baseurl)
data = urlHandler.read()
'''
os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl)
data = open('temp' + str(start)+"_"+str(increment),'rU').read()
'''
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.body_width = 10000
data = h.handle(unidecode(unicode(data,errors='ignore')))
return unidecode(data)
示例4: get_url_markdown
# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def get_url_markdown(baseurl,start,increment):
try:
'''
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
try:
j = opener.open(baseurl)
except:
return None
data = j.read()
'''
urlHandler = urllib2.urlopen(baseurl)
data = urlHandler.read()
'''
os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl)
data = open('temp' + str(start)+"_"+str(increment),'rU').read()
'''
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.body_width = 10000
data = h.handle(unidecode(unicode(data,errors='ignore')))
return unidecode(data)
except:
return None
示例5: doelse
# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def doelse(url):
headers = {
'User-Agent': random.choice(useragents)
}
res = requests.get(url=url ,headers=headers) # 获取整个html页面
h = html2text.HTML2Text()
h.ignore_links = False
soup = BeautifulSoup(res.text,'html5lib')
title = soup.title.text # 获取标题
html = str(soup.body)
article = h.handle(html)
pwd = os.getcwd() # 获取当前文件的路径
dirpath = pwd + '/Else/'
if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录
os.makedirs(dirpath)
## 写入文件
write2md(dirpath,title,article)
示例6: write2md
# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def write2md(dirpath,title,article):
## 创建转换器
h2md = html2text.HTML2Text()
h2md.ignore_links = False
## 转换文档
article = h2md.handle(article)
## 写入文件
if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录
os.makedirs(dirpath)
# 创建md文件
with open(dirpath+title+'.md','w',encoding="utf8") as f:
lines = article.splitlines()
for line in lines:
if line.endswith('-'):
f.write(line)
else:
f.write(line+"\n")
print(title+"下载完成....")
示例7: main
# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def main(result, body_width):
"""Convert Mercury parse result dict to Markdown and plain-text
result: a mercury-parser result (as a Python dict)
"""
text = HTML2Text()
text.body_width = body_width
text.ignore_emphasis = True
text.ignore_images = True
text.ignore_links = True
text.convert_charrefs = True
markdown = HTML2Text()
markdown.body_width = body_width
markdown.convert_charrefs = True
result['content'] = {
'html': result['content'],
'markdown': unescape(markdown.handle(result['content'])),
'text': unescape(text.handle(result['content']))
}
return result
示例8: export_html_to_text_html2text
# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def export_html_to_text_html2text(input_buffer, encoding="utf-8"):
"""
Export HTML to text via html2text.
:param input_buffer: input HTML buffer
:param encoding: default encoding
:return:
"""
# Ensure we have a decoded string
if isinstance(input_buffer, bytes):
input_buffer = input_buffer.decode(encoding)
# Process and return
parser = html2text.HTML2Text()
parser.ignore_emphasis = True
parser.ignore_links = True
parser.ignore_images = True
html_buffer = html.unescape(parser.handle(input_buffer))
return html_buffer
示例9: remove_html_links
# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def remove_html_links(text):
import html2text
h = html2text.HTML2Text()
h.ignore_links = True
start = "<a href="
end = "</a>"
while start in text and end in text:
s_index = text.find(start)
e_index = text.find(end) + len(end)
html_link = text[s_index:e_index]
title = h.handle(html_link).strip()
text = text.replace(html_link, title)
return text.strip()
示例10: sayurl
# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def sayurl(self, ctx: commands.Context, url):
"""
Converts a URL to something readable
Works better on smaller websites
"""
h = html2text.HTML2Text()
h.ignore_links = True
# h.ignore_images = True
h.images_to_alt = True
h.escape_snob = True
h.skip_internal_links = True
h.ignore_tables = True
h.single_line_break = True
h.mark_code = True
h.wrap_links = True
h.ul_item_mark = "-"
async with aiohttp.ClientSession() as session:
site = await fetch_url(session, url)
for page in pagify(h.handle(site)):
await ctx.send(page)
示例11: get_xs_detail
# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def get_xs_detail(href, title, path):
url = xzl+href
print('开始采集' + title + '的详情, 章节地址为: ' + url + '\n')
text_maker = ht.HTML2Text()
response = close_session().get(url=url, headers=headers)
selector = Selector(text=response.text)
html = selector.css(u'.cata-book-content').extract_first()
file_name = title
if markdown:
md = text_maker.handle(html)
with open(path + file_name + '.md', 'w') as f:
f.write(md)
else:
if not xs_pdf:
# 在html中加入编码, 否则中文会乱码
html = "<html><head><meta charset='utf-8'></head> " + html + "</html>"
pdfkit.from_string(html, path + file_name + '.pdf')
else:
return html
# 采集专栏列表
示例12: get_zl_detail
# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def get_zl_detail(url, path, name):
response = close_session().get(url=url, headers=headers)
selector = Selector(text=response.text)
text_maker = ht.HTML2Text()
create_time = selector.css(u'.time abbr::attr(title)').extract_first()
html = selector.css(u'.xzl-topic-body-content').extract_first()
file_name = name
if hasTime:
file_name = create_time+' '+name
if markdown:
md = text_maker.handle(html)
with open(path + file_name + '.md', 'w') as f:
f.write(md)
else:
# 在html中加入编码, 否则中文会乱码
html = "<html><head><meta charset='utf-8'></head> " + html + "</html>"
pdfkit.from_string(html, path + file_name + '.pdf')
# 关闭多余连接
示例13: body
# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def body(self):
if self._body is None:
super(SQLemail, self).body
self._body = fix_utf8_encoding(self._body)
# if there's no plain text version available attempt to make one by
# sanitising the html version. The output isn't always pretty but it
# is readable, better than a blank screen and helps the user decide
# if the message is spam or ham.
if self.dformat == "plain" and not self.contents["plain"] \
and self.contents["html"]:
h = HTML2Text()
h.ignore_tables = True
h.images_to_alt = True
mail_text = h.handle(self.contents["html"])
self.contents["plain"] = self._post_process_plain(
smart_text(mail_text))
self._body = self.viewmail_plain()
self._body = fix_utf8_encoding(self._body)
return self._body
示例14: extract_text
# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def extract_text(self):
html = self.data.decode(DEFAULT_TEXT_ENCODING)
h = html2text.HTML2Text()
# See https://github.com/Alir3z4/html2text/blob/master/html2text/cli.py
h.ignore_links = True
h.ignore_emphasis = True
h.ignore_images = True
h.unicode_snob = True
h.ignore_tables = True
h.decode_errors = "ignore"
h.body_width = 0
text = h.handle(html)
return self.remove_header_prefixes(text)
示例15: html_to_text
# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def html_to_text(s, ignore_images=False):
h = html2text.HTML2Text()
h.ignore_images = ignore_images
return h.handle(s)