当前位置: 首页>>代码示例>>Python>>正文


Python html2text.HTML2Text方法代码示例

本文整理汇总了Python中html2text.HTML2Text方法的典型用法代码示例。如果您正苦于以下问题:Python html2text.HTML2Text方法的具体用法?Python html2text.HTML2Text怎么用?Python html2text.HTML2Text使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在html2text的用法示例。


在下文中一共展示了html2text.HTML2Text方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: send_email

# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def send_email(to_address, to_name, subject, body_html):
    """Sends an email from the configured address.
    Does not check for address validity.
    """

    if config.ALLOWED_EMAIL_TO_ADDRESSES is not None and \
       to_address not in config.ALLOWED_EMAIL_TO_ADDRESSES:
        # Not allowed to send to this address
        logging.info('send_email: not allowed to send to: %s' % to_address)
        return

    full_to_address = '%s <%s>' % (to_name, to_address)

    h2t = html2text.HTML2Text()
    h2t.body_width = 0
    body_text = h2t.handle(body_html)

    message = mail.EmailMessage(sender=config.MASTER_EMAIL_SEND_ADDRESS,
                                subject=subject,
                                to=full_to_address,
                                body=body_text,
                                html=body_html)

    message.send() 
开发者ID:adam-p,项目名称:danforth-east,代码行数:26,代码来源:gapps.py

示例2: get_article

# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def get_article(article_id, links=False, url=URL):
    # type: (str, bool, str) -> str
    """Get article from feed with the given ID"""
    articles = _feed(url).entries
    try:
        article = articles[int(article_id)]
    except (IndexError, ValueError):
        max_id = len(articles) - 1
        msg = "Unknown article ID, use ID from 0 to {}".format(max_id)
        raise SystemExit("Error: {}".format(msg))

    # Get article as HTML
    try:
        html = article.content[0].value
    except AttributeError:
        html = article.summary

    # Convert HTML to plain text
    to_text = html2text.HTML2Text()
    to_text.ignore_links = not links
    text = to_text.handle(html)

    return u"# {}\n\n{}".format(article.title, text) 
开发者ID:realpython,项目名称:reader,代码行数:25,代码来源:feed.py

示例3: get_url_markdown

# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def get_url_markdown(baseurl,start,increment):
  '''
  opener = urllib2.build_opener()
  opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
  try:
    j = opener.open(baseurl)
  except:
    return None
  data = j.read()
  '''
  urlHandler = urllib2.urlopen(baseurl)
  data = urlHandler.read()
  '''
  os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl)
  data = open('temp' + str(start)+"_"+str(increment),'rU').read()
  '''
  h = html2text.HTML2Text()
  h.ignore_links = True
  h.ignore_images = True
  h.body_width = 10000
  data = h.handle(unidecode(unicode(data,errors='ignore')))
  return unidecode(data) 
开发者ID:schollz,项目名称:extract_recipe,代码行数:24,代码来源:downloadRecipes.py

示例4: get_url_markdown

# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def get_url_markdown(baseurl,start,increment):
  try:
    '''
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
    try:
      j = opener.open(baseurl)
    except:
      return None
    data = j.read()
    '''
    urlHandler = urllib2.urlopen(baseurl)
    data = urlHandler.read()
    '''
    os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl)
    data = open('temp' + str(start)+"_"+str(increment),'rU').read()
    '''
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    h.body_width = 10000
    data = h.handle(unidecode(unicode(data,errors='ignore')))
    return unidecode(data)
  except:
    return None 
开发者ID:schollz,项目名称:extract_recipe,代码行数:27,代码来源:MdownloadRecipes.py

示例5: doelse

# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def doelse(url):
    headers = {
        'User-Agent': random.choice(useragents)
    }
    res = requests.get(url=url ,headers=headers) # 获取整个html页面

    h = html2text.HTML2Text()
    h.ignore_links = False
    soup = BeautifulSoup(res.text,'html5lib')
    title = soup.title.text # 获取标题
    html = str(soup.body)
    article = h.handle(html)

    pwd = os.getcwd() # 获取当前文件的路径
    dirpath = pwd + '/Else/'
    if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录
        os.makedirs(dirpath)
    ## 写入文件
    write2md(dirpath,title,article) 
开发者ID:ylfeng250,项目名称:FengTools,代码行数:21,代码来源:html2md.py

示例6: write2md

# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def write2md(dirpath,title,article):
    ## 创建转换器
    h2md = html2text.HTML2Text()
    h2md.ignore_links = False
    ## 转换文档
    article = h2md.handle(article)
    ## 写入文件
    if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录
        os.makedirs(dirpath)
    # 创建md文件
    with open(dirpath+title+'.md','w',encoding="utf8") as f:
        lines = article.splitlines()
        for line in lines:
            if line.endswith('-'):
                f.write(line)
            else:
                f.write(line+"\n")
    print(title+"下载完成....") 
开发者ID:ylfeng250,项目名称:FengTools,代码行数:20,代码来源:html2md.py

示例7: main

# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def main(result, body_width):
    """Convert Mercury parse result dict to Markdown and plain-text
    
    result: a mercury-parser result (as a Python dict)
    """
    text = HTML2Text()
    text.body_width = body_width
    text.ignore_emphasis = True
    text.ignore_images = True
    text.ignore_links = True
    text.convert_charrefs = True
    markdown = HTML2Text()
    markdown.body_width = body_width
    markdown.convert_charrefs = True
    result['content'] = {
        'html': result['content'],
        'markdown': unescape(markdown.handle(result['content'])),
        'text': unescape(text.handle(result['content']))
    }
    return result 
开发者ID:zyocum,项目名称:reader,代码行数:22,代码来源:reader.py

示例8: export_html_to_text_html2text

# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def export_html_to_text_html2text(input_buffer, encoding="utf-8"):
    """
    Export HTML to text via html2text.
    :param input_buffer: input HTML buffer
    :param encoding: default encoding
    :return:
    """
    # Ensure we have a decoded string
    if isinstance(input_buffer, bytes):
        input_buffer = input_buffer.decode(encoding)

    # Process and return
    parser = html2text.HTML2Text()
    parser.ignore_emphasis = True
    parser.ignore_links = True
    parser.ignore_images = True
    html_buffer = html.unescape(parser.handle(input_buffer))
    return html_buffer 
开发者ID:LexPredict,项目名称:lexpredict-contraxsuite,代码行数:20,代码来源:html2text.py

示例9: remove_html_links

# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def remove_html_links(text):
    import html2text

    h = html2text.HTML2Text()
    h.ignore_links = True

    start = "<a href="
    end = "</a>"

    while start in text and end in text:
        s_index = text.find(start)
        e_index = text.find(end) + len(end)

        html_link = text[s_index:e_index]
        title = h.handle(html_link).strip()
        text = text.replace(html_link, title)

    return text.strip() 
开发者ID:Endogen,项目名称:OpenCryptoBot,代码行数:20,代码来源:utils.py

示例10: sayurl

# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def sayurl(self, ctx: commands.Context, url):
        """
        Converts a URL to something readable

        Works better on smaller websites
        """

        h = html2text.HTML2Text()
        h.ignore_links = True
        # h.ignore_images = True
        h.images_to_alt = True

        h.escape_snob = True
        h.skip_internal_links = True
        h.ignore_tables = True
        h.single_line_break = True
        h.mark_code = True
        h.wrap_links = True
        h.ul_item_mark = "-"

        async with aiohttp.ClientSession() as session:
            site = await fetch_url(session, url)

        for page in pagify(h.handle(site)):
            await ctx.send(page) 
开发者ID:bobloy,项目名称:Fox-V3,代码行数:27,代码来源:sayurl.py

示例11: get_xs_detail

# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def get_xs_detail(href, title, path):
    url = xzl+href
    print('开始采集' + title + '的详情, 章节地址为: ' + url + '\n')
    text_maker = ht.HTML2Text()
    response = close_session().get(url=url, headers=headers)
    selector = Selector(text=response.text)
    html = selector.css(u'.cata-book-content').extract_first()
    file_name = title
    if markdown:
        md = text_maker.handle(html)
        with open(path + file_name + '.md', 'w') as f:
            f.write(md)
    else:
        if not xs_pdf:
            # 在html中加入编码, 否则中文会乱码
            html = "<html><head><meta charset='utf-8'></head> " + html + "</html>"
            pdfkit.from_string(html, path + file_name + '.pdf')
        else:
            return html


# 采集专栏列表 
开发者ID:iizvv,项目名称:xzl,代码行数:24,代码来源:xzl.py

示例12: get_zl_detail

# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def get_zl_detail(url, path, name):
    response = close_session().get(url=url, headers=headers)
    selector = Selector(text=response.text)
    text_maker = ht.HTML2Text()
    create_time = selector.css(u'.time abbr::attr(title)').extract_first()
    html = selector.css(u'.xzl-topic-body-content').extract_first()
    file_name = name
    if hasTime:
        file_name = create_time+' '+name
    if markdown:
        md = text_maker.handle(html)
        with open(path + file_name + '.md', 'w') as f:
            f.write(md)
    else:
        # 在html中加入编码, 否则中文会乱码
        html = "<html><head><meta charset='utf-8'></head> " + html + "</html>"
        pdfkit.from_string(html, path + file_name + '.pdf')


# 关闭多余连接 
开发者ID:iizvv,项目名称:xzl,代码行数:22,代码来源:xzl.py

示例13: body

# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def body(self):
        if self._body is None:
            super(SQLemail, self).body
            self._body = fix_utf8_encoding(self._body)

        # if there's no plain text version available attempt to make one by
        # sanitising the html version. The output isn't always pretty but it
        # is readable, better than a blank screen and helps the user decide
        # if the message is spam or ham.
        if self.dformat == "plain" and not self.contents["plain"] \
                and self.contents["html"]:
            h = HTML2Text()
            h.ignore_tables = True
            h.images_to_alt = True
            mail_text = h.handle(self.contents["html"])
            self.contents["plain"] = self._post_process_plain(
                smart_text(mail_text))
            self._body = self.viewmail_plain()
            self._body = fix_utf8_encoding(self._body)

        return self._body 
开发者ID:modoboa,项目名称:modoboa-amavis,代码行数:23,代码来源:sql_email.py

示例14: extract_text

# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def extract_text(self):
        html = self.data.decode(DEFAULT_TEXT_ENCODING)

        h = html2text.HTML2Text()
        # See https://github.com/Alir3z4/html2text/blob/master/html2text/cli.py
        h.ignore_links = True
        h.ignore_emphasis = True
        h.ignore_images = True
        h.unicode_snob = True
        h.ignore_tables = True
        h.decode_errors = "ignore"
        h.body_width = 0

        text = h.handle(html)

        return self.remove_header_prefixes(text) 
开发者ID:mideind,项目名称:Greynir,代码行数:18,代码来源:doc.py

示例15: html_to_text

# 需要导入模块: import html2text [as 别名]
# 或者: from html2text import HTML2Text [as 别名]
def html_to_text(s, ignore_images=False):
    h = html2text.HTML2Text()
    h.ignore_images = ignore_images
    return h.handle(s) 
开发者ID:fpsw,项目名称:Servo,代码行数:6,代码来源:utils.py


注:本文中的html2text.HTML2Text方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。