本文整理汇总了Python中readability.Document方法的典型用法代码示例。如果您正苦于以下问题:Python readability.Document方法的具体用法?Python readability.Document怎么用?Python readability.Document使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类readability
的用法示例。
在下文中一共展示了readability.Document方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extract
# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def extract(self, item):
"""Creates an readability document and returns an ArticleCandidate containing article title and text.
:param item: A NewscrawlerItem to parse.
:return: ArticleCandidate containing the recovered article data.
"""
doc = Document(deepcopy(item['spider_response'].body))
description = doc.summary()
article_candidate = ArticleCandidate()
article_candidate.extractor = self._name
article_candidate.title = doc.short_title()
article_candidate.description = description
article_candidate.text = self._text(item)
article_candidate.topimage = self._topimage(item)
article_candidate.author = self._author(item)
article_candidate.publish_date = self._publish_date(item)
article_candidate.language = self._language(item)
return article_candidate
示例2: story_readability
# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def story_readability(content):
"""
>>> content = '<p>hello <b>world</b><br/>你好<i>世界</i></p>'
>>> print(story_readability(content))
<body id="readabilityBody"><p>hello <b>world</b><br/>你好<i>世界</i></p></body>
"""
if (not content) or (not content.strip()):
return ""
doc = ReadabilityDocument(content)
return doc.summary(html_partial=True) or ""
示例3: get_main_html
# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def get_main_html(html):
doc = Document(html)
return doc.summary()
示例4: run_readability
# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def run_readability(htmlstring):
'''try with the Python3 port of readability.js'''
try:
doc = Document(htmlstring)
return doc.summary() # sanitize(doc.summary())
except Exception as err:
print('Exception:', err)
return ''
示例5: clean
# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def clean(self):
"""Download the article and strip it of HTML formatting."""
self.res = requests.get(self.url, headers={'User-Agent':ua}, timeout=30)
doc = Document(self.res.text)
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_emphasis = True
h.ignore_images = True
h.body_width = 0
self.plaintext = h.handle(doc.summary())
示例6: get_filename
# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def get_filename(self, abs_url):
request_text = get_page_with_retry(abs_url, return_text=True)
doc = readability.Document(request_text)
title = doc.title()
# Clean the title and make it titlecase
title = clean_string(title)
title = titlecase.titlecase(title)
title = title.replace(" ", "_")
title = clean_string(title)
name = title.strip("_") + ".pdf"
name = unidecode.unidecode(name)
logger.info("Created filename: %s" % name)
return name
示例7: retrieve_pdf
# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def retrieve_pdf(self, pdf_url, filename):
"""Turn the HTML article in a clean pdf file"""
# Steps
# 1. Pull the HTML page using requests
# 2. Extract the article part of the page using readability
# 3. Convert the article HTML to markdown using html2text
# 4. Convert the markdown back to HTML (this is done to sanitize HTML)
# 4. Convert the HTML to PDF, pulling in images where needed
# 5. Save the PDF to the specified filename.
request_text = get_page_with_retry(pdf_url, return_text=True)
doc = readability.Document(request_text)
title = doc.title()
raw_html = doc.summary(html_partial=True)
h2t = html2text.HTML2Text()
h2t.wrap_links = False
text = h2t.handle(raw_html)
# Add the title back to the document
article = "# {title}\n\n{text}".format(title=title, text=text)
# Convert to html, fixing relative image urls.
md = markdown.Markdown()
md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
html_article = md.convert(article)
if self.debug:
with open("./paper.html", "w") as fp:
fp.write(html_article)
font_config = weasyprint.fonts.FontConfiguration()
html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher)
css = weasyprint.CSS(string=CSS, font_config=font_config)
html.write_pdf(filename, stylesheets=[css], font_config=font_config)
示例8: get_clean_text
# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def get_clean_text(html):
"""
generate clean text for given html
"""
doc = readability.Document(html)
try:
doc._html()
clean = doc.get_clean_html()
except Exception as e:
print(e)
clean = html
bsObj = bs(clean)
return bsObj.get_text()
示例9: readability
# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def readability():
import requests
from readability import Document
from bs4 import BeautifulSoup
data = dict(default_data)
data['message'] = "Article Extraction by Readability"
data['params'] = {}
data['error'] = ''
data['readability'] = {}
if request.method == 'GET':
data['params']['url'] = request.args.get('url')
if not data['params']['url']:
data['error'] = '[url] parameter not found'
return jsonify(data)
response = requests.get( data['params']['url'] )
doc = Document(response.text)
elif request.method == 'POST':
params = request.form # postdata
if not params:
data['error'] = 'Missing parameters'
return jsonify(data)
if not params['html']:
data['error'] = 'html parameter not found'
return jsonify(data)
doc = Document( params['html'] )
data['readability']['title'] = doc.title()
data['readability']['short_title'] = doc.short_title()
#data['readability']['content'] = doc.content()
data['readability']['article_html'] = doc.summary( html_partial=True )
soup = BeautifulSoup( data['readability']['article_html'] )
data['readability']['text'] = soup.get_text()
return jsonify(data)