本文整理匯總了Python中readability.Document.summary方法的典型用法代碼示例。如果您正苦於以下問題:Python Document.summary方法的具體用法?Python Document.summary怎麽用?Python Document.summary使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類readability.Document
的用法示例。
在下文中一共展示了Document.summary方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: test_best_elem_is_root_and_passing
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def test_best_elem_is_root_and_passing(self):
sample = (
'<html class="article" id="body">'
' <body>'
' <p>1234567890123456789012345</p>'
' </body>'
'</html>'
)
doc = Document(sample)
doc.summary()
示例2: get
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def get(self):
url = self.get_argument("url", None)
# https://www.ifanr.com/1080409
doc = Webcache.find_one({'url': url}, {'_id': 0})
if doc:
self.res = dict(doc)
return self.write_json()
try:
sessions = requests.session()
sessions.headers[
'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
response = sessions.get(url)
# response.encoding = 'utf-8' # TODO
response.encoding = get_charset(response)
doc = Document(response.text)
title = doc.title()
summary = doc.summary()
markdown = html2text.html2text(summary)
markdown = markdown.replace('-\n', '-')
markdown = markdown.strip()
res = {}
res['url'] = url
res['title'] = title
res['markdown'] = markdown
if title and markdown:
webcache = Webcache
webcache.new(res)
self.res = res
self.write_json()
except Exception as e:
print(e)
示例3: test_si_sample_html_partial
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def test_si_sample_html_partial(self):
"""Using the si sample, make sure we can get the article alone."""
sample = load_sample('si-game.sample.html')
doc = Document(sample)
doc.parse(["summary"], html_partial=True)
res = doc.summary()
self.assertEqual('<div><h1>Tigers-R', res[0:17])
示例4: test_si_sample
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def test_si_sample(self):
"""Using the si sample, load article with only opening body element"""
sample = load_sample('si-game.sample.html')
doc = Document(sample)
doc.parse(["summary"])
res = doc.summary()
self.assertEqual('<html><body><h1>Tigers-Roya', res[0:27])
示例5: convert
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def convert(link):
"""
use burify's readability implementation to transcode a web page
and return the transcoded page and images found in it
"""
if not link:
logger.error('Cannot transcode nothing!')
return None, None, None
try:
data = transcoder.prepare_link(link)
if data:
article = Document(data)
if article:
images, content = _collect_images(
article.summary(html_partial=False), link)
return article.short_title(), content, images
else:
logger.info('Burify cannot recognize the data')
return None, None, None
else:
logger.info('Cannot parse %s correctly' % link)
return None, None, None
except Exception as k:
logger.error('%s for %s' % (str(k), str(link)))
return None, None, None
示例6: test_lxml_obj_result
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def test_lxml_obj_result(self):
"""Feed Document with an lxml obj instead of an html string. Expect an lxml response"""
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
sample = lxml.html.document_fromstring(load_sample('nyt-article-video.sample.html'), parser=utf8_parser)
doc = Document(sample, url='http://nytimes.com/')
res = doc.summary()
self.assertFalse(isinstance(res, basestring))
示例7: test_correct_cleanup
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def test_correct_cleanup(self):
sample = """
<html>
<body>
<section>test section</section>
<article class="">
<p>Lot of text here.</p>
<div id="advertisement"><a href="link">Ad</a></div>
<p>More text is written here, and contains punctuation and dots.</p>
</article>
<aside id="comment1"/>
<div id="comment2">
<a href="asd">spam</a>
<a href="asd">spam</a>
<a href="asd">spam</a>
</div>
<div id="comment3"/>
<aside id="comment4">A small comment.</aside>
<div id="comment5"><p>The comment is also helpful, but it's
still not the correct item to be extracted.</p>
<p>It's even longer than the article itself!"</p></div>
</body>
</html>
"""
doc = Document(sample)
s = doc.summary()
#print(s)
assert('punctuation' in s)
assert(not 'comment' in s)
assert(not 'aside' in s)
示例8: process_item
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def process_item(self, article, spider):
doc = Document(article['text'])
article['text'] = strip_tags(doc.summary())
article['hash'] = hashlib.sha256(article['url']).hexdigest()
return article
示例9: test_many_repeated_spaces
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def test_many_repeated_spaces(self):
long_space = ' ' * 1000000
sample = '<html><body><p>foo' + long_space + '</p></body></html>'
doc = Document(sample)
s = doc.summary()
assert 'foo' in s
示例10: test_si_sample
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def test_si_sample(self):
"""Using the si sample, load article with only opening body element"""
sample = load_sample('si-game.sample.html')
doc = Document(
sample,
url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
res = doc.summary()
self.assertEqual('<html><body><div><div class', res[0:27])
示例11: test_si_sample_html_partial
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def test_si_sample_html_partial(self):
"""Using the si sample, make sure we can get the article alone."""
sample = load_sample("si-game.sample.html")
doc = Document(
sample, url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html"
)
res = doc.summary(enclose_with_html_tag=True)
self.assertEqual('<div><div class="', res[0:17])
示例12: get
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def get(self):
urls = self.get_query_arguments('url')
if urls and len(urls) == 1:
url = urls[0]
doc = Document(requests.get(url).text)
self.write(smartypants(doc.summary()))
self.write(STYLE)
else:
self.write("Please provide ?url=[your-url]")
示例13: transform
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def transform(self, row, chan):
row['response'] = resolve_future(row['response'])
doc = Document(row['response'].content)
row['title'] = doc.title()
summary = doc.summary()
row['text'] = html2text(summary, bodywidth=160).replace('****', '').strip()
yield row
示例14: extract_article
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def extract_article(url, ip):
"""Extracts the article using readability"""
title, summary = None, None
response = get_url(url, ip)
if response.status_code == 200:
doc = Document(response.content)
summary = unicode(doc.summary())
title = unicode(doc.title())
return title, summary
else:
return None
示例15: extract_article
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def extract_article(html, title=None):
"""
Wraps around readability.Document and returns the articles
title and content.
"""
doc = Document(html, negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS)
doc_title = doc.short_title()
# invoke the summary method to invoke readability's magic
doc.summary(html_partial=True)
# obtain the article as HtmlElement tree:
html_tree = doc.html
# clean up the article html:
clean_html = cleanup(html_tree, doc_title)
# check if the outer element is a tag from negative_keywords
if elem_attr_contain(clean_html, settings.ARTEX_NEGATIVE_KEYWORDS):
bad_attr = True
else:
bad_attr = False
if clean_html.tag in settings.ARTEX_NEGATIVE_KEYWORDS or bad_attr:
# if so, redo extraction with min_text_length set to 0
doc = Document(html,
negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS,
min_text_length=0)
doc_title = doc.short_title()
# invoke the summary method to invoke readability's magic
doc.summary(html_partial=True)
# obtain the article as HtmlElement tree:
html_tree = doc.html
# clean up the article html:
clean_html = cleanup(html_tree, doc_title)
content = elem_content_to_string(clean_html)
if title:
# if the extracted title is not a subset of given title, use
# the given title (b/c we assume this is more accurate, but
# maybe with some unneccessary boilerplate).
if not doc_title in title or doc_title == '':
doc_title = title
return doc_title, content