本文整理匯總了Python中readability.Document.short_title方法的典型用法代碼示例。如果您正苦於以下問題:Python Document.short_title方法的具體用法?Python Document.short_title怎麽用?Python Document.short_title使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類readability.Document
的用法示例。
在下文中一共展示了Document.short_title方法的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: convert
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import short_title [as 別名]
def convert(link):
"""
use burify's readability implementation to transcode a web page
and return the transcoded page and images found in it
"""
if not link:
logger.error('Cannot transcode nothing!')
return None, None, None
try:
data = transcoder.prepare_link(link)
if data:
article = Document(data)
if article:
images, content = _collect_images(
article.summary(html_partial=False), link)
return article.short_title(), content, images
else:
logger.info('Burify cannot recognize the data')
return None, None, None
else:
logger.info('Cannot parse %s correctly' % link)
return None, None, None
except Exception as k:
logger.error('%s for %s' % (str(k), str(link)))
return None, None, None
示例2: parse_item
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import short_title [as 別名]
def parse_item(self, response):
filename = hashlib.sha1(response.url.encode()).hexdigest()
readability_document = Document(response.body, url=response.url)
item = BeerReviewPage()
item['url'] = response.url
item['filename'] = filename
item['depth'] = response.meta['depth']
item['link_text'] = response.meta['link_text']
item['title'] = readability_document.short_title()
with open('data/' + filename + '.html','wb') as html_file:
html_file.write(readability_document.content())
print '(' + filename + ') ' + item['title'] + " : " + item['url']
return item
示例3: extract_article
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import short_title [as 別名]
def extract_article(html, title=None):
"""
Wraps around readability.Document and returns the articles
title and content.
"""
doc = Document(html, negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS)
doc_title = doc.short_title()
# invoke the summary method to invoke readability's magic
doc.summary(html_partial=True)
# obtain the article as HtmlElement tree:
html_tree = doc.html
# clean up the article html:
clean_html = cleanup(html_tree, doc_title)
# check if the outer element is a tag from negative_keywords
if elem_attr_contain(clean_html, settings.ARTEX_NEGATIVE_KEYWORDS):
bad_attr = True
else:
bad_attr = False
if clean_html.tag in settings.ARTEX_NEGATIVE_KEYWORDS or bad_attr:
# if so, redo extraction with min_text_length set to 0
doc = Document(html,
negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS,
min_text_length=0)
doc_title = doc.short_title()
# invoke the summary method to invoke readability's magic
doc.summary(html_partial=True)
# obtain the article as HtmlElement tree:
html_tree = doc.html
# clean up the article html:
clean_html = cleanup(html_tree, doc_title)
content = elem_content_to_string(clean_html)
if title:
# if the extracted title is not a subset of given title, use
# the given title (b/c we assume this is more accurate, but
# maybe with some unneccessary boilerplate).
if not doc_title in title or doc_title == '':
doc_title = title
return doc_title, content
示例4: preliminary_parse
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import short_title [as 別名]
def preliminary_parse(self):
if(not self.is_downloaded):
raise Exception("not downloaded")
try:
d = Document(self.html)
self._readability_title = d.short_title()
self._readability_text = d.summary()
logging.debug(u"readability title: {0}".format(repr(self._readability_title)))
logging.debug(u"readability text: {0}".format(repr(self._readability_text)))
if(self._readability_title and self._readability_text):
return
except Exception as e:
logging.warning("error while doing readability parse: {0}".format(str(e)))
logging.debug("falling back to newspaper parse")
self.newspaper_article.parse()
logging.debug(u"newspaper title: {0}".format(repr(self._newspaper_title)))
logging.debug(u"newspaper text: {0}".format(repr(self._newspaper_text)))
示例5: extract
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import short_title [as 別名]
def extract(self, item):
"""Creates an readability document and returns an ArticleCandidate containing article title and text.
:param item: A NewscrawlerItem to parse.
:return: ArticleCandidate containing the recovered article data.
"""
doc = Document(deepcopy(item['spider_response'].body))
description = doc.summary()
article_candidate = ArticleCandidate()
article_candidate.extractor = self._name
article_candidate.title = doc.short_title()
article_candidate.description = description
article_candidate.text = self._text(item)
article_candidate.topimage = self._topimage(item)
article_candidate.author = self._author(item)
article_candidate.publish_date = self._publish_date(item)
article_candidate.language = self._language(item)
return article_candidate
示例6: complement
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import short_title [as 別名]
def complement(self):
for entry in self.entries:
try:
response = requests.get(entry.url, timeout=10)
except requests.RequestException as excp:
logger.warn('Exception requesting article %s: %s',
entry.url, excp.message)
continue
document = Document(response.content, url=response.url)
# Image extraction first
document._html() # Trigger parsing
images = document.html.xpath(
'//meta[@property="og:image"]/@content')
images += document.html.xpath(
'//meta[@name="twitter:image:src"]/@content')
# Content extraction second
entry.url = response.url
entry.image = (images or [''])[0]
entry.title = document.short_title()
entry.content = document.summary()
yield entry
示例7: parse_web_page
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import short_title [as 別名]
def parse_web_page(text):
"""
Generic wep page parser with readability.
Used as a fallback.
:param text: unicode text
:return: title, article
:raise ParserException:
"""
try:
from readability import Document
from readability.readability import Unparseable
except ImportError:
raise ParserException('readability is not installed')
if not text:
raise ParserException('No decoded text available, aborting!')
try:
doc = Document(text)
except Unparseable as e:
raise ParserException(e.message)
else:
return doc.short_title(), doc.summary(True)
示例8: __init__
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import short_title [as 別名]
def __init__(self, url, full_content=None, timeout=10):
logger.info("HtmlContentExtractor.__init__: url=%s, full_content is None=%s", url, (full_content == None))
# validate
if not isinstance(url, str):
raise RuntimeError("url not str.")
if len(url) == 0:
raise RuntimeException("len(url) == 0")
if full_content is not None:
if not isinstance(full_content, str):
raise RuntimeError("full_content not str.")
if len(full_content) == 0:
raise ContentNoDataException(url)
# Initialize instance variable
self.url = url
self.title = ""
self.full_content = full_content
self.content = ""
self.simplified_content = ""
self.summary_list = ""
# Get html document
if self.full_content is None:
logger.debug("requests.get: start. url=%s", url)
try:
r = requests.get(url, timeout=timeout)
except requests.exceptions.RequestException as ex:
logger.warn("requests.get: fail. exception=%s", repr(ex))
raise ContentRequestFailException(url)
logger.debug("requests.get: end. status_code=%s, content_type=%s, len(full_content)=%s", r.status_code, r.headers["content-type"], len(r.text))
logger.debug("request result check: start.")
if r.status_code == 404:
raise ContentNotFoundException(url)
if len(r.text) == 0:
raise ContentNoDataException(url)
logger.debug("request result check: end.")
logger.debug("get full_content: start.")
self.full_content = r.text
logger.debug("get full_content: end. len(full_content)=%s", len(self.full_content))
else:
logger.debug("full_content not None")
# Analyze html document
## Get extracted content
logger.debug("extract content: start.")
doc = Document(self.full_content)
self.content = doc.summary()
logger.debug("extract content: end. len(content)=%s", len(self.content))
## Get title
logger.debug("get title: start.")
self.title = doc.short_title()
logger.debug("get title: end. title=%s", self.title)
## Get simplified content
logger.debug("content simplify: start.")
markdown_content = pypandoc.convert_text(self.content, "markdown_github", format="html", extra_args=["--normalize", "--no-wrap"])
self.simplified_content = pypandoc.convert_text(markdown_content, "html", format="markdown_github", extra_args=["--email-obfuscation=none"])
logger.debug("content simplify: end. len(simplified_content)=%s", len(self.simplified_content))
# Get summary
logger.debug("summarize: start.")
auto_abstractor = AutoAbstractor()
abstractable_doc = AbstractableTopNRank()
abstractable_doc.set_top_n(3)
summary_list = auto_abstractor.summarize(self.simplified_content, abstractable_doc)["summarize_result"]
self.summary_list = [pypandoc.convert_text(summary.strip(), "plain", format="html").strip() for summary in summary_list]
logger.debug("summarize: end. len(summary_list)=%s", len(self.summary_list))