当前位置: 首页>>代码示例>>Python>>正文


Python Document.short_title方法代码示例

本文整理汇总了Python中readability.Document.short_title方法的典型用法代码示例。如果您正苦于以下问题:Python Document.short_title方法的具体用法?Python Document.short_title怎么用?Python Document.short_title使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在readability.Document的用法示例。


在下文中一共展示了Document.short_title方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: convert

# 需要导入模块: from readability import Document [as 别名]
# 或者: from readability.Document import short_title [as 别名]
def convert(link):
    """
    use burify's readability implementation to transcode a web page
    and return the transcoded page and images found in it
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    try:
        data = transcoder.prepare_link(link)
        if data:
            article = Document(data)
            if article:
                images, content = _collect_images(
                    article.summary(html_partial=False), link)
                return article.short_title(), content, images
            else:
                logger.info('Burify cannot recognize the data')
                return None, None, None
        else:
            logger.info('Cannot parse %s correctly' % link)
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
开发者ID:chengdujin,项目名称:newsman,代码行数:28,代码来源:burify.py

示例2: parse_item

# 需要导入模块: from readability import Document [as 别名]
# 或者: from readability.Document import short_title [as 别名]
 def parse_item(self, response):
     filename = hashlib.sha1(response.url.encode()).hexdigest()
     readability_document = Document(response.body, url=response.url)
     item = BeerReviewPage()
     item['url'] = response.url
     item['filename'] = filename
     item['depth'] = response.meta['depth']
     item['link_text'] = response.meta['link_text']
     item['title'] = readability_document.short_title()
     with open('data/' + filename + '.html','wb') as html_file:
         html_file.write(readability_document.content())
     print '(' + filename + ') ' + item['title'] + " : " + item['url']
     return item
开发者ID:anoras,项目名称:BeerGeek,代码行数:15,代码来源:BeerGeekSpider.py

示例3: extract_article

# 需要导入模块: from readability import Document [as 别名]
# 或者: from readability.Document import short_title [as 别名]
def extract_article(html, title=None):
    """
    Wraps around readability.Document and returns the articles
    title and content.
    """
    doc = Document(html, negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS)
    doc_title = doc.short_title()
    # invoke the summary method to invoke readability's magic
    doc.summary(html_partial=True)
    # obtain the article as HtmlElement tree:
    html_tree = doc.html
    # clean up the article html:
    clean_html = cleanup(html_tree, doc_title)
    # check if the outer element is a tag from negative_keywords
    if elem_attr_contain(clean_html, settings.ARTEX_NEGATIVE_KEYWORDS):
        bad_attr = True
    else:
        bad_attr = False
    if clean_html.tag in settings.ARTEX_NEGATIVE_KEYWORDS or bad_attr:
        # if so, redo extraction with min_text_length set to 0
        doc = Document(html,
                       negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS,
                       min_text_length=0)
        doc_title = doc.short_title()
        # invoke the summary method to invoke readability's magic
        doc.summary(html_partial=True)
        # obtain the article as HtmlElement tree:
        html_tree = doc.html
        # clean up the article html:
        clean_html = cleanup(html_tree, doc_title)
    content = elem_content_to_string(clean_html)
    if title:
        # if the extracted title is not a subset of given title, use
        # the given title (b/c we assume this is more accurate, but
        # maybe with some unneccessary boilerplate).
        if not doc_title in title or doc_title == '':
            doc_title = title
    return doc_title, content
开发者ID:bericht,项目名称:bericht,代码行数:40,代码来源:__init__.py

示例4: preliminary_parse

# 需要导入模块: from readability import Document [as 别名]
# 或者: from readability.Document import short_title [as 别名]
    def preliminary_parse(self):
        if(not self.is_downloaded):
            raise Exception("not downloaded")
        try:
            d = Document(self.html)
            self._readability_title = d.short_title()
            self._readability_text = d.summary()
            logging.debug(u"readability title: {0}".format(repr(self._readability_title)))
            logging.debug(u"readability text: {0}".format(repr(self._readability_text)))
            if(self._readability_title and self._readability_text):
                return
        except Exception as e:
            logging.warning("error while doing readability parse: {0}".format(str(e)))

        logging.debug("falling back to newspaper parse")
        self.newspaper_article.parse()
        logging.debug(u"newspaper title: {0}".format(repr(self._newspaper_title)))
        logging.debug(u"newspaper text: {0}".format(repr(self._newspaper_text)))
开发者ID:wangx173,项目名称:Voyage,代码行数:20,代码来源:ExplorerArticle.py

示例5: extract

# 需要导入模块: from readability import Document [as 别名]
# 或者: from readability.Document import short_title [as 别名]
    def extract(self, item):
        """Creates an readability document and returns an ArticleCandidate containing article title and text.

        :param item: A NewscrawlerItem to parse.
        :return: ArticleCandidate containing the recovered article data.
        """

        doc = Document(deepcopy(item['spider_response'].body))
        description = doc.summary()

        article_candidate = ArticleCandidate()
        article_candidate.extractor = self._name
        article_candidate.title = doc.short_title()
        article_candidate.description = description
        article_candidate.text = self._text(item)
        article_candidate.topimage = self._topimage(item)
        article_candidate.author = self._author(item)
        article_candidate.publish_date = self._publish_date(item)
        article_candidate.language = self._language(item)

        return article_candidate
开发者ID:Sayeedsalam,项目名称:spec-event-data-server,代码行数:23,代码来源:readability_extractor.py

示例6: complement

# 需要导入模块: from readability import Document [as 别名]
# 或者: from readability.Document import short_title [as 别名]
 def complement(self):
     for entry in self.entries:
         try:
             response = requests.get(entry.url, timeout=10)
         except requests.RequestException as excp:
             logger.warn('Exception requesting article %s: %s',
                         entry.url, excp.message)
             continue
         document = Document(response.content, url=response.url)
         # Image extraction first
         document._html()  # Trigger parsing
         images = document.html.xpath(
             '//meta[@property="og:image"]/@content')
         images += document.html.xpath(
             '//meta[@name="twitter:image:src"]/@content')
         # Content extraction second
         entry.url = response.url
         entry.image = (images or [''])[0]
         entry.title = document.short_title()
         entry.content = document.summary()
         yield entry
开发者ID:Telofy,项目名称:precise-altruism,代码行数:23,代码来源:daemon.py

示例7: parse_web_page

# 需要导入模块: from readability import Document [as 别名]
# 或者: from readability.Document import short_title [as 别名]
def parse_web_page(text):
    """
    Generic wep page parser with readability.
    Used as a fallback.

    :param text: unicode text
    :return: title, article
    :raise ParserException:
    """
    try:
        from readability import Document
        from readability.readability import Unparseable
    except ImportError:
        raise ParserException('readability is not installed')

    if not text:
        raise ParserException('No decoded text available, aborting!')
    try:
        doc = Document(text)
    except Unparseable as e:
        raise ParserException(e.message)
    else:
        return doc.short_title(), doc.summary(True)
开发者ID:DarkGreising,项目名称:pypo,代码行数:25,代码来源:scrapers.py

示例8: __init__

# 需要导入模块: from readability import Document [as 别名]
# 或者: from readability.Document import short_title [as 别名]
    def __init__(self, url, full_content=None, timeout=10):
        logger.info("HtmlContentExtractor.__init__: url=%s, full_content is None=%s", url, (full_content == None))

        # validate
        if not isinstance(url, str):
            raise RuntimeError("url not str.")
        if len(url) == 0:
            raise RuntimeException("len(url) == 0")

        if full_content is not None:
            if not isinstance(full_content, str):
                raise RuntimeError("full_content not str.")
            if len(full_content) == 0:
                raise ContentNoDataException(url)

        # Initialize instance variable
        self.url = url
        self.title = ""
        self.full_content = full_content
        self.content = ""
        self.simplified_content = ""
        self.summary_list = ""

        # Get html document
        if self.full_content is None:
            logger.debug("requests.get: start. url=%s", url)
            try:
                r = requests.get(url, timeout=timeout)
            except requests.exceptions.RequestException as ex:
                logger.warn("requests.get: fail. exception=%s", repr(ex))
                raise ContentRequestFailException(url)
            logger.debug("requests.get: end. status_code=%s, content_type=%s, len(full_content)=%s", r.status_code, r.headers["content-type"], len(r.text))

            logger.debug("request result check: start.")
            if r.status_code == 404:
                raise ContentNotFoundException(url)
            if len(r.text) == 0:
                raise ContentNoDataException(url)
            logger.debug("request result check: end.")

            logger.debug("get full_content: start.")
            self.full_content = r.text
            logger.debug("get full_content: end. len(full_content)=%s", len(self.full_content))
        else:
            logger.debug("full_content not None")

        # Analyze html document

        ## Get extracted content
        logger.debug("extract content: start.")
        doc = Document(self.full_content)
        self.content = doc.summary()
        logger.debug("extract content: end. len(content)=%s", len(self.content))

        ## Get title
        logger.debug("get title: start.")
        self.title = doc.short_title()
        logger.debug("get title: end. title=%s", self.title)

        ## Get simplified content
        logger.debug("content simplify: start.")
        markdown_content = pypandoc.convert_text(self.content, "markdown_github", format="html", extra_args=["--normalize", "--no-wrap"])
        self.simplified_content = pypandoc.convert_text(markdown_content, "html", format="markdown_github", extra_args=["--email-obfuscation=none"])
        logger.debug("content simplify: end. len(simplified_content)=%s", len(self.simplified_content))

        # Get summary
        logger.debug("summarize: start.")
        auto_abstractor = AutoAbstractor()
        abstractable_doc = AbstractableTopNRank()
        abstractable_doc.set_top_n(3)
        summary_list = auto_abstractor.summarize(self.simplified_content, abstractable_doc)["summarize_result"]
        self.summary_list = [pypandoc.convert_text(summary.strip(), "plain", format="html").strip() for summary in summary_list]
        logger.debug("summarize: end. len(summary_list)=%s", len(self.summary_list))
开发者ID:u6k,项目名称:extract-content,代码行数:75,代码来源:extractor.py


注:本文中的readability.Document.short_title方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。