當前位置: 首頁>>代碼示例>>Python>>正文


Python Document.summary方法代碼示例

本文整理匯總了Python中readability.Document.summary方法的典型用法代碼示例。如果您正苦於以下問題:Python Document.summary方法的具體用法?Python Document.summary怎麽用?Python Document.summary使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在readability.Document的用法示例。


在下文中一共展示了Document.summary方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: test_best_elem_is_root_and_passing

# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
 def test_best_elem_is_root_and_passing(self):
     sample = (
         '<html class="article" id="body">'
         '   <body>'
         '       <p>1234567890123456789012345</p>'
         '   </body>'
         '</html>'
     )
     doc = Document(sample)
     doc.summary()
開發者ID:buriy,項目名稱:python-readability,代碼行數:12,代碼來源:test_article_only.py

示例2: get

# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
 def get(self):
     url = self.get_argument("url", None)
     # https://www.ifanr.com/1080409
     doc = Webcache.find_one({'url': url}, {'_id': 0})
     if doc:
         self.res = dict(doc)
         return self.write_json()
     try:
         sessions = requests.session()
         sessions.headers[
             'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
         response = sessions.get(url)
         # response.encoding = 'utf-8'  # TODO
         response.encoding = get_charset(response)
         doc = Document(response.text)
         title = doc.title()
         summary = doc.summary()
         markdown = html2text.html2text(summary)
         markdown = markdown.replace('-\n', '-')
         markdown = markdown.strip()
         res = {}
         res['url'] = url
         res['title'] = title
         res['markdown'] = markdown
         if title and markdown:
             webcache = Webcache
             webcache.new(res)
             self.res = res
         self.write_json()
     except Exception as e:
         print(e)
開發者ID:anwen,項目名稱:anwen,代碼行數:33,代碼來源:api_share.py

示例3: test_si_sample_html_partial

# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
 def test_si_sample_html_partial(self):
     """Using the si sample, make sure we can get the article alone."""
     sample = load_sample('si-game.sample.html')
     doc = Document(sample)
     doc.parse(["summary"], html_partial=True)
     res = doc.summary()
     self.assertEqual('<div><h1>Tigers-R', res[0:17])
開發者ID:stalkerg,項目名稱:python-readability,代碼行數:9,代碼來源:test_article_only.py

示例4: test_si_sample

# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
 def test_si_sample(self):
     """Using the si sample, load article with only opening body element"""
     sample = load_sample('si-game.sample.html')
     doc = Document(sample)
     doc.parse(["summary"])
     res = doc.summary()
     self.assertEqual('<html><body><h1>Tigers-Roya', res[0:27])
開發者ID:stalkerg,項目名稱:python-readability,代碼行數:9,代碼來源:test_article_only.py

示例5: convert

# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def convert(link):
    """
    use burify's readability implementation to transcode a web page
    and return the transcoded page and images found in it
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    try:
        data = transcoder.prepare_link(link)
        if data:
            article = Document(data)
            if article:
                images, content = _collect_images(
                    article.summary(html_partial=False), link)
                return article.short_title(), content, images
            else:
                logger.info('Burify cannot recognize the data')
                return None, None, None
        else:
            logger.info('Cannot parse %s correctly' % link)
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
開發者ID:chengdujin,項目名稱:newsman,代碼行數:28,代碼來源:burify.py

示例6: test_lxml_obj_result

# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
 def test_lxml_obj_result(self):
     """Feed Document with an lxml obj instead of an html string. Expect an lxml response"""
     utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
     sample = lxml.html.document_fromstring(load_sample('nyt-article-video.sample.html'), parser=utf8_parser)
     doc = Document(sample, url='http://nytimes.com/')
     res = doc.summary()
     self.assertFalse(isinstance(res, basestring))
開發者ID:RebelMouseTeam,項目名稱:python-readability,代碼行數:9,代碼來源:test_article_only.py

示例7: test_correct_cleanup

# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
    def test_correct_cleanup(self):
        sample = """
        <html>
            <body>
                <section>test section</section>
                <article class="">
<p>Lot of text here.</p>
                <div id="advertisement"><a href="link">Ad</a></div>
<p>More text is written here, and contains punctuation and dots.</p>
</article>
                <aside id="comment1"/>
                <div id="comment2">
                    <a href="asd">spam</a>
                    <a href="asd">spam</a>
                    <a href="asd">spam</a>
                </div>
                <div id="comment3"/>
                <aside id="comment4">A small comment.</aside>
                <div id="comment5"><p>The comment is also helpful, but it's
                    still not the correct item to be extracted.</p>
                    <p>It's even longer than the article itself!"</p></div>
            </body>
        </html>
        """
        doc = Document(sample)
        s = doc.summary()
        #print(s)
        assert('punctuation' in s)
        assert(not 'comment' in s)
        assert(not 'aside' in s)
開發者ID:buriy,項目名稱:python-readability,代碼行數:32,代碼來源:test_article_only.py

示例8: process_item

# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
 def process_item(self, article, spider):
     
     doc = Document(article['text'])
     article['text'] = strip_tags(doc.summary())
     article['hash'] = hashlib.sha256(article['url']).hexdigest()
     
     return article
開發者ID:omidmt,項目名稱:crawler,代碼行數:9,代碼來源:pipelines.py

示例9: test_many_repeated_spaces

# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
    def test_many_repeated_spaces(self):
        long_space = ' ' * 1000000
        sample = '<html><body><p>foo' + long_space + '</p></body></html>'

        doc = Document(sample)
        s = doc.summary()

        assert 'foo' in s
開發者ID:buriy,項目名稱:python-readability,代碼行數:10,代碼來源:test_article_only.py

示例10: test_si_sample

# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
 def test_si_sample(self):
     """Using the si sample, load article with only opening body element"""
     sample = load_sample('si-game.sample.html')
     doc = Document(
         sample,
         url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
     res = doc.summary()
     self.assertEqual('<html><body><div><div class', res[0:27])
開發者ID:buriy,項目名稱:python-readability,代碼行數:10,代碼來源:test_article_only.py

示例11: test_si_sample_html_partial

# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
 def test_si_sample_html_partial(self):
     """Using the si sample, make sure we can get the article alone."""
     sample = load_sample("si-game.sample.html")
     doc = Document(
         sample, url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html"
     )
     res = doc.summary(enclose_with_html_tag=True)
     self.assertEqual('<div><div class="', res[0:17])
開發者ID:DannyGoodall,項目名稱:python-readability,代碼行數:10,代碼來源:test_article_only.py

示例12: get

# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
 def get(self):
   urls = self.get_query_arguments('url')
   if urls and len(urls) == 1:
     url = urls[0]
     doc = Document(requests.get(url).text)
     self.write(smartypants(doc.summary()))
     self.write(STYLE)
   else:
     self.write("Please provide ?url=[your-url]")
開發者ID:guidoism,項目名稱:prettyweb,代碼行數:11,代碼來源:unshitify.py

示例13: transform

# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
    def transform(self, row, chan):
        row['response'] = resolve_future(row['response'])

        doc = Document(row['response'].content)

        row['title'] = doc.title()
        summary = doc.summary()
        row['text'] = html2text(summary, bodywidth=160).replace('****', '').strip()

        yield row
開發者ID:hartym,項目名稱:readtheweb,代碼行數:12,代碼來源:transformers.py

示例14: extract_article

# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def extract_article(url, ip):
    """Extracts the article using readability"""
    title, summary = None, None
    response = get_url(url, ip)
    if response.status_code == 200:
        doc = Document(response.content)
        summary = unicode(doc.summary())
        title = unicode(doc.title())
        return title, summary
    else:
        return None
開發者ID:apg,項目名稱:text-please,代碼行數:13,代碼來源:textplease.py

示例15: extract_article

# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import summary [as 別名]
def extract_article(html, title=None):
    """
    Wraps around readability.Document and returns the articles
    title and content.
    """
    doc = Document(html, negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS)
    doc_title = doc.short_title()
    # invoke the summary method to invoke readability's magic
    doc.summary(html_partial=True)
    # obtain the article as HtmlElement tree:
    html_tree = doc.html
    # clean up the article html:
    clean_html = cleanup(html_tree, doc_title)
    # check if the outer element is a tag from negative_keywords
    if elem_attr_contain(clean_html, settings.ARTEX_NEGATIVE_KEYWORDS):
        bad_attr = True
    else:
        bad_attr = False
    if clean_html.tag in settings.ARTEX_NEGATIVE_KEYWORDS or bad_attr:
        # if so, redo extraction with min_text_length set to 0
        doc = Document(html,
                       negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS,
                       min_text_length=0)
        doc_title = doc.short_title()
        # invoke the summary method to invoke readability's magic
        doc.summary(html_partial=True)
        # obtain the article as HtmlElement tree:
        html_tree = doc.html
        # clean up the article html:
        clean_html = cleanup(html_tree, doc_title)
    content = elem_content_to_string(clean_html)
    if title:
        # if the extracted title is not a subset of given title, use
        # the given title (b/c we assume this is more accurate, but
        # maybe with some unneccessary boilerplate).
        if not doc_title in title or doc_title == '':
            doc_title = title
    return doc_title, content
開發者ID:bericht,項目名稱:bericht,代碼行數:40,代碼來源:__init__.py


注:本文中的readability.Document.summary方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。