當前位置: 首頁>>代碼示例>>Python>>正文


Python readability.Document方法代碼示例

本文整理匯總了Python中readability.Document方法的典型用法代碼示例。如果您正苦於以下問題:Python readability.Document方法的具體用法?Python readability.Document怎麽用?Python readability.Document使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在readability的用法示例。


在下文中一共展示了readability.Document方法的9個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: extract

# 需要導入模塊: import readability [as 別名]
# 或者: from readability import Document [as 別名]
def extract(self, item):
        """Creates an readability document and returns an ArticleCandidate containing article title and text.

        :param item: A NewscrawlerItem to parse.
        :return: ArticleCandidate containing the recovered article data.
        """

        doc = Document(deepcopy(item['spider_response'].body))
        description = doc.summary()

        article_candidate = ArticleCandidate()
        article_candidate.extractor = self._name
        article_candidate.title = doc.short_title()
        article_candidate.description = description
        article_candidate.text = self._text(item)
        article_candidate.topimage = self._topimage(item)
        article_candidate.author = self._author(item)
        article_candidate.publish_date = self._publish_date(item)
        article_candidate.language = self._language(item)

        return article_candidate 
開發者ID:fhamborg,項目名稱:news-please,代碼行數:23,代碼來源:readability_extractor.py

示例2: story_readability

# 需要導入模塊: import readability [as 別名]
# 或者: from readability import Document [as 別名]
def story_readability(content):
    """
    >>> content = '<p>hello <b>world</b><br/>你好<i>世界</i></p>'
    >>> print(story_readability(content))
    <body id="readabilityBody"><p>hello <b>world</b><br/>你好<i>世界</i></p></body>
    """
    if (not content) or (not content.strip()):
        return ""
    doc = ReadabilityDocument(content)
    return doc.summary(html_partial=True) or "" 
開發者ID:anyant,項目名稱:rssant,代碼行數:12,代碼來源:processor.py

示例3: get_main_html

# 需要導入模塊: import readability [as 別名]
# 或者: from readability import Document [as 別名]
def get_main_html(html):
    doc = Document(html)
    return doc.summary() 
開發者ID:chaijunit,項目名稱:beibq,代碼行數:5,代碼來源:html.py

示例4: run_readability

# 需要導入模塊: import readability [as 別名]
# 或者: from readability import Document [as 別名]
def run_readability(htmlstring):
    '''try with the Python3 port of readability.js'''
    try:
        doc = Document(htmlstring)
        return doc.summary() # sanitize(doc.summary())
    except Exception as err:
        print('Exception:', err)
        return '' 
開發者ID:adbar,項目名稱:trafilatura,代碼行數:10,代碼來源:comparison.py

示例5: clean

# 需要導入模塊: import readability [as 別名]
# 或者: from readability import Document [as 別名]
def clean(self):
        """Download the article and strip it of HTML formatting."""
        self.res = requests.get(self.url, headers={'User-Agent':ua}, timeout=30)
        doc = Document(self.res.text)

        h = html2text.HTML2Text()
        h.ignore_links = True
        h.ignore_emphasis = True
        h.ignore_images = True
        h.body_width = 0

        self.plaintext = h.handle(doc.summary()) 
開發者ID:freedomofpress,項目名稱:trackthenews,代碼行數:14,代碼來源:core.py

示例6: get_filename

# 需要導入模塊: import readability [as 別名]
# 或者: from readability import Document [as 別名]
def get_filename(self, abs_url):
        request_text = get_page_with_retry(abs_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()

        # Clean the title and make it titlecase
        title = clean_string(title)
        title = titlecase.titlecase(title)
        title = title.replace(" ", "_")
        title = clean_string(title)
        name = title.strip("_") + ".pdf"
        name = unidecode.unidecode(name)
        logger.info("Created filename: %s" % name)
        return name 
開發者ID:GjjvdBurg,項目名稱:paper2remarkable,代碼行數:16,代碼來源:html.py

示例7: retrieve_pdf

# 需要導入模塊: import readability [as 別名]
# 或者: from readability import Document [as 別名]
def retrieve_pdf(self, pdf_url, filename):
        """Turn the HTML article in a clean pdf file"""
        # Steps
        # 1. Pull the HTML page using requests
        # 2. Extract the article part of the page using readability
        # 3. Convert the article HTML to markdown using html2text
        # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
        # 4. Convert the HTML to PDF, pulling in images where needed
        # 5. Save the PDF to the specified filename.
        request_text = get_page_with_retry(pdf_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()
        raw_html = doc.summary(html_partial=True)

        h2t = html2text.HTML2Text()
        h2t.wrap_links = False
        text = h2t.handle(raw_html)

        # Add the title back to the document
        article = "# {title}\n\n{text}".format(title=title, text=text)

        # Convert to html, fixing relative image urls.
        md = markdown.Markdown()
        md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
        html_article = md.convert(article)

        if self.debug:
            with open("./paper.html", "w") as fp:
                fp.write(html_article)

        font_config = weasyprint.fonts.FontConfiguration()
        html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher)
        css = weasyprint.CSS(string=CSS, font_config=font_config)

        html.write_pdf(filename, stylesheets=[css], font_config=font_config) 
開發者ID:GjjvdBurg,項目名稱:paper2remarkable,代碼行數:37,代碼來源:html.py

示例8: get_clean_text

# 需要導入模塊: import readability [as 別名]
# 或者: from readability import Document [as 別名]
def get_clean_text(html):
    """
    generate clean text for given html
    """
    doc = readability.Document(html)
    try:
        doc._html()
        clean = doc.get_clean_html()
    except Exception as e:
        print(e)
        clean = html
    bsObj = bs(clean)
    return bsObj.get_text() 
開發者ID:geekinglcq,項目名稱:aca,代碼行數:15,代碼來源:utility.py

示例9: readability

# 需要導入模塊: import readability [as 別名]
# 或者: from readability import Document [as 別名]
def readability():
	import requests
	from readability import Document	
	from bs4 import BeautifulSoup 

	data = dict(default_data)
	data['message'] = "Article Extraction by Readability"
	data['params'] = {}
	data['error'] = ''
	data['readability'] = {}

	if request.method == 'GET':
		data['params']['url'] = request.args.get('url')
		if not data['params']['url']:
			data['error'] = '[url] parameter not found'
			return jsonify(data)

		response = requests.get( data['params']['url'] )
		doc = Document(response.text)

	elif request.method == 'POST':
		params = request.form # postdata

		if not params:
			data['error'] = 'Missing parameters'
			return jsonify(data)

		if not params['html']:
			data['error'] = 'html parameter not found'
			return jsonify(data)
	
		doc = Document( params['html'] )
	
	data['readability']['title'] = doc.title()
	data['readability']['short_title'] = doc.short_title()
	#data['readability']['content'] = doc.content()
	data['readability']['article_html'] = doc.summary( html_partial=True )

	soup = BeautifulSoup( data['readability']['article_html'] ) 
	data['readability']['text'] =  soup.get_text() 

	return jsonify(data) 
開發者ID:web64,項目名稱:nlpserver,代碼行數:44,代碼來源:nlpserver.py


注:本文中的readability.Document方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。