Python readability.Document方法代码示例

本文整理汇总了Python中readability.Document方法的典型用法代码示例。如果您正苦于以下问题：Python readability.Document方法的具体用法？Python readability.Document怎么用？Python readability.Document使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类readability的用法示例。

在下文中一共展示了readability.Document方法的9个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: extract

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def extract(self, item):
        """Creates an readability document and returns an ArticleCandidate containing article title and text.

        :param item: A NewscrawlerItem to parse.
        :return: ArticleCandidate containing the recovered article data.
        """

        doc = Document(deepcopy(item['spider_response'].body))
        description = doc.summary()

        article_candidate = ArticleCandidate()
        article_candidate.extractor = self._name
        article_candidate.title = doc.short_title()
        article_candidate.description = description
        article_candidate.text = self._text(item)
        article_candidate.topimage = self._topimage(item)
        article_candidate.author = self._author(item)
        article_candidate.publish_date = self._publish_date(item)
        article_candidate.language = self._language(item)

        return article_candidate

开发者ID:fhamborg，项目名称:news-please，代码行数:23，代码来源:readability_extractor.py

示例2: story_readability

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def story_readability(content):
    """
    >>> content = '<p>hello <b>world</b><br/>你好<i>世界</i></p>'
    >>> print(story_readability(content))
    <body id="readabilityBody"><p>hello <b>world</b><br/>你好<i>世界</i></p></body>
    """
    if (not content) or (not content.strip()):
        return ""
    doc = ReadabilityDocument(content)
    return doc.summary(html_partial=True) or ""

开发者ID:anyant，项目名称:rssant，代码行数:12，代码来源:processor.py

示例3: get_main_html

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def get_main_html(html):
    doc = Document(html)
    return doc.summary()

开发者ID:chaijunit，项目名称:beibq，代码行数:5，代码来源:html.py

示例4: run_readability

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def run_readability(htmlstring):
    '''try with the Python3 port of readability.js'''
    try:
        doc = Document(htmlstring)
        return doc.summary() # sanitize(doc.summary())
    except Exception as err:
        print('Exception:', err)
        return ''

开发者ID:adbar，项目名称:trafilatura，代码行数:10，代码来源:comparison.py

示例5: clean

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def clean(self):
        """Download the article and strip it of HTML formatting."""
        self.res = requests.get(self.url, headers={'User-Agent':ua}, timeout=30)
        doc = Document(self.res.text)

        h = html2text.HTML2Text()
        h.ignore_links = True
        h.ignore_emphasis = True
        h.ignore_images = True
        h.body_width = 0

        self.plaintext = h.handle(doc.summary())

开发者ID:freedomofpress，项目名称:trackthenews，代码行数:14，代码来源:core.py

示例6: get_filename

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def get_filename(self, abs_url):
        request_text = get_page_with_retry(abs_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()

        # Clean the title and make it titlecase
        title = clean_string(title)
        title = titlecase.titlecase(title)
        title = title.replace(" ", "_")
        title = clean_string(title)
        name = title.strip("_") + ".pdf"
        name = unidecode.unidecode(name)
        logger.info("Created filename: %s" % name)
        return name

开发者ID:GjjvdBurg，项目名称:paper2remarkable，代码行数:16，代码来源:html.py

示例7: retrieve_pdf

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def retrieve_pdf(self, pdf_url, filename):
        """Turn the HTML article in a clean pdf file"""
        # Steps
        # 1. Pull the HTML page using requests
        # 2. Extract the article part of the page using readability
        # 3. Convert the article HTML to markdown using html2text
        # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
        # 4. Convert the HTML to PDF, pulling in images where needed
        # 5. Save the PDF to the specified filename.
        request_text = get_page_with_retry(pdf_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()
        raw_html = doc.summary(html_partial=True)

        h2t = html2text.HTML2Text()
        h2t.wrap_links = False
        text = h2t.handle(raw_html)

        # Add the title back to the document
        article = "# {title}\n\n{text}".format(title=title, text=text)

        # Convert to html, fixing relative image urls.
        md = markdown.Markdown()
        md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
        html_article = md.convert(article)

        if self.debug:
            with open("./paper.html", "w") as fp:
                fp.write(html_article)

        font_config = weasyprint.fonts.FontConfiguration()
        html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher)
        css = weasyprint.CSS(string=CSS, font_config=font_config)

        html.write_pdf(filename, stylesheets=[css], font_config=font_config)

开发者ID:GjjvdBurg，项目名称:paper2remarkable，代码行数:37，代码来源:html.py

示例8: get_clean_text

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def get_clean_text(html):
    """
    generate clean text for given html
    """
    doc = readability.Document(html)
    try:
        doc._html()
        clean = doc.get_clean_html()
    except Exception as e:
        print(e)
        clean = html
    bsObj = bs(clean)
    return bsObj.get_text()

开发者ID:geekinglcq，项目名称:aca，代码行数:15，代码来源:utility.py

示例9: readability

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def readability():
	import requests
	from readability import Document	
	from bs4 import BeautifulSoup 

	data = dict(default_data)
	data['message'] = "Article Extraction by Readability"
	data['params'] = {}
	data['error'] = ''
	data['readability'] = {}

	if request.method == 'GET':
		data['params']['url'] = request.args.get('url')
		if not data['params']['url']:
			data['error'] = '[url] parameter not found'
			return jsonify(data)

		response = requests.get( data['params']['url'] )
		doc = Document(response.text)

	elif request.method == 'POST':
		params = request.form # postdata

		if not params:
			data['error'] = 'Missing parameters'
			return jsonify(data)

		if not params['html']:
			data['error'] = 'html parameter not found'
			return jsonify(data)
	
		doc = Document( params['html'] )
	
	data['readability']['title'] = doc.title()
	data['readability']['short_title'] = doc.short_title()
	#data['readability']['content'] = doc.content()
	data['readability']['article_html'] = doc.summary( html_partial=True )

	soup = BeautifulSoup( data['readability']['article_html'] ) 
	data['readability']['text'] =  soup.get_text() 

	return jsonify(data)

开发者ID:web64，项目名称:nlpserver，代码行数:44，代码来源:nlpserver.py

注：本文中的readability.Document方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。