当前位置: 首页>>代码示例>>Python>>正文


Python readability.Document方法代码示例

本文整理汇总了Python中readability.Document方法的典型用法代码示例。如果您正苦于以下问题:Python readability.Document方法的具体用法?Python readability.Document怎么用?Python readability.Document使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在readability的用法示例。


在下文中一共展示了readability.Document方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: extract

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def extract(self, item):
        """Creates an readability document and returns an ArticleCandidate containing article title and text.

        :param item: A NewscrawlerItem to parse.
        :return: ArticleCandidate containing the recovered article data.
        """

        doc = Document(deepcopy(item['spider_response'].body))
        description = doc.summary()

        article_candidate = ArticleCandidate()
        article_candidate.extractor = self._name
        article_candidate.title = doc.short_title()
        article_candidate.description = description
        article_candidate.text = self._text(item)
        article_candidate.topimage = self._topimage(item)
        article_candidate.author = self._author(item)
        article_candidate.publish_date = self._publish_date(item)
        article_candidate.language = self._language(item)

        return article_candidate 
开发者ID:fhamborg,项目名称:news-please,代码行数:23,代码来源:readability_extractor.py

示例2: story_readability

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def story_readability(content):
    """
    >>> content = '<p>hello <b>world</b><br/>你好<i>世界</i></p>'
    >>> print(story_readability(content))
    <body id="readabilityBody"><p>hello <b>world</b><br/>你好<i>世界</i></p></body>
    """
    if (not content) or (not content.strip()):
        return ""
    doc = ReadabilityDocument(content)
    return doc.summary(html_partial=True) or "" 
开发者ID:anyant,项目名称:rssant,代码行数:12,代码来源:processor.py

示例3: get_main_html

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def get_main_html(html):
    doc = Document(html)
    return doc.summary() 
开发者ID:chaijunit,项目名称:beibq,代码行数:5,代码来源:html.py

示例4: run_readability

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def run_readability(htmlstring):
    '''try with the Python3 port of readability.js'''
    try:
        doc = Document(htmlstring)
        return doc.summary() # sanitize(doc.summary())
    except Exception as err:
        print('Exception:', err)
        return '' 
开发者ID:adbar,项目名称:trafilatura,代码行数:10,代码来源:comparison.py

示例5: clean

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def clean(self):
        """Download the article and strip it of HTML formatting."""
        self.res = requests.get(self.url, headers={'User-Agent':ua}, timeout=30)
        doc = Document(self.res.text)

        h = html2text.HTML2Text()
        h.ignore_links = True
        h.ignore_emphasis = True
        h.ignore_images = True
        h.body_width = 0

        self.plaintext = h.handle(doc.summary()) 
开发者ID:freedomofpress,项目名称:trackthenews,代码行数:14,代码来源:core.py

示例6: get_filename

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def get_filename(self, abs_url):
        request_text = get_page_with_retry(abs_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()

        # Clean the title and make it titlecase
        title = clean_string(title)
        title = titlecase.titlecase(title)
        title = title.replace(" ", "_")
        title = clean_string(title)
        name = title.strip("_") + ".pdf"
        name = unidecode.unidecode(name)
        logger.info("Created filename: %s" % name)
        return name 
开发者ID:GjjvdBurg,项目名称:paper2remarkable,代码行数:16,代码来源:html.py

示例7: retrieve_pdf

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def retrieve_pdf(self, pdf_url, filename):
        """Turn the HTML article in a clean pdf file"""
        # Steps
        # 1. Pull the HTML page using requests
        # 2. Extract the article part of the page using readability
        # 3. Convert the article HTML to markdown using html2text
        # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
        # 4. Convert the HTML to PDF, pulling in images where needed
        # 5. Save the PDF to the specified filename.
        request_text = get_page_with_retry(pdf_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()
        raw_html = doc.summary(html_partial=True)

        h2t = html2text.HTML2Text()
        h2t.wrap_links = False
        text = h2t.handle(raw_html)

        # Add the title back to the document
        article = "# {title}\n\n{text}".format(title=title, text=text)

        # Convert to html, fixing relative image urls.
        md = markdown.Markdown()
        md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
        html_article = md.convert(article)

        if self.debug:
            with open("./paper.html", "w") as fp:
                fp.write(html_article)

        font_config = weasyprint.fonts.FontConfiguration()
        html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher)
        css = weasyprint.CSS(string=CSS, font_config=font_config)

        html.write_pdf(filename, stylesheets=[css], font_config=font_config) 
开发者ID:GjjvdBurg,项目名称:paper2remarkable,代码行数:37,代码来源:html.py

示例8: get_clean_text

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def get_clean_text(html):
    """
    generate clean text for given html
    """
    doc = readability.Document(html)
    try:
        doc._html()
        clean = doc.get_clean_html()
    except Exception as e:
        print(e)
        clean = html
    bsObj = bs(clean)
    return bsObj.get_text() 
开发者ID:geekinglcq,项目名称:aca,代码行数:15,代码来源:utility.py

示例9: readability

# 需要导入模块: import readability [as 别名]
# 或者: from readability import Document [as 别名]
def readability():
	import requests
	from readability import Document	
	from bs4 import BeautifulSoup 

	data = dict(default_data)
	data['message'] = "Article Extraction by Readability"
	data['params'] = {}
	data['error'] = ''
	data['readability'] = {}

	if request.method == 'GET':
		data['params']['url'] = request.args.get('url')
		if not data['params']['url']:
			data['error'] = '[url] parameter not found'
			return jsonify(data)

		response = requests.get( data['params']['url'] )
		doc = Document(response.text)

	elif request.method == 'POST':
		params = request.form # postdata

		if not params:
			data['error'] = 'Missing parameters'
			return jsonify(data)

		if not params['html']:
			data['error'] = 'html parameter not found'
			return jsonify(data)
	
		doc = Document( params['html'] )
	
	data['readability']['title'] = doc.title()
	data['readability']['short_title'] = doc.short_title()
	#data['readability']['content'] = doc.content()
	data['readability']['article_html'] = doc.summary( html_partial=True )

	soup = BeautifulSoup( data['readability']['article_html'] ) 
	data['readability']['text'] =  soup.get_text() 

	return jsonify(data) 
开发者ID:web64,项目名称:nlpserver,代码行数:44,代码来源:nlpserver.py


注:本文中的readability.Document方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。