当前位置: 首页>>代码示例>>Python>>正文


Python html.remove_tags方法代码示例

本文整理汇总了Python中w3lib.html.remove_tags方法的典型用法代码示例。如果您正苦于以下问题:Python html.remove_tags方法的具体用法?Python html.remove_tags怎么用?Python html.remove_tags使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在w3lib.html的用法示例。


在下文中一共展示了html.remove_tags方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _extract_links

# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        def clean_text(text):
            return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()

        def clean_url(url):
            clean_url = ''
            try:
                clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
            except ValueError:
                pass
            return clean_url

        if base_url is None:
            base_url = get_base_url(response_text, response_url, response_encoding)

        links_text = linkre.findall(response_text)
        return [Link(clean_url(url).encode(response_encoding),
                     clean_text(text))
                for url, _, text in links_text] 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:21,代码来源:regex.py

示例2: scrape_product

# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def scrape_product(self, response):
        item_loader = ItemLoader(item=MyItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        item_loader.default_output_processor = TakeFirst()

        item_loader.add_css("my_field", "selector")

        return item_loader.load_item() 
开发者ID:zseta,项目名称:scrapy-templates,代码行数:10,代码来源:sitemap_spider.py

示例3: populate_item

# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        # item_loader.add_css("", "")
        yield item_loader.load_item()

    # 3. PAGINATION LEVEL 1 
开发者ID:zseta,项目名称:scrapy-templates,代码行数:10,代码来源:1fol_pag2scr.py

示例4: populate_item

# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        # item_loader.add_css("")
        # item_loader.add_value("raw", raw)

        # yield the populated item first
        yield item_loader.load_item()
        # then yield the function which paginates to another page that contains data
        yield self.paginate(response)

    # 3. PAGINATION LEVEL 2 
开发者ID:zseta,项目名称:scrapy-templates,代码行数:15,代码来源:1fol2scr_pag.py

示例5: populate_item

# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        #item_loader.add_css("", "")
        #item_loader.add_css("", "")

        yield item_loader.load_item() 
开发者ID:zseta,项目名称:scrapy-templates,代码行数:10,代码来源:1fol2scr.py

示例6: parse

# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def parse(self, response):
        item_loader = ItemLoader(item=MyItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        item_loader.default_output_processor = TakeFirst()
        #
        #item_loader.add_css("my_field", "my_css")
        #item_loader.add_xpath("my_field", "my_xpath")
        #
        return item_loader.load_item() 
开发者ID:zseta,项目名称:scrapy-templates,代码行数:11,代码来源:1scr.py

示例7: populate_item

# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        #item_loader.add_css("field", "")
        yield item_loader.load_item()

    # 3. PAGINATION LEVEL 2 
开发者ID:zseta,项目名称:scrapy-templates,代码行数:10,代码来源:1fol2fol_pag3scr.py

示例8: parse

# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def parse(self, response):
        item_loader = ItemLoader(item=MyItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        #item_loader.add_css("", "")
        #item_loader.add_css("", "")
        #item_loader.add_css("", "")
        yield FormRequest("POST_URL", formdata={'parameter': 'p'},
                                        meta={'item': item_loader.load_item()}, callback=self.populate_field) 
开发者ID:zseta,项目名称:scrapy-templates,代码行数:10,代码来源:post_pass_item.py

示例9: populate_field

# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def populate_field(self, response):
        item_loader = ItemLoader(item=response.meta["item"], response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        #item_loader.add_css("field", "")
        return item_loader.load_item() 
开发者ID:zseta,项目名称:scrapy-templates,代码行数:7,代码来源:post_pass_item.py

示例10: clean_data

# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def clean_data(self):
        try:
            self["praise_num"] = extract_num("".join(self["praise_num"]))
        except BaseException:
            self["praise_num"] = 0
        self["comments_num"] = extract_num("".join(self["comments_num"]))

        self["create_time"] = datetime.datetime.fromtimestamp(
            self["create_time"]).strftime(SQL_DATETIME_FORMAT)
        try:
            self["update_time"] = datetime.datetime.fromtimestamp(
                self["update_time"]).strftime(SQL_DATETIME_FORMAT)
        except:
            self["update_time"] = self["create_time"]

        self["crawl_time"] = self["crawl_time"].strftime(SQL_DATETIME_FORMAT)
        self["content"] = remove_tags(self["content"]) 
开发者ID:mtianyan,项目名称:FunpySpiderSearchEngine,代码行数:19,代码来源:zhihu_item.py

示例11: save_to_es

# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def save_to_es(self):
        """保存伯乐在线文章到es中"""
        self.clean_data()
        blog = JobboleBlogIndex()
        blog.title = self['title']
        blog.create_date = self["create_date"]
        blog.content = remove_tags(self["content"])
        blog.front_image_url = self["front_image_url"]
        blog.praise_nums = self["praise_nums"]
        blog.fav_nums = self["fav_nums"]
        blog.comment_nums = self["comment_nums"]
        blog.url = self["url"]
        blog.tags = self["tags"]
        blog.meta.id = self["url_object_id"]
        # 在保存数据时必须传入suggest
        blog.suggest = generate_suggests(es_jobbole_blog,
                                         ((blog.title, 10), (blog.tags, 6), (blog.content, 4)))
        real_time_count('jobbole_blog_count', JOBBOLE_COUNT_INIT)
        blog.save() 
开发者ID:mtianyan,项目名称:FunpySpiderSearchEngine,代码行数:21,代码来源:jobbole_Item.py

示例12: parse

# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def parse(self, response):
        hrefs = response.selector.xpath('//div[contains(@class, "c-container")]/h3/a/@href').extract()
        containers = response.selector.xpath('//div[contains(@class, "c-container")]')
        for container in containers:
            href = container.xpath('h3/a/@href').extract()[0]
            title = remove_tags(container.xpath('h3/a').extract()[0])
            c_abstract = container.xpath('div/div/div[contains(@class, "c-abstract")]').extract()
            abstract = ""
            if len(c_abstract) > 0:
                abstract = remove_tags(c_abstract[0])
            request = scrapy.Request(href, callback=self.parse_url)
            request.meta['title'] = title
            request.meta['abstract'] = abstract
            yield request 
开发者ID:warmheartli,项目名称:ChatBotCourse,代码行数:16,代码来源:baidu_search.py

示例13: parse_url

# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def parse_url(self, response):
        print "url:", response.url
        print "title:", response.meta['title']
        print "abstract:", response.meta['abstract']
        content = remove_tags(response.selector.xpath('//body').extract()[0])
        print "content_len:", len(content) 
开发者ID:warmheartli,项目名称:ChatBotCourse,代码行数:8,代码来源:baidu_search.py

示例14: parse

# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def parse(self, response):
        canonical_url = response.xpath('//link[@rel="canonical"]/@href').extract_first()

        ## Skip excluded sections
        section = response.css('a.section').xpath('text()').extract_first()
        if section and section.lower() in IGNORE_SECTIONS:
            self.logger.info("Skipping %s because section is %s", canonical_url, section)
            return

        ## Skip syndicated content
        body_html = "".join(response.css("#body_content p").extract())
        body_text = remove_tags(body_html)

        for string in SKIP_STRINGS:
            suffix = body_text[-20:]
            if string in suffix:
                self.logger.info("Skipping %s because suffix %r contains %r",
                                 canonical_url,
                                 suffix,
                                 string)
                return

        publication_date_str = response.xpath('//meta[@name="publicationdate"]/@content').extract_first()
        publication_date = datetime.strptime(publication_date_str, '%d/%m/%Y')
        publication_date = SAST.localize(publication_date)


        item = ScrapenewsItem()
        item['body_html'] = response.css("#body_content").extract_first()
        item['title'] = response.xpath('//meta[@name="title"]/@content').extract_first()
        item['byline'] = response.xpath('//meta[@name="author"]/@content').extract_first()
        item['published_at'] = publication_date.isoformat()
        item['retrieved_at'] = datetime.utcnow().isoformat()
        item['url'] = canonical_url
        item['file_name'] = response.url.split('/')[-1]
        item['spider_name'] = self.name
        item['publication_name'] = self.publication_name

        yield item 
开发者ID:public-people,项目名称:scrape-news,代码行数:41,代码来源:mg.py

示例15: save_to_es

# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def save_to_es(self):
        self.clean_data()
        job = LagouJobIndex()
        job.title = self["title"]
        job.url = self["url"]
        job.meta.id = self["url_object_id"]
        job.salary_min = self["salary_min"]
        job.salary_max = self["salary_max"]
        job.job_city = self["job_city"]
        job.work_years_min = self["work_years_min"]
        job.work_years_max = self["work_years_max"]
        job.degree_need = self["degree_need"]
        job.job_desc = remove_tags(self["job_desc"]).strip().replace("\r\n", "").replace("\t", "")
        job.job_advantage = self["job_advantage"]
        job.tags = self["tags"]
        job.job_type = self["job_type"]
        job.publish_time = self["publish_time"]
        job.job_addr = self["job_addr"]
        job.company_name = self["company_name"]
        job.company_url = self["company_url"]
        job.crawl_time = self['crawl_time']

        job.suggest = generate_suggests(es_lagou_job,
                                        ((job.title, 10), (job.tags, 7), (job.job_advantage, 6), (job.job_desc, 3),
                                         (job.job_addr, 5), (job.company_name, 8), (job.degree_need, 4),
                                         (job.job_city, 9)))
        real_time_count('lagou_job_count', JOB_COUNT_INIT)
        job.save() 
开发者ID:mtianyan,项目名称:FunpySpiderSearchEngine,代码行数:30,代码来源:lagou_Item.py


注:本文中的w3lib.html.remove_tags方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。