本文整理汇总了Python中w3lib.html.remove_tags方法的典型用法代码示例。如果您正苦于以下问题:Python html.remove_tags方法的具体用法?Python html.remove_tags怎么用?Python html.remove_tags使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类w3lib.html
的用法示例。
在下文中一共展示了html.remove_tags方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _extract_links
# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
def clean_text(text):
return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()
def clean_url(url):
clean_url = ''
try:
clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
except ValueError:
pass
return clean_url
if base_url is None:
base_url = get_base_url(response_text, response_url, response_encoding)
links_text = linkre.findall(response_text)
return [Link(clean_url(url).encode(response_encoding),
clean_text(text))
for url, _, text in links_text]
示例2: scrape_product
# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def scrape_product(self, response):
item_loader = ItemLoader(item=MyItem(), response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
item_loader.default_output_processor = TakeFirst()
item_loader.add_css("my_field", "selector")
return item_loader.load_item()
示例3: populate_item
# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def populate_item(self, response):
item_loader = ItemLoader(item=MySpiderItem(), response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
# item_loader.add_css("", "")
yield item_loader.load_item()
# 3. PAGINATION LEVEL 1
示例4: populate_item
# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def populate_item(self, response):
item_loader = ItemLoader(item=MySpiderItem(), response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
# item_loader.add_css("")
# item_loader.add_value("raw", raw)
# yield the populated item first
yield item_loader.load_item()
# then yield the function which paginates to another page that contains data
yield self.paginate(response)
# 3. PAGINATION LEVEL 2
示例5: populate_item
# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def populate_item(self, response):
item_loader = ItemLoader(item=MySpiderItem(), response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
#item_loader.add_css("", "")
#item_loader.add_css("", "")
yield item_loader.load_item()
示例6: parse
# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def parse(self, response):
item_loader = ItemLoader(item=MyItem(), response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
item_loader.default_output_processor = TakeFirst()
#
#item_loader.add_css("my_field", "my_css")
#item_loader.add_xpath("my_field", "my_xpath")
#
return item_loader.load_item()
示例7: populate_item
# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def populate_item(self, response):
item_loader = ItemLoader(item=MySpiderItem(), response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
#item_loader.add_css("field", "")
yield item_loader.load_item()
# 3. PAGINATION LEVEL 2
示例8: parse
# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def parse(self, response):
item_loader = ItemLoader(item=MyItem(), response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
#item_loader.add_css("", "")
#item_loader.add_css("", "")
#item_loader.add_css("", "")
yield FormRequest("POST_URL", formdata={'parameter': 'p'},
meta={'item': item_loader.load_item()}, callback=self.populate_field)
示例9: populate_field
# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def populate_field(self, response):
item_loader = ItemLoader(item=response.meta["item"], response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
#item_loader.add_css("field", "")
return item_loader.load_item()
示例10: clean_data
# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def clean_data(self):
try:
self["praise_num"] = extract_num("".join(self["praise_num"]))
except BaseException:
self["praise_num"] = 0
self["comments_num"] = extract_num("".join(self["comments_num"]))
self["create_time"] = datetime.datetime.fromtimestamp(
self["create_time"]).strftime(SQL_DATETIME_FORMAT)
try:
self["update_time"] = datetime.datetime.fromtimestamp(
self["update_time"]).strftime(SQL_DATETIME_FORMAT)
except:
self["update_time"] = self["create_time"]
self["crawl_time"] = self["crawl_time"].strftime(SQL_DATETIME_FORMAT)
self["content"] = remove_tags(self["content"])
示例11: save_to_es
# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def save_to_es(self):
"""保存伯乐在线文章到es中"""
self.clean_data()
blog = JobboleBlogIndex()
blog.title = self['title']
blog.create_date = self["create_date"]
blog.content = remove_tags(self["content"])
blog.front_image_url = self["front_image_url"]
blog.praise_nums = self["praise_nums"]
blog.fav_nums = self["fav_nums"]
blog.comment_nums = self["comment_nums"]
blog.url = self["url"]
blog.tags = self["tags"]
blog.meta.id = self["url_object_id"]
# 在保存数据时必须传入suggest
blog.suggest = generate_suggests(es_jobbole_blog,
((blog.title, 10), (blog.tags, 6), (blog.content, 4)))
real_time_count('jobbole_blog_count', JOBBOLE_COUNT_INIT)
blog.save()
示例12: parse
# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def parse(self, response):
hrefs = response.selector.xpath('//div[contains(@class, "c-container")]/h3/a/@href').extract()
containers = response.selector.xpath('//div[contains(@class, "c-container")]')
for container in containers:
href = container.xpath('h3/a/@href').extract()[0]
title = remove_tags(container.xpath('h3/a').extract()[0])
c_abstract = container.xpath('div/div/div[contains(@class, "c-abstract")]').extract()
abstract = ""
if len(c_abstract) > 0:
abstract = remove_tags(c_abstract[0])
request = scrapy.Request(href, callback=self.parse_url)
request.meta['title'] = title
request.meta['abstract'] = abstract
yield request
示例13: parse_url
# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def parse_url(self, response):
print "url:", response.url
print "title:", response.meta['title']
print "abstract:", response.meta['abstract']
content = remove_tags(response.selector.xpath('//body').extract()[0])
print "content_len:", len(content)
示例14: parse
# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def parse(self, response):
canonical_url = response.xpath('//link[@rel="canonical"]/@href').extract_first()
## Skip excluded sections
section = response.css('a.section').xpath('text()').extract_first()
if section and section.lower() in IGNORE_SECTIONS:
self.logger.info("Skipping %s because section is %s", canonical_url, section)
return
## Skip syndicated content
body_html = "".join(response.css("#body_content p").extract())
body_text = remove_tags(body_html)
for string in SKIP_STRINGS:
suffix = body_text[-20:]
if string in suffix:
self.logger.info("Skipping %s because suffix %r contains %r",
canonical_url,
suffix,
string)
return
publication_date_str = response.xpath('//meta[@name="publicationdate"]/@content').extract_first()
publication_date = datetime.strptime(publication_date_str, '%d/%m/%Y')
publication_date = SAST.localize(publication_date)
item = ScrapenewsItem()
item['body_html'] = response.css("#body_content").extract_first()
item['title'] = response.xpath('//meta[@name="title"]/@content').extract_first()
item['byline'] = response.xpath('//meta[@name="author"]/@content').extract_first()
item['published_at'] = publication_date.isoformat()
item['retrieved_at'] = datetime.utcnow().isoformat()
item['url'] = canonical_url
item['file_name'] = response.url.split('/')[-1]
item['spider_name'] = self.name
item['publication_name'] = self.publication_name
yield item
示例15: save_to_es
# 需要导入模块: from w3lib import html [as 别名]
# 或者: from w3lib.html import remove_tags [as 别名]
def save_to_es(self):
self.clean_data()
job = LagouJobIndex()
job.title = self["title"]
job.url = self["url"]
job.meta.id = self["url_object_id"]
job.salary_min = self["salary_min"]
job.salary_max = self["salary_max"]
job.job_city = self["job_city"]
job.work_years_min = self["work_years_min"]
job.work_years_max = self["work_years_max"]
job.degree_need = self["degree_need"]
job.job_desc = remove_tags(self["job_desc"]).strip().replace("\r\n", "").replace("\t", "")
job.job_advantage = self["job_advantage"]
job.tags = self["tags"]
job.job_type = self["job_type"]
job.publish_time = self["publish_time"]
job.job_addr = self["job_addr"]
job.company_name = self["company_name"]
job.company_url = self["company_url"]
job.crawl_time = self['crawl_time']
job.suggest = generate_suggests(es_lagou_job,
((job.title, 10), (job.tags, 7), (job.job_advantage, 6), (job.job_desc, 3),
(job.job_addr, 5), (job.company_name, 8), (job.degree_need, 4),
(job.job_city, 9)))
real_time_count('lagou_job_count', JOB_COUNT_INIT)
job.save()