本文整理汇总了Python中src.tools.match.Match.fix_html方法的典型用法代码示例。如果您正苦于以下问题:Python Match.fix_html方法的具体用法?Python Match.fix_html怎么用?Python Match.fix_html使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类src.tools.match.Match
的用法示例。
在下文中一共展示了Match.fix_html方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: fix_image
# 需要导入模块: from src.tools.match import Match [as 别名]
# 或者: from src.tools.match.Match import fix_html [as 别名]
def fix_image(self, content):
content = Match.fix_html(content)
for img in re.findall(r'<img[^>]*', content):
# fix img
if img[-1] == '/':
img = img[:-1]
img += '>'
src = re.search(r'(?<=src=").*?(?=")', img)
if not src:
new_image = img + '</img>'
content = content.replace(img, new_image)
continue
else:
src = src.group(0)
if src.replace(' ', '') == '':
new_image = img + '</img>'
content = content.replace(img, new_image)
continue
src_download = HtmlCreator.fix_image_src(src)
if src_download:
filename = self.image_container.add(src_download)
else:
filename = ''
new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename))
new_image = new_image.replace('//zhstatic.zhihu.com/assets/zhihu/ztext/whitedot.jpg',
'../images/{}'.format(filename))
new_image += '</img>'
content = content.replace(img, '<div class="duokan-image-single">{}</div>'.format(new_image))
return content
示例2: worker
# 需要导入模块: from src.tools.match import Match [as 别名]
# 或者: from src.tools.match.Match import fix_html [as 别名]
def worker(self, target_url):
if target_url in self.work_complete_set:
# 自动跳过已抓取成功的网址
return
Debug.logger.info(u'开始抓取{}的内容'.format(target_url))
content = Http.get_content(target_url)
if not content:
return
from src.worker.sinablog_worker import sinablogAuthorWorker
if isinstance(self, sinablogAuthorWorker):
content = Match.fix_html(content=content, recipe_kind='sinablog_author')
else:
content = Match.fix_html(content=content) # 需要修正其中的<br>标签,避免爆栈
self.content_list.append(content)
Debug.logger.debug(u'{}的内容抓取完成'.format(target_url))
self.work_complete_set.add(target_url)
return
示例3: set_dom
# 需要导入模块: from src.tools.match import Match [as 别名]
# 或者: from src.tools.match.Match import fix_html [as 别名]
def set_dom(self, dom):
self.info = {}
if dom and not (dom.select('div.answer-status')):
self.header = dom.find('div', class_='zm-item-vote-info')
self.body = dom.find('textarea', class_='content')
self.footer = dom.find('div', class_='zm-meta-panel')
if self.body:
content = self.get_tag_content(self.body)
self.content = BeautifulSoup(Match.fix_html(content), 'html.parser')
self.author_parser.set_dom(dom)
return
示例4: fix_image
# 需要导入模块: from src.tools.match import Match [as 别名]
# 或者: from src.tools.match.Match import fix_html [as 别名]
def fix_image(self, content, recipe):
content = Match.fix_html(content=content, recipe_kind=recipe)
for img in re.findall(r'<img[^>]*', content):
if recipe not in [Type.sinablog_author, Type.cnblogs_author]:
# fix img
if img[-1] == '/':
img = img[:-1]
img += '>'
src = re.search(r'(?<=src=").*?(?=")', img)
if not src:
new_image = img + '</img>'
content = content.replace(img, new_image)
continue
else:
src = src.group(0)
if src.replace(' ', '') == '':
new_image = img + '</img>'
content = content.replace(img, new_image)
continue
src_download = HtmlCreator.fix_image_src(src)
if src_download:
if recipe in Type.zhihu and not src_download.startswith('http'):
# fix zhuanlan image href
src_download = src_download.split('.')[0]
filename = self.image_container.add('https://pic2.zhimg.com/'+src_download+'_b.jpg')
elif recipe in Type.generic:
filename = '' # TODO
else:
filename = self.image_container.add(src_download)
else:
filename = ''
new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename))
if recipe in Type.jianshu:
new_image = new_image.replace('data-original-src', 'temppicsr')
new_image = new_image.replace('src', 'falsesrc')
new_image = new_image.replace('temppicsr', 'src') # 应该有更好的方式, 暂时先这样写
new_image += '</img>'
elif recipe in Type.sinablog:
# 硬编码, 可以优化?写到fix_html函数中
new_image = new_image.replace('http://simg.sinajs.cn/blog7style/images/common/sg_trans.gif',\
'../images/{}'.format(filename))
elif recipe in Type.zhihu:
new_image = new_image.replace('//zhstatic.zhihu.com/assets/zhihu/ztext/whitedot.jpg',
'../images/{}'.format(filename))
new_image += '</img>'
elif recipe in Type.cnblogs:
pass
content = content.replace(img, '<div class="duokan-image-single">{}</div>'.format(new_image))
return content
示例5: worker
# 需要导入模块: from src.tools.match import Match [as 别名]
# 或者: from src.tools.match.Match import fix_html [as 别名]
def worker(self, target_url):
if target_url in self.work_complete_set:
# 自动跳过已抓取成功的网址
return
Debug.logger.info(u'开始抓取{}的内容'.format(target_url))
content = Http.get_content(target_url)
if not content:
return
content = Match.fix_html(content) # 需要修正其中的<br>标签,避免爆栈
self.content_list.append(content)
Debug.logger.debug(u'{}的内容抓取完成'.format(target_url))
self.work_complete_set.add(target_url)
return
示例6: fix_image
# 需要导入模块: from src.tools.match import Match [as 别名]
# 或者: from src.tools.match.Match import fix_html [as 别名]
def fix_image(self, content):
content = Match.fix_html(content)
for img in re.findall(r'<img[^>]*', content):
# fix img
if img[-1] == '/':
# print u"修改前,img为:" + str(img)
img = img[:-1]
# print u"修改后,img为:" + str(img)
img += '>'
src = re.search(r'(?<=src=").*?(?=")', img)
if not src:
new_image = img + '</img>'
content = content.replace(img, new_image)
continue
else:
src = src.group(0)
if src.replace(' ', '') == '':
new_image = img + '</img>'
content = content.replace(img, new_image)
continue
src_download = HtmlCreator.fix_image_src(src)
if src_download:
filename = self.image_container.add(src_download)
else:
filename = ''
# print u"src是什么?????" + str(src)
new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename))
new_image = new_image.replace('data-original-src', 'temppicsr')
new_image = new_image.replace('src', 'falsesrc')
new_image = new_image.replace('temppicsr', 'src') # 应该有更好的方式, 暂时先这样写
# new_image = new_image.replace('"{}"'.format(src+'/w/1240'), '"./images/{}"'.format(filename))
# new_image = new_image.replace('"{}"'.format(src), '"./images/{}"'.format(filename))
new_image += '</img>'
content = content.replace(img, '<div class="duokan-image-single">{}</div>'.format(new_image))
return content