本文整理汇总了Python中src.tools.match.Match类的典型用法代码示例。如果您正苦于以下问题:Python Match类的具体用法?Python Match怎么用?Python Match使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Match类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: download_img
def download_img(self):
from src.container.image_container import ImageContainer
img_container = ImageContainer()
img_src_dict = Match.match_img_with_src_dict(self.content)
self.img_filename_list = []
for img in img_src_dict:
src = img_src_dict[img]
filename = img_container.add(src)
self.img_filename_list.append(filename)
self.content = self.content.replace(img, Match.create_img_element_with_file_name(filename))
# 下载文章封面图像
filename = img_container.add(self.image_url)
self.img_filename_list.append(filename)
self.image_url = Match.create_local_img_src(filename)
# 下载用户头像
filename = img_container.add(self.author_avatar_url)
self.img_filename_list.append(filename)
self.author_avatar_url = Match.create_local_img_src(filename)
img_container.start_download()
# 下载完成后,更新图片大小
for filename in self.img_filename_list:
self.total_img_size_kb += Path.get_img_size_by_filename_kb(filename)
return
示例2: main
def main():
debug = False
def version():
log.info_log('version %s' % __version__)
try:
opts, args = getopt.getopt(sys.argv[1:], short_options, long_options)
except getopt.GetoptError as err:
log.error_log(u"Try ee-book --help for more options")
sys.exit(2)
for option, args in opts:
if option in ('-V', '--version'):
version()
sys.exit()
elif option in ('-d', '--debug'):
print u"Debug mode..."
debug = True
elif option in ('-h', '--help'):
version()
print(help_info)
sys.exit()
elif option in ('-g', '--gui'):
print(u"Under developing...")
sys.exit()
# graphviz = GraphvizOutput(output_file='filter_gui.png')
# with PyCallGraph(output=graphviz, config=config):
# from PyQt4.QtGui import QApplication
# from PyQt4.QtGui import QIcon
# from src.gui.ui import MainWindow
# from src.resources import qrc_resources
# app = QApplication(sys.argv)
# app.setWindowIcon(QIcon(":/icon.png"))
# app.setApplicationName('EE-Book')
# window = MainWindow()
# window.show()
# sys.exit(app.exec_())
elif option in ('-l', '--login'):
url = args
try:
recipe_kind = Match.get_url_kind(url)
except UnsupportTypeException, e:
print e
print u"Please try again."
sys.exit()
zhihu = EEBook(recipe_kind=recipe_kind) # Init path, e.g. config, only zhihu are supported now
login = Login(recipe_kind=recipe_kind)
login.start()
sys.exit()
elif option in ('-u', '--url'):
url = args
try:
recipe_kind = Match.get_website_kind(url)
except UnsupportTypeException, e:
print e
print u"Please check url."
sys.exit()
示例3: fix_image
def fix_image(self, content):
content = Match.fix_html(content)
for img in re.findall(r'<img[^>]*', content):
# fix img
if img[-1] == '/':
img = img[:-1]
img += '>'
src = re.search(r'(?<=src=").*?(?=")', img)
if not src:
new_image = img + '</img>'
content = content.replace(img, new_image)
continue
else:
src = src.group(0)
if src.replace(' ', '') == '':
new_image = img + '</img>'
content = content.replace(img, new_image)
continue
src_download = HtmlCreator.fix_image_src(src)
if src_download:
filename = self.image_container.add(src_download)
else:
filename = ''
new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename))
new_image = new_image.replace('//zhstatic.zhihu.com/assets/zhihu/ztext/whitedot.jpg',
'../images/{}'.format(filename))
new_image += '</img>'
content = content.replace(img, '<div class="duokan-image-single">{}</div>'.format(new_image))
return content
示例4: login
def login(self, account, password, captcha=''):
content = Http.get_content('https://www.zhihu.com/')
xsrf = Match.xsrf(content)
if not xsrf:
Debug.logger.info(u'登陆失败')
Debug.logger.info(u'敲击回车重新发送登陆请求')
return False
xsrf = xsrf.split('=')[1]
# add xsrf as cookie into cookieJar,
cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com')
self.cookieJar.set_cookie(cookie)
if captcha:
post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True,
'captcha': captcha}
else:
post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True}
header = {
'Accept': '*/*',
'Accept-Encoding': 'gzip,deflate', # 主要属性,只要有此项知乎即认为来源非脚本
'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4',
'Host': 'www.zhihu.com',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36',
'Connection': 'keep-alive',
'X-Requested-With': 'XMLHttpRequest',
'Origin': 'https://www.zhihu.com',
'Referer': 'https://www.zhihu.com/',
}
result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header)
if not result:
Debug.logger.info(u'登陆失败,请敲击回车重新登陆')
return False
response = json.loads(result)
if response['r'] == 0:
print u'登陆成功!'
print u'登陆账号:', account
print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认'
if raw_input() == 'yes':
Config.account, Config.password, Config.remember_account = account, password, True
print u'帐号密码已保存,可通过修改config.json修改设置'
else:
Config.account, Config.password, Config.remember_account = '', '', False
print u'跳过保存环节,进入下一流程'
Config._save()
cookie = self.get_cookie()
DB.execute('delete from LoginRecord') # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录
data = {}
data['account'] = account
data['password'] = password
data['recordDate'] = ExtraTools.get_today()
data['cookieStr'] = cookie
DB.save(data, 'LoginRecord')
DB.commit()
return True
else:
print u'登陆失败'
Debug.print_dict(response)
return False
示例5: set_info
def set_info(self, info):
self.info.update(info)
if self.kind == Type.question:
self.epub.title = u'知乎问题集锦({})'.format(info['title'])
self.epub.id = info['id']
elif self.kind == Type.answer:
self.epub.title = u'知乎回答集锦({})'.format(info['title'])
self.epub.id = info['id']
elif self.kind == Type.article:
self.epub.title = u'知乎专栏文章集锦({})'.format(info['title'])
self.epub.id = info['id']
if self.kind == Type.topic:
self.epub.title = u'话题_{}({})'.format(info['title'], info['topic_id'])
self.epub.id = info['topic_id']
if self.kind == Type.collection:
self.epub.title = u'收藏夹_{}({})'.format(info['title'], info['collection_id'])
self.epub.id = info['collection_id']
if self.kind == Type.author:
self.epub.title = u'作者_{}({})'.format(info['name'], info['author_id'])
self.epub.id = info['author_id']
if self.kind == Type.column:
self.epub.title = u'专栏_{}({})'.format(info['name'], info['column_id'])
self.epub.id = info['column_id']
self.epub.title = Match.fix_filename(self.epub.title)
return
示例6: create_work_set
def create_work_set(self, target_url):
u"""
根据target_url(例:http://www.jianshu.com/users/b1dd2b2c87a8/latest_articles)的内容,
先获得creator_id, 再根据文章的数目, 获得页面数, 依次打开每个页面, 将文章的地址放入work_set中
:param target_url:
:return:
"""
if target_url in self.task_complete_set:
return
id_result = Match.jianshu_author(target_url)
jianshu_id = id_result.group('jianshu_id')
article_num, article_list = self.get_jianshu_question_list(target_url)
self.task_complete_set.add(target_url)
if article_num % 9 != 0:
page_num = article_num/9 + 1 # 9 href on one page
else:
page_num = article_num / 9
for item in article_list:
self.work_set.add(item)
for page in range(page_num-1): # page+2, don't need to get the first page
url = 'http://www.jianshu.com/users/{}/latest_articles?page={}'.format(jianshu_id, page+2)
content_article_list = Http.get_content(url)
article_list = self.parse_get_article_list(content_article_list)
for item in article_list:
self.work_set.add(item)
return
示例7: create_work_set
def create_work_set(self, target_url):
if target_url in self.task_complete_set:
return
result = Match.column(target_url)
self.column_id = result.group('column_id')
content = Http.get_content('https://zhuanlan.zhihu.com/api/columns/' + self.column_id)
if not content:
return
raw_info = json.loads(content)
info = {}
info['creator_id'] = raw_info['creator']['slug']
info['creator_hash'] = raw_info['creator']['hash']
info['creator_sign'] = raw_info['creator']['bio']
info['creator_name'] = raw_info['creator']['name']
info['creator_logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['creator']['avatar'][
'id']).replace('_{size}', '')
info['column_id'] = raw_info['slug']
info['name'] = raw_info['name']
info['logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['avatar']['id']).replace(
'_{size}', '')
info['article'] = raw_info['postsCount']
info['follower'] = raw_info['followersCount']
info['description'] = raw_info['description']
self.info_list.append(info)
self.task_complete_set.add(target_url)
detect_url = 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset='.format(self.column_id)
for i in range(info['article'] / 10 + 1):
self.work_set.add(detect_url + str(i * 10))
return
示例8: create_single_html_book
def create_single_html_book(self, book_package):
title = book_package.get_title()
if not title:
# 电子书题目为空时自动跳过
# 否则会发生『rm -rf / 』的惨剧
return
Path.reset_path()
Path.chdir(Path.result_path)
Path.rmdir(u'./' + title)
Path.mkdir(u'./' + title)
Path.chdir(u'./' + title)
page = []
for book in book_package.book_list:
page += book.page_list
content = u' \r\n '.join([Match.html_body(x.content) for x in page]).replace(u'../images/', u'./images/')
with open(TemplateConfig.content_base_uri) as html:
content = html.read().format(title=title, body=content).replace(u'../style/', u'./')
with open(title + u'.html', 'w') as html:
html.write(content)
Path.copy(Path.html_pool_path + u'/../{}/OEBPS/images'.format(title), u'./images')
Path.copy(Path.www_css + u'/customer.css', u'./customer.css')
Path.copy(Path.www_css + u'/markdown.css', u'./markdown.css')
Path.copy(Path.www_css + u'/normalize.css', u'./normalize.css')
Path.reset_path()
return
示例9: create_work_set
def create_work_set(self, target_url):
if target_url in self.task_complete_set:
return
result = Match.column(target_url)
self.column_id = result.group("column_id")
content = Http.get_content("https://zhuanlan.zhihu.com/api/columns/" + self.column_id)
if not content:
return
raw_info = json.loads(content)
info = {}
info["creator_id"] = raw_info["creator"]["slug"]
info["creator_hash"] = raw_info["creator"]["hash"]
info["creator_sign"] = raw_info["creator"]["bio"]
info["creator_name"] = raw_info["creator"]["name"]
info["creator_logo"] = (
raw_info["creator"]["avatar"]["template"]
.replace("{id}", raw_info["creator"]["avatar"]["id"])
.replace("_{size}", "")
)
info["column_id"] = raw_info["slug"]
info["name"] = raw_info["name"]
info["logo"] = (
raw_info["creator"]["avatar"]["template"].replace("{id}", raw_info["avatar"]["id"]).replace("_{size}", "")
)
info["article"] = raw_info["postsCount"]
info["follower"] = raw_info["followersCount"]
info["description"] = raw_info["description"]
self.info_list.append(info)
self.task_complete_set.add(target_url)
detect_url = "https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset=".format(self.column_id)
for i in range(info["article"] / 10 + 1):
self.work_set.add(detect_url + str(i * 10))
return
示例10: create_work_set
def create_work_set(self, target_url):
u"""
根据博客首页的url, 首先通过re获得博客id, 然后根据博客"关于我"的页面的内容获得写入sinablog_info
的数据(这部分理应不在这个函数中, 可以改进), 最后通过博客目录页面的内容, 获得每篇博文的地址,
放入work_set中
:param target_url: 博客首页的url
:return:
"""
if target_url in self.task_complete_set:
return
result = Match.sinablog_author(target_url)
sinablog_author_id = int(result.group('sinablog_people_id'))
article_num = self.get_sinablog_question_list(sinablog_author_id)
if article_num % 50 != 0:
page_num = article_num/50 + 1 # 50 href on 1 page
else:
page_num = article_num / 50
self.question_list[0]['article_num'] = article_num
# 上面这行, 暂时只能这样写, 因为"关于我"的页面没有文章的数量
self.task_complete_set.add(target_url)
for page in range(page_num):
url = 'http://blog.sina.com.cn/s/articlelist_{}_0_{}.html'.format(sinablog_author_id, page+1)
content_article_list = Http.get_content(url)
article_list = self.parse_get_article_list(content_article_list)
for item in article_list:
self.work_set.add(item)
return
示例11: worker
def worker(self, target_url):
if target_url in self.work_complete_set:
# 自动跳过已抓取成功的网址
return
Debug.logger.info(u'开始抓取{}的内容'.format(target_url))
content = Http.get_content(target_url)
if not content:
return
from src.worker.sinablog_worker import sinablogAuthorWorker
if isinstance(self, sinablogAuthorWorker):
content = Match.fix_html(content=content, recipe_kind='sinablog_author')
else:
content = Match.fix_html(content=content) # 需要修正其中的<br>标签,避免爆栈
self.content_list.append(content)
Debug.logger.debug(u'{}的内容抓取完成'.format(target_url))
self.work_complete_set.add(target_url)
return
示例12: parse_article_id
def parse_article_id(self):
article_id = str(self.dom.find("div", class_="share-group"))
if not article_id:
Debug.logger.info(u"没有找到文章id")
return
result = Match.jianshu_article_id(article_id)
article_id = result.group('jianshu_article_id')
self.info['article_id'] = article_id
示例13: set_info
def set_info(self, info):
self.info.update(info)
if self.kind == Type.csdnblog_author:
self.epub.title = u'csdn博客作者_{}({})文章集锦'.format(info['creator_name'], info['creator_id'])
self.epub.id = info['creator_id']
elif self.kind == Type.cnblogs_author:
self.epub.title = u'cnblogs作者_{}({})文章集锦'.format(info['creator_name'], info['creator_id'])
self.epub.id = info['creator_id']
elif self.kind == Type.jianshu_author: # 该博客所有的博文
self.epub.title = u'简书作者_{}({})文章集锦'.format(info['creator_name'], info['creator_id'])
self.epub.id = info['creator_id']
elif self.kind == Type.jianshu_collection:
self.epub.title = u'简书专题_{}({})'.format(info['title'], info['collection_fake_id'])
self.epub.id = info['collection_fake_id']
elif self.kind == Type.jianshu_notebooks:
self.epub.title = u'简书文集_{}({})'.format(info['title'], info['notebooks_id'])
self.epub.id = info['notebooks_id']
elif self.kind == Type.jianshu_article: # 单篇博文 TODO
self.epub.title = u'简书博文集锦({})'.format(info['title'])
self.epub.id = info['id'] # TODO
elif self.kind == Type.sinablog_author: # 该博客所有的博文
self.epub.title = u'新浪博客_{}({})'.format(info['creator_name'], info['creator_id'])
self.epub.id = info['creator_id']
elif self.kind == Type.sinablog_article: # 新浪单篇博文 TODO
self.epub.title = u'新浪博客博文集锦({})'.format(info['title'])
self.epub.id = info['id'] # TODO
elif self.kind == Type.question:
self.epub.title = u'知乎问题集锦({})'.format(info['title'])
self.epub.id = info['id']
elif self.kind == Type.answer:
self.epub.title = u'知乎回答集锦({})'.format(info['title'])
self.epub.id = info['id']
elif self.kind == Type.article:
self.epub.title = u'知乎专栏文章集锦({})'.format(info['title'])
self.epub.id = info['id']
elif self.kind == Type.topic:
self.epub.title = u'知乎话题_{}({})'.format(info['title'], info['topic_id'])
self.epub.id = info['topic_id']
elif self.kind == Type.collection:
self.epub.title = u'知乎收藏夹_{}({})'.format(info['title'], info['collection_id'])
self.epub.id = info['collection_id']
elif self.kind == Type.author:
self.epub.title = u'知乎作者_{}({})'.format(info['name'], info['author_id'])
self.epub.id = info['author_id']
elif self.kind == Type.column:
self.epub.title = u'知乎专栏_{}({})'.format(info['name'], info['column_id'])
self.epub.id = info['column_id']
elif self.kind == Type.yiibai:
self.epub.title = u'易百教程_{}'.format(info['title'])
self.epub.id = info['creator_id']
elif self.kind == Type.talkpython:
self.epub.title = u'TalkPythonToMe'
self.epub.id = info['creator_id']
from src.html5lib.constants import entities_reverse
self.epub.title = Match.replace_words(self.epub.title, entities_reverse)
return
示例14: download_img_in_question_content
def download_img_in_question_content(self):
# 下载问题详情中的图片,同时更新
from src.container.image_container import ImageContainer
img_container = ImageContainer()
img_src_dict = Match.match_img_with_src_dict(self.question_info.detail)
self.question_content_img_filename_list = []
for img in img_src_dict:
src = img_src_dict[img]
filename = img_container.add(src)
self.question_content_img_filename_list.append(filename)
self.question_info.detail = self.question_info.detail.replace(img, Match.create_img_element_with_file_name(filename))
img_container.start_download()
# 下载完成后,更新图片大小
for filename in self.question_content_img_filename_list:
self.question_content_img_size += Path.get_img_size_by_filename_kb(filename)
return
示例15: create_book
def create_book(self):
# 确定文件信息
title = Match.fix_filename(self.book_title)
if self.is_split:
title = self.book_title + u'_卷{}'.format(self.chapter_no)
# 先切换到电子书临时资源目录下
Path.chdir(Path.book_pool_path)
epub = Epub(title)
for task_result in self.task_result_list:
chapter_src = ''
# info_page
if task_result.task.task_type == Type.question:
chapter_src = self.generate_question_info_page(task_result.info_page)
elif task_result.task.task_type == Type.answer:
chapter_src = self.generate_question_info_page(task_result.info_page)
elif task_result.task.task_type == Type.collection:
chapter_src = self.generate_collection_info_page(task_result.info_page)
elif task_result.task.task_type == Type.topic:
chapter_src = self.generate_topic_info_page(task_result.info_page)
elif task_result.task.task_type == Type.author:
chapter_src = self.generate_author_info_page(task_result.info_page)
elif task_result.task.task_type == Type.column:
chapter_src = self.generate_column_info_page(task_result.info_page)
elif task_result.task.task_type == Type.article:
chapter_src = self.generate_article_info_page(task_result.info_page)
epub.create_chapter(chapter_src, task_result.get_title())
for question in task_result.question_list:
# 添加图片文件
for filename in question.img_filename_list:
epub.add_image(Path.image_pool_path + '/' + filename)
question_src = self.generate_question_page(question)
epub.add_html(question_src, question.question_info.title)
for column in task_result.column_list:
# 添加图片文件
for filename in column.img_filename_list:
epub.add_image(Path.image_pool_path + '/' + filename)
for article in column.article_list:
article_src = self.generate_article_page(article)
epub.add_html(article_src, article.title)
epub.finish_chapter()
epub.set_creator(u'ZhihuHelp1.8.0')
epub.set_language(u'zh-cn')
epub.set_book_id()
epub.set_output_path(Path.result_path)
epub.add_css(Path.base_path + u'/www/css/markdown.css')
epub.add_css(Path.base_path + u'/www/css/customer.css')
epub.add_css(Path.base_path + u'/www/css/normalize.css')
epub.add_css(Path.base_path + u'/www/css/bootstrap.css')
epub.create()
Path.reset_path()
return