本文整理汇总了Python中executor.Executor.doLog方法的典型用法代码示例。如果您正苦于以下问题:Python Executor.doLog方法的具体用法?Python Executor.doLog怎么用?Python Executor.doLog使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类executor.Executor
的用法示例。
在下文中一共展示了Executor.doLog方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from executor import Executor [as 别名]
# 或者: from executor.Executor import doLog [as 别名]
#.........这里部分代码省略.........
def doWork(self, root, fileName):
if not fileName.endswith(".xml"):
return
srcFile = root + "/" + fileName
resultFilePath = srcFile
if self.dataFileExists(fileName, srcFile):
soup = self.integrateParentWithData(fileName, srcFile)
else:
xmlDataFile = codecs.open(srcFile, "r", "utf-8")
xmlData = xmlDataFile.read()
xmlData = html.unescape_string(xmlData)
xmlDataFile.close()
soup = BeautifulSoup(xmlData, "lxml")
soup = self.semantify(soup, resultFilePath)
# 最后做断句处理
divider = Divider(soup, self.config_file_path)
soup = divider.doWork()
resultFile = codecs.open(resultFilePath, "w", "utf-8")
resultFile.write(self.beautiful_soup_tag_to_unicode(soup))
resultFile.close()
self.count += 1
print "Processed: %d" % self.count
# try to resolve the maximum-recursion problem
def beautiful_soup_tag_to_unicode(self, tag):
try:
return unicode(tag)
except RuntimeError as e:
if not str(e).startswith("maximum recursion"):
raise
# If you have more than 480 level of nested tags you can hit the maximum recursion level
out = []
for mystring in tag.findAll(text=True):
mystring = mystring.strip()
if not mystring:
continue
out.append(mystring)
return u"<pre>%s</pre>" % "\n".join(out)
# 语义化处理
def semantify(self, soup, resultFilePath):
# 建立originUrl为key,[hash, absoluteUrl]为value的字典
hashNodeRecords = {}
try:
dom = parseString(self.beautiful_soup_tag_to_unicode(soup).encode("utf-8"))
hashNodes = dom.getElementsByTagName("hashnode")
for hashNode in hashNodes:
hashValue = (hashNode.getElementsByTagName("hash")[0]).toprettyxml()[7:-8].strip()
absolute = (hashNode.getElementsByTagName("absoluteurl")[0]).toprettyxml()[13:-15].strip()
origin = (hashNode.getElementsByTagName("originalurl")[0]).toprettyxml()[13:-15].strip()
hashNodeRecords[origin] = [hashValue, absolute]
except Exception as e:
pass
# 去掉注释
comments = soup.find_all(text=(lambda text: isinstance(text, Comment)))
[comment.extract() for comment in comments]
# 将相对URL替换为绝对URL,并添加hash属性
for img_element in soup.find_all("img"):
if img_element.has_attr("src"):
originUrl = img_element["src"]
if hashNodeRecords.has_key(originUrl) and hashNodeRecords[originUrl]:
img_element["src"] = hashNodeRecords[originUrl][1]
img_element["hash"] = hashNodeRecords[originUrl][0]
# 利用反射机制,动态调用方法,所有方法的实现都在executor.Executor类中
for rule in self.rule_list:
for script_code in soup.find_all(rule.target.split(" ")[0]):
# 默认所有对象都需要处理
# 当指定条件的对象不能满足的时候,再跳过处理过程
needToProcess = True
for condition in rule.condition:
conMethod = getattr(Executor, condition[0])
if not conMethod(self.executor, script_code, condition[1:]):
needToProcess = False # 条件不满足,跳过
break
if needToProcess:
for act in rule.action:
actMethod = getattr(Executor, act[0])
if len(act) == 1:
actMethod(self.executor, script_code)
else:
args = [rule.target] + act[1:]
actMethod(self.executor, script_code, args)
if len(rule.logLevel.strip()) > 0 and len(rule.logMsg) > 0:
self.executor.doLog(rule.logLevel, resultFilePath, rule.logMsg)
return soup