本文整理汇总了Python中utils.Utils.clearHtmlTag方法的典型用法代码示例。如果您正苦于以下问题:Python Utils.clearHtmlTag方法的具体用法?Python Utils.clearHtmlTag怎么用?Python Utils.clearHtmlTag使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类utils.Utils
的用法示例。
在下文中一共展示了Utils.clearHtmlTag方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: processComputerScienceData
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import clearHtmlTag [as 别名]
def processComputerScienceData(self, url):
r = requests.get(url)
utils = Utils()
user_url = ''
last_line = ''
last_citations = ''
remark = 'description:'
file_name = self.get_file_name("eecs/people/computer-science-citations", self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
good_line = False
for line in r.text.split('\n'):
good_line = False
remark = 'description:'
if line.strip() == '':
continue
if line.find('<') != -1 and line.find('>') != -1:
line = utils.clearHtmlTag(line).strip()
else:
line = line.strip()
if len(line) < 5 or line.find('<a href') != -1:
last_line = line
continue
if last_line != '':
if last_line[0 : 1].isdigit():
good_line = True
line = utils.clearHtmlTag(last_line + ' ' + line)
last_line = ''
else:
if line[0 : 1].isdigit() and line.find('(') > 0:
good_line = True
line = utils.removeDoubleSpace(line.replace('\n', ''))
if good_line == False:
continue
citations = line[0 : line.find(" ")]
person = line[line.find(" ") + 1 : line.find("(")]
place = line[line.find('(') + 1 : line.find(')')]
info = line[line.find(')') + 2 :].strip()
#print citations
title = person
#print info
remark += citations + ' citations, ' + info
if citations != last_citations:
self.count += 1
last_citations = citations
if title.find('>') != -1:
title = title[title.find('>') + 1 :].strip()
self.write_db(f, 'csc-' + str(self.count), title, '', 'university:' + place + ' ' + remark)
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
示例2: BerkeleySpider
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import clearHtmlTag [as 别名]
class BerkeleySpider(Spider):
def __init__(self):
Spider.__init__(self)
self.school = "berkeley"
self.utils = Utils()
def processSubject(self, subject , url):
r = requests.get(url)
#print r.text
desc = ''
code = ''
title = ''
link = ''
file_name = self.get_file_name(subject, self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for line in r.text.split('\n'):
line = line.strip()
if line.startswith('<span class="title">'):
title = self.utils.clearHtmlTag(line)
elif line.startswith('<span class="code">'):
code = self.utils.clearHtmlTag(line).replace(' ', '').replace(' ', '')
elif line.startswith('<a href="/search/?P='):
link = 'http://guide.berkeley.edu' + line[line.find('"') + 1 : line.find('"', line.find('"') + 1)]
elif line.startswith('<p class="courseblockdesc">'):
#desc = self.utils.clearHtmlTag(line)
print code + ' ' + title
self.count += 1
self.write_db(f, code, title, link)
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
def doWork(self):
r = requests.get('http://guide.berkeley.edu/courses/')
for line in r.text.split('\n'):
if line.startswith('<li><a'):
subject = self.utils.clearHtmlTag(line)
subject = subject[0 : subject.find('(')].strip()
if subject == 'Computer Science' or subject == 'Electrical Engineering':
continue
if self.need_update_subject(subject):
print subject
link = 'http://guide.berkeley.edu' + line[line.find('"') + 1 : line.find('"', line.find('"') + 1)]
self.processSubject(subject, link)
示例3: Milestone
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import clearHtmlTag [as 别名]
class Milestone(BaseExtension):
record_milestone = {}
html = ''
def __init__(self):
BaseExtension.__init__(self)
self.utils = Utils()
def loadMilestone(self, filename, rID):
print 'loadMilestone ' + filename
#if len(self.record_milestone) != 0 and self.record_milestone.has_key(rID):
# return
name = 'extensions/milestone/data/' + filename + '-milestone'
record_milestone_back = {}
if os.path.exists(name):
f = open(name, 'rU')
all_lines = f.readlines()
for line in all_lines:
if line.startswith('#'):
continue
record = Record(line)
key = record.get_id().strip()
if key != rID:
continue
if record_milestone_back.has_key(key):
record_milestone_back[key].append(record)
else:
record_milestone_back[key] = [record]
if record_milestone_back.has_key(rID) and len(record_milestone_back[rID]) > 0:
#if len(record_milestone_back[rID]) > 20:
record_milestone_back[rID] = reversed(record_milestone_back[rID])
self.record_milestone[rID] = record_milestone_back[rID]
#for (k, v) in self.record_milestone.items():
# print k
def excute(self, form_dict):
fileName = form_dict['fileName'].encode('utf8')
rID = form_dict['rID'].encode('utf8')
self.loadMilestone(self.formatFileName(fileName), rID)
#print self.record_milestone
if self.record_milestone.has_key(rID):
return self.genMilestoneHtml(rID, form_dict['divID'].encode('utf8'))
return 'not found'
def check(self, form_dict):
fileName = form_dict['fileName'].encode('utf8')
rID = form_dict['rID'].encode('utf8')
self.loadMilestone(self.formatFileName(fileName), rID)
return self.record_milestone.has_key(rID)
def genMilestoneHtml(self, rID, ref_divID):
return self.genMetadataHtml(rID, ref_divID)
def genMetadataHtml(self, key, ref_divID):
if self.record_milestone.has_key(key):
self.html = '<div class="ref"><ol>'
count = 0
print 'sdsd'
for r in self.record_milestone[key]:
count += 1
ref_divID += '-' + str(count)
linkID = 'a-' + ref_divID[ref_divID.find('-') + 1 :]
appendID = str(count)
script = self.utils.genMoreEnginScript(linkID, ref_divID, "loop-" + key.replace(' ', '-') + '-' + str(appendID), self.utils.clearHtmlTag(r.get_title().strip()), r.get_url().strip(), '-')
title = r.get_title().strip()
id = title[0 : title.find(' ')].strip()
self.html += '<li><span>' + id + '.</span>'
if len(id) > 4:
self.html += '<br>'
if script != "" and len(title[title.find(' ') + 1 :].strip()) < 20:
self.html += '<p>' + self.utils.toSmartLink(title[title.find(' ') + 1 :].strip())
self.html += self.utils.genMoreEnginHtml(linkID, script.replace("'", '"'), '...', ref_divID, '', False);
else:
self.html += '<p>' + title[title.find(' ') + 1 :].strip()
self.html += '</p></li>'
return self.html + "</ol></div>"
else:
return ''
示例4: Reference
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import clearHtmlTag [as 别名]
#.........这里部分代码省略.........
obj = sp.find('span', class_='stat view-count')
views = ''
font_size = 0
if obj != None:
views = sp.find('span', class_='stat view-count').text.strip().strip()
views = views[0 : views.find(' ')]
font_size = len(views.replace(',', ''))
if font_size - 2 > 0:
font_size -= 2
title += '<font size="' + str(font_size) + '" color="rgb(212, 51, 51)">' + views + '</font> views'
return self.utils.removeDoubleSpace(title)
def check(self, form_dict):
fileName = form_dict['fileName'].encode('utf8')
rID = form_dict['rID'].encode('utf8')
url = form_dict['url'].encode('utf8')
if url == '':
return False
self.loadReference(self.formatFileName(fileName), rID)
if self.record_reference.has_key(rID) or fileName.find('papers') != -1 or form_dict['url'] != '' and form_dict['url'].startswith('http'):
return True
return False
def genReferenceHtml2(self, alist, divid, defaultLinks, rID):
return self.genMetadataHtml2(alist, divid, defaultLinks, rID)
def genMetadataHtml2(self, alist, ref_divID, defaultLinks, rID):
self.html = '<div class="ref"><ol>'
count = 0
for r in alist:
if self.passItem(r[0], r[1]):
continue
count += 1
ref_divID += '-' + str(count)
linkID = 'a-' + ref_divID[ref_divID.find('-') + 1 :]
appendID = str(count)
if rID.startswith('loop'):
appendID = rID[rID.rfind('-') + 1 :].replace('R', '.') + '.' + str(count)
self.html += '<li><span>' + appendID + '.</span>'
if len(appendID) >= 5:
self.html += '<br/>'
appendID = appendID.replace('.','R')
else:
self.html += '<li><span>' + str(count) + '.</span>'
script = self.utils.genMoreEnginScript(linkID, ref_divID, "loop-" + rID.replace(' ', '-') + '-' + str(appendID), r[0], r[1], '-', hidenEnginSection=Config.reference_hiden_engin_section)
if r[1] != '':
self.html += '<p>' + self.utils.enhancedLink(r[1], self.utils.formatTitle(r[0], Config.reference_smart_link_br_len), module='reference', rid=rID, library=self.form_dict['originFileName'])
else:
self.html += '<p>' + self.utils.toSmartLink(r[0], Config.reference_smart_link_br_len, module='reference', rid=rID, library=self.form_dict['originFileName'])
#self.html += self.utils.getDefaultEnginHtml(r[0], defaultLinks)
if script != "":
self.html += self.utils.genMoreEnginHtml(linkID, script.replace("'", '"'), '...', ref_divID, '', False);
#title = a.text.strip()
self.html += '</p></li>'
return self.html + "</ol></div>"
def genReferenceHtml(self, rID, ref_divID):
return self.genMetadataHtml(rID, ref_divID)
def genMetadataHtml(self, key, ref_divID):
if self.record_reference.has_key(key):
self.html = '<div class="ref"><br><ol>'
if self.form_dict['column'] == '1':
self.html = '<div class="ref"><ol>'
count = 0
for r in self.record_reference[key]:
count += 1
ref_divID += '-' + str(count)
linkID = 'a-' + ref_divID[ref_divID.find('-') + 1 :]
appendID = str(count)
script = self.utils.genMoreEnginScript(linkID, ref_divID, "loop-" + key.replace(' ', '-') + '-' + str(appendID), self.utils.clearHtmlTag(r.get_title().strip()), r.get_url().strip(), '-', hidenEnginSection=Config.reference_hiden_engin_section)
self.html += '<li><span>' + str(count) + '.</span>'
self.html += '<p>' + self.genMetadataLink(r.get_title().strip(), r.get_url().strip(), rID=key)
if script != "":
self.html += self.utils.genMoreEnginHtml(linkID, script.replace("'", '"'), '...', ref_divID, '', False);
self.html += '</p></li>'
return self.html + "</ol></div>"
else:
return ''
def genMetadataLink(self, title, url, rID=''):
if url.find('[') != -1:
ft = url.replace('[', '').replace(']', '').strip()
r = self.utils.getRecord(ft, '','', False, False)
key = r.get_path()[r.get_path().find(default_subject) + len(default_subject) + 1 :]
url = 'http://' + Config.ip_adress + '?db=' + default_subject + '/&key=' + key + '&filter=' + ft + '&desc=true'
return self.genMetadataLinkEx(title, url, rID=rID)
def genMetadataLinkEx(self, title, url, rID=''):
if title.find('<a>') != -1:
title = title.replace('<a>', '<a target="_blank" href="' + url + '">')
else:
title = self.utils.enhancedLink(url, self.utils.formatTitle(title, Config.reference_smart_link_br_len), module='reference', rid=rID, library=self.form_dict['originFileName'])
return title
示例5: processStudentCatalog
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import clearHtmlTag [as 别名]
def processStudentCatalog(self, html, f, course_code):
#print html
soup = BeautifulSoup(html);
links_all = soup.find_all("a")
course_links = []
utils = Utils()
for link in links_all:
if link.attrs.has_key("href") and False == link["href"].startswith("editcookie.cgi") \
and False == link["href"].startswith("/ent/cgi-bin") and False == link["href"].startswith("javascript:") \
and False == link["href"].startswith("m"):
course_links.append(link)
course_num = ""
title = ""
link = ""
textbook = ''
prereq = ''
instructors = ''
for line in html.split("\n"):
if (line.strip().startswith('<br>') and utils.clearHtmlTag(line.strip())[1 : 2] == '.') or \
line.find('Prereq:') != -1:
if line.find('Prereq:') != -1:
all_prereq = self.course_num_regex.findall(line.lower())
all_prereq = list(set(all_prereq))
for p in all_prereq:
prereq += p + ' '
if len(all_prereq) > 0:
prereq = 'prereq:' + prereq
#print course_num + '---->' + prereq
if line.strip().startswith('<') and utils.clearHtmlTag(line.strip())[1 : 2] == '.':
instructors = 'instructors:' + utils.clearHtmlTag(line.strip()[0 : line.strip().find('</')]) + ' '
if line.strip().find('<h3>') != -1 or \
(line.strip().startswith('<br>') and (line.strip()[len(line.strip()) - 1 : ] == '.' or line.strip()[len(line.strip()) - 7 : ] == 'limited')):
line = line[line.find('>', 3) + 1 : ]
if line.find('</h3>') == -1:
#print line
if line[0 : line.find('.')] == course_code:
if course_num != '':
print course_num + " " + title + " " + link
if instructors != '' and remark.find('instructors:') == -1:
remark = instructors + ' ' + remark
self.count += 1
self.write_db(f, course_num, title, link, remark)
remark = ''
course_num = ""
title = ""
link = ""
textbook = ''
prereq = ''
instructors = ''
course_num = line.strip()[0 : line.strip().find(" ")]
textbook = ''
if self.deep_mind:
textbook = self.getTextBook(course_num)
if textbook == '' and self.deep_mind and self.ocw_links.get(course_num, '') != '':
textbook = self.ocw_spider.getTextBook(self.ocw_links[course_num], course_num)
title = line.strip()[line.strip().find(" ") + 1 : ]
if course_num.find(',') != -1:
course_num = line.strip()[0 : line.strip().find(" ", line.strip().find(" ") + 1)]
title = line.strip()[line.strip().find(" ", line.strip().find(" ") + 1) + 1 : ]
link = self.getMitCourseLink(course_links, course_num.strip())
else:
remark = ''
if self.deep_mind and self.ocw_links.get(course_num, '') != '':
remark = self.ocw_spider.getDescription(self.ocw_spider.getDescriptionApiUrl(self.ocw_links[course_num]))
if remark.find('description:') != -1:
remark = remark[0 : remark.find('description:')]
if textbook != '':
remark += textbook
if prereq != '':
remark += prereq
remark += 'description:' + line.strip() + ' '
if course_num != '':
self.count = self.count + 1
self.write_db(f, course_num, title, link, remark)
示例6: Content
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import clearHtmlTag [as 别名]
#.........这里部分代码省略.........
if contentref != None:
return contentref.strip()
return ''
def write(self, html):
f = open('temp/test.html', 'w')
for line in html:
f.write(line)
#print 'write ' + html + ' to file'
f.close
def genContentHtml(self, key, content_divID, defaultLinks, library):
return self.genMetadataHtml(key, content_divID, defaultLinks, library)
def genMetadataHtml(self, key, content_divID, defaultLinks, library):
html = '<div class="ref"><ol>'
if self.form_dict['column'] == '3' and int(self.form_dict['extension_count']) > 10:
html = '<div class="ref"><br><ol>'
count = 0
#print 'key:' + key
#print self.datafile_content
if self.datafile_content.has_key(key):
self.record_content = self.datafile_content
elif self.optional_content.has_key(key):
self.record_content = self.optional_content
if self.record_content.has_key(key):
#print key
for r in self.record_content[key]:
count += 1
format_index = ''
pRecord = None
pid = r.get_parentid().strip()
if self.record_content.has_key(pid) and key.find('-') != -1:
pRecord = self.record_content[pid]
if content_divID.find(self.data_type) == content_divID.rfind(self.data_type):
format_index = str(count)
else:
format_index = pid[pid.rfind('-') + 1 :] + '.' + str(count)
elif r.get_id().find('-') != -1:
format_index = r.get_id()[r.get_id().rfind('-') + 1 : ].strip()
while format_index.find('-') != -1:
format_index = format_index[format_index.find('-') + 1 :]
html += '<li><span>' + format_index + '</span>'
if len(format_index) > 5:
html += '</li><br/><li>'
content_divID += '-' + str(count)
linkID = 'a-' + content_divID[content_divID.find('-') + 1 :]
title = r.get_title().strip().replace(' ', '%20')
desc = r.get_describe().strip()
script = self.utils.genMoreEnginScript(linkID, content_divID, r.get_id().strip(), self.utils.clearHtmlTag(title), r.get_url().strip(), '-', hidenEnginSection=Config.content_hiden_engin_section)
descHtml = ''
if desc != '':
descHtml = self.utils.genDescHtml(desc, Config.course_name_len, self.tag.tag_list, iconKeyword=True, fontScala=1, module='history')
moreHtml = self.utils.genMoreEnginHtml(linkID, script.replace("'", '"'), '...', content_divID, '', False, descHtml=descHtml);
if self.record_content.has_key(r.get_id().strip()) or r.get_url().strip() == '':
if r.get_url().strip() != '':
html += '<p>' + self.genMetadataLink(r.get_title().strip(), r.get_url().strip())
else:
html += '<p>' + self.utils.toSmartLink(r.get_title().strip(), 45, module='content', rid=self.form_dict['rID'], library=library)
#html += self.utils.getDefaultEnginHtml(title, defaultLinks)
if moreHtml != "":
html += moreHtml
html += '</p>'
elif r.get_url().strip() != '':
html += '<p>' + self.genMetadataLink(r.get_title().strip(), r.get_url().strip()) + moreHtml + '</p>'
html += '</li>'
else:
return ''
html += "</ol></div>"
return html
def genMetadataLink(self, title, url):
if url.find('[') != -1:
ft = url.replace('[', '').replace(']', '').strip()
r = self.utils.getRecord(ft, '','', False, False)
key = r.get_path()[r.get_path().find(default_subject) + len(default_subject) + 1 :]
url = 'http://' + Config.ip_adress + '?db=' + default_subject + '/&key=' + key + '&filter=' + ft + '&desc=true'
return self.genMetadataLinkEx(title, url)
def genMetadataLinkEx(self, title, url):
if title.find('<a>') != -1:
title = title.replace('<a>', '<a target="_blank" href="' + url + '">')
else:
title = self.utils.enhancedLink(url, self.utils.formatTitle(title, 45), module='content', rid=self.form_dict['rID'], library=self.form_dict['originFileName'])
return title
示例7: HarvardSpider
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import clearHtmlTag [as 别名]
#.........这里部分代码省略.........
if self.count == 0:
print subject + " can not get the data, check the html and python code"
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
def doWork(self):
print "downloading harvard course info"
r = requests.get("http://www.registrar.fas.harvard.edu/courses-exams/courses-instruction")
soup = BeautifulSoup(r.text)
for span in soup.find_all("span", class_="field-content"):
#print span.a.string
self.getHarvardCourse(span.a.string, self.url + str(span.a["href"]))
'''
def getUrlByYear(self, year):
if year == 2016:
return '%2C%22PageSize%22%3A%22%22%2C%22SortOrder%22%3A%5B%22IS_SCL_SUBJ_CAT%22%2C%22IS_SCL_SUBJ_CAT%22%5D%2C%22Facets%22%3A%5B%5D%2C%22Category%22%3A%22HU_SCL_SCHEDULED_BRACKETED_COURSES%22%2C%22SearchPropertiesInResults%22%3Atrue%2C%22FacetsInResults%22%3Atrue%2C%22SaveRecent%22%3Afalse%2C%22TopN%22%3A%22%22%2C%22SearchText%22%3A%22(STRM%3A2166%20%7C%20STRM%3A2168)%20(ACAD_ORG%3A%5C%22%s%5C%22)%22%2C%22DeepLink%22%3Afalse%7D'
elif year == 2017:
return '%2C%22PageSize%22%3A%22%22%2C%22SortOrder%22%3A%5B%22IS_SCL_SUBJ_CAT%22%2C%22IS_SCL_SUBJ_CAT%22%5D%2C%22Facets%22%3A%5B%5D%2C%22Category%22%3A%22HU_SCL_SCHEDULED_BRACKETED_COURSES%22%2C%22SearchPropertiesInResults%22%3Atrue%2C%22FacetsInResults%22%3Atrue%2C%22SaveRecent%22%3Afalse%2C%22TopN%22%3A%22%22%2C%22SearchText%22%3A%22(STRM%3A2172%20%7C%20STRM%3A2178)%20(ACAD_ORG%3A%5C%22%s%5C%22)%22%2C%22DeepLink%22%3Afalse%7D'
elif year == 2018:
return '%2C%22PageSize%22%3A%22%22%2C%22SortOrder%22%3A%5B%22IS_SCL_SUBJ_CAT%22%2C%22IS_SCL_SUBJ_CAT%22%5D%2C%22Facets%22%3A%5B%5D%2C%22Category%22%3A%22HU_SCL_SCHEDULED_BRACKETED_COURSES%22%2C%22SearchPropertiesInResults%22%3Atrue%2C%22FacetsInResults%22%3Atrue%2C%22SaveRecent%22%3Afalse%2C%22TopN%22%3A%22%22%2C%22SearchText%22%3A%22(STRM%3A2182%20%7C%20STRM%3A2188)%20(ACAD_ORG%3A%5C%22%s%5C%22)%22%2C%22DeepLink%22%3Afalse%7D'
elif year == 2019:
return '%2C%22PageSize%22%3A%22%22%2C%22SortOrder%22%3A%5B%22IS_SCL_SUBJ_CAT%22%2C%22IS_SCL_SUBJ_CAT%22%5D%2C%22Facets%22%3A%5B%5D%2C%22Category%22%3A%22HU_SCL_SCHEDULED_BRACKETED_COURSES%22%2C%22SearchPropertiesInResults%22%3Atrue%2C%22FacetsInResults%22%3Atrue%2C%22SaveRecent%22%3Afalse%2C%22TopN%22%3A%22%22%2C%22SearchText%22%3A%22(STRM%3A2166%20%7C%20STRM%3A2168)%20(ACAD_ORG%3A%5C%22%s%5C%22)%22%2C%22DeepLink%22%3Afalse%7D'
return ''
def doWork(self):
url_part1 = "https://courses.my.harvard.edu/psc/courses/EMPLOYEE/EMPL/s/WEBLIB_IS_SCL.ISCRIPT1.FieldFormula.IScript_Search?SearchReqJSON={%22PageNumber%22%3A"
year = int(time.strftime('%Y',time.localtime(time.time())))
url_part2 = self.getUrlByYear(year)
for k, v in self.dept_dict.items():
if self.need_update_subject(v) == False:
continue
file_name = self.get_file_name(v, self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
print 'processing ' + v
page = 0
courseid_dict = {}
while True:
page += 1
url = url_part1 + str(page) +url_part2.replace('%s', k)
print url
r = requests.get(url)
jobj = json.loads(r.text)
search_ok = False
for obj in jobj[0]['ResultsCollection']:
search_ok = True
dept = obj['IS_SCL_DESCR_IS_SCL_DESCRD'].strip().lower()
if dept != v.strip().lower():
print dept + ' not match ' + v.strip().lower()
continue
if obj.has_key('IS_SCL_DESCR_IS_SCL_DESCRH') and 'Not Offered' == obj['IS_SCL_DESCR_IS_SCL_DESCRH']:
continue
if courseid_dict.has_key(obj['CRSE_ID']):
continue
else:
courseid_dict[obj['CRSE_ID']] = ''
title = obj['Title']
term = 'term:' + obj['IS_SCL_DESCR_IS_SCL_DESCRH']
instructors = "instructors:"
description = "description:"
print obj['Title'] + ' ' + obj['IS_SCL_DESCR_IS_SCL_DESCRH']
if obj.has_key('DESCRLONG_DETAILS'):
for author in obj['DESCRLONG_DETAILS']:
instructors += author['Name'] + ', '
instructors = instructors[0 : len(instructors) - 2]
elif obj.has_key('IS_SCL_DESCR_IS_SCL_DESCRL'):
instructors += obj['IS_SCL_DESCR_IS_SCL_DESCRL']
if instructors.endswith(':'):
instructors = ''
desc = self.utils.clearHtmlTag(obj['Description'].strip()).strip()
if desc != '':
description += desc
else:
description = ''
self.count += 1
self.write_db(f, 'harford-' + k + '-' + obj['CRSE_ID'], title, 'https://courses.my.harvard.edu/psp/courses/EMPLOYEE/EMPL/h/?tab=HU_CLASS_SEARCH&SearchReqJSON=%7B%22SearchText%22%3A%22%s%22%7D'.replace('%s', obj['CRSE_ID']), term + ' ' + instructors + ' ' + description)
if jobj[2]['PageNumber'] == jobj[2]['TotalPages'] or search_ok == False:
break
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"