当前位置: 首页>>代码示例>>Python>>正文


Python Utils.clearHtmlTag方法代码示例

本文整理汇总了Python中utils.Utils.clearHtmlTag方法的典型用法代码示例。如果您正苦于以下问题:Python Utils.clearHtmlTag方法的具体用法?Python Utils.clearHtmlTag怎么用?Python Utils.clearHtmlTag使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在utils.Utils的用法示例。


在下文中一共展示了Utils.clearHtmlTag方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: processComputerScienceData

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import clearHtmlTag [as 别名]
    def processComputerScienceData(self, url):
        r = requests.get(url)
        utils = Utils()
        user_url = ''
        last_line = ''
        last_citations = ''
        remark = 'description:'
        file_name = self.get_file_name("eecs/people/computer-science-citations", self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0
        good_line = False 

        for line in r.text.split('\n'):
            good_line = False 
            remark = 'description:'
            if line.strip() == '':
                continue
            if line.find('<') != -1 and line.find('>') != -1:
                line = utils.clearHtmlTag(line).strip()
            else:
                line = line.strip()
            if len(line) < 5 or line.find('<a href') != -1:
                last_line = line
                continue
            if last_line != '':
                if last_line[0 : 1].isdigit():
                    good_line = True
                    line = utils.clearHtmlTag(last_line + ' ' + line)
                last_line = ''
            else:
                if line[0 : 1].isdigit() and line.find('(') > 0:
                    good_line = True
                    line = utils.removeDoubleSpace(line.replace('\n', ''))
            
            if good_line == False:
                continue 
            citations = line[0 : line.find(" ")]
            person = line[line.find(" ") + 1 : line.find("(")]
            place = line[line.find('(') + 1 : line.find(')')]
            info = line[line.find(')') + 2 :].strip()
            #print citations
            title = person
            #print info
            remark +=  citations + ' citations, ' + info
            if citations != last_citations:
                self.count += 1
            last_citations = citations
	    if title.find('>') != -1:
		title = title[title.find('>') + 1 :].strip()
            self.write_db(f, 'csc-' + str(self.count), title, '', 'university:' + place + ' ' + remark)

        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"
开发者ID:wowdd1,项目名称:xlinkBook,代码行数:61,代码来源:update_rank.py

示例2: BerkeleySpider

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import clearHtmlTag [as 别名]
class BerkeleySpider(Spider):
    def __init__(self):
        Spider.__init__(self)
        self.school = "berkeley"
        self.utils = Utils()

    def processSubject(self, subject , url):
        r = requests.get(url)
        #print r.text
        desc = ''
        code = ''
        title = ''
        link = ''
        file_name = self.get_file_name(subject, self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0

        for line in r.text.split('\n'):
            line = line.strip()
            if line.startswith('<span class="title">'):
                title = self.utils.clearHtmlTag(line)
            elif line.startswith('<span class="code">'):
                code = self.utils.clearHtmlTag(line).replace('&#160;', '').replace(' ', '')
            elif line.startswith('<a href="/search/?P='):
                link = 'http://guide.berkeley.edu' + line[line.find('"') + 1 : line.find('"', line.find('"') + 1)]
            elif line.startswith('<p class="courseblockdesc">'):
                #desc = self.utils.clearHtmlTag(line)
                print code + ' ' + title
                self.count += 1
                self.write_db(f, code, title, link)

        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"

    def doWork(self):
        r = requests.get('http://guide.berkeley.edu/courses/')
        for line in r.text.split('\n'):
            if line.startswith('<li><a'):
                subject = self.utils.clearHtmlTag(line)
                subject = subject[0 : subject.find('(')].strip()
                if subject == 'Computer Science' or subject == 'Electrical Engineering':
                    continue
                if self.need_update_subject(subject):
                    print subject 
                    link = 'http://guide.berkeley.edu' + line[line.find('"') + 1 : line.find('"', line.find('"') + 1)]
                    self.processSubject(subject, link)
开发者ID:wowdd1,项目名称:xlinkBook,代码行数:54,代码来源:update_berkeley.py

示例3: Milestone

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import clearHtmlTag [as 别名]
class Milestone(BaseExtension):

    record_milestone = {}
    html = ''

    def __init__(self):
        BaseExtension.__init__(self)
        self.utils = Utils()

    def loadMilestone(self, filename, rID):
        print 'loadMilestone ' + filename
        #if len(self.record_milestone) != 0 and self.record_milestone.has_key(rID):
        #    return
        name = 'extensions/milestone/data/' + filename + '-milestone'
        record_milestone_back = {}
        if os.path.exists(name):
            f = open(name, 'rU')
            all_lines = f.readlines()
            for line in all_lines:
                if line.startswith('#'):
                    continue
                record = Record(line)
                key = record.get_id().strip()
                if key != rID:
                    continue

                if record_milestone_back.has_key(key):
                    record_milestone_back[key].append(record)
                else:
                    record_milestone_back[key] = [record]

        if record_milestone_back.has_key(rID) and len(record_milestone_back[rID]) > 0:
            #if len(record_milestone_back[rID]) > 20:
            record_milestone_back[rID] = reversed(record_milestone_back[rID])

            self.record_milestone[rID] = record_milestone_back[rID]
        

        #for (k, v) in self.record_milestone.items():
        #    print k

    def excute(self, form_dict):
      
        fileName = form_dict['fileName'].encode('utf8')
        rID = form_dict['rID'].encode('utf8')

        self.loadMilestone(self.formatFileName(fileName), rID)
        #print self.record_milestone
        if self.record_milestone.has_key(rID):
            return self.genMilestoneHtml(rID, form_dict['divID'].encode('utf8'))

        return 'not found'


    def check(self, form_dict):
        fileName = form_dict['fileName'].encode('utf8')
        rID = form_dict['rID'].encode('utf8')
        self.loadMilestone(self.formatFileName(fileName), rID)
        return self.record_milestone.has_key(rID)
                

    def genMilestoneHtml(self, rID, ref_divID):
        return self.genMetadataHtml(rID, ref_divID)

    def genMetadataHtml(self, key, ref_divID):
        if self.record_milestone.has_key(key):
            self.html = '<div class="ref"><ol>'
            count = 0
            print 'sdsd'
            for r in self.record_milestone[key]:
                count += 1
                ref_divID += '-' + str(count)
                linkID = 'a-' + ref_divID[ref_divID.find('-') + 1 :]
                appendID = str(count)
                script = self.utils.genMoreEnginScript(linkID, ref_divID, "loop-" + key.replace(' ', '-') + '-' + str(appendID), self.utils.clearHtmlTag(r.get_title().strip()), r.get_url().strip(), '-')

                title = r.get_title().strip()
                id = title[0 : title.find(' ')].strip()
                self.html += '<li><span>' + id + '.</span>'
                if len(id) > 4:
                    self.html += '<br>'

                if script != "" and len(title[title.find(' ') + 1 :].strip()) < 20:
                    self.html += '<p>' + self.utils.toSmartLink(title[title.find(' ') + 1 :].strip())
                    self.html += self.utils.genMoreEnginHtml(linkID, script.replace("'", '"'), '...', ref_divID, '', False);
                else:
                    self.html += '<p>' + title[title.find(' ') + 1 :].strip()
                self.html += '</p></li>'
            return self.html + "</ol></div>"
        else:
            return ''
开发者ID:wowdd1,项目名称:xlinkBook,代码行数:93,代码来源:milestone.py

示例4: Reference

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import clearHtmlTag [as 别名]

#.........这里部分代码省略.........
        obj = sp.find('span', class_='stat view-count')
        views = ''
        font_size = 0
        if obj != None:
            views = sp.find('span', class_='stat view-count').text.strip().strip()
            views = views[0 : views.find(' ')]
            font_size = len(views.replace(',', ''))
        if font_size - 2 > 0:
	        font_size -= 2
        title += '<font size="' + str(font_size) + '" color="rgb(212, 51, 51)">' + views + '</font> views'
        return self.utils.removeDoubleSpace(title)

    def check(self, form_dict):
        fileName = form_dict['fileName'].encode('utf8')
        rID = form_dict['rID'].encode('utf8')
        url = form_dict['url'].encode('utf8')
        if url == '':
            return False
        self.loadReference(self.formatFileName(fileName), rID)
        if self.record_reference.has_key(rID) or fileName.find('papers') != -1 or form_dict['url'] != '' and form_dict['url'].startswith('http'):
            return True
        return False
                

    def genReferenceHtml2(self, alist, divid, defaultLinks, rID):
        return self.genMetadataHtml2(alist, divid, defaultLinks, rID)
    
    def genMetadataHtml2(self, alist, ref_divID, defaultLinks, rID):
            self.html = '<div class="ref"><ol>'
            count = 0
            for r in alist:
                if self.passItem(r[0], r[1]):
                    continue
                count += 1
                ref_divID += '-' + str(count)
                linkID = 'a-' + ref_divID[ref_divID.find('-') + 1 :]
                appendID = str(count)
                if rID.startswith('loop'):
                    appendID = rID[rID.rfind('-') + 1 :].replace('R', '.') + '.' + str(count) 
                    self.html += '<li><span>' + appendID + '.</span>'
                    if len(appendID) >= 5:
                        self.html += '<br/>'
                    appendID = appendID.replace('.','R')
                else:
                    self.html += '<li><span>' + str(count) + '.</span>'
                script = self.utils.genMoreEnginScript(linkID, ref_divID, "loop-" + rID.replace(' ', '-') + '-' + str(appendID), r[0], r[1], '-', hidenEnginSection=Config.reference_hiden_engin_section)
                if r[1] != '':
                    self.html += '<p>' + self.utils.enhancedLink(r[1], self.utils.formatTitle(r[0], Config.reference_smart_link_br_len), module='reference', rid=rID, library=self.form_dict['originFileName'])
                else:
                    self.html += '<p>' + self.utils.toSmartLink(r[0], Config.reference_smart_link_br_len, module='reference', rid=rID, library=self.form_dict['originFileName'])
                #self.html += self.utils.getDefaultEnginHtml(r[0], defaultLinks)
                if script != "":
                    self.html += self.utils.genMoreEnginHtml(linkID, script.replace("'", '"'), '...', ref_divID, '', False);
                #title = a.text.strip()
                self.html += '</p></li>'
            return self.html + "</ol></div>"


    def genReferenceHtml(self, rID, ref_divID):
        return self.genMetadataHtml(rID, ref_divID)

    def genMetadataHtml(self, key, ref_divID):
        if self.record_reference.has_key(key):
            self.html = '<div class="ref"><br><ol>'
            if self.form_dict['column'] == '1':
                self.html = '<div class="ref"><ol>'
            count = 0
            for r in self.record_reference[key]:
                count += 1
                ref_divID += '-' + str(count)
                linkID = 'a-' + ref_divID[ref_divID.find('-') + 1 :]
                appendID = str(count)
                script = self.utils.genMoreEnginScript(linkID, ref_divID, "loop-" + key.replace(' ', '-') + '-' + str(appendID), self.utils.clearHtmlTag(r.get_title().strip()), r.get_url().strip(), '-', hidenEnginSection=Config.reference_hiden_engin_section)

                self.html += '<li><span>' + str(count) + '.</span>'
                self.html += '<p>' + self.genMetadataLink(r.get_title().strip(), r.get_url().strip(), rID=key)
                if script != "":
                    self.html += self.utils.genMoreEnginHtml(linkID, script.replace("'", '"'), '...', ref_divID, '', False);
                self.html += '</p></li>'
            return self.html + "</ol></div>"
        else:
            return ''


    def genMetadataLink(self, title, url, rID=''):
        if url.find('[') != -1:
            ft = url.replace('[', '').replace(']', '').strip()
            r = self.utils.getRecord(ft, '','', False, False)
            key = r.get_path()[r.get_path().find(default_subject) + len(default_subject) + 1 :]
            url = 'http://' + Config.ip_adress + '?db=' + default_subject + '/&key=' + key + '&filter=' + ft  + '&desc=true'

        return self.genMetadataLinkEx(title, url, rID=rID)


    def genMetadataLinkEx(self, title, url, rID=''):
        if title.find('<a>') != -1:
            title = title.replace('<a>', '<a target="_blank" href="' + url + '">')
        else:
            title = self.utils.enhancedLink(url, self.utils.formatTitle(title, Config.reference_smart_link_br_len), module='reference', rid=rID, library=self.form_dict['originFileName'])
        return title
开发者ID:wowdd1,项目名称:xlinkBook,代码行数:104,代码来源:reference.py

示例5: processStudentCatalog

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import clearHtmlTag [as 别名]
    def processStudentCatalog(self, html, f, course_code):
        #print html
        soup = BeautifulSoup(html);
        links_all = soup.find_all("a")
        course_links = []
        utils = Utils()
        for link in links_all:
            if link.attrs.has_key("href") and False == link["href"].startswith("editcookie.cgi") \
               and False == link["href"].startswith("/ent/cgi-bin") and False == link["href"].startswith("javascript:") \
               and False == link["href"].startswith("m"):
                course_links.append(link)
        course_num = ""
        title = ""
        link = ""
        textbook = ''
        prereq = ''
        instructors = ''
        for line in html.split("\n"):

            if (line.strip().startswith('<br>') and utils.clearHtmlTag(line.strip())[1 : 2] == '.') or \
                line.find('Prereq:') != -1:
                if line.find('Prereq:') != -1:
                    all_prereq = self.course_num_regex.findall(line.lower())
                    all_prereq = list(set(all_prereq))
                    for p in all_prereq:
                        prereq += p + ' '
                    if len(all_prereq) > 0:
                        prereq = 'prereq:' + prereq
                    #print course_num + '---->' + prereq
                        
                if line.strip().startswith('<') and utils.clearHtmlTag(line.strip())[1 : 2] == '.':
                    instructors = 'instructors:' + utils.clearHtmlTag(line.strip()[0 : line.strip().find('</')]) + ' '

            if line.strip().find('<h3>') != -1 or \
                (line.strip().startswith('<br>') and (line.strip()[len(line.strip()) - 1 : ] == '.' or line.strip()[len(line.strip()) - 7 : ] == 'limited')):
                line = line[line.find('>', 3) + 1 : ]
                if line.find('</h3>') == -1:
                    #print line
                    if line[0 : line.find('.')] == course_code:
                        if course_num != '':
                            print course_num + " " + title + " " + link                     

                            if instructors != '' and remark.find('instructors:') == -1:
                                remark = instructors + ' ' + remark

                            self.count += 1
                            self.write_db(f, course_num, title, link, remark)
                            remark = ''
                            course_num = ""
                            title = ""
                            link = ""
                            textbook = ''
                            prereq = ''
                            instructors = ''

                        course_num = line.strip()[0 : line.strip().find(" ")]
                        textbook = ''
                        if self.deep_mind:
                            textbook = self.getTextBook(course_num)

                        if textbook == '' and self.deep_mind and self.ocw_links.get(course_num, '') != '':
                            textbook = self.ocw_spider.getTextBook(self.ocw_links[course_num], course_num)
 
                        title = line.strip()[line.strip().find(" ") + 1 : ]
                        if course_num.find(',') != -1:
                            course_num = line.strip()[0 : line.strip().find(" ", line.strip().find(" ") + 1)]
                            title = line.strip()[line.strip().find(" ", line.strip().find(" ") + 1) + 1 : ]
                        link = self.getMitCourseLink(course_links, course_num.strip())
                    else:
                        remark = ''
                        if self.deep_mind and self.ocw_links.get(course_num, '') != '':
                            remark = self.ocw_spider.getDescription(self.ocw_spider.getDescriptionApiUrl(self.ocw_links[course_num]))
                            if remark.find('description:') != -1:
                                remark = remark[0 : remark.find('description:')]

                        if textbook != '':
                            remark += textbook
                        if prereq != '':
                            remark += prereq

                        remark += 'description:' + line.strip() + ' ' 
        if course_num != '':
            self.count = self.count + 1
            self.write_db(f, course_num, title, link, remark)
开发者ID:amitahire,项目名称:numberWave,代码行数:86,代码来源:update_mit.py

示例6: Content

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import clearHtmlTag [as 别名]

#.........这里部分代码省略.........
            if contentref != None:

                return contentref.strip()
        return ''


    def write(self, html):
        f = open('temp/test.html', 'w')
        for line in html:
            f.write(line)
        #print 'write ' + html + ' to file'
        f.close

    def genContentHtml(self, key, content_divID, defaultLinks, library):
        return self.genMetadataHtml(key, content_divID, defaultLinks, library)

    def genMetadataHtml(self, key, content_divID, defaultLinks, library):
        html = '<div class="ref"><ol>'
        if self.form_dict['column'] == '3' and int(self.form_dict['extension_count']) > 10:
            html = '<div class="ref"><br><ol>'

        count = 0
        #print 'key:' + key
        #print self.datafile_content
        if self.datafile_content.has_key(key):
            self.record_content = self.datafile_content
        elif self.optional_content.has_key(key):
            self.record_content = self.optional_content

        if self.record_content.has_key(key):
            #print key
            for r in self.record_content[key]:
                count += 1
                format_index = ''
                pRecord = None
                pid = r.get_parentid().strip()
                if self.record_content.has_key(pid) and key.find('-') != -1:
                    pRecord = self.record_content[pid] 
		    if content_divID.find(self.data_type) == content_divID.rfind(self.data_type):
                        format_index = str(count)
		    else:
                        format_index = pid[pid.rfind('-') + 1 :] + '.' + str(count)
                elif r.get_id().find('-') != -1:
                    format_index = r.get_id()[r.get_id().rfind('-') + 1 : ].strip()
		while format_index.find('-') != -1:
		    format_index = format_index[format_index.find('-') + 1 :]
                html += '<li><span>' + format_index + '</span>'
                if len(format_index) > 5:
                    html += '</li><br/><li>'
                
                content_divID += '-' + str(count)
                linkID = 'a-' + content_divID[content_divID.find('-') + 1 :]
                title = r.get_title().strip().replace(' ', '%20')
                desc = r.get_describe().strip()
                script = self.utils.genMoreEnginScript(linkID, content_divID, r.get_id().strip(), self.utils.clearHtmlTag(title), r.get_url().strip(), '-', hidenEnginSection=Config.content_hiden_engin_section)
                
                descHtml = ''

                if desc != '':

                    descHtml = self.utils.genDescHtml(desc, Config.course_name_len, self.tag.tag_list, iconKeyword=True, fontScala=1, module='history')
            


                moreHtml = self.utils.genMoreEnginHtml(linkID, script.replace("'", '"'), '...', content_divID, '', False, descHtml=descHtml);
                if self.record_content.has_key(r.get_id().strip()) or r.get_url().strip() == '':
                    if r.get_url().strip() != '':
                        html += '<p>' + self.genMetadataLink(r.get_title().strip(), r.get_url().strip())
                    else:
                        html += '<p>' + self.utils.toSmartLink(r.get_title().strip(), 45, module='content', rid=self.form_dict['rID'], library=library)
                    #html += self.utils.getDefaultEnginHtml(title, defaultLinks)
                    if moreHtml != "":
                        html += moreHtml
                    html += '</p>'
                elif r.get_url().strip() != '':
                    html += '<p>' + self.genMetadataLink(r.get_title().strip(), r.get_url().strip())  + moreHtml + '</p>'
                html += '</li>'
        else:
            return ''

        html += "</ol></div>"
        return html


    def genMetadataLink(self, title, url):
        if url.find('[') != -1:
            ft = url.replace('[', '').replace(']', '').strip()
            r = self.utils.getRecord(ft, '','', False, False)
            key = r.get_path()[r.get_path().find(default_subject) + len(default_subject) + 1 :]
            url = 'http://' + Config.ip_adress + '?db=' + default_subject + '/&key=' + key + '&filter=' + ft  + '&desc=true'

        return self.genMetadataLinkEx(title, url)


    def genMetadataLinkEx(self, title, url):
        if title.find('<a>') != -1:
            title = title.replace('<a>', '<a target="_blank" href="' + url + '">')
        else:
            title = self.utils.enhancedLink(url, self.utils.formatTitle(title, 45), module='content', rid=self.form_dict['rID'], library=self.form_dict['originFileName'])
        return title
开发者ID:wowdd1,项目名称:xlinkBook,代码行数:104,代码来源:content.py

示例7: HarvardSpider

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import clearHtmlTag [as 别名]

#.........这里部分代码省略.........
        if self.count == 0:
            print subject + " can not get the data, check the html and python code"
        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"
        
    def doWork(self): 
        print "downloading harvard course info"
        r = requests.get("http://www.registrar.fas.harvard.edu/courses-exams/courses-instruction")
        soup = BeautifulSoup(r.text)
    
        for span in soup.find_all("span", class_="field-content"):
            #print span.a.string
            self.getHarvardCourse(span.a.string, self.url + str(span.a["href"]))
    '''

    def getUrlByYear(self, year):
        if year == 2016:
            return '%2C%22PageSize%22%3A%22%22%2C%22SortOrder%22%3A%5B%22IS_SCL_SUBJ_CAT%22%2C%22IS_SCL_SUBJ_CAT%22%5D%2C%22Facets%22%3A%5B%5D%2C%22Category%22%3A%22HU_SCL_SCHEDULED_BRACKETED_COURSES%22%2C%22SearchPropertiesInResults%22%3Atrue%2C%22FacetsInResults%22%3Atrue%2C%22SaveRecent%22%3Afalse%2C%22TopN%22%3A%22%22%2C%22SearchText%22%3A%22(STRM%3A2166%20%7C%20STRM%3A2168)%20(ACAD_ORG%3A%5C%22%s%5C%22)%22%2C%22DeepLink%22%3Afalse%7D'
        elif year == 2017:
            return '%2C%22PageSize%22%3A%22%22%2C%22SortOrder%22%3A%5B%22IS_SCL_SUBJ_CAT%22%2C%22IS_SCL_SUBJ_CAT%22%5D%2C%22Facets%22%3A%5B%5D%2C%22Category%22%3A%22HU_SCL_SCHEDULED_BRACKETED_COURSES%22%2C%22SearchPropertiesInResults%22%3Atrue%2C%22FacetsInResults%22%3Atrue%2C%22SaveRecent%22%3Afalse%2C%22TopN%22%3A%22%22%2C%22SearchText%22%3A%22(STRM%3A2172%20%7C%20STRM%3A2178)%20(ACAD_ORG%3A%5C%22%s%5C%22)%22%2C%22DeepLink%22%3Afalse%7D'
        elif year == 2018:
            return '%2C%22PageSize%22%3A%22%22%2C%22SortOrder%22%3A%5B%22IS_SCL_SUBJ_CAT%22%2C%22IS_SCL_SUBJ_CAT%22%5D%2C%22Facets%22%3A%5B%5D%2C%22Category%22%3A%22HU_SCL_SCHEDULED_BRACKETED_COURSES%22%2C%22SearchPropertiesInResults%22%3Atrue%2C%22FacetsInResults%22%3Atrue%2C%22SaveRecent%22%3Afalse%2C%22TopN%22%3A%22%22%2C%22SearchText%22%3A%22(STRM%3A2182%20%7C%20STRM%3A2188)%20(ACAD_ORG%3A%5C%22%s%5C%22)%22%2C%22DeepLink%22%3Afalse%7D'
        elif year == 2019:
            return '%2C%22PageSize%22%3A%22%22%2C%22SortOrder%22%3A%5B%22IS_SCL_SUBJ_CAT%22%2C%22IS_SCL_SUBJ_CAT%22%5D%2C%22Facets%22%3A%5B%5D%2C%22Category%22%3A%22HU_SCL_SCHEDULED_BRACKETED_COURSES%22%2C%22SearchPropertiesInResults%22%3Atrue%2C%22FacetsInResults%22%3Atrue%2C%22SaveRecent%22%3Afalse%2C%22TopN%22%3A%22%22%2C%22SearchText%22%3A%22(STRM%3A2166%20%7C%20STRM%3A2168)%20(ACAD_ORG%3A%5C%22%s%5C%22)%22%2C%22DeepLink%22%3Afalse%7D'

        return ''

    def doWork(self):
        url_part1 = "https://courses.my.harvard.edu/psc/courses/EMPLOYEE/EMPL/s/WEBLIB_IS_SCL.ISCRIPT1.FieldFormula.IScript_Search?SearchReqJSON={%22PageNumber%22%3A"
        year = int(time.strftime('%Y',time.localtime(time.time())))
        url_part2 = self.getUrlByYear(year)
        for k, v in self.dept_dict.items():
            if self.need_update_subject(v) == False:
                continue
            file_name = self.get_file_name(v, self.school)
            file_lines = self.countFileLineNum(file_name)
            f = self.open_db(file_name + ".tmp")
            self.count = 0

            print 'processing ' + v
            page = 0
            courseid_dict = {}

            while True:
                page += 1
                url = url_part1 + str(page) +url_part2.replace('%s', k)
                print url
                r = requests.get(url)
                jobj = json.loads(r.text)
                search_ok = False
                for obj in jobj[0]['ResultsCollection']:
                    search_ok = True
                    dept = obj['IS_SCL_DESCR_IS_SCL_DESCRD'].strip().lower()
                    if dept != v.strip().lower():
                        print dept + ' not match ' + v.strip().lower()
                        continue
                    if obj.has_key('IS_SCL_DESCR_IS_SCL_DESCRH') and 'Not Offered' == obj['IS_SCL_DESCR_IS_SCL_DESCRH']:
                        continue
                    if courseid_dict.has_key(obj['CRSE_ID']):
                        continue
                    else:
                        courseid_dict[obj['CRSE_ID']] = ''

                    title = obj['Title']
                    term = 'term:' + obj['IS_SCL_DESCR_IS_SCL_DESCRH']
                    instructors = "instructors:"
                    description = "description:"

                    print obj['Title'] + ' ' + obj['IS_SCL_DESCR_IS_SCL_DESCRH']
                    if obj.has_key('DESCRLONG_DETAILS'):
                        for author in obj['DESCRLONG_DETAILS']:
                            instructors += author['Name'] + ', '
                        instructors = instructors[0 : len(instructors) - 2]
                    elif obj.has_key('IS_SCL_DESCR_IS_SCL_DESCRL'):
                        instructors += obj['IS_SCL_DESCR_IS_SCL_DESCRL']
                    if instructors.endswith(':'):
                        instructors = ''

                    desc = self.utils.clearHtmlTag(obj['Description'].strip()).strip()
                    if desc != '':
                        description += desc
                    else:
                        description = ''
                    self.count += 1
                    self.write_db(f, 'harford-' + k + '-' + obj['CRSE_ID'], title, 'https://courses.my.harvard.edu/psp/courses/EMPLOYEE/EMPL/h/?tab=HU_CLASS_SEARCH&SearchReqJSON=%7B%22SearchText%22%3A%22%s%22%7D'.replace('%s', obj['CRSE_ID']), term + ' ' + instructors + ' ' + description)
                
                if jobj[2]['PageNumber'] == jobj[2]['TotalPages'] or search_ok == False:
                    break
            self.close_db(f)
            if file_lines != self.count and self.count > 0:
                self.do_upgrade_db(file_name)
                print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
            else:
                self.cancel_upgrade(file_name)
                print "no need upgrade\n"
开发者ID:wowdd1,项目名称:xlinkBook,代码行数:104,代码来源:update_harvard.py


注:本文中的utils.Utils.clearHtmlTag方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。