当前位置: 首页>>代码示例>>Python>>正文


Python Utils.removeDoubleSpace方法代码示例

本文整理汇总了Python中utils.Utils.removeDoubleSpace方法的典型用法代码示例。如果您正苦于以下问题:Python Utils.removeDoubleSpace方法的具体用法?Python Utils.removeDoubleSpace怎么用?Python Utils.removeDoubleSpace使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在utils.Utils的用法示例。


在下文中一共展示了Utils.removeDoubleSpace方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: EthzSpider

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
class EthzSpider(Spider):
    def __init__(self):
        Spider.__init__(self)
        self.school = "ethz"
        self.semkezDict = {}
        self.deptDict = {}
        self.utils = Utils()

    def processData(self, semkez, deptId, subject):
        print "processing " + semkez + " " + deptId + " " + subject

        r = requests.get('http://www.vvz.ethz.ch/Vorlesungsverzeichnis/sucheLehrangebot.do?wahlinfo=&seite=0&katalogdaten=&lerneinheitstitel=&studiengangTyp=&strukturAus=on&rufname=&bereichAbschnittId=0&lang=en&ansicht=3&lehrsprache=&studiengangAbschnittId=0&semkez=' + semkez + '&famname=&deptId=' + deptId + '&unterbereichAbschnittId=0&lerneinheitscode=')
        soup = BeautifulSoup(r.text)

        file_name = self.get_file_name(subject.lower(), self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0


        for a in soup.find_all('a'):
            if a.attrs.has_key('href') and a['href'].find('lerneinheitPre.do') != -1:
                title = self.utils.removeDoubleSpace(a.text.strip().replace('\n','').replace('\t', ''))
                if len(title) > 2:
                    print title
                    self.count += 1
                    self.write_db(f, self.school + "-" + str(deptId) + "-" + str(self.count), title, 'http://www.vvz.ethz.ch' + a['href'])

        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"


    def doWork(self):
        r = requests.get('http://www.vvz.ethz.ch/Vorlesungsverzeichnis/sucheLehrangebotPre.do?lang=en')
        soup = BeautifulSoup(r.text)
        for select in soup.find_all('select', class_='w50'):
            if select['name'] == "semkez":
                soup1 = BeautifulSoup(select.prettify())
                for option in soup1.find_all('option'):
                    if option.text.strip() != '':
                        self.semkezDict[option['value']] = option.text.strip()
       
            if select['name'] == "deptId":
                soup1 = BeautifulSoup(select.prettify())
                for option in soup1.find_all('option'):
                    if option.text.strip() != '':
                        self.deptDict[option['value']] = option.text.strip()

        for k, v in [(k,self.deptDict[k]) for k in self.deptDict.keys()]:
            if self.need_update_subject(v) == False:
                continue
            year = time.strftime("%Y") 
            for semkez in self.semkezDict.keys():
                if semkez[0 : 4] == year:
                    self.processData(semkez, k, v)
开发者ID:fahimkhan,项目名称:slma.link,代码行数:62,代码来源:update_ethz.py

示例2: doWork

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
    def doWork(self):
        url = "https://projects.apache.org/json/foundation/projects.json"
        r = requests.get(url)
        print r.text
        jobj = json.loads(r.text)
        utils = Utils()
        #print jobj
        file_name = self.get_file_name("eecs/apache/" + "apache-projects", self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0

        for obj in jobj:
             desc = "description:"
             title = jobj[obj]['name']
             link = jobj[obj]['homepage']
             if jobj[obj].get("description") != None:
                 desc += utils.removeDoubleSpace(jobj[obj]['description'].strip().replace('\n', ''))
             print title
             self.count += 1
             self.write_db(f, "apache-project-" + str(self.count), title[title.find(" ") :].strip(), link, desc)

        self.close_db(f)
        if self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"
开发者ID:amitahire,项目名称:numberWave,代码行数:31,代码来源:update_apache.py

示例3: doWork

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
    def doWork(self):
        utils = Utils()
        r = requests.get('http://www.cdf.toronto.edu/cs_courses/current_course_web_pages.html')
        soup = BeautifulSoup(r.text)

        file_name = self.get_file_name("eecs/" + self.subject, self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0

        for li in soup.find_all('li'):
            if li.a != None:
                line = utils.removeDoubleSpace(li.text.replace('\n', ''))
                line = line.replace('CSC ', 'CSC')
                course_id = line[0 : line.find(' ')]
                if course_id.startswith('CSC') == False:
                    continue
                title = line[line.find(' ') : ].strip()
                print course_id + ' ' + title
                self.count += 1
                self.write_db(f, course_id, title, li.a['href'])

        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"
开发者ID:wowdd1,项目名称:xlinkBook,代码行数:31,代码来源:update_toronto.py

示例4: processTR35

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
    def processTR35(self):
        utils = Utils()
        for i in range(0, 3):
            year = str(2013 + i)
            r = requests.get('http://www.technologyreview.com/lists/innovators-under-35/' + year)
            soup = BeautifulSoup(r.text)
            ul = soup.find('ul', class_='people')
            soup = BeautifulSoup(ul.prettify())

            file_name = self.get_file_name(self.subject + "/mit-tr35/tr35-" + year + "#", '')
            file_name = file_name[0 : file_name.find('#')]
            file_lines = self.countFileLineNum(file_name)
            f = self.open_db(file_name + ".tmp")
            self.count = 0

            for li in soup.find_all('li'):
                data = utils.removeDoubleSpace(li.text.strip().replace('\t', '').replace('\n', ''))
                title = data[0 : data.find(',')].strip()
                desc = 'description:' + data[data.find(',') + 1 :].strip() 
                print title
                print desc
                self.count += 1
                self.write_db(f, 'tr35-' + year + '-' + str(self.count), title, 'http://www.technologyreview.com/' + li.a['href'], desc)
            self.close_db(f)
            if file_lines != self.count and self.count > 0:
                self.do_upgrade_db(file_name)
                print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
            else:
                self.cancel_upgrade(file_name)
                print "no need upgrade\n"
开发者ID:amitahire,项目名称:numberWave,代码行数:32,代码来源:update_rank.py

示例5: processMacArthur

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
    def processMacArthur(self, url):
        utils = Utils()
        r = requests.get(url)
        soup = BeautifulSoup(r.text)

        file_name = self.get_file_name(self.subject + "/macArthur-all-fellows", '')
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0

        for table in soup.find_all('table', class_='multicol'):
            sp = BeautifulSoup(table.prettify())
            for li in sp.find_all('li'):
                url = ''
                if li.a != None:
                    url = 'https://en.wikipedia.org' + li.a['href']
                data = utils.removeDoubleSpace(li.text.strip().replace('\n', ''))
                title = data[0 : data.find(',')].strip()
                desc = "description:" + data[data.find(',') + 1 :].strip()
                print title
                self.count += 1
                self.write_db(f, 'macArthur-fellow-' + str(self.count), title, url, desc)

        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"
开发者ID:amitahire,项目名称:numberWave,代码行数:32,代码来源:update_rank.py

示例6: processComputerScienceData

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
    def processComputerScienceData(self, url):
        r = requests.get(url)
        utils = Utils()
        user_url = ''
        last_line = ''
        last_citations = ''
        remark = 'description:'
        file_name = self.get_file_name("eecs/people/computer-science-citations", self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0
        good_line = False 

        for line in r.text.split('\n'):
            good_line = False 
            remark = 'description:'
            if line.strip() == '':
                continue
            if line.find('<') != -1 and line.find('>') != -1:
                line = utils.clearHtmlTag(line).strip()
            else:
                line = line.strip()
            if len(line) < 5 or line.find('<a href') != -1:
                last_line = line
                continue
            if last_line != '':
                if last_line[0 : 1].isdigit():
                    good_line = True
                    line = utils.clearHtmlTag(last_line + ' ' + line)
                last_line = ''
            else:
                if line[0 : 1].isdigit() and line.find('(') > 0:
                    good_line = True
                    line = utils.removeDoubleSpace(line.replace('\n', ''))
            
            if good_line == False:
                continue 
            citations = line[0 : line.find(" ")]
            person = line[line.find(" ") + 1 : line.find("(")]
            place = line[line.find('(') + 1 : line.find(')')]
            info = line[line.find(')') + 2 :].strip()
            #print citations
            title = person
            #print info
            remark +=  citations + ' citations, ' + info
            if citations != last_citations:
                self.count += 1
            last_citations = citations
	    if title.find('>') != -1:
		title = title[title.find('>') + 1 :].strip()
            self.write_db(f, 'csc-' + str(self.count), title, '', 'university:' + place + ' ' + remark)

        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"
开发者ID:wowdd1,项目名称:xlinkBook,代码行数:61,代码来源:update_rank.py

示例7: doWork

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
    def doWork(self):
        utils = Utils()
        r = requests.get('http://jeffhuang.com/best_paper_awards.html')
        soup = BeautifulSoup(r.text)
        file_name = self.get_file_name("eecs/best_cs_paper", "")
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0

        for table in soup.find_all('table'):
            sp = BeautifulSoup(table.prettify())
            #for thead in sp.find_all('thead'):
            #    print thead.text.strip()
            
            year = ''
            line = ''
            conf = ''
            for td in sp.find_all('td'):
                if (td.text.strip() == 'Microsoft Research'):
                    break
                if td.a != None and td.a.attrs.has_key('name') and False == td.a.attrs.has_key('href'):
                    conference = td.a.text.strip()
                    conference = conference[0 : conference.find('(')].strip().lower()
                    continue
                #utils.removeDoubleSpace(tbody.text).strip()
                text = utils.removeDoubleSpace(td.text.replace('; et al.', '')).strip()
                if (len(text) == 4):
                    if len(line.strip()) > 4:
                        if conf == '':
                            conf = conference
                        self.writeLines(f, line, conf + '-' + year)
                        conf = conference
                        line = ''
                        line += text + '|'
                    else:
                        line += text + '|'
                    year = text
                else:
                    line += text + '|'
            if len(line.strip()) > 4:
                self.writeLines(f, line, conference + '-' + year)
        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"
开发者ID:amitahire,项目名称:numberWave,代码行数:50,代码来源:update_bp_cs.py

示例8: CSAILSpider

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
class CSAILSpider(Spider):

    def __init__(self):
        Spider.__init__(self)
        self.school = 'csail'
        self.utils = Utils()


    def doWork(self):
        r = requests.get('http://courses.csail.mit.edu/')
        soup = BeautifulSoup(r.text)

        file_name = self.get_file_name("eecs/ai", self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0

        for li in soup.find_all('li'):
            title = self.utils.removeDoubleSpace(li.a.text.strip())
            title = title.replace('\t', '')
            course_num = ''

            self.count += 1
            if title.find('.') != -1:
                course_num = title[0 : title.find(' ')]
                title = title[title.find(' ') + 1 :]
            else:
                course_num += self.school + '-' + str(self.count)   
                print title
            self.write_db(f, course_num, title, 'http://courses.csail.mit.edu/' + li.a['href'])
             

        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"
开发者ID:wowdd1,项目名称:xlinkBook,代码行数:41,代码来源:update_csail.py

示例9: AiPapersSpider

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
class AiPapersSpider(Spider):
    def __init__(self):
        Spider.__init__(self)
        self.school = 'aipapers'
        self.util = Utils()

    def doWork(self, jobj=None):
        if jobj == None:
            r = requests.get('http://cs.stanford.edu/people/karpathy/scholaroctopus/out.json')
            jobj = json.loads(r.text)

        paper_dict = {}
        for paper in jobj:
            key = paper['conference'] + '-' + str(paper['year'])
            if paper_dict.get(key, '')  == '':
                paper_dict[key] = []
            paper_dict[key].append(paper)

        for key, v in sorted([(k,paper_dict[k]) for k in sorted(paper_dict.keys())]):
            #print key + ' paper:' + str(len(paper_dict[key]))
            print 'processing ' + key
            file_name = self.get_file_name("eecs/" + self.school.lower() + '/' + key, "scholaroctopus")
            file_lines = self.countFileLineNum(file_name)
            if file_lines == len(v):
                continue
            f = self.open_db(file_name + ".tmp")
            self.count = 0

            for paper in sorted(v, key=lambda item : item['title']):
                self.count += 1
                paper_id = key.lower() + '-' + str(self.count)
                self.write_db(f, paper_id, paper['title'].strip(), paper['pdf'], 'author:' + ', '.join(paper['authors']))
                print paper_id + ' ' + paper['title'].strip()

            self.close_db(f)
            if file_lines != self.count and self.count > 0:
                self.do_upgrade_db(file_name)
                print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
            else:
                self.cancel_upgrade(file_name)
                print "no need upgrade\n"

    def initPaperObj(self, conference, year):
        paper_obj = {}
        paper_obj['conference'] = conference
        paper_obj['x'] = 0.34087790145304875
        paper_obj['y'] = 0.25455838359147459
        paper_obj['year'] = year
        paper_obj['pdf'] = ''
        paper_obj['authors'] = ''
        paper_obj['title'] = ''
        return paper_obj

    def getCvPaper(self):
        conference = ''
        year = ''
        r = requests.get('http://cvpapers.com')
        sp = BeautifulSoup(r.text)
        ul = sp.find('ul')
        sp = BeautifulSoup(ul.prettify())
        for li in sp.find_all('li'):
            conference = li.text[0 : li.text.find(':')].strip().lower()
            if conference.find('(') != -1:
                conference = conference[0 : conference.find('(')].strip().lower()
            sp1 = BeautifulSoup(li.prettify())
            for a in sp1.find_all('a'):
                year = a.text.strip()
                if year.startswith('BMVC'):
                    continue

                r = requests.get('http://cvpapers.com/' + a['href'])
                print conference + ' ' + year
                soup = BeautifulSoup(r.text)
                paper_list = []

                i = 0
                for dl in soup.find_all('dl'):
                    if dl.parent.name == 'dl':
                        continue
                    soup1 = BeautifulSoup(dl.prettify())
                    paper_obj = {}
                    for dt in soup1.find_all('dt'):
                        paper_obj = self.initPaperObj(conference, year)
                        #print dt.text[0 : dt.text.find('(')].strip()
                        if dt.text.find('(') != -1 :
                            paper_obj['title'] = self.util.removeDoubleSpace(dt.text[0 : dt.text.find('(')].replace('\n','')).strip()
                        else:
                            paper_obj['title'] = self.util.removeDoubleSpace(dt.text.replace('\n',''))
                        soup2 = BeautifulSoup(dt.prettify())
                        for a in soup2.find_all('a'):
                            paper_obj[a.text.strip().lower()] = a['href']
                        if paper_obj.get('pdf','') == '':
                            paper_obj['pdf'] = 'https://scholar.google.com/scholar?hl=en&q=' + paper_obj['title']
                        paper_list.append(paper_obj)

                    if len(soup1.find_all('dd')) != len(soup1.find_all('dt')):
                        i += len(soup1.find_all('dt'))
                        continue
                    for dd in soup1.find_all('dd'):
                        author_list = []
#.........这里部分代码省略.........
开发者ID:dtbinh,项目名称:random_walk,代码行数:103,代码来源:update_ai_papers.py

示例10: ProjectsSpider

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
class ProjectsSpider(Spider):
    def __init__(self):
        Spider.__init__(self)
        self.school = "projects"
        self.utils = Utils()

    def doWork(self):
        """
       self.getDARPAProjects() 
       self.getDARPAOpenProjects()
       self.getAIProjects()
       self.getDotNetFoundationProjects()

       self.getMicrosoftResearch()
       self.getAICProjects()
       self.getDARPAWikiProjects()
       """
        self.getSRIProject()

    def getSRIProject(self):
        r = requests.get("http://www.sri.com/work/projects")
        soup = BeautifulSoup(r.text)
        for a in soup.find_all("a"):
            if a["href"].startswith("/work/projects/"):
                if a.parent != None and a.parent.prettify().startswith("<span"):
                    subject = a.text.replace(" ", "-")
                    file_name = self.get_file_name(self.school + "/SRI/" + subject, self.school)
                    file_lines = self.countFileLineNum(file_name)
                    f = self.open_db(file_name + ".tmp")
                    self.count = 0
                    r2 = requests.get("http://www.sri.com" + a["href"])
                    soup2 = BeautifulSoup(r2.text)
                    for div in soup2.find_all("div", class_="events_inner"):
                        soup3 = BeautifulSoup(div.prettify())
                        title = soup3.find("div", class_="events_inner_title").text.strip()
                        link = "http://www.sri.com" + soup3.find("div", class_="events_inner_title").a["href"]
                        desc = "description:" + soup3.find("div", class_="events_inner_teaser").text.strip()
                        print title
                        self.count += 1
                        self.write_db(f, "sri-project-" + str(self.count), title, link, desc)
                    self.close_db(f)
                    if file_lines != self.count and self.count > 0:
                        self.do_upgrade_db(file_name)
                        print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
                    else:
                        self.cancel_upgrade(file_name)
                        print "no need upgrade\n"

    def getAICProjects(self):
        r = requests.get("http://www.ai.sri.com/project_list/mode=All&sort=titleAsc")
        soup = BeautifulSoup(r.text)
        pages = 0

        file_name = self.get_file_name("eecs/" + self.school + "/" + "AIC", self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0

        for a in soup.find_all("a"):
            if a.text.strip() == "End":
                pages = int(a["href"][len(a["href"]) - 1 :])

        for page in range(0, pages + 1):
            r2 = requests.get("http://www.ai.sri.com/project_list/mode=All&sort=titleAsc&page=" + str(page))
            soup2 = BeautifulSoup(r2.text)
            for td in soup2.find_all("td", class_="project"):
                if td.h2 != None:
                    title = td.h2.a.text
                    desc = "description:" + self.utils.removeDoubleSpace(td.p.text.replace("\n", ""))
                    print title
                    self.count += 1
                    self.write_db(
                        f, "aic-project-" + str(self.count), title, "http://www.ai.sri.com" + td.h2.a["href"], desc
                    )
        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"

    def getMicrosoftResearch(self):
        file_name = self.get_file_name("eecs/" + self.school + "/" + "microsoft-research", self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0
        for page in range(1, 13):
            r = requests.get(
                "http://research.microsoft.com/apps/catalog/default.aspx?p="
                + str(page)
                + "&sb=no&ps=100&t=projects&sf=&s=&r=&vr=&ra="
            )
            soup = BeautifulSoup(r.text)
            for div in soup.find_all("div", class_="l"):
                sp = BeautifulSoup(div.prettify())
                name_div = sp.find("div", class_="name")
                desc_div = sp.find("div", class_="desc")
                title = name_div.a.text.strip()
                desc = "description:" + self.utils.removeDoubleSpace(desc_div.text.strip().replace("\n", ""))
#.........这里部分代码省略.........
开发者ID:Django-27,项目名称:deep_em,代码行数:103,代码来源:update_projects.py

示例11: LabsSpider

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]

#.........这里部分代码省略.........
            r = requests.get(url)
            soup = BeautifulSoup(r.text)
            for p in soup.find_all('p', class_='TrigListings'):
                self.count += 1
                self.write_db(f, "mit-" + subject.replace(' ','-').lower() + "-" + str(self.count), p.a.text, p.a['href'])
                print p.a.text

        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"

    def getBerkeleyEECSLabs(self):
        r = requests.get('http://www.eecs.berkeley.edu/Research/Areas/Centers/')
        soup = BeautifulSoup(r.text)
        start = False

        file_name = self.get_file_name(self.school + "/" + "berkeley-eecs-labs", self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0

        for li in soup.find_all('li'):
            if li.a != None and li.a['href'].startswith('http'):
                
                title = li.a.text.strip()
                if title.startswith('Algorithms'):
                    start = True
                if start:
                    self.count += 1
                    self.write_db(f, 'berkeley-eecs-lab-' + str(self.count), title, li.a['href'], 'description:' + self.utils.removeDoubleSpace(li.p.text.strip().replace('\n', '')))
                    print title
        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"

    def getSocialRobotLabs(self):
        r = requests.get('https://en.wikipedia.org/wiki/Social_robot')
        soup = BeautifulSoup(r.text)

        file_name = self.get_file_name(self.school + "/" + "Social-robot-labs", self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0

        for a in soup.find_all('a', class_='external text'):
            print a.text    
            self.count += 1
            self.write_db(f, 'social-robot-lab-' + str(self.count), a.text, a['href'])
            if a.text.startswith('Department'):
                break

        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"
开发者ID:amitahire,项目名称:numberWave,代码行数:70,代码来源:update_labs.py

示例12: ProjectPaperSpider

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
class ProjectPaperSpider(Spider):
    def __init__(self):
        Spider.__init__(self)
        self.utils = Utils()
        self.school = "project-papers"

    def doWork(self):
        self.getWastonPapers()
        self.getSTAIRPapers()
        self.getSTARTPapers()
        self.getSparkPapers()
        self.getHadoopPapers()
        #self.getRoboBrainPapers()
        self.getMobileEyePapers()
        self.getAutonomousDrivingPapers()

        self.getCALOPapers()
    def getCALOPapers(self):
        r = requests.get('http://www.ai.sri.com/pubs/search.php?project=179')
        soup = BeautifulSoup(r.text)
        file_name = self.get_file_name("eecs/" + self.school + "/" + "CALO", self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0

        for li in soup.find_all('li'):
            title = li.p.strong.text.strip()
            link = 'http://www.ai.sri.com' + li.p.a['href']
            print title
            self.count += 1
            self.write_db(f, 'calo-' + str(self.count), title, link)

        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"

    def getAutonomousDrivingPapers(self):
        r = requests.get("http://driving.stanford.edu/papers.html")
        soup = BeautifulSoup(r.text)
        title = ""
        author = ""
        journal = ""
        desc = ""
        url = ""
        file_name = self.get_file_name("eecs/" + self.school + "/" + "AutonomousDriving", self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0
        for p in soup.find_all("p"):
            if p.span != None:
                sp = BeautifulSoup(p.prettify())
                title = sp.find('span', class_='papertitle').text.strip()
                author = "author:" + self.utils.removeDoubleSpace(sp.find('span', class_='authors').text.strip().replace('\n', '')) + " "
                journal = "journal:" + sp.find('span', class_='meeting').text.strip()
                journal += " " + sp.find('span', class_='year').text.strip() + " "
                desc = "description:" +  self.utils.removeDoubleSpace(sp.find('span', class_='summary').text.strip().replace('\n', ''))
            if p.a != None and p.a['href'].find(".pdf") != -1:
                if p.a['href'].startswith('http'):
                    url = p.a['href']
                else:
                    url = 'http://driving.stanford.edu/' + p.a['href']
                self.count += 1
                self.write_db(f, "autonomousdriving-paper-" + str(self.count), title, url, author + journal + desc)
                title = ""
                author = ""
                journal = ""
                desc = ""
                url = "" 
        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"

    def getMobileEyePapers(self):
        file_name = self.get_file_name("eecs/" + self.school + "/" + "MobileEye", self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0

        for i in range(1, 3):
            r = requests.get("http://www.mobileye.com/technology/mobileye-research/page/" + str(i))
            soup = BeautifulSoup(r.text)

            for div in soup.find_all("div", class_="ContentItemText"):
                title = div.h2.text.strip()
                link = div.h2.a['href']
                author = "author:" + div.p.text.strip()
                print title
                self.count += 1
                self.write_db(f, "mobileeye-paper-" + str(self.count), title, link, author)
        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
#.........这里部分代码省略.........
开发者ID:amitahire,项目名称:numberWave,代码行数:103,代码来源:update_project_papers.py

示例13: HarvardOnlineSpider

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]

#.........这里部分代码省略.........
                link = "http://www.extension.harvard.edu" + str(li.a["href"]).strip()
                if self.deep_mind:
                    link, description = self.getMoreInfo(link)
                count = count + 1
                print course_num + " " + title + " " + link
                self.write_db(f, course_num, title, link, description)
                if course_dict != None:
                    if course_num.startswith("CSCI"):
                        course_num = course_num.replace("CSCI", "CS")
                    course_dict[course_num] = CourseRecord(
                        self.get_storage_format(course_num, title, link, description)
                    )
        else:
            for li in soup.find_all("li"):
                if li.attrs.has_key("class"):
                    if li.prettify().find("E-") != -1 and str(li.a["href"]).startswith("/courses"):
                        for item in li.prettify().split("\n"):
                            if item.find("E-") != -1:
                                course_num = item.replace(" E-", "").strip()
                        count = count + 1
                        title = li.a.string.strip()
                        link = "http://www.extension.harvard.edu" + str(li.a["href"]).strip()
                        if self.deep_mind:
                            link, description = self.getMoreInfo(link)
                        print course_num + " " + title + " " + link
                        self.write_db(f, course_num, title, link, description)
                        if course_dict != None:
                            if course_num.startswith("E-"):
                                course_num = course_num.replace("CSCI", "CS")
                            course_dict[course_num] = CourseRecord(
                                self.get_storage_format(course_num, title, link, description)
                            )
        self.close_db(f)
        if file_lines != count and count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"

    def getCourseDict(self, subject):
        course_dict = {}
        r = requests.get(self.url)
        soup = BeautifulSoup(r.text)

        for li in soup.find_all("li", class_="is-more-items"):
            if li.a.string.lower() == subject.lower():
                self.getHarvardOnlineCourse(
                    li.a.string, "http://www.extension.harvard.edu" + str(li.a["href"]), course_dict
                )
        return course_dict

    def getHarvardCourse(self, subject, url):
        r = requests.get(url)
        soup = BeautifulSoup(r.text)
        code = ""

        file_name = self.get_file_name(subject.lower(), self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0

        for div in soup.find_all("div", class_="view-group"):
            sp = BeautifulSoup(div.prettify())
            for div2 in sp.find_all("div"):
                code = div2.span.text.strip()
                link = "http://www.extension.harvard.edu" + div2.a["href"]
                title = self.utils.removeDoubleSpace(div2.text.replace("\n", "")).replace(" E- ", "-E")
                for t in title[title.find(code.upper()) :].split(code.upper()):
                    if t != "":
                        title = code.upper() + t
                        course_num = title[0 : title.find(" ")].strip()
                        if title.find("(") != -1:
                            title = title[title.find(" ") : title.find("(")].strip()
                        else:
                            title = title[title.find(" ") :].strip()
                        print course_num + " " + title
                        self.count += 1
                        self.write_db(f, course_num, title, link)
        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"

    def doWork(self):
        print "downloading harvard online course info"
        # r = requests.get("http://www.extension.harvard.edu/courses/subject/computer-science")
        r = requests.get(self.url)
        soup = BeautifulSoup(r.text)

        # for li in soup.find_all("li", class_ = "is-more-items"):
        #    self.getHarvardOnlineCourse(li.a.string, "http://www.extension.harvard.edu" + str(li.a["href"]))
        for div in soup.find_all("div", class_="view-content"):
            print div.a.text
            if self.need_update_subject(div.a.text) == False:
                continue
            self.getHarvardCourse(div.a.text, "http://www.extension.harvard.edu" + div.a["href"])
开发者ID:fahimkhan,项目名称:slma.link,代码行数:104,代码来源:update_harvard_online.py

示例14: GraphPapersSpider

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
class GraphPapersSpider(Spider):
    def __init__(self):
        Spider.__init__(self)
        self.school = 'GraphPapers'
        self.util = Utils()


    def getPapers(self, conference, year, link):
        print link
        r = requests.get(link)
        soup = BeautifulSoup(r.text) 
        paper_list = []
        i = 0

        file_name = self.get_file_name("eecs/" + self.school.lower() + '/' + conference + year, '')
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0

        for dl in soup.find_all('dl'):
            soup1 = BeautifulSoup(dl.prettify())
            for dt in soup1.find_all('dt'):
                title = dt.text.strip().replace('\n', '')
                #if title.startswith('('):
                #    continue
                if title.find('(') != -1:
                    title = title[0 : title.find('(')].strip()
                paper_list.append(title)
            if len(soup1.find_all('dd')) != len(soup1.find_all('dt')):
                i += len(soup1.find_all('dt'))
                continue
            for dd in soup1.find_all('dd'):
                author_list = []
                if dd.text.find('),') != -1:
                    for author in self.util.removeDoubleSpace(dd.text.strip().replace('\n', '')).split('),'):
                        author_list.append(author + ')')
                elif dd.text.find(',') != -1:
                    for author in self.util.removeDoubleSpace(dd.text.strip().replace('\n', '')).split(','):
                        author_list.append(author)
                else:
                    author_list.append(self.util.removeDoubleSpace(dd.text.strip().replace('\n', '')))
                if paper_list[i] != '':
                    self.count += 1
                    paper_id = conference + year + '-' + str(self.count)
                    self.write_db(f, paper_id, paper_list[i], '', 'author:' + ' '.join(author_list))

                    print paper_list[i] + '  ' + ','.join(author_list)
                i += 1
        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n" 

    def doWork(self):
        r = requests.get('http://kesen.realtimerendering.com/')
        for line in r.text.split('\n'):
            if line.startswith('<li><a') and line.lower().find('.htm') != -1 and line.find('http://') == -1:
                data = line[line.find('"') + 1 : line.find('"', line.find('"') + 1)]
                if data.find('20') == -1:
                    continue
                conference = data[0 : data.find('20')]
                year = ''
                if data.lower().find('paper') != -1:
                    year = data[data.find('20') : data.lower().find('paper')]
                else:
                    year = data[data.find('20') : data.find('.htm')]
                print conference + ' -  - - - ' + year
                self.getPapers(conference, year, 'http://kesen.realtimerendering.com/' + data)
开发者ID:Django-27,项目名称:deep_em,代码行数:73,代码来源:update_graph_papers.py

示例15: ProjectsSpider

# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]

#.........这里部分代码省略.........
        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"

    def getMitMediaProjects(self):
        r = requests.get('https://www.media.mit.edu/research/groups-projects')
        soup = BeautifulSoup(r.text)
        for span in soup.find_all('span', class_='field-content'):
            if span.a != None and span.a.text.startswith('more') == False:
                subject = span.a.text.strip().lower().replace(' ', '-')
                print subject
                file_name = self.get_file_name("projects/MIT-MEDIA-LAB/" + subject, self.school)
                file_lines = self.countFileLineNum(file_name)
                f = self.open_db(file_name + ".tmp")
                self.count = 0
 
                r = requests.get('https://www.media.mit.edu' + span.a['href'])
                sp = BeautifulSoup(r.text)
                for li in sp.find_all('li'):
                    if li.div != None and li.div.h2 != None:
                        link = ''
                        title = ''
                        desc = ''
                        sp1 = BeautifulSoup(li.div.prettify())
                        for a in sp1.find_all('a'):
                            if a.text.strip() == 'view site':
                                link = a['href']
                        title = li.div.h2.text.strip()
                        print title
                        desc = 'description:' + self.utils.removeDoubleSpace(li.div.div.text.strip().replace('\n', ''))
                        self.count += 1
                        self.write_db(f, "mit-media-" + subject + '-' + str(self.count), title, link, desc)
                self.close_db(f)
                if file_lines != self.count and self.count > 0:
                    self.do_upgrade_db(file_name)
                    print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
                else:
                    self.cancel_upgrade(file_name)
                    print "no need upgrade\n"

    def getOpenSourceRobotProjects(self):
        r = requests.get('https://en.wikipedia.org/wiki/Open-source_robotics')
        soup = BeautifulSoup(r.text)
        soup = BeautifulSoup(soup.find('table', class_='wikitable').prettify())

        file_name = self.get_file_name("eecs/projects/" + "open-source-robot", self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0

        for a in soup.find_all('a'):
            title = self.utils.removeDoubleSpace(a.text.strip().replace('\n', ''))
            if title.startswith('[') or title == 'Arduino' or title == 'Self-balancing robot' or title == 'Modular design':
                continue
            print title
            self.count += 1
            self.write_db(f, "open-source-robot-project-" + str(self.count), title, 'https://en.wikipedia.org' + a['href'])

        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
开发者ID:amitahire,项目名称:numberWave,代码行数:70,代码来源:update_projects.py


注:本文中的utils.Utils.removeDoubleSpace方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。