本文整理汇总了Python中utils.Utils.removeDoubleSpace方法的典型用法代码示例。如果您正苦于以下问题:Python Utils.removeDoubleSpace方法的具体用法?Python Utils.removeDoubleSpace怎么用?Python Utils.removeDoubleSpace使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类utils.Utils
的用法示例。
在下文中一共展示了Utils.removeDoubleSpace方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: EthzSpider
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
class EthzSpider(Spider):
def __init__(self):
Spider.__init__(self)
self.school = "ethz"
self.semkezDict = {}
self.deptDict = {}
self.utils = Utils()
def processData(self, semkez, deptId, subject):
print "processing " + semkez + " " + deptId + " " + subject
r = requests.get('http://www.vvz.ethz.ch/Vorlesungsverzeichnis/sucheLehrangebot.do?wahlinfo=&seite=0&katalogdaten=&lerneinheitstitel=&studiengangTyp=&strukturAus=on&rufname=&bereichAbschnittId=0&lang=en&ansicht=3&lehrsprache=&studiengangAbschnittId=0&semkez=' + semkez + '&famname=&deptId=' + deptId + '&unterbereichAbschnittId=0&lerneinheitscode=')
soup = BeautifulSoup(r.text)
file_name = self.get_file_name(subject.lower(), self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for a in soup.find_all('a'):
if a.attrs.has_key('href') and a['href'].find('lerneinheitPre.do') != -1:
title = self.utils.removeDoubleSpace(a.text.strip().replace('\n','').replace('\t', ''))
if len(title) > 2:
print title
self.count += 1
self.write_db(f, self.school + "-" + str(deptId) + "-" + str(self.count), title, 'http://www.vvz.ethz.ch' + a['href'])
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
def doWork(self):
r = requests.get('http://www.vvz.ethz.ch/Vorlesungsverzeichnis/sucheLehrangebotPre.do?lang=en')
soup = BeautifulSoup(r.text)
for select in soup.find_all('select', class_='w50'):
if select['name'] == "semkez":
soup1 = BeautifulSoup(select.prettify())
for option in soup1.find_all('option'):
if option.text.strip() != '':
self.semkezDict[option['value']] = option.text.strip()
if select['name'] == "deptId":
soup1 = BeautifulSoup(select.prettify())
for option in soup1.find_all('option'):
if option.text.strip() != '':
self.deptDict[option['value']] = option.text.strip()
for k, v in [(k,self.deptDict[k]) for k in self.deptDict.keys()]:
if self.need_update_subject(v) == False:
continue
year = time.strftime("%Y")
for semkez in self.semkezDict.keys():
if semkez[0 : 4] == year:
self.processData(semkez, k, v)
示例2: doWork
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
def doWork(self):
url = "https://projects.apache.org/json/foundation/projects.json"
r = requests.get(url)
print r.text
jobj = json.loads(r.text)
utils = Utils()
#print jobj
file_name = self.get_file_name("eecs/apache/" + "apache-projects", self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for obj in jobj:
desc = "description:"
title = jobj[obj]['name']
link = jobj[obj]['homepage']
if jobj[obj].get("description") != None:
desc += utils.removeDoubleSpace(jobj[obj]['description'].strip().replace('\n', ''))
print title
self.count += 1
self.write_db(f, "apache-project-" + str(self.count), title[title.find(" ") :].strip(), link, desc)
self.close_db(f)
if self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
示例3: doWork
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
def doWork(self):
utils = Utils()
r = requests.get('http://www.cdf.toronto.edu/cs_courses/current_course_web_pages.html')
soup = BeautifulSoup(r.text)
file_name = self.get_file_name("eecs/" + self.subject, self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for li in soup.find_all('li'):
if li.a != None:
line = utils.removeDoubleSpace(li.text.replace('\n', ''))
line = line.replace('CSC ', 'CSC')
course_id = line[0 : line.find(' ')]
if course_id.startswith('CSC') == False:
continue
title = line[line.find(' ') : ].strip()
print course_id + ' ' + title
self.count += 1
self.write_db(f, course_id, title, li.a['href'])
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
示例4: processTR35
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
def processTR35(self):
utils = Utils()
for i in range(0, 3):
year = str(2013 + i)
r = requests.get('http://www.technologyreview.com/lists/innovators-under-35/' + year)
soup = BeautifulSoup(r.text)
ul = soup.find('ul', class_='people')
soup = BeautifulSoup(ul.prettify())
file_name = self.get_file_name(self.subject + "/mit-tr35/tr35-" + year + "#", '')
file_name = file_name[0 : file_name.find('#')]
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for li in soup.find_all('li'):
data = utils.removeDoubleSpace(li.text.strip().replace('\t', '').replace('\n', ''))
title = data[0 : data.find(',')].strip()
desc = 'description:' + data[data.find(',') + 1 :].strip()
print title
print desc
self.count += 1
self.write_db(f, 'tr35-' + year + '-' + str(self.count), title, 'http://www.technologyreview.com/' + li.a['href'], desc)
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
示例5: processMacArthur
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
def processMacArthur(self, url):
utils = Utils()
r = requests.get(url)
soup = BeautifulSoup(r.text)
file_name = self.get_file_name(self.subject + "/macArthur-all-fellows", '')
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for table in soup.find_all('table', class_='multicol'):
sp = BeautifulSoup(table.prettify())
for li in sp.find_all('li'):
url = ''
if li.a != None:
url = 'https://en.wikipedia.org' + li.a['href']
data = utils.removeDoubleSpace(li.text.strip().replace('\n', ''))
title = data[0 : data.find(',')].strip()
desc = "description:" + data[data.find(',') + 1 :].strip()
print title
self.count += 1
self.write_db(f, 'macArthur-fellow-' + str(self.count), title, url, desc)
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
示例6: processComputerScienceData
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
def processComputerScienceData(self, url):
r = requests.get(url)
utils = Utils()
user_url = ''
last_line = ''
last_citations = ''
remark = 'description:'
file_name = self.get_file_name("eecs/people/computer-science-citations", self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
good_line = False
for line in r.text.split('\n'):
good_line = False
remark = 'description:'
if line.strip() == '':
continue
if line.find('<') != -1 and line.find('>') != -1:
line = utils.clearHtmlTag(line).strip()
else:
line = line.strip()
if len(line) < 5 or line.find('<a href') != -1:
last_line = line
continue
if last_line != '':
if last_line[0 : 1].isdigit():
good_line = True
line = utils.clearHtmlTag(last_line + ' ' + line)
last_line = ''
else:
if line[0 : 1].isdigit() and line.find('(') > 0:
good_line = True
line = utils.removeDoubleSpace(line.replace('\n', ''))
if good_line == False:
continue
citations = line[0 : line.find(" ")]
person = line[line.find(" ") + 1 : line.find("(")]
place = line[line.find('(') + 1 : line.find(')')]
info = line[line.find(')') + 2 :].strip()
#print citations
title = person
#print info
remark += citations + ' citations, ' + info
if citations != last_citations:
self.count += 1
last_citations = citations
if title.find('>') != -1:
title = title[title.find('>') + 1 :].strip()
self.write_db(f, 'csc-' + str(self.count), title, '', 'university:' + place + ' ' + remark)
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
示例7: doWork
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
def doWork(self):
utils = Utils()
r = requests.get('http://jeffhuang.com/best_paper_awards.html')
soup = BeautifulSoup(r.text)
file_name = self.get_file_name("eecs/best_cs_paper", "")
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for table in soup.find_all('table'):
sp = BeautifulSoup(table.prettify())
#for thead in sp.find_all('thead'):
# print thead.text.strip()
year = ''
line = ''
conf = ''
for td in sp.find_all('td'):
if (td.text.strip() == 'Microsoft Research'):
break
if td.a != None and td.a.attrs.has_key('name') and False == td.a.attrs.has_key('href'):
conference = td.a.text.strip()
conference = conference[0 : conference.find('(')].strip().lower()
continue
#utils.removeDoubleSpace(tbody.text).strip()
text = utils.removeDoubleSpace(td.text.replace('; et al.', '')).strip()
if (len(text) == 4):
if len(line.strip()) > 4:
if conf == '':
conf = conference
self.writeLines(f, line, conf + '-' + year)
conf = conference
line = ''
line += text + '|'
else:
line += text + '|'
year = text
else:
line += text + '|'
if len(line.strip()) > 4:
self.writeLines(f, line, conference + '-' + year)
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
示例8: CSAILSpider
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
class CSAILSpider(Spider):
def __init__(self):
Spider.__init__(self)
self.school = 'csail'
self.utils = Utils()
def doWork(self):
r = requests.get('http://courses.csail.mit.edu/')
soup = BeautifulSoup(r.text)
file_name = self.get_file_name("eecs/ai", self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for li in soup.find_all('li'):
title = self.utils.removeDoubleSpace(li.a.text.strip())
title = title.replace('\t', '')
course_num = ''
self.count += 1
if title.find('.') != -1:
course_num = title[0 : title.find(' ')]
title = title[title.find(' ') + 1 :]
else:
course_num += self.school + '-' + str(self.count)
print title
self.write_db(f, course_num, title, 'http://courses.csail.mit.edu/' + li.a['href'])
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
示例9: AiPapersSpider
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
class AiPapersSpider(Spider):
def __init__(self):
Spider.__init__(self)
self.school = 'aipapers'
self.util = Utils()
def doWork(self, jobj=None):
if jobj == None:
r = requests.get('http://cs.stanford.edu/people/karpathy/scholaroctopus/out.json')
jobj = json.loads(r.text)
paper_dict = {}
for paper in jobj:
key = paper['conference'] + '-' + str(paper['year'])
if paper_dict.get(key, '') == '':
paper_dict[key] = []
paper_dict[key].append(paper)
for key, v in sorted([(k,paper_dict[k]) for k in sorted(paper_dict.keys())]):
#print key + ' paper:' + str(len(paper_dict[key]))
print 'processing ' + key
file_name = self.get_file_name("eecs/" + self.school.lower() + '/' + key, "scholaroctopus")
file_lines = self.countFileLineNum(file_name)
if file_lines == len(v):
continue
f = self.open_db(file_name + ".tmp")
self.count = 0
for paper in sorted(v, key=lambda item : item['title']):
self.count += 1
paper_id = key.lower() + '-' + str(self.count)
self.write_db(f, paper_id, paper['title'].strip(), paper['pdf'], 'author:' + ', '.join(paper['authors']))
print paper_id + ' ' + paper['title'].strip()
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
def initPaperObj(self, conference, year):
paper_obj = {}
paper_obj['conference'] = conference
paper_obj['x'] = 0.34087790145304875
paper_obj['y'] = 0.25455838359147459
paper_obj['year'] = year
paper_obj['pdf'] = ''
paper_obj['authors'] = ''
paper_obj['title'] = ''
return paper_obj
def getCvPaper(self):
conference = ''
year = ''
r = requests.get('http://cvpapers.com')
sp = BeautifulSoup(r.text)
ul = sp.find('ul')
sp = BeautifulSoup(ul.prettify())
for li in sp.find_all('li'):
conference = li.text[0 : li.text.find(':')].strip().lower()
if conference.find('(') != -1:
conference = conference[0 : conference.find('(')].strip().lower()
sp1 = BeautifulSoup(li.prettify())
for a in sp1.find_all('a'):
year = a.text.strip()
if year.startswith('BMVC'):
continue
r = requests.get('http://cvpapers.com/' + a['href'])
print conference + ' ' + year
soup = BeautifulSoup(r.text)
paper_list = []
i = 0
for dl in soup.find_all('dl'):
if dl.parent.name == 'dl':
continue
soup1 = BeautifulSoup(dl.prettify())
paper_obj = {}
for dt in soup1.find_all('dt'):
paper_obj = self.initPaperObj(conference, year)
#print dt.text[0 : dt.text.find('(')].strip()
if dt.text.find('(') != -1 :
paper_obj['title'] = self.util.removeDoubleSpace(dt.text[0 : dt.text.find('(')].replace('\n','')).strip()
else:
paper_obj['title'] = self.util.removeDoubleSpace(dt.text.replace('\n',''))
soup2 = BeautifulSoup(dt.prettify())
for a in soup2.find_all('a'):
paper_obj[a.text.strip().lower()] = a['href']
if paper_obj.get('pdf','') == '':
paper_obj['pdf'] = 'https://scholar.google.com/scholar?hl=en&q=' + paper_obj['title']
paper_list.append(paper_obj)
if len(soup1.find_all('dd')) != len(soup1.find_all('dt')):
i += len(soup1.find_all('dt'))
continue
for dd in soup1.find_all('dd'):
author_list = []
#.........这里部分代码省略.........
示例10: ProjectsSpider
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
class ProjectsSpider(Spider):
def __init__(self):
Spider.__init__(self)
self.school = "projects"
self.utils = Utils()
def doWork(self):
"""
self.getDARPAProjects()
self.getDARPAOpenProjects()
self.getAIProjects()
self.getDotNetFoundationProjects()
self.getMicrosoftResearch()
self.getAICProjects()
self.getDARPAWikiProjects()
"""
self.getSRIProject()
def getSRIProject(self):
r = requests.get("http://www.sri.com/work/projects")
soup = BeautifulSoup(r.text)
for a in soup.find_all("a"):
if a["href"].startswith("/work/projects/"):
if a.parent != None and a.parent.prettify().startswith("<span"):
subject = a.text.replace(" ", "-")
file_name = self.get_file_name(self.school + "/SRI/" + subject, self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
r2 = requests.get("http://www.sri.com" + a["href"])
soup2 = BeautifulSoup(r2.text)
for div in soup2.find_all("div", class_="events_inner"):
soup3 = BeautifulSoup(div.prettify())
title = soup3.find("div", class_="events_inner_title").text.strip()
link = "http://www.sri.com" + soup3.find("div", class_="events_inner_title").a["href"]
desc = "description:" + soup3.find("div", class_="events_inner_teaser").text.strip()
print title
self.count += 1
self.write_db(f, "sri-project-" + str(self.count), title, link, desc)
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
def getAICProjects(self):
r = requests.get("http://www.ai.sri.com/project_list/mode=All&sort=titleAsc")
soup = BeautifulSoup(r.text)
pages = 0
file_name = self.get_file_name("eecs/" + self.school + "/" + "AIC", self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for a in soup.find_all("a"):
if a.text.strip() == "End":
pages = int(a["href"][len(a["href"]) - 1 :])
for page in range(0, pages + 1):
r2 = requests.get("http://www.ai.sri.com/project_list/mode=All&sort=titleAsc&page=" + str(page))
soup2 = BeautifulSoup(r2.text)
for td in soup2.find_all("td", class_="project"):
if td.h2 != None:
title = td.h2.a.text
desc = "description:" + self.utils.removeDoubleSpace(td.p.text.replace("\n", ""))
print title
self.count += 1
self.write_db(
f, "aic-project-" + str(self.count), title, "http://www.ai.sri.com" + td.h2.a["href"], desc
)
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
def getMicrosoftResearch(self):
file_name = self.get_file_name("eecs/" + self.school + "/" + "microsoft-research", self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for page in range(1, 13):
r = requests.get(
"http://research.microsoft.com/apps/catalog/default.aspx?p="
+ str(page)
+ "&sb=no&ps=100&t=projects&sf=&s=&r=&vr=&ra="
)
soup = BeautifulSoup(r.text)
for div in soup.find_all("div", class_="l"):
sp = BeautifulSoup(div.prettify())
name_div = sp.find("div", class_="name")
desc_div = sp.find("div", class_="desc")
title = name_div.a.text.strip()
desc = "description:" + self.utils.removeDoubleSpace(desc_div.text.strip().replace("\n", ""))
#.........这里部分代码省略.........
示例11: LabsSpider
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
#.........这里部分代码省略.........
r = requests.get(url)
soup = BeautifulSoup(r.text)
for p in soup.find_all('p', class_='TrigListings'):
self.count += 1
self.write_db(f, "mit-" + subject.replace(' ','-').lower() + "-" + str(self.count), p.a.text, p.a['href'])
print p.a.text
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
def getBerkeleyEECSLabs(self):
r = requests.get('http://www.eecs.berkeley.edu/Research/Areas/Centers/')
soup = BeautifulSoup(r.text)
start = False
file_name = self.get_file_name(self.school + "/" + "berkeley-eecs-labs", self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for li in soup.find_all('li'):
if li.a != None and li.a['href'].startswith('http'):
title = li.a.text.strip()
if title.startswith('Algorithms'):
start = True
if start:
self.count += 1
self.write_db(f, 'berkeley-eecs-lab-' + str(self.count), title, li.a['href'], 'description:' + self.utils.removeDoubleSpace(li.p.text.strip().replace('\n', '')))
print title
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
def getSocialRobotLabs(self):
r = requests.get('https://en.wikipedia.org/wiki/Social_robot')
soup = BeautifulSoup(r.text)
file_name = self.get_file_name(self.school + "/" + "Social-robot-labs", self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for a in soup.find_all('a', class_='external text'):
print a.text
self.count += 1
self.write_db(f, 'social-robot-lab-' + str(self.count), a.text, a['href'])
if a.text.startswith('Department'):
break
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
示例12: ProjectPaperSpider
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
class ProjectPaperSpider(Spider):
def __init__(self):
Spider.__init__(self)
self.utils = Utils()
self.school = "project-papers"
def doWork(self):
self.getWastonPapers()
self.getSTAIRPapers()
self.getSTARTPapers()
self.getSparkPapers()
self.getHadoopPapers()
#self.getRoboBrainPapers()
self.getMobileEyePapers()
self.getAutonomousDrivingPapers()
self.getCALOPapers()
def getCALOPapers(self):
r = requests.get('http://www.ai.sri.com/pubs/search.php?project=179')
soup = BeautifulSoup(r.text)
file_name = self.get_file_name("eecs/" + self.school + "/" + "CALO", self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for li in soup.find_all('li'):
title = li.p.strong.text.strip()
link = 'http://www.ai.sri.com' + li.p.a['href']
print title
self.count += 1
self.write_db(f, 'calo-' + str(self.count), title, link)
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
def getAutonomousDrivingPapers(self):
r = requests.get("http://driving.stanford.edu/papers.html")
soup = BeautifulSoup(r.text)
title = ""
author = ""
journal = ""
desc = ""
url = ""
file_name = self.get_file_name("eecs/" + self.school + "/" + "AutonomousDriving", self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for p in soup.find_all("p"):
if p.span != None:
sp = BeautifulSoup(p.prettify())
title = sp.find('span', class_='papertitle').text.strip()
author = "author:" + self.utils.removeDoubleSpace(sp.find('span', class_='authors').text.strip().replace('\n', '')) + " "
journal = "journal:" + sp.find('span', class_='meeting').text.strip()
journal += " " + sp.find('span', class_='year').text.strip() + " "
desc = "description:" + self.utils.removeDoubleSpace(sp.find('span', class_='summary').text.strip().replace('\n', ''))
if p.a != None and p.a['href'].find(".pdf") != -1:
if p.a['href'].startswith('http'):
url = p.a['href']
else:
url = 'http://driving.stanford.edu/' + p.a['href']
self.count += 1
self.write_db(f, "autonomousdriving-paper-" + str(self.count), title, url, author + journal + desc)
title = ""
author = ""
journal = ""
desc = ""
url = ""
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
def getMobileEyePapers(self):
file_name = self.get_file_name("eecs/" + self.school + "/" + "MobileEye", self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for i in range(1, 3):
r = requests.get("http://www.mobileye.com/technology/mobileye-research/page/" + str(i))
soup = BeautifulSoup(r.text)
for div in soup.find_all("div", class_="ContentItemText"):
title = div.h2.text.strip()
link = div.h2.a['href']
author = "author:" + div.p.text.strip()
print title
self.count += 1
self.write_db(f, "mobileeye-paper-" + str(self.count), title, link, author)
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
#.........这里部分代码省略.........
示例13: HarvardOnlineSpider
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
#.........这里部分代码省略.........
link = "http://www.extension.harvard.edu" + str(li.a["href"]).strip()
if self.deep_mind:
link, description = self.getMoreInfo(link)
count = count + 1
print course_num + " " + title + " " + link
self.write_db(f, course_num, title, link, description)
if course_dict != None:
if course_num.startswith("CSCI"):
course_num = course_num.replace("CSCI", "CS")
course_dict[course_num] = CourseRecord(
self.get_storage_format(course_num, title, link, description)
)
else:
for li in soup.find_all("li"):
if li.attrs.has_key("class"):
if li.prettify().find("E-") != -1 and str(li.a["href"]).startswith("/courses"):
for item in li.prettify().split("\n"):
if item.find("E-") != -1:
course_num = item.replace(" E-", "").strip()
count = count + 1
title = li.a.string.strip()
link = "http://www.extension.harvard.edu" + str(li.a["href"]).strip()
if self.deep_mind:
link, description = self.getMoreInfo(link)
print course_num + " " + title + " " + link
self.write_db(f, course_num, title, link, description)
if course_dict != None:
if course_num.startswith("E-"):
course_num = course_num.replace("CSCI", "CS")
course_dict[course_num] = CourseRecord(
self.get_storage_format(course_num, title, link, description)
)
self.close_db(f)
if file_lines != count and count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
def getCourseDict(self, subject):
course_dict = {}
r = requests.get(self.url)
soup = BeautifulSoup(r.text)
for li in soup.find_all("li", class_="is-more-items"):
if li.a.string.lower() == subject.lower():
self.getHarvardOnlineCourse(
li.a.string, "http://www.extension.harvard.edu" + str(li.a["href"]), course_dict
)
return course_dict
def getHarvardCourse(self, subject, url):
r = requests.get(url)
soup = BeautifulSoup(r.text)
code = ""
file_name = self.get_file_name(subject.lower(), self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for div in soup.find_all("div", class_="view-group"):
sp = BeautifulSoup(div.prettify())
for div2 in sp.find_all("div"):
code = div2.span.text.strip()
link = "http://www.extension.harvard.edu" + div2.a["href"]
title = self.utils.removeDoubleSpace(div2.text.replace("\n", "")).replace(" E- ", "-E")
for t in title[title.find(code.upper()) :].split(code.upper()):
if t != "":
title = code.upper() + t
course_num = title[0 : title.find(" ")].strip()
if title.find("(") != -1:
title = title[title.find(" ") : title.find("(")].strip()
else:
title = title[title.find(" ") :].strip()
print course_num + " " + title
self.count += 1
self.write_db(f, course_num, title, link)
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
def doWork(self):
print "downloading harvard online course info"
# r = requests.get("http://www.extension.harvard.edu/courses/subject/computer-science")
r = requests.get(self.url)
soup = BeautifulSoup(r.text)
# for li in soup.find_all("li", class_ = "is-more-items"):
# self.getHarvardOnlineCourse(li.a.string, "http://www.extension.harvard.edu" + str(li.a["href"]))
for div in soup.find_all("div", class_="view-content"):
print div.a.text
if self.need_update_subject(div.a.text) == False:
continue
self.getHarvardCourse(div.a.text, "http://www.extension.harvard.edu" + div.a["href"])
示例14: GraphPapersSpider
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
class GraphPapersSpider(Spider):
def __init__(self):
Spider.__init__(self)
self.school = 'GraphPapers'
self.util = Utils()
def getPapers(self, conference, year, link):
print link
r = requests.get(link)
soup = BeautifulSoup(r.text)
paper_list = []
i = 0
file_name = self.get_file_name("eecs/" + self.school.lower() + '/' + conference + year, '')
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for dl in soup.find_all('dl'):
soup1 = BeautifulSoup(dl.prettify())
for dt in soup1.find_all('dt'):
title = dt.text.strip().replace('\n', '')
#if title.startswith('('):
# continue
if title.find('(') != -1:
title = title[0 : title.find('(')].strip()
paper_list.append(title)
if len(soup1.find_all('dd')) != len(soup1.find_all('dt')):
i += len(soup1.find_all('dt'))
continue
for dd in soup1.find_all('dd'):
author_list = []
if dd.text.find('),') != -1:
for author in self.util.removeDoubleSpace(dd.text.strip().replace('\n', '')).split('),'):
author_list.append(author + ')')
elif dd.text.find(',') != -1:
for author in self.util.removeDoubleSpace(dd.text.strip().replace('\n', '')).split(','):
author_list.append(author)
else:
author_list.append(self.util.removeDoubleSpace(dd.text.strip().replace('\n', '')))
if paper_list[i] != '':
self.count += 1
paper_id = conference + year + '-' + str(self.count)
self.write_db(f, paper_id, paper_list[i], '', 'author:' + ' '.join(author_list))
print paper_list[i] + ' ' + ','.join(author_list)
i += 1
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
def doWork(self):
r = requests.get('http://kesen.realtimerendering.com/')
for line in r.text.split('\n'):
if line.startswith('<li><a') and line.lower().find('.htm') != -1 and line.find('http://') == -1:
data = line[line.find('"') + 1 : line.find('"', line.find('"') + 1)]
if data.find('20') == -1:
continue
conference = data[0 : data.find('20')]
year = ''
if data.lower().find('paper') != -1:
year = data[data.find('20') : data.lower().find('paper')]
else:
year = data[data.find('20') : data.find('.htm')]
print conference + ' - - - - ' + year
self.getPapers(conference, year, 'http://kesen.realtimerendering.com/' + data)
示例15: ProjectsSpider
# 需要导入模块: from utils import Utils [as 别名]
# 或者: from utils.Utils import removeDoubleSpace [as 别名]
#.........这里部分代码省略.........
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
def getMitMediaProjects(self):
r = requests.get('https://www.media.mit.edu/research/groups-projects')
soup = BeautifulSoup(r.text)
for span in soup.find_all('span', class_='field-content'):
if span.a != None and span.a.text.startswith('more') == False:
subject = span.a.text.strip().lower().replace(' ', '-')
print subject
file_name = self.get_file_name("projects/MIT-MEDIA-LAB/" + subject, self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
r = requests.get('https://www.media.mit.edu' + span.a['href'])
sp = BeautifulSoup(r.text)
for li in sp.find_all('li'):
if li.div != None and li.div.h2 != None:
link = ''
title = ''
desc = ''
sp1 = BeautifulSoup(li.div.prettify())
for a in sp1.find_all('a'):
if a.text.strip() == 'view site':
link = a['href']
title = li.div.h2.text.strip()
print title
desc = 'description:' + self.utils.removeDoubleSpace(li.div.div.text.strip().replace('\n', ''))
self.count += 1
self.write_db(f, "mit-media-" + subject + '-' + str(self.count), title, link, desc)
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
else:
self.cancel_upgrade(file_name)
print "no need upgrade\n"
def getOpenSourceRobotProjects(self):
r = requests.get('https://en.wikipedia.org/wiki/Open-source_robotics')
soup = BeautifulSoup(r.text)
soup = BeautifulSoup(soup.find('table', class_='wikitable').prettify())
file_name = self.get_file_name("eecs/projects/" + "open-source-robot", self.school)
file_lines = self.countFileLineNum(file_name)
f = self.open_db(file_name + ".tmp")
self.count = 0
for a in soup.find_all('a'):
title = self.utils.removeDoubleSpace(a.text.strip().replace('\n', ''))
if title.startswith('[') or title == 'Arduino' or title == 'Self-balancing robot' or title == 'Modular design':
continue
print title
self.count += 1
self.write_db(f, "open-source-robot-project-" + str(self.count), title, 'https://en.wikipedia.org' + a['href'])
self.close_db(f)
if file_lines != self.count and self.count > 0:
self.do_upgrade_db(file_name)
print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"