本文整理汇总了Python中mparser.ProfileParser类的典型用法代码示例。如果您正苦于以下问题:Python ProfileParser类的具体用法?Python ProfileParser怎么用?Python ProfileParser使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了ProfileParser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: profile_handler
def profile_handler(doc, name, url, path):
filename = os.path.join(path, name + ".html")
employee = Employee(name=name, url=url)
# 只保存名称和个人主页,个人简历文件另存当前目录
soup = BeautifulSoup(doc, Config.SOUP_PARSER)
#div_header = soup.find_all(name="div", attrs={"class":"neiye-shizi-title"}, limit=1)
divs = soup.find_all(name="div", attrs={"class":"xinwen-txt_3"}, limit=1)
if not divs or len(divs) == 0:
return employee
div = divs[0]
if not os.path.exists(filename):
with open(filename, 'wb') as fp:
content = div.prettify()
fp.write(content)
fp.close()
# 使用纯文本方式处理
lines = div.stripped_strings
# text=div.get_text(strip=True)
parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,ignore=set(['fax']))
return parser.parse()
示例2: profile_handler
def profile_handler(doc, name, url, path):
filename = os.path.join(path, name + ".html")
employee = Employee(name=name, url=url)
# 只保存名称和个人主页,个人简历文件另存当前目录
soup = BeautifulSoup(doc, Config.SOUP_PARSER)
divs = soup.find_all(name="div", clasS_="content-wrapper", limit=1)
if not divs or len(divs) == 0:
div = soup
else:
div = divs[0]
if not os.path.exists(filename):
with open(filename, 'wb') as fp:
content = div.prettify()
fp.write(content)
fp.close()
infos_div = div.find_all('div',attrs={"id":"column-1"})
if infos_div and len(infos_div) != 0:
div = infos_div[0]
# 使用纯文本方式处理
lines = div.stripped_strings
# text=div.get_text(strip=True)
parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,max_line=256)
return parser.parse()
示例3: profile_handler
def profile_handler(doc, name, url, path):
filename = os.path.join(path, name + ".html")
employee = Employee(name=name, url=url)
# 只保存名称和个人主页,个人简历文件另存当前目录
soup = BeautifulSoup(doc, Config.SOUP_PARSER)
divs = soup.find_all(name="div", attrs={"class": "NewsArticles"}, limit=1)
if not divs or len(divs) == 0:
div = soup
else:
div = divs[0]
if not os.path.exists(filename):
with open(filename, "wb") as fp:
content = div.prettify()
fp.write(content)
fp.close()
# 使用纯文本方式处理
lines = div.stripped_strings
# text=div.get_text(strip=True)
parser = ProfileParser(
lines=lines, employee=employee, set_attr_hook=set_attr_hook, max_line=999, force_email=True, force_tel=False
)
return parser.parse()
示例4: profile_handler
def profile_handler(doc, name, url, path):
filename = os.path.join(path, name + ".html")
employee = Employee(name=name, url=url)
# 只保存名称和个人主页,个人简历文件另存当前目录
soup = BeautifulSoup(doc, Config.SOUP_PARSER)
divs = soup.find_all(name="div", class_="box_rt01 list", limit=1)
if not divs or len(divs) == 0:
div = soup
else:
div = divs[0]
with open(filename, 'wb') as fp:
content = div.prettify()
fp.write(content)
fp.close()
h3s = div.find_all('h3')
if h3s and len(h3s) != 0:
title = h3s[0].get_text()
title = ''.join(title.split())
print title
for t in PROFILE_TITLES:
if t in title:
employee.title = title
print "got => " + title
break
else:
print "not found h3"
# 使用纯文本方式处理
lines = div.stripped_strings
# text=div.get_text(strip=True)
parser = ProfileParser(lines=lines,employee=employee,force_email=True)
return parser.parse()
示例5: profile_handler
def profile_handler(doc, name, url, path):
filename = os.path.join(path, name + ".html")
employee = Employee(name=name, url=url)
# 只保存名称和个人主页,个人简历文件另存当前目录
soup = BeautifulSoup(doc, Config.SOUP_PARSER)
lis = soup.find_all(name="li")
if not lis and len(lis) != 5:
div = soup
else:
ass = lis[4].find_all('a')
if len(ass) != 0:
li_url = ass[0]['href']
newUrl = urljoin(url,li_url)
newDoc = get_doc_byUrllib2(newUrl)
soup = BeautifulSoup(newDoc, Config.SOUP_PARSER)
mainDiv = soup.find_all('div',attrs={"id":"main"})
if not mainDiv or len(mainDiv) == 0:
print "not found main div"
div = soup
else:
div = mainDiv[0]
if not os.path.exists(filename):
with open(filename, 'wb') as fp:
content = div.prettify()
fp.write(content)
fp.close()
# 使用纯文本方式处理
lines = div.stripped_strings
parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,force_email=True)
return parser.parse()
示例6: profile_handler
def profile_handler(doc, name, url, path):
filename = os.path.join(path, name + ".html")
employee = Employee(name=name, url=url)
# 只保存名称和个人主页,个人简历文件另存当前目录
soup = BeautifulSoup(doc, Config.SOUP_PARSER)
divs = soup.find_all(name="div", attrs={"class":"right-nr"})
if not divs or len(divs) == 0:
print("div class=right-nr not found")
return employee
div = divs[0]
if not os.path.exists(filename):
with open(filename, 'wb') as fp:
content = div.prettify()
fp.write(content)
fp.close()
# 使用纯文本方式处理
lines = div.stripped_strings
# text=div.get_text(strip=True)
# ,set_attr_hook=set_attr_hook
parser = ProfileParser(lines=lines,employee=employee)
return parser.parse()
示例7: profile_handler
def profile_handler(doc, name, url, path):
filename = os.path.join(path, name + ".html")
employee = Employee(name=name, url=url)
# 只保存名称和个人主页,个人简历文件另存当前目录
soup = BeautifulSoup(doc, Config.SOUP_PARSER)
divs = soup.find_all(name="div", class_="line20 dataName", limit=1)
if not divs or len(divs) == 0:
divs = soup.find_all(name="div", class_="rightArea clearfix ", limit=1)
if not divs or len(divs) == 0:
div = soup
else:
div = divs[0]
else:
div = divs[0]
if not os.path.exists(filename):
with open(filename, 'wb') as fp:
content = div.prettify()
fp.write(content)
fp.close()
# 使用纯文本方式处理
lines = div.stripped_strings
# text=div.get_text(strip=True)
parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,max_line=999)
return parser.parse()
示例8: profile_handler
def profile_handler(doc, name, url, path):
filename = os.path.join(path, name + ".html")
employee = Employee(name=name, url=url)
soup = BeautifulSoup(doc, Config.SOUP_PARSER)
divs = soup.find_all(name="td",attrs={"valign":"center"}, limit=1)
if not divs or len(divs) == 0:
div = soup
else:
div = divs[0]
if not os.path.exists(filename):
with open(filename, 'wb') as fp:
content = div.prettify()
fp.write(content)
fp.close()
lines = []
tds = div.find_all('td')
if len(tds) == 0:
lines = div.stripped_strings
print "TDS none!"
else:
for td in tds:
string = td.get_text().strip()
if len(string) < 128:
string = ''.join(string.split())
print string
lines.append(string)
# 使用纯文本方式处理
#lines = div.stripped_strings
# text=div.get_text(strip=True)
parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=profile_set_attr_hook,max_line=256)
return parser.parse()
示例9: profile_handler
def profile_handler(doc, name, url, path):
filename = os.path.join(path, name + ".html")
employee = Employee(name=name, url=url)
# 太乱了,只保存名称和个人主页,个人简历文件另存当前目录
soup = BeautifulSoup(doc, Config.SOUP_PARSER)
divs = soup.find_all(name="td", class_="bd-content", limit=1)
if not divs or len(divs) == 0:
divs = soup.find_all(name="td", attrs={"width": "79%"}, limit=1)
if not divs or len(divs) == 0:
with open(filename, "wb") as fp:
content = doc
fp.write(content)
fp.close()
return employee
div = divs[0]
with open(filename, "wb") as fp:
content = div.prettify()
fp.write(content)
fp.close()
# 使用纯文本方式处理
lines = div.stripped_strings
# text=div.get_text(strip=True)
parser = ProfileParser(lines=lines, employee=employee)
return parser.parse()
示例10: profile_handler
def profile_handler(doc, name, url, path):
filename = os.path.join(path, name + ".html")
employee = Employee(name=name, url=url)
# 只保存名称和个人主页,个人简历文件另存当前目录
soup = BeautifulSoup(doc, Config.SOUP_PARSER)
divs = soup.find_all(name="div", attrs={"id":"right_2"}, limit=1)
if not divs or len(divs) == 0:
return employee
div = divs[0]
if not os.path.exists(filename):
with open(filename, 'wb') as fp:
content = div.prettify()
fp.write(content)
fp.close()
researches = [' ',' ']
tds = div.find_all(name="td",attrs={"bgcolor":"#FFFFFF","class":"ft12","valign":"top"},limit=4)
if len(tds) == 4:
researches[0] = tds[2].get_text().strip()
researches[1] = tds[3].get_text().strip()
employee.research = researches[0] + ";" +researches[1]
print "research:" + employee.research
# 使用纯文本方式处理
lines = div.stripped_strings
# text=div.get_text(strip=True)
parser = ProfileParser(lines=lines,employee=employee,ignore=set(['research']))
return parser.parse()
示例11: profile_handler
def profile_handler(doc, name, url, path):
filename = os.path.join(path, name + ".html")
employee = Employee(name=name, url=url)
# 只保存名称和个人主页,个人简历文件另存当前目录
soup = BeautifulSoup(doc, Config.SOUP_PARSER)
divs = soup.find_all(name="div", attrs={"class":"newsContent"}, limit=1)
if not divs or len(divs) == 0:
return employee
div = divs[0]
if not os.path.exists(filename):
with open(filename, 'wb') as fp:
content = div.prettify()
fp.write(content)
fp.close()
details = soup.find_all(name="span", attrs={"id":"ctl00_ContentPlaceHolder1_NewsView1_lbl_NewsContent"}, limit=1)
if not details or len(details) == 0:
return employee
# 使用纯文本方式处理
lines = details[0].stripped_strings
# text=div.get_text(strip=True)
parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook)
return parser.parse()
示例12: handler
def handler(tag):
employee = Employee()
name_divs = tag.find_all("div",class_="teacher-title")
if name_divs and len(name_divs) != 0:
employee.name = name_divs[0].get_text()
employee.name = ''.join(employee.name.split())
# 使用纯文本方式处理
lines = tag.stripped_strings
# text=div.get_text(strip=True)
parser = ProfileParser(lines=lines,employee=employee)
return parser.parse()
示例13: profile_handler
def profile_handler(doc, name, url, path):
filename = os.path.join(path, name + ".html")
employee = Employee(name=name, url=url)
# 只保存名称和个人主页,个人简历文件另存当前目录
soup = BeautifulSoup(doc, Config.SOUP_PARSER)
divs = soup.find_all(name="div", attrs={"class":"page_right addpage_right"}, limit=1)
if not divs or len(divs) == 0:
div= soup
else:
div = divs[0]
if not os.path.exists(filename):
with open(filename, 'wb') as fp:
content = div.prettify()
fp.write(content)
fp.close()
tds = div.find_all('td')
if tds and len(tds) == 11:
department = tds[2].get_text()
if department:
department = ''.join(department.split())
if department and len(department) != 0:
employee.departments = department
title = tds[4].get_text()
if title:
title = ''.join(title.split())
if title and len(title) != 0:
employee.title = title
email = tds[8].get_text()
if email:
email = ''.join(email.split())
if email and len(email) != 0:
employee.email = email
research = tds[10].get_text()
if research:
research = ''.join(research.split())
if research and len(research) != 0:
employee.research = research
divs = soup.find_all(name="div", attrs={"class":"text_more"}, limit=1)
if divs and len(divs) != 0:
div = divs[0]
# 使用纯文本方式处理
lines = div.stripped_strings
# text=div.get_text(strip=True)
parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook)
return parser.parse()
示例14: handler
def handler(tag):
employee = Employee()
ass = tag.find_all('a',class_="orangea")
if ass and len(ass) != 0:
employee.name = ass[0].get_text()
employee.name = ''.join(employee.name.split())
employee.profile = ass[0]['href']
ass = tag.find_all('a',class_="black01")
if ass and len(ass) != 0:
lines = ass[0].stripped_strings
parser = ProfileParser(lines=lines,employee=employee)
employee = parser.parse()
return employee
示例15: profile_handler
def profile_handler(doc, name, url, path):
filename = os.path.join(path, name + ".html")
employee = Employee(name=name, url=url)
# 只保存名称和个人主页,个人简历文件另存当前目录
soup = BeautifulSoup(doc, Config.SOUP_PARSER)
divs = soup.find_all(name="div", attrs={"id":"maincontent"}, limit=1)
if not divs or len(divs) == 0:
div = soup
else:
div = divs[0]
if not os.path.exists(filename):
with open(filename, 'wb') as fp:
content = div.prettify()
fp.write(content)
fp.close()
divs = div.find_all(class_="other")
if not divs or len(divs) == 0:
div = soup
else:
div = divs[0]
lines = []
spans = div.find_all('span')
for child in spans:
line = child.get_text()
if line:
line = ''.join(line.split())
if not line:
continue
if len(line) != 0:
lines.append(line)
if len(lines) == 0:
return emplo
#email
#email_div = soup.find_all(name='a',class_="phy-mail")
#if email_div and len(email_div) != 0:
# employee.email = email_div[0].get_text().strip()
#
#te_div = soup.find_all(name='a',class_="phy-phone")
#if te_div and len(te_div) != 0:
# employee.tel = te_div[0].get_text().strip()
# 使用纯文本方式处理
#lines = div.stripped_strings
# text=div.get_text(strip=True)
parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook)
return parser.parse()