本文整理汇总了Python中scraper.Scraper.setup_session方法的典型用法代码示例。如果您正苦于以下问题:Python Scraper.setup_session方法的具体用法?Python Scraper.setup_session怎么用?Python Scraper.setup_session使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scraper.Scraper
的用法示例。
在下文中一共展示了Scraper.setup_session方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: start_threads_for_letter
# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import setup_session [as 别名]
def start_threads_for_letter(startLetter):
outFn = "voters_"+str(startLetter)+".txt"
outFile = open(outFn, 'w')
print("Getting records starting with " + startLetter)
scp = Scraper(conf.sessionHeaders, conf.searchHeaders)
scp.setup_session([conf.baseUrl, conf.rollSearchUrl])
url = conf.searchUrl
params = conf.searchParams
params['electorName'] = startLetter
scp.get_and_write_records(url, 0, params, outFile)
示例2: __init__
# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import setup_session [as 别名]
class MyParser:
def __init__(self):
pass
def init_web(self):
self.detailsParams = conf.detailsParams
self.detailsUrl = conf.detailsUrl
self.scp = Scraper(conf.sessionHeaders, conf.searchHeaders, [conf.baseUrl, conf.rollSearchUrl])
self.scp.setup_session()
def parse_details_tag(self, rec):
urlTag = re.search("paramValue=(\w+)\"", rec[6])
if not urlTag:
raise Exception('Cannot find a details url in the record: \n'+str(rec))
return urlTag.group(1)
def request_and_parse(self, rec):
# startTime = datetime.now()
# print(str(rec[0]).encode('utf-8'))
detailsTag = self.parse_details_tag(rec)
# print ("After paramvalue RE search: ", (datetime.now()-startTime).total_seconds())
self.detailsParams['paramValue'] = detailsTag
resp = self.scp.get_response(self.detailsUrl, self.detailsParams, 3)
# print ("After GET: ", (datetime.now()-startTime).total_seconds())
checkInvalid = re.search("Invalid access to the page", resp.text)
if checkInvalid:
print("Looks like the record for %s %s is not in the database anymore. Continuing on..." % (rec[0],rec[1]))
return None
# print(resp.request.url)
soup = BS(resp.content, 'html.parser')
parsedDetails = []
tables = soup.find_all('table')
# Go through all the info in the details page and extract
for table in tables:
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('td')
txtCols = []
# txtCols = [ele.text.strip() for ele in cols]
# print([ele for ele in cols if ele.has_attr('a')])
for ele in cols:
txtCols.append(ele.text.strip())
if ele.a:
txtCols.append(ele.a.get('href').split("Value=")[1].strip())
parsedRow = [ele for ele in txtCols if ele]
parsedDetails.append(parsedRow)
# print ("After all processing: ", (datetime.now()-startTime).total_seconds())
time.sleep(random.uniform(0.5,1))
return parsedDetails