本文整理汇总了Python中pathos.multiprocessing.ProcessingPool.imap方法的典型用法代码示例。如果您正苦于以下问题:Python ProcessingPool.imap方法的具体用法?Python ProcessingPool.imap怎么用?Python ProcessingPool.imap使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pathos.multiprocessing.ProcessingPool
的用法示例。
在下文中一共展示了ProcessingPool.imap方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: sleep
# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]
try:
get_states = requests.get(nation_url, timeout=(1,60)).text
break
except:
sleep(1.5**wait)
wait += 1
parsed = BeautifulSoup(get_states, 'html.parser')
state_urls = [a['href'] for a in parsed.find('div', class_='newLocUSListArea').find_all('a')]
################
#Get town links#
################
print "Getting town URLs..."
pool = Pool(10)
result_iter = pool.imap(get_town_urls, state_urls)
town_urls = []
for result in result_iter:
town_urls += result
#Clean up town URLs
town_urls = [re.sub("st\.-","st-",url) for url in town_urls]
#################
#Get paper links#
#################
print "Getting paper URLs..."
result_iter = pool.imap(get_paper_urls, town_urls)
paper_urls = []
示例2: newspapers_com_scraper
# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]
#.........这里部分代码省略.........
sleep(1.5**wait)
wait += 1
articles = articles + json.loads(content)['records']
break
except ValueError:
groups += 1
return articles
#Get article attributes
def get_from_object(obj, *keys):
try:
value = obj
for k in keys:
if isinstance(value, dict):
value = value.get(k)
elif isinstance(value, list) and len(value)>1:
value = (item for item in value if item['name'] == k).next()['value']
elif isinstance(value, list) and len(value)==1:
value = value[k]
return value
except:
return ''
#Extract article data
def get_article_data(record, search_date):
line = {}
line['archive'] = 'newspapers_com'
line['publication_id'] = get_from_object(record, 'rec', 'cover', 'publicationId')
line['publication_title'] = get_from_object(record, 'rec', 'pubMetadata', 'publication-title')
line['search_date'] = search_date
line['page'] = get_from_object(record, 'rec', 'cover', 'title')
line['href'] = "http://www.newspapers.com/image/" + str(record['rec']['cover']['id']) + "/?terms=" + record['terms']
line['search_terms'] = search_terms
return line
#Scrape function
def scrape(search_terms, day):
sleep(1)
print day
#Create search query
query_form = make_search_query(search_terms, day, 1000)
#POST search query
wait = 0
while True:
try:
matches = session.post(search_url, data = query_form, cookies=session.cookies, allow_redirects=True, headers={'referer' : 'http://www.newspapers.com/search/'}, timeout=(1,60)).text
break
except:
print "... trying again ..."
sleep(1.5**wait)
wait += 1
#Create search content query
results = json.loads(matches)
if results['recCount'] > 0:
#records = make_record_dict(results['records'])
#print "Made "
#Get articles
articles = get_content(results['records'])
lines = []
for article in articles:
lines.append(get_article_data(article, day))
return lines
else:
return None
#Complete Scraper
date_list = [str(date) for date in perdelta(start_date, end_date, timedelta(days=1))]
#Start session
session = requests.session()
#Log in
signin = session.get(signin_url)
doc = lxml.html.fromstring(signin.text)
signin_form = doc.forms[0]
signin_form.fields['username'] = "email"
signin_form.fields['password'] = "password"
session.post(signin_url, data=signin_form.form_values(), allow_redirects=True)
#Create CSV
#Create file name
timeperiod = str(start_date) + "to" + str(end_date - timedelta(days=1))
filename = "newspapers_com-" + timeperiod + ".csv"
fields = ["archive", "publication_title", "publication_id", "search_date", "page", "href", "search_terms"]
pool = Pool(10)
results_iter = pool.imap(scrape, [search_terms]*len(date_list), date_list)
with open("/".join((filepath,filename)), "w") as w:
writer = csv.DictWriter(w, fieldnames=fields)
writer.writeheader()
#Loop over days
for results in results_iter:
if results != None:
writer.writerows(results)
示例3: analyze
# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]
class analyze(setup.setup):
def __init__(self,args,logging_level=logging.INFO):
super(analyze, self ).__init__(args,logging_level)
# set up processing pool and run all analyses specified in args
def run(self):
if self.args.jumpdists:
n_bins=100.
bin_width = 1/n_bins
bins = np.arange(0,1+bin_width,1/n_bins)
if self.args.file:
user,vals = self.artist_jump_distributions(self.args.file,bins=bins,self_jumps=False)
with open(self.args.resultdir+user,'w') as fout:
fout.write(','.join(vals.astype(str))+'\n')
else:
raise('not implemented!')
self.pool = Pool(self.args.n)
self.rootLogger.info("Pool started")
self.rootLogger.info("Starting jump distance analysis")
func_partial = partial(self.artist_jump_distributions,bins=bins,self_jumps=False)
with open(self.args.resultdir+'jumpdists','w') as fout:
for user,vals in self.pool.imap(func_partial,self.listen_files):
fout.write(user+'\t'+','.join(vals.astype(str))+'\n')
self.pool.close()
self.rootLogger.info("Pool closed")
if self.args.blockdists:
#self.rootLogger.info("Starting block distance analysis")
self.mean_block_distances(self.args.file)
if self.args.diversity_dists:
bins = np.arange(0,1.01,.01)
self.diversity_distributions(self.args.file,bins=bins)
if self.args.clustering:
self.clustering(self.args.file)
if self.args.values:
self.patch_values(self.args.file)
if self.args.exp:
self.explore_exploit(self.args.file)
if self.args.patch_len_dists:
self.patch_len_dists(self.args.file)
# calculate distribution (using histogram with specified bins)
# of sequential artist-to-artist distances
def artist_jump_distributions(self,fi,bins,self_jumps=False):
user = fi.split('/')[-1][:-4]
df = pd.read_pickle(fi)
if self_jumps:
vals = np.histogram(df['dist'].dropna(),bins=bins)[0]
else:
vals = np.histogram(df['dist'][df['dist']>0],bins=bins)[0]
self.rootLogger.info('artist jump distances done for user {} ({})'.format(user,fi))
return user,vals
# calculate distribution (using histogram with specified bins)
# of patch diversity for each user
# awk 'FNR==1' * > diversity_dists_zeros
# awk 'FNR==2' * > diversity_dists_nozeros
def diversity_distributions(self,fi,bins):
if 'patches' not in fi:
raise('WRONG DATATYPE')
user = fi.split('/')[-1].split('_')[0]
df = pd.read_pickle(fi).dropna(subset=['diversity'])
zeros = np.histogram(df[df['n']>=5]['diversity'],bins=bins)[0]
nozeros = np.histogram(df[(df['n']>=5)&(df['diversity']>0)]['diversity'],bins=bins)[0]
zeros = zeros/float(zeros.sum())
nozeros = nozeros/float(nozeros.sum())
with open(self.args.resultdir+user,'w') as fout:
fout.write(user+'\t'+'zeros'+'\t'+','.join(zeros.astype(str))+'\n')
fout.write(user+'\t'+'nozeros'+'\t'+','.join(nozeros.astype(str))+'\n')
self.rootLogger.info('diversity distributions done for user {} ({})'.format(user,fi))
def mean_block_distances(self,fi,n=100):
def cos_nan(arr1,arr2):
if np.any(np.isnan(arr1)) or np.any(np.isnan(arr2)):
return np.nan
else:
return cosine(arr1,arr2)
#.........这里部分代码省略.........
示例4: sleep
# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]
print "... trying again ..."
sleep(1.5**wait)
wait += 1
if count > 0:
t = r['titleData']
return {day : Set([x['value'] for x in t])}
else:
return {day : Set()}
start_date = date(1880,1,1)
end_date = date(1941,1,1)
date_list = [str(date) for date in perdelta(start_date, end_date, timedelta(days=1))]
pool = Pool(10)
result_iter = pool.imap(scrape_day, date_list)
title_sets = {}
for result in result_iter:
title_sets.update(result)
###################################
#Make dictionary of daily matches #
###################################
def scrape_paper(title_id):
title_url = "_".join((title_stub_url, title_id))
wait=0
while True:
try:
title_get = session.get(title_url, cookies=session.cookies, allow_redirects=True).text
break
except:
示例5: ProcessingPool
# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]
# instantiate and configure the worker pool
from pathos.multiprocessing import ProcessingPool
pool = ProcessingPool(nodes=4)
_result = map(pow, [1,2,3,4], [5,6,7,8])
# do a blocking map on the chosen function
result = pool.map(pow, [1,2,3,4], [5,6,7,8])
assert result == _result
# do a non-blocking map, then extract the result from the iterator
result_iter = pool.imap(pow, [1,2,3,4], [5,6,7,8])
result = list(result_iter)
assert result == _result
# do an asynchronous map, then get the results
result_queue = pool.amap(pow, [1,2,3,4], [5,6,7,8])
result = result_queue.get()
assert result == _result
示例6: str
# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]
state = re.search("^[^(--)]+(?=--)", str(location_raw)).group(0)
line['location'] = ", ".join((city, state))
line['lastUpdated'] = search_datetime
#Get paper publication dates
paper_date_set = Set([x['date_issued'] for x in paper_data['issues']])
date_match = {k : int(k in paper_date_set) for k in date_list}
line.update(date_match)
return line
#Scrape publication data
print "Getting publication data..."
pool = Pool(10)
result_iter = pool.imap(scrape_paper, paper_stubs, [date_list]*len(paper_stubs))
lines = []
for result in result_iter:
lines.append(result)
#Prepare for write
filename = "chronicling_america-allpubs.csv"
filepath = directory
fields = ['archive', 'publication_title', 'publication_id', 'location', 'lastUpdated'] + date_list
print "Creating data rows..."
out = []
for line in lines:
line['publication_title'] = line['publication_title'].encode('utf8')
line['location'] = line['location'].encode('utf8')
out.append([line[k] for k in fields])
示例7: newspaperarchive_scraper
# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]
#.........这里部分代码省略.........
def extract_articles(page):
#Grab articles
articles = page.find_all('div', class_="searchResultBlock searchResultBlockWithThumb")
return articles
def extract_data(article, day):
line = {}
line['archive'] = "newspaperarchive"
try:
line["publication_title"] = article.h4.a.get_text().strip().encode('utf8')
except:
line["publication_title"] = ""
line["href"] = article.a['href']
try:
line['publication_id'] = re.search("(?<=http://access\.newspaperarchive\.com/)([^/]+/[^/]+/[^/]+/[^/]+)", line['href']).group(0)
except:
line['publication_id'] = ""
line["search_date"] = day
try:
line['page'] = re.search("(?<=/page-)(\d\d?)", line['href']).group(0)
except:
line['page'] = ""
line['search_terms'] = search_terms
return line
def scrape(search_terms, day):
sleep(1)
print day
#Visit URL and parse
url = newspaperarchive_url(search_terms, day)
wait = 0
while True:
try:
start = requests.get(url, timeout=(1,180)).text
break
except:
print "... trying again ..." + str(day)
sleep(1.5**wait)
wait += 1
#Are there any hits?
if test_matches(start) == None:
lines = []
nextLink = []
page = start
page_number = 2
while nextLink != None:
soup = BeautifulSoup(page, 'html.parser')
articles = extract_articles(soup)
#extract article data
for article in articles:
lines.append(extract_data(article, day))
#Get nextLink
try:
nextLink = soup.find('a', text=page_number)['href']
wait = 0
while True:
try:
page = requests.get(nextLink, timeout=(1,180)).text
break
except:
print "... trying again ..." + str(day)
sleep(1.5**wait)
wait += 1
page_number += 1
except TypeError:
nextLink = None
return lines
else:
return None
#Complete scraper
#Parallel processing
if __name__ == "__main__":
#Create file name
timeperiod = str(start_date) + "to" + str(end_date - timedelta(days=1))
filename = "newspaperarchive-" + timeperiod + ".csv"
pool = Pool(10)
date_list = []
for date in perdelta(start_date, end_date, timedelta(days=1)):
date_list.append(date)
search_terms_list = [search_terms] * len(date_list)
result_iter = pool.imap(scrape, search_terms_list, date_list)
#Create CSV
fields = ["archive", "publication_title", "publication_id", "search_date", "page", "href", "search_terms"]
with open("/".join((filepath,filename)), "w") as w:
writer = csv.DictWriter(w, fieldnames=fields)
writer.writeheader()
for result in result_iter:
if result != None:
writer.writerows(result)
示例8: Pool
# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]
line['archive'] = "americas_historical_newspapers"
line['publication_title'] = title.input.text
line['publication_id'] = title.input['value']
city = title.find('td', class_="ci").text
state = title.find('td', class_="st").text
line['location'] = ", ".join((city,state))
line['lastUpdated'] = search_datetime
papers_data.append(line)
#Scrape publication dates in parallel
pub_ids = [paper['publication_id'] for paper in papers_data]
print "Scraping papers..."
pool = Pool(10)
result_iter = pool.imap(scrape_paper, pub_ids, [date_list_str]*len(pub_ids))
title_sets = {}
for result in result_iter:
title_sets.update(result)
#Create file#
filename = "americas_historical_newspapers-allpubs.csv"
filepath = directory
fields = ['archive', 'publication_title', 'publication_id', 'location', 'lastUpdated'] + date_list_str
#Create output
print "Creating data rows..."
out = []
for paper in papers_data:
title_id = paper['publication_id']
date_match = {k : int(k in title_sets[title_id]) for k in date_list_str}