当前位置: 首页>>代码示例>>Python>>正文


Python ProcessingPool.imap方法代码示例

本文整理汇总了Python中pathos.multiprocessing.ProcessingPool.imap方法的典型用法代码示例。如果您正苦于以下问题:Python ProcessingPool.imap方法的具体用法?Python ProcessingPool.imap怎么用?Python ProcessingPool.imap使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pathos.multiprocessing.ProcessingPool的用法示例。


在下文中一共展示了ProcessingPool.imap方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: sleep

# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]
	try:
		get_states = requests.get(nation_url, timeout=(1,60)).text
		break
	except:
		sleep(1.5**wait)
		wait += 1

parsed = BeautifulSoup(get_states, 'html.parser')
state_urls = [a['href'] for a in parsed.find('div', class_='newLocUSListArea').find_all('a')]

################
#Get town links#
################
print "Getting town URLs..."
pool = Pool(10)
result_iter = pool.imap(get_town_urls, state_urls)

town_urls = []
for result in result_iter:
	town_urls += result

#Clean up town URLs
town_urls = [re.sub("st\.-","st-",url) for url in town_urls]

#################
#Get paper links#
#################
print "Getting paper URLs..."
result_iter = pool.imap(get_paper_urls, town_urls)

paper_urls = []
开发者ID:mdweaver,项目名称:newspaperScrapers,代码行数:33,代码来源:newspaperarchive-gettitles.py

示例2: newspapers_com_scraper

# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]

#.........这里部分代码省略.........
                            sleep(1.5**wait)
                            wait += 1
                    articles = articles + json.loads(content)['records']
                break
            except ValueError:
                groups += 1
        return articles

    #Get article attributes
    def get_from_object(obj, *keys):
        try:
            value = obj
            for k in keys:
                if isinstance(value, dict):
                    value = value.get(k)
                elif isinstance(value, list) and len(value)>1:
                    value = (item for item in value if item['name'] == k).next()['value']
                elif isinstance(value, list) and len(value)==1:
                    value = value[k]
            return value
        except:
            return ''

    #Extract article data
    def get_article_data(record, search_date):
        line = {}
        line['archive'] = 'newspapers_com'
        line['publication_id'] = get_from_object(record, 'rec', 'cover', 'publicationId')
        line['publication_title'] = get_from_object(record, 'rec', 'pubMetadata', 'publication-title')
        line['search_date'] = search_date
        line['page'] = get_from_object(record, 'rec', 'cover', 'title')
        line['href'] = "http://www.newspapers.com/image/" + str(record['rec']['cover']['id']) + "/?terms=" + record['terms']
        line['search_terms'] = search_terms
        return line

    #Scrape function
    def scrape(search_terms, day):
        sleep(1)
        print day

        #Create search query
        query_form = make_search_query(search_terms, day, 1000)

        #POST search query
        wait = 0
        while True:
            try:
                matches = session.post(search_url, data = query_form, cookies=session.cookies, allow_redirects=True, headers={'referer' : 'http://www.newspapers.com/search/'}, timeout=(1,60)).text
                break
            except:
                print "... trying again ..."
                sleep(1.5**wait)
                wait += 1

        #Create search content query
        results = json.loads(matches)
        if results['recCount'] > 0:
            #records = make_record_dict(results['records'])
            #print "Made "
            
            #Get articles
            articles = get_content(results['records'])

            lines = []
            for article in articles:
                lines.append(get_article_data(article, day))

            return lines
        else:
            return None

    #Complete Scraper
    date_list = [str(date) for date in perdelta(start_date, end_date, timedelta(days=1))]
    #Start session
    session = requests.session()

    #Log in
    signin = session.get(signin_url)
    doc = lxml.html.fromstring(signin.text)
    signin_form = doc.forms[0]
    signin_form.fields['username'] = "email"
    signin_form.fields['password'] = "password"
    session.post(signin_url, data=signin_form.form_values(), allow_redirects=True)

    #Create CSV
    #Create file name
    timeperiod = str(start_date) + "to" + str(end_date - timedelta(days=1))
    filename = "newspapers_com-" + timeperiod + ".csv"
    fields = ["archive", "publication_title", "publication_id", "search_date", "page", "href", "search_terms"]
    
    pool = Pool(10)
    results_iter = pool.imap(scrape, [search_terms]*len(date_list), date_list)

    with open("/".join((filepath,filename)), "w") as w:
        writer = csv.DictWriter(w, fieldnames=fields)
        writer.writeheader()
        #Loop over days
        for results in results_iter:
            if results != None:
                writer.writerows(results)
开发者ID:mdweaver,项目名称:newspaperScrapers,代码行数:104,代码来源:newspapers_com-scraper.py

示例3: analyze

# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]
class analyze(setup.setup):

    def __init__(self,args,logging_level=logging.INFO):

         super(analyze, self ).__init__(args,logging_level)


    # set up processing pool and run all analyses specified in args
    def run(self):


        if self.args.jumpdists:
            n_bins=100.
            bin_width = 1/n_bins
            bins = np.arange(0,1+bin_width,1/n_bins)

            if self.args.file:
                user,vals = self.artist_jump_distributions(self.args.file,bins=bins,self_jumps=False)
                with open(self.args.resultdir+user,'w') as fout:
                    fout.write(','.join(vals.astype(str))+'\n')



            else:
                raise('not implemented!')
                self.pool = Pool(self.args.n)
                self.rootLogger.info("Pool started")

                self.rootLogger.info("Starting jump distance analysis")

                func_partial = partial(self.artist_jump_distributions,bins=bins,self_jumps=False)
                with open(self.args.resultdir+'jumpdists','w') as fout:
                    for user,vals in self.pool.imap(func_partial,self.listen_files):
                        fout.write(user+'\t'+','.join(vals.astype(str))+'\n')

                self.pool.close()
                self.rootLogger.info("Pool closed")

        if self.args.blockdists:
            #self.rootLogger.info("Starting block distance analysis")
            self.mean_block_distances(self.args.file)

        if self.args.diversity_dists:
            bins = np.arange(0,1.01,.01)
            self.diversity_distributions(self.args.file,bins=bins)

        if self.args.clustering:
            self.clustering(self.args.file)

        if self.args.values:
            self.patch_values(self.args.file)

        if self.args.exp:
            self.explore_exploit(self.args.file)

        if self.args.patch_len_dists:
            self.patch_len_dists(self.args.file)


    # calculate distribution (using histogram with specified bins)
    # of sequential artist-to-artist distances
    def artist_jump_distributions(self,fi,bins,self_jumps=False):
        user = fi.split('/')[-1][:-4]
        df = pd.read_pickle(fi)
        if self_jumps:
            vals = np.histogram(df['dist'].dropna(),bins=bins)[0]
        else:
            vals = np.histogram(df['dist'][df['dist']>0],bins=bins)[0]
        self.rootLogger.info('artist jump distances done for user {} ({})'.format(user,fi))
        return user,vals

    # calculate distribution (using histogram with specified bins)
    # of patch diversity for each user

    # awk 'FNR==1' * > diversity_dists_zeros
    # awk 'FNR==2' * > diversity_dists_nozeros
    def diversity_distributions(self,fi,bins):
        if 'patches' not in fi:
            raise('WRONG DATATYPE')
        user = fi.split('/')[-1].split('_')[0]
        df = pd.read_pickle(fi).dropna(subset=['diversity'])
        zeros = np.histogram(df[df['n']>=5]['diversity'],bins=bins)[0]
        nozeros = np.histogram(df[(df['n']>=5)&(df['diversity']>0)]['diversity'],bins=bins)[0]

        zeros = zeros/float(zeros.sum())
        nozeros = nozeros/float(nozeros.sum())

        with open(self.args.resultdir+user,'w') as fout:
            fout.write(user+'\t'+'zeros'+'\t'+','.join(zeros.astype(str))+'\n')
            fout.write(user+'\t'+'nozeros'+'\t'+','.join(nozeros.astype(str))+'\n')
        self.rootLogger.info('diversity distributions done for user {} ({})'.format(user,fi))


    def mean_block_distances(self,fi,n=100):

        def cos_nan(arr1,arr2):
            if np.any(np.isnan(arr1)) or np.any(np.isnan(arr2)):
                return np.nan
            else:
                return cosine(arr1,arr2)
#.........这里部分代码省略.........
开发者ID:jlorince,项目名称:MusicForaging,代码行数:103,代码来源:patchAnalyses.py

示例4: sleep

# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]
            print "... trying again ..."
            sleep(1.5**wait)
            wait += 1    
    if count > 0:
        t = r['titleData']
        return {day : Set([x['value'] for x in t])}
    else:
        return {day : Set()}

start_date = date(1880,1,1)
end_date = date(1941,1,1)
date_list = [str(date) for date in perdelta(start_date, end_date, timedelta(days=1))]


pool = Pool(10)
result_iter = pool.imap(scrape_day, date_list)
title_sets = {}
for result in result_iter:
    title_sets.update(result)

###################################
#Make dictionary of daily matches #
###################################
def scrape_paper(title_id):
    title_url = "_".join((title_stub_url, title_id))
    wait=0
    while True:
        try:
            title_get = session.get(title_url,  cookies=session.cookies, allow_redirects=True).text
            break
        except:
开发者ID:mdweaver,项目名称:newspaperScrapers,代码行数:33,代码来源:newspapers_com-gettitles.py

示例5: ProcessingPool

# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]
# instantiate and configure the worker pool
from pathos.multiprocessing import ProcessingPool
pool = ProcessingPool(nodes=4)

_result = map(pow, [1,2,3,4], [5,6,7,8]) 

# do a blocking map on the chosen function
result = pool.map(pow, [1,2,3,4], [5,6,7,8])
assert result == _result

# do a non-blocking map, then extract the result from the iterator
result_iter = pool.imap(pow, [1,2,3,4], [5,6,7,8])
result = list(result_iter)
assert result == _result

# do an asynchronous map, then get the results
result_queue = pool.amap(pow, [1,2,3,4], [5,6,7,8])
result = result_queue.get()
assert result == _result

开发者ID:WarrenWeckesser,项目名称:pathos,代码行数:21,代码来源:test_mp.py

示例6: str

# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]
    state = re.search("^[^(--)]+(?=--)", str(location_raw)).group(0)
    line['location'] = ", ".join((city, state))
    line['lastUpdated'] = search_datetime
    #Get paper publication dates
    paper_date_set = Set([x['date_issued'] for x in paper_data['issues']])
    date_match = {k : int(k in paper_date_set) for k in date_list}
    line.update(date_match)
    return line




#Scrape publication data
print "Getting publication data..."
pool = Pool(10)
result_iter = pool.imap(scrape_paper, paper_stubs, [date_list]*len(paper_stubs))
lines = []
for result in result_iter:
    lines.append(result)

#Prepare for write
filename = "chronicling_america-allpubs.csv"
filepath = directory
fields = ['archive', 'publication_title', 'publication_id', 'location', 'lastUpdated'] + date_list

print "Creating data rows..."
out = []
for line in lines:
    line['publication_title'] = line['publication_title'].encode('utf8')
    line['location'] = line['location'].encode('utf8')
    out.append([line[k] for k in fields])
开发者ID:mdweaver,项目名称:newspaperScrapers,代码行数:33,代码来源:chronicling_america-gettitles.py

示例7: newspaperarchive_scraper

# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]

#.........这里部分代码省略.........
    def extract_articles(page):    
        #Grab articles
        articles = page.find_all('div', class_="searchResultBlock searchResultBlockWithThumb")
        return articles

    def extract_data(article, day):
        line = {}
        line['archive'] = "newspaperarchive"
        try:
            line["publication_title"] = article.h4.a.get_text().strip().encode('utf8')
        except:
            line["publication_title"] = ""
        line["href"] = article.a['href']    
        try:
            line['publication_id'] = re.search("(?<=http://access\.newspaperarchive\.com/)([^/]+/[^/]+/[^/]+/[^/]+)", line['href']).group(0)
        except:
            line['publication_id'] = ""
        line["search_date"] = day
        try:
            line['page'] = re.search("(?<=/page-)(\d\d?)", line['href']).group(0)
        except:
            line['page'] = ""
        line['search_terms'] = search_terms
        return line

    def scrape(search_terms, day):
        sleep(1)
        print day
        #Visit URL and parse
        url = newspaperarchive_url(search_terms, day)
        wait = 0
        while True:
            try:
                start = requests.get(url, timeout=(1,180)).text
                break
            except:
                print "... trying again ..." + str(day)
                sleep(1.5**wait)
                wait += 1

        #Are there any hits?
        if test_matches(start) == None:
            lines = []
            nextLink = []
            page = start
            page_number = 2

            while nextLink != None:
                soup = BeautifulSoup(page, 'html.parser')
                articles = extract_articles(soup)
                #extract article data
                for article in articles:
                    lines.append(extract_data(article, day))

                #Get nextLink
                try:
                    nextLink = soup.find('a', text=page_number)['href']
                    wait = 0
                    while True:
                        try:
                            page = requests.get(nextLink, timeout=(1,180)).text
                            break
                        except:
                            print "... trying again ..." + str(day)
                            sleep(1.5**wait)
                            wait += 1
                    page_number += 1
                except TypeError:
                    nextLink = None           

            return lines

        else:
            return None

    #Complete scraper
    #Parallel processing
    if __name__ == "__main__":
        #Create file name
        timeperiod = str(start_date) + "to" + str(end_date - timedelta(days=1))
        filename = "newspaperarchive-" + timeperiod + ".csv"

        pool = Pool(10)

        date_list = []
        for date in perdelta(start_date, end_date, timedelta(days=1)):
            date_list.append(date)

        search_terms_list = [search_terms] * len(date_list)

        result_iter = pool.imap(scrape, search_terms_list, date_list)

        #Create CSV
        fields = ["archive", "publication_title", "publication_id", "search_date", "page", "href", "search_terms"]
        with open("/".join((filepath,filename)), "w") as w:
            writer = csv.DictWriter(w, fieldnames=fields)
            writer.writeheader()
            for result in result_iter:
                if result != None:
                    writer.writerows(result)
开发者ID:mdweaver,项目名称:newspaperScrapers,代码行数:104,代码来源:newspaperarchive-scraper.py

示例8: Pool

# 需要导入模块: from pathos.multiprocessing import ProcessingPool [as 别名]
# 或者: from pathos.multiprocessing.ProcessingPool import imap [as 别名]
    line['archive'] = "americas_historical_newspapers"
    line['publication_title'] = title.input.text
    line['publication_id'] = title.input['value']
    city = title.find('td', class_="ci").text
    state = title.find('td', class_="st").text
    line['location'] = ", ".join((city,state))
    line['lastUpdated'] = search_datetime
    papers_data.append(line)


#Scrape publication dates in parallel
pub_ids = [paper['publication_id'] for paper in papers_data]

print "Scraping papers..."
pool = Pool(10)
result_iter = pool.imap(scrape_paper, pub_ids, [date_list_str]*len(pub_ids))
title_sets = {}
for result in result_iter:
    title_sets.update(result)

#Create file#
filename = "americas_historical_newspapers-allpubs.csv"
filepath = directory
fields = ['archive', 'publication_title', 'publication_id', 'location', 'lastUpdated'] + date_list_str

#Create output
print "Creating data rows..."
out = []
for paper in papers_data:
    title_id = paper['publication_id']
    date_match = {k : int(k in title_sets[title_id]) for k in date_list_str}
开发者ID:mdweaver,项目名称:newspaperScrapers,代码行数:33,代码来源:americas_historical_newspapers-gettitles.py


注:本文中的pathos.multiprocessing.ProcessingPool.imap方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。