本文整理汇总了Python中holmes.models.Domain.get_active_domains方法的典型用法代码示例。如果您正苦于以下问题:Python Domain.get_active_domains方法的具体用法?Python Domain.get_active_domains怎么用?Python Domain.get_active_domains使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类holmes.models.Domain
的用法示例。
在下文中一共展示了Domain.get_active_domains方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_can_get_active_domains
# 需要导入模块: from holmes.models import Domain [as 别名]
# 或者: from holmes.models.Domain import get_active_domains [as 别名]
def test_can_get_active_domains(self):
self.db.query(Domain).delete()
domain = DomainFactory(is_active=True)
DomainFactory(is_active=False)
domains = Domain.get_active_domains(self.db)
expect(domains).to_length(1)
expect(domains[0].id).to_equal(domain.id)
示例2: get_next_jobs_count
# 需要导入模块: from holmes.models import Domain [as 别名]
# 或者: from holmes.models.Domain import get_active_domains [as 别名]
def get_next_jobs_count(cls, db, config):
from holmes.models import Domain
active_domains = Domain.get_active_domains(db)
active_domains_ids = [item.id for item in active_domains]
return db \
.query(
sa.func.count(Page.id)
) \
.filter(Page.domain_id.in_(active_domains_ids)) \
.scalar()
示例3: get_next_job_list
# 需要导入模块: from holmes.models import Domain [as 别名]
# 或者: from holmes.models.Domain import get_active_domains [as 别名]
def get_next_job_list(cls, db, expiration, current_page=1, page_size=200):
from holmes.models import Domain
lower_bound = (current_page - 1) * page_size
upper_bound = lower_bound + page_size
active_domains = Domain.get_active_domains(db)
active_domains_ids = [item.id for item in active_domains]
pages_query = db \
.query(
Page.uuid,
Page.url,
Page.score,
Page.last_review_date
) \
.filter(Page.domain_id.in_(active_domains_ids)) \
.order_by(Page.score.desc())
return pages_query[lower_bound:upper_bound]
示例4: _verify_workers_limits
# 需要导入模块: from holmes.models import Domain [as 别名]
# 或者: from holmes.models.Domain import get_active_domains [as 别名]
def _verify_workers_limits(self, url, avg_links_per_page=10):
active_domains = Domain.get_active_domains(self.db)
return LimiterModel.has_limit_to_work(self.db, active_domains, url, avg_links_per_page)
示例5: get_next_job
# 需要导入模块: from holmes.models import Domain [as 别名]
# 或者: from holmes.models.Domain import get_active_domains [as 别名]
def get_next_job(cls, db, expiration, cache, lock_expiration, avg_links_per_page=10):
from holmes.models import Settings, Worker, Domain, Limiter # Avoid circular dependency
page = None
lock = None
settings = Settings.instance(db)
workers = db.query(Worker).all()
number_of_workers = len(workers)
active_domains = Domain.get_active_domains(db)
active_domains_ids = [item.id for item in active_domains]
all_domains_pages_in_need_of_review = {}
for domain_id in active_domains_ids:
pages = db \
.query(
Page.uuid,
Page.url,
Page.score,
Page.last_review_date
) \
.filter(Page.domain_id == domain_id) \
.order_by(Page.score.desc())[:number_of_workers]
if pages:
all_domains_pages_in_need_of_review[domain_id] = pages
pages_in_need_of_review = []
current_domain = 0
while all_domains_pages_in_need_of_review:
domains = all_domains_pages_in_need_of_review.keys()
if current_domain >= len(domains):
current_domain = 0
domain_id = domains[current_domain]
item = all_domains_pages_in_need_of_review[domain_id].pop(0)
pages_in_need_of_review.append(item)
if not all_domains_pages_in_need_of_review[domain_id]:
del all_domains_pages_in_need_of_review[domain_id]
current_domain += 1
if not pages_in_need_of_review:
return None
if settings.lambda_score > 0 and settings.lambda_score > pages_in_need_of_review[0].score:
cls.update_pages_score_by(settings, settings.lambda_score, db)
for i in range(len(pages_in_need_of_review)):
if not Limiter.has_limit_to_work(db, active_domains, pages_in_need_of_review[i].url, avg_links_per_page):
continue
lock = cache.has_next_job_lock(
pages_in_need_of_review[i].url,
lock_expiration
)
if lock is not None:
page = pages_in_need_of_review[i]
break
if page is None:
return None
return {
'page': str(page.uuid),
'url': page.url,
'score': page.score,
'lock': lock
}
示例6: fill_job_bucket
# 需要导入模块: from holmes.models import Domain [as 别名]
# 或者: from holmes.models.Domain import get_active_domains [as 别名]
def fill_job_bucket(self, expiration, look_ahead_pages=1000, avg_links_per_page=10.0):
try:
with Lock('next-job-fill-bucket-lock', redis=self.redis):
logging.info('Refilling job bucket. Lock acquired...')
expired_time = datetime.utcnow() - timedelta(seconds=expiration)
active_domains = Domain.get_active_domains(self.db)
if not active_domains:
return
active_domains_ids = [item.id for item in active_domains]
limiter_buckets = self.get_limiter_buckets(active_domains, avg_links_per_page)
all_domains_pages_in_need_of_review = []
for domain_id in active_domains_ids:
pages = self.db \
.query(
Page.uuid,
Page.url,
Page.score,
Page.last_review_date
) \
.filter(Page.domain_id == domain_id) \
.filter(or_(
Page.last_review_date == None,
Page.last_review_date <= expired_time
))[:look_ahead_pages]
if pages:
all_domains_pages_in_need_of_review.append(pages)
logging.debug('Total of %d pages found to add to redis.' % (sum([len(item) for item in all_domains_pages_in_need_of_review])))
item_count = int(self.redis.zcard('next-job-bucket'))
current_domain = 0
while item_count < look_ahead_pages and len(all_domains_pages_in_need_of_review) > 0:
if current_domain >= len(all_domains_pages_in_need_of_review):
current_domain = 0
item = all_domains_pages_in_need_of_review[current_domain].pop(0)
has_limit = True
logging.debug('Available Limit Buckets: %s' % limiter_buckets)
for index, (limit, available) in enumerate(limiter_buckets):
if limit.matches(item.url):
if available <= 0:
has_limit = False
break
limiter_buckets[index] = (limit, available - 1)
if has_limit:
self.add_next_job_bucket(item.uuid, item.url)
item_count += 1
# if there are not any more pages in this domain remove it from dictionary
if not all_domains_pages_in_need_of_review[current_domain]:
del all_domains_pages_in_need_of_review[current_domain]
current_domain += 1
logging.debug('ADDED A TOTAL of %d ITEMS TO REDIS...' % item_count)
except LockTimeout:
logging.info("Can't acquire lock. Moving on...")