本文整理汇总了Python中billy.scrape.validator.DatetimeValidator.validate方法的典型用法代码示例。如果您正苦于以下问题:Python DatetimeValidator.validate方法的具体用法?Python DatetimeValidator.validate怎么用?Python DatetimeValidator.validate使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类billy.scrape.validator.DatetimeValidator
的用法示例。
在下文中一共展示了DatetimeValidator.validate方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from billy.scrape.validator import DatetimeValidator [as 别名]
# 或者: from billy.scrape.validator.DatetimeValidator import validate [as 别名]
def main():
parser = argparse.ArgumentParser(
description='Scrape data for state, saving data to disk.',
parents=[base_arg_parser],
)
parser.add_argument('state', type=str,
help='state scraper module (eg. nc)')
parser.add_argument('-s', '--session', action='append', dest='sessions',
help='session(s) to scrape')
parser.add_argument('-t', '--term', action='append', dest='terms',
help='term(s) to scrape')
parser.add_argument('--upper', action='store_true', dest='upper',
default=False, help='scrape upper chamber')
parser.add_argument('--lower', action='store_true', dest='lower',
default=False, help='scrape lower chamber')
parser.add_argument('--bills', action='store_true', dest='bills',
default=False, help="scrape bill data")
parser.add_argument('--legislators', action='store_true',
dest='legislators', default=False,
help="scrape legislator data")
parser.add_argument('--committees', action='store_true', dest='committees',
default=False, help="scrape committee data")
parser.add_argument('--votes', action='store_true', dest='votes',
default=False, help="scrape vote data")
parser.add_argument('--events', action='store_true', dest='events',
default=False, help='scrape event data')
parser.add_argument('--alldata', action='store_true', dest='alldata',
default=False,
help="scrape all available types of data")
parser.add_argument('--strict', action='store_true', dest='strict',
default=False, help="fail immediately when"
"encountering validation warning")
parser.add_argument('-n', '--no_cache', action='store_true',
dest='no_cache', help="don't use web page cache")
parser.add_argument('--fastmode', help="scrape in fast mode",
action="store_true", default=False)
parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm',
default=60),
parser.add_argument('--timeout', action='store', type=int, dest='timeout',
default=10)
args = parser.parse_args()
settings.update(args)
# set up search path
sys.path.insert(0, os.path.join(os.path.dirname(__file__),
'../../openstates'))
# get metadata
metadata = __import__(args.state, fromlist=['metadata']).metadata
state = metadata['abbreviation']
configure_logging(args.verbose, args.state)
# make output dir
args.output_dir = os.path.join(settings.BILLY_DATA_DIR, args.state)
try:
os.makedirs(args.output_dir)
except OSError as e:
if e.errno != 17:
raise e
# write metadata
try:
schema_path = os.path.join(os.path.split(__file__)[0],
'../schemas/metadata.json')
schema = json.load(open(schema_path))
validator = DatetimeValidator()
validator.validate(metadata, schema)
except ValueError as e:
logging.getLogger('billy').warning('metadata validation error: '
+ str(e))
with open(os.path.join(args.output_dir, 'state_metadata.json'), 'w') as f:
json.dump(metadata, f, cls=JSONDateEncoder)
# determine time period to run for
if args.terms:
for term in metadata['terms']:
if term in args.terms:
args.sessions.extend(term['sessions'])
args.sessions = set(args.sessions or [])
# determine chambers
args.chambers = []
if args.upper:
args.chambers.append('upper')
if args.lower:
args.chambers.append('lower')
if not args.chambers:
args.chambers = ['upper', 'lower']
if not (args.bills or args.legislators or args.votes or
args.committees or args.events or args.alldata):
raise ScrapeError("Must specify at least one of --bills, "
"--legislators, --committees, --votes, --events, "
#.........这里部分代码省略.........
示例2: Scraper
# 需要导入模块: from billy.scrape.validator import DatetimeValidator [as 别名]
# 或者: from billy.scrape.validator.DatetimeValidator import validate [as 别名]
class Scraper(scrapelib.Scraper):
""" Base class for all Scrapers
Provides several useful methods for retrieving URLs and checking
arguments against metadata.
"""
latest_only = False
def __init__(self, metadata, output_dir=None, strict_validation=None,
fastmode=False):
"""
Create a new Scraper instance.
:param metadata: metadata for this scraper
:param output_dir: the data directory to use
:param strict_validation: exit immediately if validation fails
"""
super(Scraper, self).__init__()
# scrapelib overrides
self.timeout = settings.SCRAPELIB_TIMEOUT
self.cache_storage = scrapelib.FileCache(settings.BILLY_CACHE_DIR)
self.requests_per_minute = settings.SCRAPELIB_RPM
self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS
self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS
if fastmode:
self.requests_per_minute = 0
self.cache_write_only = False
self.metadata = metadata
self.output_dir = output_dir
self.output_names = set()
# make output_dir
os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir)
# validation
self.strict_validation = strict_validation
self.validator = DatetimeValidator()
self._schema = {}
self._load_schemas()
self.follow_robots = False
# logging convenience methods
self.logger = logging.getLogger("billy")
self.log = self.logger.info
self.info = self.logger.info
self.debug = self.logger.debug
self.warning = self.logger.warning
self.error = self.logger.error
self.critical = self.logger.critical
def _load_schemas(self):
""" load all schemas into schema dict """
types = ('bill', 'committee', 'person', 'vote', 'event', 'speech')
for type in types:
schema_path = os.path.join(os.path.split(__file__)[0],
'../schemas/%s.json' % type)
self._schema[type] = json.load(open(schema_path))
self._schema[type]['properties'][settings.LEVEL_FIELD] = {
'minLength': 2, 'type': 'string'}
# bills & votes
self._schema['bill']['properties']['session']['enum'] = \
self.all_sessions()
self._schema['vote']['properties']['session']['enum'] = \
self.all_sessions()
# legislators
terms = [t['name'] for t in self.metadata['terms']]
# ugly break here b/c this line is nearly impossible to split
self._schema['person']['properties']['roles'][
'items']['properties']['term']['enum'] = terms
@property
def object_count(self):
# number of distinct output filenames
return len(self.output_names)
def validate_json(self, obj):
try:
self.validator.validate(obj, self._schema[obj['_type']])
except ValueError as ve:
self.warning(str(ve))
if self.strict_validation:
raise ve
def all_sessions(self):
sessions = []
for t in self.metadata['terms']:
sessions.extend(t['sessions'])
return sessions
def validate_session(self, session, latest_only=False):
""" Check that a session is present in the metadata dictionary.
#.........这里部分代码省略.........
示例3: main
# 需要导入模块: from billy.scrape.validator import DatetimeValidator [as 别名]
# 或者: from billy.scrape.validator.DatetimeValidator import validate [as 别名]
#.........这里部分代码省略.........
if 'events' in metadata['feature_flags']:
args.types.append('events')
if 'speeches' in metadata['feature_flags']:
args.types.append('speeches')
plan = """billy-update abbr=%s
actions=%s
types=%s
sessions=%s
terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types),
','.join(args.sessions), ','.join(args.terms))
logging.getLogger('billy').info(plan)
scrape_data = {}
if 'scrape' in args.actions:
_clear_scraped_data(args.output_dir)
# validate then write metadata
if hasattr(module, 'session_list'):
session_list = module.session_list()
else:
session_list = []
check_sessions(metadata, session_list)
try:
schema_path = os.path.join(os.path.split(__file__)[0],
'../schemas/metadata.json')
schema = json.load(open(schema_path))
validator = DatetimeValidator()
validator.validate(metadata, schema)
except ValueError as e:
logging.getLogger('billy').warning(
'metadata validation error: ' + str(e))
run_record = []
exec_record = {
"run_record": run_record,
"args": sys.argv,
}
lex = None
exc_traceback = None
# start to run scrapers
exec_start = dt.datetime.utcnow()
# scraper order matters
order = ('legislators', 'committees', 'votes', 'bills',
'events', 'speeches')
_traceback = None
try:
for stype in order:
if stype in args.types:
run_record += _run_scraper(stype, args, metadata)
except Exception as e:
_traceback = _, _, exc_traceback = sys.exc_info()
run_record += [{"exception": e, "type": stype}]
lex = e
exec_end = dt.datetime.utcnow()
exec_record['started'] = exec_start
exec_record['ended'] = exec_end
示例4: DatetimeValidator
# 需要导入模块: from billy.scrape.validator import DatetimeValidator [as 别名]
# 或者: from billy.scrape.validator.DatetimeValidator import validate [as 别名]
# make output dir
args.output_dir = os.path.join(settings.BILLY_DATA_DIR, state)
try:
os.makedirs(args.output_dir)
except OSError, e:
if e.errno != 17:
raise e
# write metadata
try:
schema_path = os.path.join(os.path.split(__file__)[0],
'../schemas/metadata.json')
schema = json.load(open(schema_path))
validator = DatetimeValidator()
validator.validate(metadata, schema)
except ValueError, e:
logging.getLogger('billy').warning('metadata validation error: '
+ str(e))
with open(os.path.join(args.output_dir, 'state_metadata.json'), 'w') as f:
json.dump(metadata, f, cls=JSONDateEncoder)
# determine time period to run for
if args.terms:
for term in metadata['terms']:
if term in args.terms:
args.sessions.extend(term['sessions'])
args.sessions = set(args.sessions or [])
# determine chambers
示例5: Scraper
# 需要导入模块: from billy.scrape.validator import DatetimeValidator [as 别名]
# 或者: from billy.scrape.validator.DatetimeValidator import validate [as 别名]
class Scraper(scrapelib.Scraper):
""" Base class for all Scrapers
Provides several useful methods for retrieving URLs and checking
arguments against metadata.
"""
__metaclass__ = ScraperMeta
latest_only = False
def __init__(self, metadata, output_dir=None, strict_validation=None, fastmode=False, **kwargs):
"""
Create a new Scraper instance.
:param metadata: metadata for this scraper
:param output_dir: the data directory to use
:param strict_validation: exit immediately if validation fails
"""
# configure underlying scrapelib object
kwargs["cache_obj"] = scrapelib.FileCache(settings.BILLY_CACHE_DIR)
kwargs["requests_per_minute"] = settings.SCRAPELIB_RPM
kwargs["timeout"] = settings.SCRAPELIB_TIMEOUT
kwargs["retry_attempts"] = settings.SCRAPELIB_RETRY_ATTEMPTS
kwargs["retry_wait_seconds"] = settings.SCRAPELIB_RETRY_WAIT_SECONDS
if fastmode:
kwargs["requests_per_minute"] = 0
kwargs["cache_write_only"] = False
super(Scraper, self).__init__(**kwargs)
self.metadata = metadata
self.output_dir = output_dir
self.output_names = set()
# make output_dir
os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir)
# validation
self.strict_validation = strict_validation
self.validator = DatetimeValidator()
self._schema = {}
self._load_schemas()
self.follow_robots = False
# logging convenience methods
self.logger = logging.getLogger("billy")
self.log = self.logger.info
self.info = self.logger.info
self.debug = self.logger.debug
self.warning = self.logger.warning
self.error = self.logger.error
self.critical = self.logger.critical
def _load_schemas(self):
""" load all schemas into schema dict """
types = ("bill", "committee", "person", "vote", "event", "speech")
for type in types:
schema_path = os.path.join(os.path.split(__file__)[0], "../schemas/%s.json" % type)
self._schema[type] = json.load(open(schema_path))
self._schema[type]["properties"][settings.LEVEL_FIELD] = {"maxLength": 2, "minLength": 2, "type": "string"}
# bills & votes
self._schema["bill"]["properties"]["session"]["enum"] = self.all_sessions()
self._schema["vote"]["properties"]["session"]["enum"] = self.all_sessions()
# legislators
terms = [t["name"] for t in self.metadata["terms"]]
self._schema["person"]["properties"]["roles"]["items"]["properties"]["term"]["enum"] = terms
@property
def object_count(self):
# number of distinct output filenames
return len(self.output_names)
def validate_json(self, obj):
try:
self.validator.validate(obj, self._schema[obj["_type"]])
except ValueError as ve:
self.warning(str(ve))
if self.strict_validation:
raise ve
def all_sessions(self):
sessions = []
for t in self.metadata["terms"]:
sessions.extend(t["sessions"])
return sessions
def validate_session(self, session, latest_only=False):
""" Check that a session is present in the metadata dictionary.
raises :exc:`~billy.scrape.NoDataForPeriod` if session is invalid
:param session: string representing session to check
#.........这里部分代码省略.........
示例6: Scraper
# 需要导入模块: from billy.scrape.validator import DatetimeValidator [as 别名]
# 或者: from billy.scrape.validator.DatetimeValidator import validate [as 别名]
class Scraper(scrapelib.Scraper):
""" Base class for all Scrapers
Provides several useful methods for retrieving URLs and checking
arguments against metadata.
"""
__metaclass__ = ScraperMeta
latest_only = False
def __init__(self, metadata, output_dir=None, strict_validation=None,
fastmode=False, **kwargs):
"""
Create a new Scraper instance.
:param metadata: metadata for this scraper
:param output_dir: the data directory to use
:param strict_validation: exit immediately if validation fails
"""
# configure underlying scrapelib object
kwargs['cache_obj'] = scrapelib.FileCache(settings.BILLY_CACHE_DIR)
kwargs['requests_per_minute'] = settings.SCRAPELIB_RPM
kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT
kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS
kwargs['retry_wait_seconds'] = settings.SCRAPELIB_RETRY_WAIT_SECONDS
if fastmode:
kwargs['requests_per_minute'] = 0
kwargs['cache_write_only'] = False
super(Scraper, self).__init__(**kwargs)
for f in settings.BILLY_LEVEL_FIELDS[self.level]:
if not hasattr(self, f):
raise Exception('%s scrapers must have a %s attribute' % (
self.level, f))
self.metadata = metadata
self.output_dir = output_dir
# make output_dir
os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir)
# validation
self.strict_validation = strict_validation
self.validator = DatetimeValidator()
self.follow_robots = False
# logging convenience methods
self.logger = logging.getLogger("billy")
self.log = self.logger.info
self.debug = self.logger.debug
self.warning = self.logger.warning
def validate_json(self, obj):
if not hasattr(self, '_schema'):
self._schema = self._get_schema()
try:
self.validator.validate(obj, self._schema)
except ValueError as ve:
self.warning(str(ve))
if self.strict_validation:
raise ve
def all_sessions(self):
sessions = []
for t in self.metadata['terms']:
sessions.extend(t['sessions'])
return sessions
def validate_session(self, session):
""" Check that a session is present in the metadata dictionary.
raises :exc:`~billy.scrape.NoDataForPeriod` if session is invalid
:param session: string representing session to check
"""
for t in self.metadata['terms']:
if session in t['sessions']:
return True
raise NoDataForPeriod(session)
def validate_term(self, term, latest_only=False):
""" Check that a term is present in the metadata dictionary.
raises :exc:`~billy.scrape.NoDataForPeriod` if term is invalid
:param term: string representing term to check
:param latest_only: if True, will raise exception if term is not
the current term (default: False)
"""
if latest_only:
if term == self.metadata['terms'][-1]['name']:
return True
else:
raise NoDataForPeriod(term)
#.........这里部分代码省略.........
示例7: Scraper
# 需要导入模块: from billy.scrape.validator import DatetimeValidator [as 别名]
# 或者: from billy.scrape.validator.DatetimeValidator import validate [as 别名]
class Scraper(scrapelib.Scraper):
""" Base class for all Scrapers
Provides several useful methods for retrieving URLs and checking
arguments against metadata.
"""
__metaclass__ = ScraperMeta
def __init__(self, metadata, no_cache=False, output_dir=None,
strict_validation=None, **kwargs):
"""
Create a new Scraper instance.
:param metadata: metadata for this state
:param no_cache: if True, will ignore any cached downloads
:param output_dir: the data directory to use
:param strict_validation: exit immediately if validation fails
"""
# configure underlying scrapelib object
if no_cache:
kwargs['cache_dir'] = None
elif 'cache_dir' not in kwargs:
kwargs['cache_dir'] = settings.BILLY_CACHE_DIR
if 'error_dir' not in kwargs:
kwargs['error_dir'] = settings.BILLY_ERROR_DIR
if 'timeout' not in kwargs:
kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT
if 'requests_per_minute' not in kwargs:
kwargs['requests_per_minute'] = None
if 'retry_attempts' not in kwargs:
kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS
if 'retry_wait_seconds' not in kwargs:
kwargs['retry_wait_seconds'] = settings.SCRAPELIB_RETRY_WAIT_SECONDS
super(Scraper, self).__init__(**kwargs)
if not hasattr(self, 'state'):
raise Exception('Scrapers must have a state attribute')
self.metadata = metadata
self.output_dir = output_dir
# validation
self.strict_validation = strict_validation
self.validator = DatetimeValidator()
self.follow_robots = False
# logging convenience methods
self.logger = logging.getLogger("billy")
self.log = self.logger.info
self.debug = self.logger.debug
self.warning = self.logger.warning
def validate_json(self, obj):
if not hasattr(self, '_schema'):
self._schema = self._get_schema()
try:
self.validator.validate(obj, self._schema)
except ValueError, ve:
self.warning(str(ve))
if self.strict_validation:
raise ve
示例8: Scraper
# 需要导入模块: from billy.scrape.validator import DatetimeValidator [as 别名]
# 或者: from billy.scrape.validator.DatetimeValidator import validate [as 别名]
class Scraper(scrapelib.Scraper):
""" Base class for all Scrapers
Provides several useful methods for retrieving URLs and checking
arguments against metadata.
"""
__metaclass__ = ScraperMeta
def __init__(self, metadata, no_cache=False, output_dir=None,
strict_validation=None, **kwargs):
"""
Create a new Scraper instance.
:param metadata: metadata for this state
:param no_cache: if True, will ignore any cached downloads
:param output_dir: the data directory to use
:param strict_validation: exit immediately if validation fails
"""
# configure underlying scrapelib object
if no_cache:
kwargs['cache_dir'] = None
elif 'cache_dir' not in kwargs:
kwargs['cache_dir'] = settings.BILLY_CACHE_DIR
if 'error_dir' not in kwargs:
kwargs['error_dir'] = settings.BILLY_ERROR_DIR
if 'timeout' not in kwargs:
kwargs['timeout'] = settings.SCRAPELIB_TIMEOUT
if 'requests_per_minute' not in kwargs:
kwargs['requests_per_minute'] = None
if 'retry_attempts' not in kwargs:
kwargs['retry_attempts'] = settings.SCRAPELIB_RETRY_ATTEMPTS
if 'retry_wait_seconds' not in kwargs:
kwargs['retry_wait_seconds'] = settings.SCRAPELIB_RETRY_WAIT_SECONDS
super(Scraper, self).__init__(**kwargs)
if not hasattr(self, 'state'):
raise Exception('Scrapers must have a state attribute')
self.metadata = metadata
self.output_dir = output_dir
# validation
self.strict_validation = strict_validation
self.validator = DatetimeValidator()
self.follow_robots = False
# logging convenience methods
self.logger = logging.getLogger("billy")
self.log = self.logger.info
self.debug = self.logger.debug
self.warning = self.logger.warning
def validate_json(self, obj):
if not hasattr(self, '_schema'):
self._schema = self._get_schema()
try:
self.validator.validate(obj, self._schema)
except ValueError as ve:
self.warning(str(ve))
if self.strict_validation:
raise ve
def all_sessions(self):
sessions = []
for t in self.metadata['terms']:
sessions.extend(t['sessions'])
return sessions
def validate_session(self, session):
""" Check that a session is present in the metadata dictionary.
raises :exc:`~billy.scrape.NoDataForPeriod` if session is invalid
:param session: string representing session to check
"""
for t in self.metadata['terms']:
if session in t['sessions']:
return True
raise NoDataForPeriod(session)
def validate_term(self, term, latest_only=False):
""" Check that a term is present in the metadata dictionary.
raises :exc:`~billy.scrape.NoDataForPeriod` if term is invalid
:param term: string representing term to check
:param latest_only: if True, will raise exception if term is not
the current term (default: False)
"""
if latest_only:
#.........这里部分代码省略.........
示例9: main
# 需要导入模块: from billy.scrape.validator import DatetimeValidator [as 别名]
# 或者: from billy.scrape.validator.DatetimeValidator import validate [as 别名]
#.........这里部分代码省略.........
args.types.append('speeches')
plan = """billy-update abbr=%s
actions=%s
types=%s
sessions=%s
terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types),
','.join(args.sessions), ','.join(args.terms))
_log.info(plan)
scrape_data = {}
if args.billid is False:
_log.debug("No billid filter.")
else:
_log.debug("Search for billid: %s" % args.billid)
if 'scrape' in args.actions:
_clear_scraped_data(args.output_dir)
# validate then write metadata
if hasattr(module, 'session_list'):
session_list = module.session_list()
else:
session_list = []
check_sessions(metadata, session_list)
_log.debug("Session List %s" % session_list)
try:
schema_path = os.path.join(
os.path.split(__file__)[0],
'../schemas/metadata.json')
schema = json.load(open(schema_path))
validator = DatetimeValidator()
validator.validate(metadata, schema)
except ValueError as e:
_log.warning(
'metadata validation error: ' + str(e))
with open(os.path.join(args.output_dir, 'metadata.json'),
'w') as f:
json.dump(metadata, f, cls=JSONDateEncoder)
run_record = []
exec_record = {
"run_record": run_record,
"args": sys.argv,
"state": abbrev
}
lex = None
exc_traceback = None
# start to run scrapers
exec_start = dt.datetime.utcnow()
# scraper order matters
if args.billid is False:
order = (
'legislators',
'committees',
'votes',
'bills',
'events',
'speeches')
else:
_log.debug("going to process bills")