本文整理汇总了Python中oaipmh.metadata.MetadataRegistry类的典型用法代码示例。如果您正苦于以下问题:Python MetadataRegistry类的具体用法?Python MetadataRegistry怎么用?Python MetadataRegistry使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了MetadataRegistry类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: gather_stage
def gather_stage(self, harvest_job):
url = harvest_job.source.url
# Test wether we should use OAI-PMH or DDI
metadata_registry = MetadataRegistry()
metadata_registry.registerReader('oai_dc', oai_dc_reader)
client = oaipmh.client.Client(url, metadata_registry)
try:
client.identify()
except XMLSyntaxError:
self.harvester = DDIHarvester()
except urllib2.URLError:
self._save_gather_error('Could not identify source!', harvest_job)
return None
if not self.harvester:
self.harvester = OAIPMHHarvester()
objs = self.harvester.gather_stage(harvest_job)
ret = []
for obj in objs:
obj = HarvestObject.get(obj)
cont = obj.content
dict = json.loads(cont)
dict['harv'] = jsonpickle.encode(self.harvester)
obj.content = json.dumps(dict)
obj.save()
ret.append(obj.id)
return ret
示例2: scrape
def scrape(self):
raise Exception("not finished")
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
url = self.setting('pmh-endpoint')
client = Client(url, registry)
print " OAI Repository", url
print " Available sets:"
for s in client.listSets():
print " ", s
oai_set = self.setting('set')
oai_from = self.setting('from')
oai_until = self.setting('until')
kwargs = {}
if oai_set:
kwargs['set'] = oai_set
if oai_from is not None:
date_args = [int(arg) for arg in oai_from.split("-")]
kwargs['from_'] = datetime.datetime(*date_args)
if oai_until is not None:
date_args = [int(arg) for arg in oai_until.split("-")]
kwargs['until'] = datetime.datetime(*date_args)
records = [r for r in client.listRecords(metadataPrefix='oai_dc', **kwargs)]
data_filepath = os.path.join(self.work_dir(), self.setting('data-file'))
with open(data_filepath, 'wb') as f:
print " picking", len(records), "records"
pickle.dump(records, f)
示例3: insertAll
def insertAll(time, time2):
registry = MetadataRegistry()
registry.registerReader('arXivRaw', arXivRaw_reader)
client = Client(URL, registry)
client.updateGranularity()
list = client.listRecords(metadataPrefix='arXivRaw', from_=time, until=time2)
errors = 0
for a in list:
#a = list.next()
try:
title = '\n'.join(a[1]['title'])
sr2 = str(' '.join(a[1]['categories']).replace('-','_')).split(' ')
abstract = '\n'.join(a[1]['abstract'])
url = 'http://arxiv.org/abs/' + a[1]['id'][0]
date = datetime.strptime(a[1]['created'][0], '%a, %d %b %Y %H:%M:%S %Z')
authors = a[1]['authors'][0]# '; '.join(a[1]['keynames'])
abstract = abstract + '\nBy: ' + authors + '\nIn: ' + ', '.join(sr2)
print title
print sr2
print abstract
print url
print date
print authors
insert(title + ' (' + authors + ')', str("fullarxiv"), url, abstract, date=date, cross_srs=sr2)
except:
print 'ERROR'
print a
errors = errors+1
print 'Completed with %s errors' % errors
示例4: arxiv_oai_scraper
def arxiv_oai_scraper(subject, start, end, sleep_time=0):
base_url = "http://export.arxiv.org/oai2"
output = list()
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(base_url, registry)
client.updateGranularity()
records = client.listRecords(metadataPrefix='oai_dc', set="{}".format(subject), from_=start, until=end)
for _, md, _ in records:
# print md.getField("title")
# checks for the case in 2010 when there is no title for something
if md is not None:
txt_dict = {"title": md["title"],
"abstract": md["description"],
"date": md["date"],
"subject": md["subject"],
"url": md["identifier"],
"authors": md['creator']}
output.append(txt_dict)
time.sleep(sleep_time)
return output
示例5: __init__
def __init__(self, url):
"""Initialize client."""
registry = MetadataRegistry()
registry.registerReader('oaf', self.oaf_reader)
return super(OpenAireClient, self).__init__(
url, metadata_registry=registry
)
示例6: list_oai_collections
def list_oai_collections(self, community):
""" Retrieve the header data for each record in the current community repo """
try:
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(community.repository.base_url, registry)
records = client.listIdentifiers(
metadataPrefix='oai_dc', set=community.identifier)
except:
community_collections = set()
return
""" Filter records to build list of collections in the community set """
community_collections = set()
for i in records:
for j in i.setSpec():
if j[:3] == 'col':
community_collections.add(j)
print len(community_collections)
""" Build collection tuples (identifier, name) """
for i in community_collections:
# print i
# print community_collections
set_data = []
set_data.append(i) # Store identifier
set_data.append('Collection: %s'%i) # Store human readable name
# print set_data
self.collections.append(set_data)
示例7: _get_client_identifier
def _get_client_identifier(self, url, harvest_job=None):
registry = MetadataRegistry()
registry.registerReader(self.metadata_prefix_value, oai_dc_reader)
client = oaipmh.client.Client(url, registry)
try:
identifier = client.identify()
except (urllib2.URLError, urllib2.HTTPError,):
if harvest_job:
self._save_gather_error(
'Could not gather from %s!' % harvest_job.source.url,
harvest_job)
return client, None
except socket.error:
if harvest_job:
errno, errstr = sys.exc_info()[:2]
self._save_gather_error(
'Socket error OAI-PMH %s, details:\n%s' % (errno, errstr),
harvest_job)
return client, None
except ValueError:
# We have no source URL when importing via UI.
return client, None
except Exception as e:
# Guard against miscellaneous stuff. Probably plain bugs.
log.debug(traceback.format_exc(e))
return client, None
return client, identifier
示例8: gather_stage
def gather_stage(self, harvest_job):
'''
The gather stage will recieve a HarvestJob object and will be
responsible for:
- gathering all the necessary objects to fetch on a later.
stage (e.g. for a CSW server, perform a GetRecords request)
- creating the necessary HarvestObjects in the database, specifying
the guid and a reference to its source and job.
- creating and storing any suitable HarvestGatherErrors that may
occur.
- returning a list with all the ids of the created HarvestObjects.
:param harvest_job: HarvestJob object
:returns: A list of HarvestObject ids
'''
self._set_config(harvest_job.source.config)
sets = []
harvest_objs = []
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = oaipmh.client.Client(harvest_job.source.url, registry)
try:
identifier = client.identify()
except urllib2.URLError:
self._save_gather_error('Could not gather anything from %s!' %
harvest_job.source.url, harvest_job)
return None
domain = identifier.repositoryName()
group = Group.by_name(domain)
if not group:
group = Group(name=domain, description=domain)
query = self.config['query'] if 'query' in self.config else ''
try:
for set in client.listSets():
identifier, name, _ = set
if 'query' in self.config:
if query in name:
sets.append((identifier, name))
else:
sets.append((identifier, name))
except NoSetHierarchyError:
sets.append(('1', 'Default'))
self._save_gather_error('Could not fetch sets!', harvest_job)
for set_id, set_name in sets:
harvest_obj = HarvestObject(job=harvest_job)
harvest_obj.content = json.dumps(
{
'set': set_id, \
'set_name': set_name, \
'domain': domain
}
)
harvest_obj.save()
harvest_objs.append(harvest_obj.id)
model.repo.commit()
return harvest_objs
示例9: __init__
def __init__(self, configuration_file):
"""Constructor."""
self.oai_config = ConfigParser.SafeConfigParser()
self.oai_config.read(configuration_file)
self.current_config = 'ToulouseBis'
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
self.client = Client(self._get_config_value('url'), registry)
示例10: test
def test(request):
URL = 'http://www.kulturarv.dk/ffrepox/OAIHandler'
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(URL, registry)
identifyResponse = client.identify()
print dir(identifyResponse)
#for record in client.listRecords(metadataPrefix='oai_dc'):
# result += record
return HttpResponse(identifyResponse.repositoryName())
示例11: test_get_record
def test_get_record(self):
metadata_reg = MetadataRegistry()
metadata_reg.registerReader('oai_dc', oai_dc_reader)
client = Client(config.get('ckan.site_url') + self.base_url, metadata_reg)
res = self._oai_get_method_and_validate('?verb=ListIdentifiers&metadataPrefix=oai_dc&set=roger')
urllib2.urlopen = mock.Mock(return_value=StringIO(res))
ids = client.listIdentifiers(metadataPrefix='oai_dc')
offset = self.base_url + '?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc' % ids.next().identifier()
res = self.app.get(offset)
self.assert_(oaischema.validate(etree.fromstring(res.body)))
self.assert_("abraham" in res.body)
示例12: harvest_oai_collection_records
def harvest_oai_collection_records(self, collection):
records = []
try:
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(collection.community.repository.base_url, registry)
records = client.listRecords(
metadataPrefix='oai_dc', set=collection.identifier)
except:
return
return records
示例13: clean
def clean(self):
cleaned_data = super(CreateRepositoryForm, self).clean()
try:
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(cleaned_data.get('base_url'), registry)
server = client.identify()
# set the repository name apply to model instance when saved.
cleaned_data['name'] = server.repositoryName()
except:
raise ValidationError('Repository base url is invalid.')
return cleaned_data
示例14: test_resumption_identifiers
def test_resumption_identifiers(self):
metadata_reg = MetadataRegistry()
metadata_reg.registerReader('oai_dc', oai_dc_reader)
urllib2.urlopen = realopen
client = CKANServer()
metadata_registry = metadata.MetadataRegistry()
metadata_registry.registerReader('oai_dc', oai_dc_reader)
metadata_registry.registerWriter('oai_dc', oai_dc_writer)
serv = BatchingServer(client, metadata_registry=metadata_registry)
client = ServerClient(serv, metadata_reg)
recs = client.listIdentifiers(metadataPrefix='oai_dc')
for rec in recs:
self.assert_(rec)
示例15: get_client
def get_client(url, transforms):
transforms = fix_transforms(transforms)
registry = MetadataRegistry()
c = Client(url, registry)
metadata = c.listMetadataFormats()
metadata[0] = [
'fbb', 'http://www.kulturarv.dk/fbb/fbb.xsd', 'http://www.kulturarv.dk/fbb']
namespaces = dict((x[0], x[2]) for x in metadata)
fields = dict((transform['field'], ('textList', transform['path']))
for transform in transforms)
namespace = metadata[0][0]
print namespaces,fields
registry.registerReader(namespace, MetadataReader(fields=fields, namespaces=namespaces))
return c, namespace