本文整理汇总了Python中oaipmh.client.Client.listRecords方法的典型用法代码示例。如果您正苦于以下问题:Python Client.listRecords方法的具体用法?Python Client.listRecords怎么用?Python Client.listRecords使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类oaipmh.client.Client
的用法示例。
在下文中一共展示了Client.listRecords方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: list_records
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import listRecords [as 别名]
def list_records(target, date_from, date_until, setspec):
if target is not None:
client = Client(target['url'], registry)
# todo : clean this, find simplified cases
if date_from is not None and date_until is not None and setspec is not None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec)
elif date_from is not None and date_until is not None and setspec is None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until)
elif date_from is not None and date_until is None and setspec is not None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, set=setspec)
elif date_from is None and date_until is not None and setspec is not None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until, set=setspec)
elif date_from is not None and date_until is None and setspec is None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from)
elif date_from is None and date_until is not None and setspec is None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until)
elif date_from is None and date_until is None and setspec is not None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'], set=setspec)
elif date_from is None and date_until is None and setspec is None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'])
results = []
if records is not None:
results = []
for record in records:
results.append(convert_record(record, target['metadata_prefix'], target['title']))
return results
示例2: list_records
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import listRecords [as 别名]
def list_records(target, date_from, date_until, setspec):
logging.debug("list_records")
if target is not None:
client = Client(target['url'], registry)
# todo : clean this, find simplified cases
if date_from is not None and date_until is not None and setspec is not None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec)
elif date_from is not None and date_until is not None and setspec is None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until)
elif date_from is not None and date_until is None and setspec is not None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, set=setspec)
elif date_from is None and date_until is not None and setspec is not None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until, set=setspec)
elif date_from is not None and date_until is None and setspec is None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from)
elif date_from is None and date_until is not None and setspec is None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until)
elif date_from is None and date_until is None and setspec is not None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'], set=setspec)
elif date_from is None and date_until is None and setspec is None:
records = client.listRecords(metadataPrefix=target['metadata_prefix'])
if records is not None:
for record in records:
yield convert_record(record, target['metadata_prefix'], target['title'])
示例3: scrape
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import listRecords [as 别名]
def scrape(self):
raise Exception("not finished")
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
url = self.setting('pmh-endpoint')
client = Client(url, registry)
print " OAI Repository", url
print " Available sets:"
for s in client.listSets():
print " ", s
oai_set = self.setting('set')
oai_from = self.setting('from')
oai_until = self.setting('until')
kwargs = {}
if oai_set:
kwargs['set'] = oai_set
if oai_from is not None:
date_args = [int(arg) for arg in oai_from.split("-")]
kwargs['from_'] = datetime.datetime(*date_args)
if oai_until is not None:
date_args = [int(arg) for arg in oai_until.split("-")]
kwargs['until'] = datetime.datetime(*date_args)
records = [r for r in client.listRecords(metadataPrefix='oai_dc', **kwargs)]
data_filepath = os.path.join(self.work_dir(), self.setting('data-file'))
with open(data_filepath, 'wb') as f:
print " picking", len(records), "records"
pickle.dump(records, f)
示例4: insertAll
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import listRecords [as 别名]
def insertAll(time, time2):
registry = MetadataRegistry()
registry.registerReader('arXivRaw', arXivRaw_reader)
client = Client(URL, registry)
client.updateGranularity()
list = client.listRecords(metadataPrefix='arXivRaw', from_=time, until=time2)
errors = 0
for a in list:
#a = list.next()
try:
title = '\n'.join(a[1]['title'])
sr2 = str(' '.join(a[1]['categories']).replace('-','_')).split(' ')
abstract = '\n'.join(a[1]['abstract'])
url = 'http://arxiv.org/abs/' + a[1]['id'][0]
date = datetime.strptime(a[1]['created'][0], '%a, %d %b %Y %H:%M:%S %Z')
authors = a[1]['authors'][0]# '; '.join(a[1]['keynames'])
abstract = abstract + '\nBy: ' + authors + '\nIn: ' + ', '.join(sr2)
print title
print sr2
print abstract
print url
print date
print authors
insert(title + ' (' + authors + ')', str("fullarxiv"), url, abstract, date=date, cross_srs=sr2)
except:
print 'ERROR'
print a
errors = errors+1
print 'Completed with %s errors' % errors
示例5: arxiv_oai_scraper
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import listRecords [as 别名]
def arxiv_oai_scraper(subject, start, end, sleep_time=0):
base_url = "http://export.arxiv.org/oai2"
output = list()
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(base_url, registry)
client.updateGranularity()
records = client.listRecords(metadataPrefix='oai_dc', set="{}".format(subject), from_=start, until=end)
for _, md, _ in records:
# print md.getField("title")
# checks for the case in 2010 when there is no title for something
if md is not None:
txt_dict = {"title": md["title"],
"abstract": md["description"],
"date": md["date"],
"subject": md["subject"],
"url": md["identifier"],
"authors": md['creator']}
output.append(txt_dict)
time.sleep(sleep_time)
return output
示例6: _listRecords
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import listRecords [as 别名]
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
# Generator to yield records from baseUrl in the given metadataPrefix
# Add metatdataPrefix to args
kwargs['metadataPrefix'] = metadataPrefix
client = Client(baseUrl, metadata_registry)
# Check server timestamp granularity support
client.updateGranularity()
for record in client.listRecords(**kwargs):
yield record
示例7: pull_data
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import listRecords [as 别名]
def pull_data(source):
list_of_records = []
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
# Get list of public experiments at sources
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(source
+ "/apps/oaipmh/?verb=ListRecords&metadataPrefix=oai_dc", registry)
try:
exps_date = []
exps_metadata = []
for (header, meta, extra) in client.listRecords(metadataPrefix='oai_dc'):
exps_date.append(str(header._datestamp))
exps_metadata.append(meta)
logger.debug('Date=%s' % header._datestamp)
except AttributeError as e:
msg = "Error reading experiment %s" % e
logger.error(msg)
raise OAIPMHError(msg)
except error.NoRecordsMatchError as e:
msg = "no public records found on source %s" % e
logger.warn(msg)
return
exp_counter = 0
for exp_metadata in exps_metadata:
user_id = exp_metadata.getField('creator')[0]
user_profile = json.loads(_get_user(source, user_id))
data_tobe_indexed = dict(user_profile)
data_tobe_indexed['user_id'] = user_id
exp_id = exp_metadata.getField('identifier')[0]
description = exp_metadata.getField('description')[0]
title = exp_metadata.getField('title')[0]
if settings.EXPERIMENT_PATH[0] == '/':
settings.EXPERIMENT_PATH = settings.EXPERIMENT_PATH[1:]
experiment_url = os.path.join(source,
settings.EXPERIMENT_PATH % exp_id)
data_tobe_indexed['experiment_id'] = exp_id
data_tobe_indexed['experiment_title'] = title
data_tobe_indexed['experiment_description'] = description
data_tobe_indexed['experiment_url'] = experiment_url
data_tobe_indexed['id'] = experiment_url
data_tobe_indexed['experiment_date'] = exps_date[exp_counter]
exp_counter += 1
for k, v in data_tobe_indexed.items():
logger.debug('%s = %s' % (k, v))
logger.debug('')
list_of_records.append(json.dumps(data_tobe_indexed))
return list_of_records
示例8: harvest_oai_collection_records
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import listRecords [as 别名]
def harvest_oai_collection_records(self, collection):
records = []
try:
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(collection.community.repository.base_url, registry)
records = client.listRecords(
metadataPrefix='oai_dc', set=collection.identifier)
except:
return
return records
示例9: read_base_records
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import listRecords [as 别名]
def read_base_records(self):
registry = MetadataRegistry()
registry.registerReader('base_dc', base_dc_reader)
client = Client('http://doai.io/oai', registry)
for header, record, _ in client.listRecords(metadataPrefix='base_dc'):
# only process records for which base was unsure
if '2' not in record['oa']:
continue
# extract splash_url
for link in record['identifier']:
metadata = {'base_oa':''.join(record['oa']),
'splash_url':link,
'from_identifier':header.identifier()}
yield self.filter_url(link,metadata, looking_for='any')
示例10: index_documents
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import listRecords [as 别名]
def index_documents(main_url, database_name, url, reader, prefix, format):
registry = MetadataRegistry()
registry.registerReader(prefix, reader)
client = Client(url, registry)
return_stuff = []
for record in client.listRecords(metadataPrefix=prefix):
r = record[1]
value = format(r,record[0].identifier())
if value != None:
return_stuff.append(value)
if len(return_stuff) >= 10000:
sync_files(main_url, database_name, return_stuff)
return_stuff = []
sync_files(main_url, database_name, return_stuff)
示例11: scrape
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import listRecords [as 别名]
def scrape(start=START, end=END, set=SET_THESIS, type='Thesis'):
"""
Create an OAI-PMH client, gather metadata and output it.
"""
total = num = 0
msg = "Fetching records between " + str(start) + " and " + str(end)
sys.stderr.write(msg + "\n")
#
# Set up metadata readers
#
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
registry.registerReader('qdc', qdc_reader)
# registry.registerReader('rdf', rdf_reader) # no reader yet
# registry.registerReader('ore', ore_reader) # no reader yet
# registry.registerReader('mets', mets_reader) # no reader yet
client = Client(URL, registry)
records = client.listRecords(metadataPrefix='qdc',
from_=start, until=end, set=set)
for (h, m, a) in records:
print h, m, a
if not m:
sys.stderr.write("o")
continue
total = total + 1
handle = m.getField('identifier')
if not handle:
sys.stderr.write("Record without a handle.\n")
continue
r = dict({ 'handle' : handle[0] })
for key in qdc_reader._fields.keys():
r[key] = m.getField(key)
RECORDS.append(r)
sys.stderr.write('.')
sys.stderr.flush()
num = num + 1
msg = "\nCollected " + str(num) + " records, out of " + str(total)
sys.stderr.write('\n' + msg + '\n');
if options.store:
pickle.dump(RECORDS, open(options.store, "wb"))
示例12: update
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import listRecords [as 别名]
def update(self, from_date=None):
self._log.info('Harvesting oai server: %s' % self._url)
registry = MetadataRegistry()
registry.registerReader(self._prefix, lambda el: el)
client = Client(self._url, registry)
try:
for header, element, about in client.listRecords(
metadataPrefix = self._prefix,
from_ = from_date):
added = self._process_record(header, element)
if added:
yield self._get_id(header)
except NoRecordsMatchError:
pass
super(OAIBasedContentProvider, self).update()
示例13: processItems
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import listRecords [as 别名]
def processItems():
oai_oi_reader = MetadataReader(
fields={
'title': ('textList', 'oai_oi:oi/oi:title/text()'),
'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'),
'creator': ('textList', 'oai_oi:oi/oi:creator/text()'),
'subject': ('textList', 'oai_oi:oi/oi:subject/text()'),
'description': ('textList', 'oai_oi:oi/oi:description/text()'),
'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'),
'publisher': ('textList', 'oai_oi:oi/oi:publisher/text()'),
'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'),
'date': ('textList', 'oai_oi:oi/oi:date/text()'),
'type': ('textList', 'oai_oi:oi/oi:type/text()'),
'extent': ('textList', 'oai_oi:oi/oi:extend/text()'),
'medium': ('textList', 'oai_oi:oi/oi:medium/text()'),
'identifier': ('textList', 'oai_oi:oi/oi:identifier/text()'),
'source': ('textList', 'oai_oi:oi/oi:source/text()'),
'language': ('textList', 'oai_oi:oi/oi:language/text()'),
'references': ('textList', 'oai_oi:oi/oi:references/text()'),
'spatial': ('textList', 'oai_oi:oi/oi:spatial/text()'),
'attributionName': ('textList', 'oai_oi:oi/oi:attributionName/text()'),
'attributionURL': ('textList', 'oai_oi:oi/oi:attributionURL/text()'),
'license': ('textList', 'oai_oi:oi/oi:license/text()'),
#Zitten er niet in
#'rights': ('textList', 'oai_oi:oi/oi:rights/text()'),
#'relation': ('textList', 'oai_oi:oi/oi:relation/text()'),
#'coverage': ('textList', 'oai_oi:oi/oi:coverage/text()'),
#'format': ('textList', 'oai_oi:oi/oi:format/text()'),
},
namespaces={
'oi' : 'http://www.openbeelden.nl/oai/',
'oai_oi' : 'http://www.openarchives.org/OAI/2.0/oai_dc/',
'dc' : 'http://purl.org/dc/elements/1.1/',
'dcterms' : 'http://purl.org/dc/terms',
}
)
url = u'http://www.openbeelden.nl/feeds/oai/'
registry = MetadataRegistry()
registry.registerReader('oai_oi', oai_oi_reader)
client = Client(url, registry)
for record in client.listRecords(metadataPrefix='oai_oi'):
processItem(record)
示例14: iter_items
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import listRecords [as 别名]
def iter_items(self, partition):
""" Partition is an OAI-PMH endpoint """
# source = "oai:%s" % partition
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(partition, registry)
for record in client.listRecords(metadataPrefix='oai_dc'):
header, metadata, _ = record
if header.isDeleted():
continue
# _id = header.identifier()
# date = header.datestamp()
meta = metadata.getMap()
# TODO: there are much validation and heuristics to be done here!
# format0 = (meta.get("format") or [None])[0]
# if not format0:
# continue
# if format0 not in ("application/pdf", ):
# continue
url0 = (meta.get("identifier") or [None])[0]
if not url0:
continue
title0 = (meta.get("title") or [""])[0].encode("utf-8")
desc0 = (meta.get("description") or [""])[0].encode("utf-8")
# TODO: validate that the url0 is not on another domain?!
yield url0, {}, "html", 2, """
<html><head><title>%s</title></head><body>%s</body></html>
""" % (title0, desc0)
示例15: _listRecords
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import listRecords [as 别名]
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
# Generator to yield records from baseUrl in the given metadataPrefix
# Add metatdataPrefix to args
kwargs['metadataPrefix'] = metadataPrefix
client = Client(baseUrl, self._mdRegistry)
# Check that baseUrl actually represents an OAI-PMH target
try:
client.identify()
except IndexError:
raise NotOAIPMHBaseURLException(
"{0} does not appear to be an OAI-PMH compatible base URL"
"".format(baseUrl)
)
# Check server timestamp granularity support
client.updateGranularity()
for record in client.listRecords(**kwargs):
# Unit test hotfix
header, metadata, about = record
# Fix pyoai returning a "b'...'" string for py3k
if isinstance(metadata, str) and metadata.startswith("b'"):
metadata = ast.literal_eval(metadata).decode("utf-8")
yield (header, metadata, about)