本文整理汇总了Python中oaipmh.client.Client.updateGranularity方法的典型用法代码示例。如果您正苦于以下问题:Python Client.updateGranularity方法的具体用法?Python Client.updateGranularity怎么用?Python Client.updateGranularity使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类oaipmh.client.Client
的用法示例。
在下文中一共展示了Client.updateGranularity方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: insertAll
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import updateGranularity [as 别名]
def insertAll(time, time2):
registry = MetadataRegistry()
registry.registerReader('arXivRaw', arXivRaw_reader)
client = Client(URL, registry)
client.updateGranularity()
list = client.listRecords(metadataPrefix='arXivRaw', from_=time, until=time2)
errors = 0
for a in list:
#a = list.next()
try:
title = '\n'.join(a[1]['title'])
sr2 = str(' '.join(a[1]['categories']).replace('-','_')).split(' ')
abstract = '\n'.join(a[1]['abstract'])
url = 'http://arxiv.org/abs/' + a[1]['id'][0]
date = datetime.strptime(a[1]['created'][0], '%a, %d %b %Y %H:%M:%S %Z')
authors = a[1]['authors'][0]# '; '.join(a[1]['keynames'])
abstract = abstract + '\nBy: ' + authors + '\nIn: ' + ', '.join(sr2)
print title
print sr2
print abstract
print url
print date
print authors
insert(title + ' (' + authors + ')', str("fullarxiv"), url, abstract, date=date, cross_srs=sr2)
except:
print 'ERROR'
print a
errors = errors+1
print 'Completed with %s errors' % errors
示例2: arxiv_oai_scraper
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import updateGranularity [as 别名]
def arxiv_oai_scraper(subject, start, end, sleep_time=0):
base_url = "http://export.arxiv.org/oai2"
output = list()
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(base_url, registry)
client.updateGranularity()
records = client.listRecords(metadataPrefix='oai_dc', set="{}".format(subject), from_=start, until=end)
for _, md, _ in records:
# print md.getField("title")
# checks for the case in 2010 when there is no title for something
if md is not None:
txt_dict = {"title": md["title"],
"abstract": md["description"],
"date": md["date"],
"subject": md["subject"],
"url": md["identifier"],
"authors": md['creator']}
output.append(txt_dict)
time.sleep(sleep_time)
return output
示例3: _listRecords
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import updateGranularity [as 别名]
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
# Generator to yield records from baseUrl in the given metadataPrefix
# Add metatdataPrefix to args
kwargs['metadataPrefix'] = metadataPrefix
client = Client(baseUrl, metadata_registry)
# Check server timestamp granularity support
client.updateGranularity()
for record in client.listRecords(**kwargs):
yield record
示例4: _listRecords
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import updateGranularity [as 别名]
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
# Generator to yield records from baseUrl in the given metadataPrefix
# Add metatdataPrefix to args
kwargs['metadataPrefix'] = metadataPrefix
client = Client(baseUrl, self._mdRegistry)
# Check that baseUrl actually represents an OAI-PMH target
try:
client.identify()
except IndexError:
raise NotOAIPMHBaseURLException(
"{0} does not appear to be an OAI-PMH compatible base URL"
"".format(baseUrl)
)
# Check server timestamp granularity support
client.updateGranularity()
for record in client.listRecords(**kwargs):
# Unit test hotfix
header, metadata, about = record
# Fix pyoai returning a "b'...'" string for py3k
if isinstance(metadata, str) and metadata.startswith("b'"):
metadata = ast.literal_eval(metadata).decode("utf-8")
yield (header, metadata, about)
示例5: now
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import updateGranularity [as 别名]
def now():
return datetime.now().ctime()
print >>sys.stderr, "beginning @", now()
URL = "http://citeseerx.ist.psu.edu/oai2"
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(URL, registry)
client.updateGranularity()
store = Store()
if len(sys.argv) > 1:
start = datetime.strptime(sys.argv[1], '%Y-%m-%d') #2011-10-27, for instance
elif store.last():
start = store.last()
else:
start = client.identify().earliestDatestamp()
#try this and see if it works; if it does resumption tokens right, this should work fine.
chunk = timedelta(days=1)
oneday = timedelta(days=1)
示例6: run
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import updateGranularity [as 别名]
def run(self):
# Check that ElasticSearch is alive
self.check_index()
# If the user specified the --REBUILD flag, recreate the index
if self.options['rebuild']:
self.rebuild_index()
# Connect to the repository
registry = MetadataRegistry()
registry.registerReader(self.settings["metadata_format"], self.settings["metadata_reader"])
client = Client(self.settings["uri"], registry)
identity = client.identify()
print "Connected to repository: %s" % identity.repositoryName()
# got to update granularity or we barf with:
# oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z
client.updateGranularity()
# Initialise some variables
batcher = Batch.Batch()
total_records = 0
start = time.time()
# Now do the synchonisation
# If the user specified an identifier, then synchronise this record
if (self.options['identifier'] is not None):
total_records += self.synchronise_record(client, batcher, self.options['identifier'])
else:
# Else, synchronise using the date-range provided by the user, or failing that,
# the date-range based on the last sync
# Get the synchronisation config record
synchronisation_config = self.get_synchronisation_config()
if self.options["from_date"] is not None:
# If the user specified a from-date argument, use it
from_date = self.options["from_date"] # already a date (not a datetime)
elif synchronisation_config is not None and "to_date" in synchronisation_config:
# Else read the last synchronised to_date from the config, and add on a day
from_date = dateutil.parser.parse(synchronisation_config["to_date"]).date() + timedelta(days=1)
else:
# Else use the default_from_date in the config
from_date = dateutil.parser.parse(self.settings['default_from_date']).date()
if self.options["to_date"] is not None:
to_date = self.options["to_date"] # already a date (not a datetime)
else:
to_date = (date.today() - timedelta(days=1))
# Force the from_date to use time 00:00:00
from_date = datetime.combine(from_date, _time(hour=0, minute=0, second=0, microsecond=0))
# Force the to_date to use time 23:59:59
to_date = datetime.combine(to_date, _time(hour=23, minute=59, second=59, microsecond=0))
print "Synchronising from %s - %s" % (from_date, to_date)
while from_date < to_date:
next_date = datetime.combine(from_date.date() + timedelta(days=(self.settings['delta_days'] - 1)), _time(hour=23, minute=59, second=59, microsecond=0))
number_of_records = self.synchronise_period(client, batcher, from_date, next_date)
batcher.clear() #Store the records in elasticsearch
self.put_synchronisation_config(from_date, next_date, number_of_records)
from_date += timedelta(days=(self.settings['delta_days']))
total_records += number_of_records
# Pause so as not to get banned.
to = 20
print "Sleeping for %i seconds so as not to get banned." % to
time.sleep(to)
# Store the records in the index
batcher.clear()
# Print out some statistics
time_spent = time.time() - start
print 'Total time spent: %d seconds' % (time_spent)
if time_spent > 0.001: # careful as its not an integer
print 'Total records synchronised: %i records (%d records/second)' % (total_records, (total_records/time_spent))
else:
print 'Total records synchronised: %i records' % (total_records)
return total_records
sys.exit()
示例7: oaiSpider
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import updateGranularity [as 别名]
def oaiSpider(subject="hep-ex", section="physics", start=None, end=None, sleep_time = 0):
'''
Pull articles using the Open Archives Initiative protocol
subject - String defining the subset of the main section
section - String defining the main section (typically physics or nothing)
start - A datetime.datetime object restricting the starting date of returned articles
end - A datetime.datetime object restricting the ending date of the returned articles
sleep_time - A number specifying how many ms to wait between the record queries
Examples
oaiSpider("hep-ex", "physics")
==> returns all HEP experiment articles
oaiSpider("cs", "", datetime(2011,06,24))
==> returns all computer science articles submitted after June 24th, 2011
oaiSpider("hep-ph", "physics", None, datetime(2011,06, 24))
==> returns all HEP phenomenology articles submitted before June 24th, 2011
Returns a list of dictionaries containing the article metadata
'''
from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
base_url = "http://export.arxiv.org/oai2"
output = []
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(base_url, registry)
client.updateGranularity()
if section == None:
section = ""
if len(section) > 0 and section[-1] != ":":
section += ":"
# sets = client.listSets()
# for entry in sets:
# print entry
### OAIPMH module sucks donkey balls
# Causes some error when I use the from_ or until keys
records = client.listRecords(metadataPrefix='oai_dc'
, set='%s%s' % (section, subject)
, from_=start
#, from_=datestamp
, until=end
)
counter = 0
for (header, metadata, aux) in records:
print counter
# for key in metadata._map.keys():
# print key, metadata[key]
output.append({"title" : cleanText(metadata["title"][0]),
"abstract" : cleanText(metadata["description"][0]),
"date" : convertDate(max(metadata["date"])),
"subject" : subject,
"url" : metadata["identifier"][0],
"authors" : "; ".join( metadata['creator']),
})
print output[-1]
counter += 1
# break
# if counter > 15:
# break
time.sleep(sleep_time)
return output
示例8: OpenBeeldenDataLoader
# 需要导入模块: from oaipmh.client import Client [as 别名]
# 或者: from oaipmh.client.Client import updateGranularity [as 别名]
#.........这里部分代码省略.........
'subject': ('textList', 'oai_oi:oi/oi:subject/text()'),
'description': ('textList', 'oai_oi:oi/oi:description/text()'),
'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'),
'publisher': ('textList', 'oai_oi:oi/oi:publisher/text()'),
'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'),
'date': ('textList', 'oai_oi:oi/oi:date/text()'),
'type': ('textList', 'oai_oi:oi/oi:type/text()'),
'extent': ('textList', 'oai_oi:oi/oi:extent/text()'),
'medium': ('textList', 'oai_oi:oi/oi:medium/text()'),
'identifier': ('textList', 'oai_oi:oi/oi:identifier/text()'),
'source': ('textList', 'oai_oi:oi/oi:source/text()'),
'language': ('textList', 'oai_oi:oi/oi:language/text()'),
'references': ('textList', 'oai_oi:oi/oi:references/text()'),
'spatial': ('textList', 'oai_oi:oi/oi:spatial/text()'),
'attributionName': ('textList', 'oai_oi:oi/oi:attributionName/text()'),
'attributionURL': ('textList', 'oai_oi:oi/oi:attributionURL/text()'),
'license': ('textList', 'oai_oi:oi/oi:license/text()')
},
namespaces={
'oai_oi': 'http://www.openbeelden.nl/feeds/oai/', #'http://www.openarchives.org/OAI/2.0/oai_oi/',
'oi': 'http://www.openbeelden.nl/oai/'
}
)
URL = 'http://www.openbeelden.nl/feeds/oai/'
#Initialize the OAI client
self.registry = MetadataRegistry()
self.registry.registerReader('oai_oi', oai_oi_reader)
self.client = Client(URL, self.registry)
#Test if the connection to the OAI-PMH provider works
x = self.client.updateGranularity()
x = self.client.identify()
print 'identity %s' % x.repositoryName()
print 'identity %s' % x.protocolVersion()
print 'identity %s' % x.baseURL()
"""
for s in client.listSets():
print s
"""
#initialize the OpenSKOSHandler
self.openSKOSHandler = OpenSKOSHandler()
def reindex(self, provider = None):
setupOAIPMHConnection()
i = 0
extent = None
item = None
identifier = None
for rec in self.client.listRecords(metadataPrefix=u'oai_oi', set=u'beeldengeluid'):#stichting_natuurbeelden, beeldengeluid
header, metadata, about = rec
extent = metadata.getField('extent')[0]
item = {
'id' : header.identifier(),
'identifier' : self.getFieldData(metadata, 'identifier'),
'title' : self.getFieldData(metadata, 'title'),
'alternative' : self.getFieldData(metadata, 'alternative'),
'creator' : self.getFieldData(metadata, 'creator'),
'subject' : self.getFieldData(metadata, 'subject'),
'description' : self.getFieldData(metadata, 'description'),
'abstract' : self.getFieldData(metadata, 'abstract'),