本文整理汇总了Python中datalad.api.Dataset.add方法的典型用法代码示例。如果您正苦于以下问题:Python Dataset.add方法的具体用法?Python Dataset.add怎么用?Python Dataset.add使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类datalad.api.Dataset
的用法示例。
在下文中一共展示了Dataset.add方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_audio
# 需要导入模块: from datalad.api import Dataset [as 别名]
# 或者: from datalad.api.Dataset import add [as 别名]
def test_audio(path):
ds = Dataset(path).create()
ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
copy(
opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'),
path)
ds.add('.')
ok_clean_git(ds.path)
res = ds.aggregate_metadata()
assert_status('ok', res)
res = ds.metadata('audio.mp3')
assert_result_count(res, 1)
# from this extractor
meta = res[0]['metadata']['audio']
for k, v in target.items():
eq_(meta[k], v)
assert_in('@context', meta)
uniques = ds.metadata(
reporton='datasets', return_type='item-or-list')['metadata']['datalad_unique_content_properties']
# test file has it, but uniques have it blanked out, because the extractor considers it worthless
# for discovering whole datasets
assert_in('bitrate', meta)
eq_(uniques['audio']['bitrate'], None)
# 'date' field carries not value, hence gets exclude from the unique report
assert_in('date', meta)
assert(not meta['date'])
assert_not_in('date', uniques['audio'])
示例2: test_ignore_nondatasets
# 需要导入模块: from datalad.api import Dataset [as 别名]
# 或者: from datalad.api.Dataset import add [as 别名]
def test_ignore_nondatasets(path):
# we want to ignore the version/commits for this test
def _kill_time(meta):
for m in meta:
for k in ('version', 'shasum'):
if k in m:
del m[k]
return meta
ds = Dataset(path).create()
meta = _kill_time(ds.metadata(reporton='datasets', on_failure='ignore'))
n_subm = 0
# placing another repo in the dataset has no effect on metadata
for cls, subpath in ((GitRepo, 'subm'), (AnnexRepo, 'annex_subm')):
subm_path = opj(ds.path, subpath)
r = cls(subm_path, create=True)
with open(opj(subm_path, 'test'), 'w') as f:
f.write('test')
r.add('test')
r.commit('some')
assert_true(Dataset(subm_path).is_installed())
assert_equal(meta, _kill_time(ds.metadata(reporton='datasets', on_failure='ignore')))
# making it a submodule has no effect either
ds.add(subpath)
assert_equal(len(ds.subdatasets()), n_subm + 1)
assert_equal(meta, _kill_time(ds.metadata(reporton='datasets', on_failure='ignore')))
n_subm += 1
示例3: test_addurls_subdataset
# 需要导入模块: from datalad.api import Dataset [as 别名]
# 或者: from datalad.api.Dataset import add [as 别名]
def test_addurls_subdataset(self, path):
ds = Dataset(path).create(force=True)
with chpwd(path):
for save in True, False:
label = "save" if save else "nosave"
hexsha_before = ds.repo.get_hexsha()
ds.addurls(self.json_file, "{url}",
"{subdir}-" + label + "//{name}",
save=save)
hexsha_after = ds.repo.get_hexsha()
for fname in ["foo-{}/a", "bar-{}/b", "foo-{}/c"]:
ok_exists(fname.format(label))
assert_true(save ^ (hexsha_before == hexsha_after))
assert_true(save ^ ds.repo.dirty)
# Now save the "--nosave" changes and check that we have
# all the subdatasets.
ds.add(".")
eq_(set(subdatasets(ds, recursive=True,
result_xfm="relpaths")),
{"foo-save", "bar-save", "foo-nosave", "bar-nosave"})
# We don't try to recreate existing subdatasets.
with swallow_logs(new_level=logging.DEBUG) as cml:
ds.addurls(self.json_file, "{url}", "{subdir}-nosave//{name}")
assert_in("Not creating subdataset at existing path", cml.out)
示例4: test_addurls_dry_run
# 需要导入模块: from datalad.api import Dataset [as 别名]
# 或者: from datalad.api.Dataset import add [as 别名]
def test_addurls_dry_run(path):
ds = Dataset(path).create(force=True)
with chpwd(path):
json_file = "links.json"
with open(json_file, "w") as jfh:
json.dump([{"url": "URL/a.dat", "name": "a", "subdir": "foo"},
{"url": "URL/b.dat", "name": "b", "subdir": "bar"},
{"url": "URL/c.dat", "name": "c", "subdir": "foo"}],
jfh)
ds.add(".", message="setup")
with swallow_logs(new_level=logging.INFO) as cml:
ds.addurls(json_file,
"{url}",
"{subdir}//{_url_filename_root}",
dry_run=True)
for dir_ in ["foo", "bar"]:
assert_in("Would create a subdataset at {}".format(dir_),
cml.out)
assert_in(
"Would download URL/a.dat to {}".format(
os.path.join(path, "foo", "BASE")),
cml.out)
assert_in("Metadata: {}".format([u"name=a", u"subdir=foo"]),
cml.out)
示例5: test_addurls
# 需要导入模块: from datalad.api import Dataset [as 别名]
# 或者: from datalad.api.Dataset import add [as 别名]
def test_addurls(self, path):
ds = Dataset(path).create(force=True)
def get_annex_commit_counts():
return int(
ds.repo.repo.git.rev_list("--count", "git-annex").strip())
n_annex_commits = get_annex_commit_counts()
with chpwd(path):
ds.addurls(self.json_file, "{url}", "{name}")
filenames = ["a", "b", "c"]
for fname in filenames:
ok_exists(fname)
for (fname, meta), subdir in zip(ds.repo.get_metadata(filenames),
["foo", "bar", "foo"]):
assert_dict_equal(meta,
{"subdir": [subdir], "name": [fname]})
# Ignore this check if we're faking dates because that disables
# batch mode.
if not os.environ.get('DATALAD_FAKE__DATES'):
# We should have two new commits on the git-annex: one for the
# added urls and one for the added metadata.
eq_(n_annex_commits + 2, get_annex_commit_counts())
# Add to already existing links, overwriting.
with swallow_logs(new_level=logging.DEBUG) as cml:
ds.addurls(self.json_file, "{url}", "{name}",
ifexists="overwrite")
for fname in filenames:
assert_in("Removing {}".format(os.path.join(path, fname)),
cml.out)
# Add to already existing links, skipping.
assert_in_results(
ds.addurls(self.json_file, "{url}", "{name}", ifexists="skip"),
action="addurls",
status="notneeded")
# Add to already existing links works, as long content is the same.
ds.addurls(self.json_file, "{url}", "{name}")
# But it fails if something has changed.
ds.unlock("a")
with open("a", "w") as ofh:
ofh.write("changed")
ds.add("a")
assert_raises(IncompleteResultsError,
ds.addurls,
self.json_file, "{url}", "{name}")
示例6: test_bf2458
# 需要导入模块: from datalad.api import Dataset [as 别名]
# 或者: from datalad.api.Dataset import add [as 别名]
def test_bf2458(src, dst):
ds = Dataset(src).create(force=True)
ds.add('.', to_git=False)
# no clone (empty) into new dst
clone = install(source=ds.path, path=dst)
# XXX whereis says nothing in direct mode
# content is not here
eq_(clone.repo.whereis('dummy'), [ds.config.get('annex.uuid')])
# check that plain metadata access does not `get` stuff
clone.metadata('.', on_failure='ignore')
# XXX whereis says nothing in direct mode
eq_(clone.repo.whereis('dummy'), [ds.config.get('annex.uuid')])
示例7: check_api
# 需要导入模块: from datalad.api import Dataset [as 别名]
# 或者: from datalad.api.Dataset import add [as 别名]
def check_api(no_annex, path):
ds = Dataset(path).create(force=True, no_annex=no_annex)
ds.add('.')
ok_clean_git(ds.path)
processed_extractors, skipped_extractors = [], []
for extractor_ep in iter_entry_points('datalad.metadata.extractors'):
# we need to be able to query for metadata, even if there is none
# from any extractor
try:
extractor_cls = extractor_ep.load()
except Exception as exc:
exc_ = str(exc)
skipped_extractors += [exc_]
continue
extractor = extractor_cls(
ds, paths=['file.dat'])
meta = extractor.get_metadata(
dataset=True,
content=True)
# we also get something for the dataset and something for the content
# even if any of the two is empty
assert_equal(len(meta), 2)
dsmeta, contentmeta = meta
assert (isinstance(dsmeta, dict))
assert hasattr(contentmeta, '__len__') or isgenerator(contentmeta)
# verify that generator does not blow and has an entry for our
# precious file
cm = dict(contentmeta)
# datalad_core does provide some (not really) information about our
# precious file
if extractor_ep.name == 'datalad_core':
assert 'file.dat' in cm
elif extractor_ep.name == 'annex':
if not no_annex:
# verify correct key, which is the same for all files of 0 size
assert_equal(
cm['file.dat']['key'],
'MD5E-s0--d41d8cd98f00b204e9800998ecf8427e.dat'
)
else:
# no metadata on that file
assert not cm
processed_extractors.append(extractor_ep.name)
assert "datalad_core" in processed_extractors, \
"Should have managed to find at least the core extractor extractor"
if skipped_extractors:
raise SkipTest(
"Not fully tested/succeded since some extractors failed"
" to load:\n%s" % ("\n".join(skipped_extractors)))
示例8: test_exif
# 需要导入模块: from datalad.api import Dataset [as 别名]
# 或者: from datalad.api.Dataset import add [as 别名]
def test_exif(path):
ds = Dataset(path).create()
ds.config.add('datalad.metadata.nativetype', 'exif', where='dataset')
copy(
opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'exif.jpg'),
path)
ds.add('.')
ok_clean_git(ds.path)
res = ds.aggregate_metadata()
assert_status('ok', res)
res = ds.metadata('exif.jpg')
assert_result_count(res, 1)
# from this extractor
meta = res[0]['metadata']['exif']
for k, v in target.items():
eq_(meta[k], v)
assert_in('@context', meta)
示例9: test_aggregation
# 需要导入模块: from datalad.api import Dataset [as 别名]
# 或者: from datalad.api.Dataset import add [as 别名]
def test_aggregation(path):
with chpwd(path):
assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
# a hierarchy of three (super/sub)datasets, each with some native metadata
ds = Dataset(opj(path, 'origin')).create(force=True)
# before anything aggregated we would get nothing and only a log warning
with swallow_logs(new_level=logging.WARNING) as cml:
assert_equal(list(query_aggregated_metadata('all', ds, [])), [])
assert_re_in('.*Found no aggregated metadata.*update', cml.out)
ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
where='dataset')
subds = ds.create('sub', force=True)
subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
where='dataset')
subsubds = subds.create('subsub', force=True)
subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
where='dataset')
ds.add('.', recursive=True)
ok_clean_git(ds.path)
# aggregate metadata from all subdatasets into any superdataset, including
# intermediate ones
res = ds.aggregate_metadata(recursive=True, update_mode='all')
# we get success report for both subdatasets and the superdataset,
# and they get saved
assert_result_count(res, 6)
assert_result_count(res, 3, status='ok', action='aggregate_metadata')
assert_result_count(res, 3, status='ok', action='save')
# nice and tidy
ok_clean_git(ds.path)
# quick test of aggregate report
aggs = ds.metadata(get_aggregates=True)
# one for each dataset
assert_result_count(aggs, 3)
# mother also report layout version
assert_result_count(aggs, 1, path=ds.path, layout_version=1)
# store clean direct result
origres = ds.metadata(recursive=True)
# basic sanity check
assert_result_count(origres, 6)
assert_result_count(origres, 3, type='dataset')
assert_result_count(origres, 3, type='file') # Now that we have annex.key
# three different IDs
assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset'])))
# and we know about all three datasets
for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'):
assert_true(
sum([s['metadata']['frictionless_datapackage']['name'] \
== assure_unicode(name) for s in origres
if s['type'] == 'dataset']))
# now clone the beast to simulate a new user installing an empty dataset
clone = install(
opj(path, 'clone'), source=ds.path,
result_xfm='datasets', return_type='item-or-list')
# ID mechanism works
assert_equal(ds.id, clone.id)
# get fresh metadata
cloneres = clone.metadata()
# basic sanity check
assert_result_count(cloneres, 2)
assert_result_count(cloneres, 1, type='dataset')
assert_result_count(cloneres, 1, type='file')
# now loop over the previous results from the direct metadata query of
# origin and make sure we get the extact same stuff from the clone
_compare_metadata_helper(origres, clone)
# now obtain a subdataset in the clone, should make no difference
assert_status('ok', clone.install('sub', result_xfm=None, return_type='list'))
_compare_metadata_helper(origres, clone)
# test search in search tests, not all over the place
## query smoke test
assert_result_count(clone.search('mother', mode='egrep'), 1)
assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1)
child_res = clone.search('child', mode='egrep')
assert_result_count(child_res, 2)
for r in child_res:
if r['type'] == 'dataset':
assert_in(
r['query_matched']['frictionless_datapackage.name'],
r['metadata']['frictionless_datapackage']['name'])
示例10: supers
# 需要导入模块: from datalad.api import Dataset [as 别名]
# 或者: from datalad.api.Dataset import add [as 别名]
class supers(SuprocBenchmarks):
"""
Benchmarks on common operations on collections of datasets using datalad API
"""
timeout = 3600
# need to assure that we are working in a different repository now
# see https://github.com/datalad/datalad/issues/1512
# might not be sufficient due to side effects between tests and
# thus getting into the same situation
ds_count = 0
def setup_cache(self):
# creating in CWD so things get removed when ASV is done
ds_path = create_test_dataset("testds1", spec='2/-2/-2', seed=0)[0]
# Will store into a tarfile since otherwise install -r is way too slow
# to be invoked for every benchmark
tarfile_path = opj(osp.dirname(ds_path), 'testds1.tar')
with tarfile.open(tarfile_path, "w") as tar:
# F.CK -- Python tarfile can't later extract those because key dirs are
# read-only. For now just a workaround - make it all writeable
from datalad.utils import rotree
rotree('testds1', ro=False, chmod_files=False)
tar.add('testds1', recursive=True)
rmtree('testds1')
return tarfile_path
def setup(self, tarfile_path):
import tarfile
tempdir = osp.dirname(tarfile_path)
with tarfile.open(tarfile_path) as tar:
tar.extractall(tempdir)
# TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed
epath = opj(tempdir, 'testds1')
epath_unique = epath + str(self.__class__.ds_count)
os.rename(epath, epath_unique)
self.__class__.ds_count += 1
self.ds = Dataset(epath_unique)
print("Finished setup for %s" % tempdir)
def teardown(self, tarfile_path):
for path in [self.ds.path + '_', self.ds.path]:
print("Cleaning up %s" % path)
if osp.exists(path):
rmtree(path)
def time_installr(self, tarfile_path):
# somewhat duplicating setup but lazy to do different one for now
assert install(self.ds.path + '_', source=self.ds.path, recursive=True)
def time_createadd(self, tarfile_path):
assert self.ds.create('newsubds')
def time_createadd_to_dataset(self, tarfile_path):
subds = create(opj(self.ds.path, 'newsubds'))
self.ds.add(subds.path)
def time_ls(self, tarfile_path):
ls(self.ds.path)
def time_ls_recursive(self, tarfile_path):
ls(self.ds.path, recursive=True)
def time_ls_recursive_long_all(self, tarfile_path):
ls(self.ds.path, recursive=True, long_=True, all_=True)
# TODO: since doesn't really allow to uninstall top level ds... bleh ;)
#def time_uninstall(self, tarfile_path):
# uninstall(self.ds.path, recursive=True)
def time_remove(self, tarfile_path):
remove(self.ds.path, recursive=True)
示例11: test_within_ds_file_search
# 需要导入模块: from datalad.api import Dataset [as 别名]
# 或者: from datalad.api.Dataset import add [as 别名]
def test_within_ds_file_search(path):
try:
import mutagen
except ImportError:
raise SkipTest
ds = Dataset(path).create(force=True)
# override default and search for datasets and files for this test
for m in ('egrep', 'textblob', 'autofield'):
ds.config.add(
'datalad.search.index-{}-documenttype'.format(m), 'all',
where='dataset')
ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
makedirs(opj(path, 'stim'))
for src, dst in (
('audio.mp3', opj('stim', 'stim1.mp3')),):
copy(
opj(dirname(dirname(__file__)), 'tests', 'data', src),
opj(path, dst))
ds.add('.')
# yoh: CANNOT FIGURE IT OUT since in direct mode it gets added to git
# directly BUT
# - output reports key, so seems to be added to annex!
# - when I do manually in cmdline - goes to annex
ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True)
# If it is not under annex, below addition of metadata silently does
# not do anything
list(ds.repo.set_metadata(
opj('stim', 'stim1.mp3'), init={'importance': 'very'}))
ds.aggregate_metadata()
ok_clean_git(ds.path)
# basic sanity check on the metadata structure of the dataset
dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata']
for src in ('audio',):
# something for each one
assert_in(src, dsmeta)
# each src declares its own context
assert_in('@context', dsmeta[src])
# we have a unique content metadata summary for each src
assert_in(src, dsmeta['datalad_unique_content_properties'])
# test default behavior
with swallow_outputs() as cmo:
ds.search(show_keys='name', mode='textblob')
assert_in("""\
id
meta
parentds
path
type
""", cmo.out)
target_out = """\
annex.importance
annex.key
audio.bitrate
audio.duration(s)
audio.format
audio.music-Genre
audio.music-album
audio.music-artist
audio.music-channels
audio.music-sample_rate
audio.name
audio.tracknumber
datalad_core.id
datalad_core.refcommit
id
parentds
path
type
"""
# check generated autofield index keys
with swallow_outputs() as cmo:
ds.search(mode='autofield', show_keys='name')
# it is impossible to assess what is different from that dump
assert_in(target_out, cmo.out)
assert_result_count(ds.search('blablob#'), 0)
# now check that we can discover things from the aggregated metadata
for mode, query, hitpath, matched in (
('egrep',
':mp3',
opj('stim', 'stim1.mp3'),
{'audio.format': 'mp3'}),
# same as above, leading : is stripped, in indicates "ALL FIELDS"
('egrep',
'mp3',
opj('stim', 'stim1.mp3'),
{'audio.format': 'mp3'}),
# same as above, but with AND condition
# get both matches
('egrep',
['mp3', 'type:file'],
opj('stim', 'stim1.mp3'),
{'type': 'file', 'audio.format': 'mp3'}),
# case insensitive search
('egrep',
'mp3',
#.........这里部分代码省略.........