本文整理汇总了Python中aleph.model.Collection.by_foreign_id方法的典型用法代码示例。如果您正苦于以下问题:Python Collection.by_foreign_id方法的具体用法?Python Collection.by_foreign_id怎么用?Python Collection.by_foreign_id使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类aleph.model.Collection
的用法示例。
在下文中一共展示了Collection.by_foreign_id方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: find_collection
# 需要导入模块: from aleph.model import Collection [as 别名]
# 或者: from aleph.model.Collection import by_foreign_id [as 别名]
def find_collection(self, foreign_id, data):
collection = Collection.by_foreign_id(foreign_id, data)
if not hasattr(self, 'entity_cache'):
self.entity_cache = {}
self.entity_cache[collection.id] = []
db.session.flush()
return collection
示例2: load_collection
# 需要导入模块: from aleph.model import Collection [as 别名]
# 或者: from aleph.model.Collection import by_foreign_id [as 别名]
def load_collection(self):
if not hasattr(self, '_collection'):
self._collection = Collection.by_foreign_id('polyglot:ner', {
'label': 'Automatically Extracted Persons and Companies',
'public': True
})
return self._collection
示例3: crawl
# 需要导入模块: from aleph.model import Collection [as 别名]
# 或者: from aleph.model.Collection import by_foreign_id [as 别名]
def crawl(self):
url = urljoin(self.host, '/ticket/all_closed/?format=json')
collection = Collection.by_foreign_id(url, {
'label': 'Investigative Dashboard Requests'
})
Permission.grant_foreign(collection, 'idashboard:occrp_staff',
True, False)
existing_entities = []
terms = set()
db.session.flush()
for endpoint in ['all_closed', 'all_open']:
url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint)
data = self.session.get(url).json()
for req in data.get('paginator', {}).get('object_list'):
category = REQUEST_TYPES.get(req.get('ticket_type'))
if category is None:
continue
ent = Entity.by_foreign_id(str(req.get('id')), collection, {
'name': req.get('name'),
'category': category,
'data': req,
'selectors': [req.get('name')]
})
terms.update(ent.terms)
existing_entities.append(ent.id)
log.info(" # %s (%s)", ent.name, ent.category)
for entity in collection.entities:
if entity.id not in existing_entities:
entity.delete()
self.emit_collection(collection, terms)
示例4: crawl
# 需要导入模块: from aleph.model import Collection [as 别名]
# 或者: from aleph.model.Collection import by_foreign_id [as 别名]
def crawl(self):
url = urljoin(self.host, '/ticket/all_closed/?format=json')
collection = Collection.by_foreign_id(url, {
'label': 'Investigative Dashboard Requests'
})
Permission.grant_foreign(collection, 'idashboard:occrp_staff',
True, False)
existing_entities = []
terms = set()
db.session.flush()
for endpoint in ['all_closed', 'all_open']:
url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint)
data = self.session.get(url).json()
print url
continue
for req in data.get('paginator', {}).get('object_list'):
ent = self.update_entity(req, collection)
if ent is not None:
terms.update(ent.terms)
existing_entities.append(ent.id)
log.info(" # %s", ent.name)
for entity in collection.entities:
if entity.id not in existing_entities:
entity.delete()
self.emit_collection(collection, terms)
示例5: crawl_collection
# 需要导入模块: from aleph.model import Collection [as 别名]
# 或者: from aleph.model.Collection import by_foreign_id [as 别名]
def crawl_collection(self, collection):
if not len(collection.get('subjects', [])):
return
url = urljoin(self.URL, '/api/collections/%s' % collection.get('id'))
collection = Collection.by_foreign_id(url, {
'label': collection.get('title')
})
res = requests.get('%s/permissions' % url, headers=self.HEADERS)
for perm in res.json().get('results', []):
Permission.grant_foreign(collection, perm.get('role'),
perm.get('read'), perm.get('write'))
log.info(" > Spindle collection: %s", collection.label)
res = requests.get('%s/entities' % url, headers=self.HEADERS)
terms = set()
existing_entities = []
for entity in res.json().get('results', []):
if entity.get('name') is None:
continue
aliases = [on.get('alias') for on in entity.get('other_names', [])]
ent = Entity.by_foreign_id(entity.get('id'), collection, {
'name': entity.get('name'),
'category': SCHEMATA.get(entity.get('$schema'), OTHER),
'data': entity,
'selectors': aliases
})
terms.update(ent.terms)
existing_entities.append(ent.id)
log.info(" # %s (%s)", ent.name, ent.category)
for entity in collection.entities:
if entity.id not in existing_entities:
entity.delete()
self.emit_collection(collection, terms)
示例6: load_collection
# 需要导入模块: from aleph.model import Collection [as 别名]
# 或者: from aleph.model.Collection import by_foreign_id [as 别名]
def load_collection(self, data):
foreign_id = data.get('foreign_id')
collection = Collection.by_foreign_id(foreign_id)
if collection is None:
collection = Collection.create(data)
db.session.commit()
update_collection(collection)
return collection
示例7: analyze
# 需要导入模块: from aleph.model import Collection [as 别名]
# 或者: from aleph.model.Collection import by_foreign_id [as 别名]
def analyze(foreign_id=None):
"""Re-analyze documents in the given collection (or throughout)."""
if foreign_id:
collection = Collection.by_foreign_id(foreign_id)
if collection is None:
raise ValueError("No such collection: %r" % foreign_id)
analyze_collection.delay(collection.id)
else:
for collection in Collection.all():
analyze_collection.delay(collection.id)
示例8: crawl_collection
# 需要导入模块: from aleph.model import Collection [as 别名]
# 或者: from aleph.model.Collection import by_foreign_id [as 别名]
def crawl_collection(self, collection):
if not len(collection.get('subjects', [])):
return
url = urljoin(self.URL, '/api/collections/%s' % collection.get('id'))
collection = Collection.by_foreign_id(url, {
'label': collection.get('title')
})
res = requests.get('%s/permissions' % url, headers=self.HEADERS)
for perm in res.json().get('results', []):
Permission.grant_foreign(collection, perm.get('role'),
perm.get('read'), perm.get('write'))
log.info(" > Spindle collection: %s", collection.label)
res = requests.get('%s/entities' % url, headers=self.HEADERS)
terms = set()
existing_entities = []
for entity in res.json().get('results', []):
if entity.get('name') is None:
continue
entity['$schema'] = SCHEMATA.get(entity.get('$schema'), OTHER)
if 'jurisdiction_code' in entity:
entity['jurisdiction_code'] = \
entity['jurisdiction_code'].lower()
entity.pop('members', None)
entity.pop('memberships', None)
entity.pop('assets', None)
entity.pop('owners', None)
entity.pop('family_first', None)
entity.pop('family_second', None)
entity.pop('social_first', None)
entity.pop('social_second', None)
for date_field in ['birth_date']:
if date_field in entity and 'T' in entity[date_field]:
entity[date_field], _ = entity[date_field].split('T', 1)
for on in entity.get('other_names', []):
name = on.pop('alias', None)
if name is not None:
on['name'] = name
entity['identifiers'] = [{
'scheme': 'spindle',
'identifier': entity.pop('id', None)
}]
ent = Entity.save(entity, collection_id=collection.id, merge=True)
db.session.flush()
terms.update(ent.terms)
existing_entities.append(ent.id)
log.info(" # %s", ent.name)
for entity in collection.entities:
if entity.id not in existing_entities:
entity.delete()
self.emit_collection(collection, terms)
示例9: index
# 需要导入模块: from aleph.model import Collection [as 别名]
# 或者: from aleph.model.Collection import by_foreign_id [as 别名]
def index(foreign_id=None):
"""Index documents in the given collection (or throughout)."""
q = Document.all_ids()
if foreign_id:
collection = Collection.by_foreign_id(foreign_id)
if collection is None:
raise ValueError("No such collection: %r" % foreign_id)
clause = Collection.id == collection.id
q = q.filter(Document.collections.any(clause))
for doc_id, in q:
index_document_id.delay(doc_id)
if foreign_id is None:
reindex_entities()
示例10: test_crawler_execute
# 需要导入模块: from aleph.model import Collection [as 别名]
# 或者: from aleph.model.Collection import by_foreign_id [as 别名]
def test_crawler_execute(self):
tdc = TDocumentCrawler()
ccnt = CrawlerState.all().count()
assert ccnt == 0, ccnt
tdc.execute()
states = CrawlerState.all().all()
assert len(states) == 2, len(states)
demo = states[1]
assert 'kitty' in demo.meta['title'], demo.meta
assert 'demo.pdf' in demo.meta['source_path'], demo.meta
coll = Collection.by_foreign_id('test')
assert coll is not None, coll
assert len(list(coll.documents)) == 1, list(coll.documents)
示例11: crawl_source
# 需要导入模块: from aleph.model import Collection [as 别名]
# 或者: from aleph.model.Collection import by_foreign_id [as 别名]
def crawl_source(self, source):
if source.get('source_id') in IGNORE_SOURCES:
return
json_file = source.get('data', {}).get('json')
url = urljoin(JSON_PATH, json_file)
source_name = source.get('source') or source.get('source_id')
label = '%s - %s' % (source.get('publisher'), source_name)
collection = Collection.by_foreign_id(url, {
'label': label
})
Permission.grant_foreign(collection, Role.SYSTEM_GUEST, True, False)
log.info(" > OpenNames collection: %s", collection.label)
terms = set()
existing_entities = []
db.session.flush()
entities = requests.get(url).json().get('entities', [])
for entity in entities:
data = {
'identifiers': [{
'scheme': 'opennames:%s' % source.get('source_id'),
'identifier': entity.get('uid')
}],
'other_names': [],
'name': entity.get('name'),
'$schema': SCHEMA.get(entity.get('type'),
'/entity/entity.json#')
}
for on in entity.get('other_names', []):
on['name'] = on.pop('other_name', None)
data['other_names'].append(on)
ent = Entity.save(data, collection_id=collection.id, merge=True)
db.session.flush()
terms.update(ent.terms)
existing_entities.append(ent.id)
log.info(" # %s", ent.name)
for entity in collection.entities:
if entity.id not in existing_entities:
entity.delete()
self.emit_collection(collection, terms)
示例12: crawl_source
# 需要导入模块: from aleph.model import Collection [as 别名]
# 或者: from aleph.model.Collection import by_foreign_id [as 别名]
def crawl_source(self, source):
if source.get('source_id') in IGNORE_SOURCES:
return
json_file = source.get('data', {}).get('json')
url = urljoin(JSON_PATH, json_file)
source_name = source.get('source') or source.get('source_id')
label = '%s - %s' % (source.get('publisher'), source_name)
collection = Collection.by_foreign_id(url, {
'label': label
})
Permission.grant_foreign(collection, Role.SYSTEM_GUEST, True, False)
log.info(" > OpenNames collection: %s", collection.label)
terms = set()
existing_entities = []
db.session.flush()
entities = requests.get(url).json().get('entities', [])
for entity in entities:
if entity.get('name') is None:
continue
selectors = []
for on in entity.get('other_names', []):
selectors.append(on.get('other_name'))
for iden in entity.get('identities', []):
if iden.get('number'):
selectors.append(iden.get('number'))
ent = Entity.by_foreign_id(entity.get('uid'), collection, {
'name': entity.get('name'),
'category': CATEGORIES.get(entity.get('type'), OTHER),
'data': entity,
'selectors': selectors
})
terms.update(ent.terms)
existing_entities.append(ent.id)
log.info(" # %s (%s)", ent.name, ent.category)
for entity in collection.entities:
if entity.id not in existing_entities:
entity.delete()
self.emit_collection(collection, terms)
示例13: test_load_csv
# 需要导入模块: from aleph.model import Collection [as 别名]
# 或者: from aleph.model.Collection import by_foreign_id [as 别名]
def test_load_csv(self):
count = Collection.all().count()
assert 0 == count, count
db_uri = 'file://' + self.get_fixture_path('experts.csv')
os.environ['ALEPH_TEST_BULK_CSV'] = db_uri
yml_path = self.get_fixture_path('experts.yml')
config = load_config_file(yml_path)
bulk_load(config)
coll = Collection.by_foreign_id('experts')
assert coll.category == 'scrape', coll.category
_, headers = self.login(is_admin=True)
count = Collection.all().count()
assert 1 == count, count
url = '/api/2/entities?filter:schemata=Thing&q=Greenfield'
res = self.client.get(url, headers=headers)
assert res.status_code == 200, res
assert res.json['total'] == 1, res.json
示例14: test_load_sqlite
# 需要导入模块: from aleph.model import Collection [as 别名]
# 或者: from aleph.model.Collection import by_foreign_id [as 别名]
def test_load_sqlite(self):
count = Collection.all().count()
assert 0 == count, count
db_uri = 'sqlite:///' + self.get_fixture_path('kek.sqlite')
os.environ['ALEPH_TEST_BULK_DATABASE_URI'] = db_uri
yml_path = self.get_fixture_path('kek.yml')
config = load_config_file(yml_path)
bulk_load(config)
count = Collection.all().count()
assert 1 == count, count
coll = Collection.by_foreign_id('kek')
assert coll.category == 'scrape', coll.category
_, headers = self.login(is_admin=True)
url = '/api/2/entities?filter:schemata=Thing&q=friede+springer'
res = self.client.get(url, headers=headers)
assert res.status_code == 200, res
assert res.json['total'] == 1, res.json
res0 = res.json['results'][0]
key = '9895ccc1b3d6444ccc6371ae239a7d55c748a714'
assert res0['id'].startswith(key), res0
示例15: flush
# 需要导入模块: from aleph.model import Collection [as 别名]
# 或者: from aleph.model.Collection import by_foreign_id [as 别名]
def flush(foreign_id):
"""Reset the crawler state for a given collecton."""
collection = Collection.by_foreign_id(foreign_id)
if collection is None:
raise ValueError("No such collection: %r" % foreign_id)
delete_collection(collection.id)