本文整理汇总了Python中dumptruck.DumpTruck.create_index方法的典型用法代码示例。如果您正苦于以下问题:Python DumpTruck.create_index方法的具体用法?Python DumpTruck.create_index怎么用?Python DumpTruck.create_index使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dumptruck.DumpTruck
的用法示例。
在下文中一共展示了DumpTruck.create_index方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_create_if_exists
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import create_index [as 别名]
def test_create_if_exists(self):
dt = DumpTruck(dbname = '/tmp/test.db')
dt.execute('create table pineapple (bar integer, baz integer);')
dt.create_index(['bar', 'baz'], 'pineapple')
with self.assertRaises(sqlite3.OperationalError):
dt.create_index(['bar', 'baz'], 'pineapple', if_not_exists = False)
示例2: test_non_unique
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import create_index [as 别名]
def test_non_unique(self):
dt = DumpTruck(dbname = '/tmp/test.db')
dt.execute('create table tomato (bar integer, baz integer);')
dt.create_index(['bar', 'baz'], 'tomato')
observed = dt.execute('PRAGMA index_info(tomato_bar_baz)')
# Indexness
self.assertIsNotNone(observed)
# Indexed columns
expected = [
{u'seqno': 0, u'cid': 0, u'name': u'bar'},
{u'seqno': 1, u'cid': 1, u'name': u'baz'},
]
self.assertListEqual(observed, expected)
# Uniqueness
indices = dt.execute('PRAGMA index_list(tomato)')
for index in indices:
if index[u'name'] == u'tomato_bar_baz':
break
else:
index = {}
self.assertEqual(index[u'unique'], 0)
示例3: test_create_if_not_exists
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import create_index [as 别名]
def test_create_if_not_exists(self):
dt = DumpTruck(dbname="/tmp/test.db")
dt.execute("create table mango (bar integer, baz integer);")
dt.create_index(["bar", "baz"], "mango")
# This should not raise an error.
dt.create_index(["bar", "baz"], "mango", if_not_exists=True)
示例4: test_create_if_not_exists
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import create_index [as 别名]
def test_create_if_not_exists(self):
dt = DumpTruck(dbname = '/tmp/test.db')
dt.execute('create table mango (bar integer, baz integer);')
dt.create_index(['bar', 'baz'], 'mango')
# This should not raise an error.
dt.create_index(['bar', 'baz'], 'mango', if_not_exists = True)
示例5: test_create_if_exists
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import create_index [as 别名]
def test_create_if_exists(self):
dt = DumpTruck(dbname="/tmp/test.db")
dt.execute("create table pineapple (bar integer, baz integer);")
dt.create_index(["bar", "baz"], "pineapple")
with self.assertRaises(sqlite3.OperationalError):
dt.create_index(["bar", "baz"], "pineapple", if_not_exists=False)
示例6: apis
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import create_index [as 别名]
def apis():
dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False)
dt.create_table({'catalog':'abc.def'}, 'socrata_apis')
dt.create_index(['catalog'], 'socrata_apis', unique = True, if_not_exists = True)
socrata_catalogs = filter(lambda x: x[0] == 'socrata', catalogs())
for _, catalog in socrata_catalogs:
dt.upsert({
'catalog': catalog.split('://')[-1],
'apis': count_apis(catalog),
}, 'socrata_apis')
示例7: main
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import create_index [as 别名]
def main():
edges = build_network()['edges']
dt = DumpTruck(dbname = '/tmp/open-data.sqlite', adapt_and_convert = True)
datasets_in = dt.execute('SELECT * FROM socrata')
dt.create_table({'id': 'blah-blah'}, 'socrata_deduplicated')
dt.create_index(['id'], 'socrata_deduplicated', if_not_exists = True, unique = True)
for dataset in dedupe(datasets_in, edges):
dt.upsert(dataset, 'socrata_deduplicated')
示例8: to_sqlite3
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import create_index [as 别名]
def to_sqlite3():
dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False)
dummyrow = dict(zip(['software','catalog','identifier'], ['blah']*3))
dt.create_table(dummyrow, 'datasets', if_not_exists = True)
dt.create_index(['software','catalog','identifier'], 'datasets', if_not_exists = True, unique = True)
for table in ['ckan','socrata']:
dt.create_table({'catalog':'blah','identifier':'blah'}, table, if_not_exists = True)
dt.create_index(['catalog','identifier'], table, if_not_exists = True, unique = True)
dt.create_table({'view_id':'abc','table_id':123}, 'socrata_tables')
dt.create_index(['view_id'], 'socrata_tables', if_not_exists = True, unique = True)
dt.create_index(['table_id'], 'socrata_tables', if_not_exists = True)
for dataset in datasets():
row = {
'software': dataset['software'],
'catalog': dataset['catalog'],
'identifier': dataset[SOFTWARE_MAP['identifier'][dataset['software']]],
}
sql = 'SELECT * FROM datasets WHERE software = ? AND catalog = ? AND identifier = ?'
if dt.execute(sql, [row['software'],row['catalog'],row['identifier']]) != []:
continue
dt.upsert(row, 'datasets')
if dataset['software'] == 'socrata':
socrata_table = {
'view_id': row['identifier'],
'table_id': dataset['tableId'],
}
dt.upsert(socrata_table, 'socrata_tables')
dt.upsert(dataset,dataset['software'])
dt.commit()
示例9: extract_dataset_table_info
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import create_index [as 别名]
def extract_dataset_table_info():
dt = DumpTruck(dbname = '/tmp/table_info.db')
dt.create_table({'portal': 'abc', 'id': 'abcd-efgh'}, 'table_info')
dt.create_index(['portal', 'id'], 'table_info', unique = True)
dt.create_index(['tableId'], 'table_info', unique = False)
done = set([tuple(row.keys()) for row in dt.execute('SELECT portal, id FROM table_info')])
for portal in os.listdir('data'):
for viewid in os.listdir(os.path.join('data', portal, 'views')):
if (portal, viewid) in done:
continue
d = _dataset_table_info(portal, viewid)
if d == None:
continue
dt.upsert(d, 'table_info')
示例10: users
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import create_index [as 别名]
def users():
dt = DumpTruck(dbname = '/tmp/socrata.db')
dt.create_table({'id': 'abcd-efgh'}, 'user')
dt.create_index(['id'], 'user', unique = True)
_users = {}
for portal in os.listdir('data'):
for viewid in os.listdir(os.path.join('data', portal, 'views')):
handle = open(os.path.join('data', portal, 'views', viewid), 'r')
try:
view = json.load(handle)
except:
# *cringe*
continue
handle.close()
if view['owner']['id'] in _users:
_users[view['owner']['id']]['views'].add(view['id'])
try:
_users[view['owner']['id']]['publicationDates'].add((view['id'], view['publicationDate']))
except:
return view
else:
_users[view['owner']['id']] = view['owner']
_users[view['owner']['id']]['views'] = {view['id']}
_users[view['owner']['id']]['tables'] = set()
_users[view['owner']['id']]['publicationDates'] = set()
if view['tableAuthor']['id'] in _users:
_users[view['tableAuthor']['id']]['tables'].add(view['tableId'])
else:
_users[view['tableAuthor']['id']] = view['tableAuthor']
_users[view['tableAuthor']['id']]['views'] = set()
_users[view['tableAuthor']['id']]['tables'] = {view['tableId']}
_users[view['tableAuthor']['id']]['publicationDates'] = set()
for uid in _users.keys():
for key in ['views', 'rights', 'tables']:
if key in _users[uid]:
_users[uid]['n_' + key] = len(_users[uid][key])
del _users[uid][key]
dt.insert(_users.values(), 'user')
for uid, user in _users.items():
for viewid, publicationDate in user['publicationDates']:
dt.insert({'userid': user['id'], 'viewid': viewid, 'publicationDate': publicationDate}, 'publications', commit = False)
dt.commit()
return _users
示例11: get_links
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import create_index [as 别名]
def get_links(softwares = ['ckan','socrata']):
dt = DumpTruck('/tmp/open-data.sqlite')
dummyrow = dict(zip(['software','catalog','identifier', 'status_code', 'headers', 'error'], (['blah'] * 3) + ([234] * 1) + ([{'a':'b'}] * 2)))
dt.create_table(dummyrow, 'links', if_not_exists = True)
dt.create_index(['software','catalog','identifier'], 'links', if_not_exists = True, unique = True)
for software in softwares:
for catalog in read.catalogs(software):
if SOCRATA_FIX.get(catalog, 'this is a string, not None') == None:
continue
try:
for row in _check_catalog(software, catalog):
dt.upsert(row, 'links')
except:
print(os.path.join('downloads',software,catalog))
raise
示例12: check_links
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import create_index [as 别名]
def check_links():
dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False)
dt.create_index(['url'], 'links', if_not_exists = True, unique = False)
dt.create_index(['status_code'], 'links', if_not_exists = True, unique = False)
# Source
urls = Queue()
sql = '''
SELECT DISTINCT url
FROM links
WHERE (status_code = -42 OR status_code IS NULL) AND is_link AND url NOT NULL
ORDER BY status_code, substr(30, 100);
'''
# Order by the substring so that we randomly bounce around catalogs
url_list = [row['url'] for row in dt.execute(sql)]
for url in url_list:
urls.put(url)
# Sink to the database
def _db(queue):
dt = DumpTruck('/tmp/open-data.sqlite')
while True:
dt.execute(*queue.get())
db_updates = Queue()
db_thread = Thread(None, target = _db, args = (db_updates,))
db_thread.start()
# Check links
def _check_link(url_queue):
while not urls.empty():
url = url_queue.get()
if url == None:
raise ValueError('url is None')
status_code, headers, error = links.is_alive(url)
sql = 'UPDATE links SET status_code = ?, headers = ?, error = ? WHERE is_link = 1 AND url = ?'
db_updates.put((sql, (status_code, headers, error, url)))
print(url)
threads = {}
for i in range(100):
threads[i] = Thread(None, target = _check_link, args = (urls,))
for thread in threads.values():
thread.start()
示例13: test_non_unique
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import create_index [as 别名]
def test_non_unique(self):
dt = DumpTruck(dbname="/tmp/test.db")
dt.execute("create table tomato (bar integer, baz integer);")
dt.create_index(["bar", "baz"], "tomato")
observed = dt.execute("PRAGMA index_info(tomato_bar_baz)")
# Indexness
self.assertIsNotNone(observed)
# Indexed columns
expected = [{u"seqno": 0, u"cid": 0, u"name": u"bar"}, {u"seqno": 1, u"cid": 1, u"name": u"baz"}]
self.assertListEqual(observed, expected)
# Uniqueness
indices = dt.execute("PRAGMA index_list(tomato)")
for index in indices:
if index[u"name"] == u"tomato_bar_baz":
break
else:
index = {}
self.assertEqual(index[u"unique"], 0)
示例14: scrape_main_article
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import create_index [as 别名]
def scrape_main_article(url):
req = requests.get(url)
doc = lxml.html.fromstring(req.text)
div = doc.xpath("//*[@class='wrapper']")[0]
div.remove(div.find("h1"))
for para in div.findall("p"):
if para.find("strong") is not None:
div.remove(para)
return htmlize(etree.tostring(div))
dt = DumpTruck(dbname="scotland.db")
dt.create_table({"Title": "",
"Publication date": "",
"Old URL": "",
"Summary": "",
"Body": "",
"Associated organisations": ""}, "news")
dt.create_index(["Title", "Old URL"], "news", unique=True)
for url in URLS:
for news_item in scrape_list_page(url):
attachments = json.loads(news_item.pop("Attachments"))
link = attachments[0]["link"]
news_item["Old URL"] = link
news_item["Body"] = scrape_main_article(link)
dt.upsert(news_item, "news")
dumptruck_to_csv(dt, "news", "/home/http/scotland/news.csv")
示例15: scrape_now
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import create_index [as 别名]
from dumptruck import DumpTruck
import random
import os,sys
import logging
from sqlite3 import OperationalError
import datetime
import copy,types,time
logging.basicConfig(level=logging.DEBUG,file=sys.stdout,format='%(levelname)s %(asctime)-15s %(filename)s %(lineno)s %(message)s')
logger=logging.getLogger(os.path.split(__file__)[1])
_here=os.path.split(__file__)[0]
dt=DumpTruck(dbname=os.path.join(_here,"data/events.sqlite"))
try :
dt.create_index(["id"],"events",unique=True)
dt.create_index(["url"],"events",unique=True)
except OperationalError :
pass
def scrape_now() :
nn=0
for l in listevents() :
try :
d=dt.execute("select count(*) as c from events where url='%(url)s'" % l)
except OperationalError :
d=[{ "c" : 0 }]
pass
if d[0]["c"]==0 :
l.update(eventdata(l["url"]))
logger.info("found new %s %s" % (l.get("id",l.get("threadid","")),l.get("title","")))