本文整理汇总了Python中dumptruck.DumpTruck.upsert方法的典型用法代码示例。如果您正苦于以下问题:Python DumpTruck.upsert方法的具体用法?Python DumpTruck.upsert怎么用?Python DumpTruck.upsert使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dumptruck.DumpTruck
的用法示例。
在下文中一共展示了DumpTruck.upsert方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: apis
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import upsert [as 别名]
def apis():
dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False)
dt.create_table({'catalog':'abc.def'}, 'socrata_apis')
dt.create_index(['catalog'], 'socrata_apis', unique = True, if_not_exists = True)
socrata_catalogs = filter(lambda x: x[0] == 'socrata', catalogs())
for _, catalog in socrata_catalogs:
dt.upsert({
'catalog': catalog.split('://')[-1],
'apis': count_apis(catalog),
}, 'socrata_apis')
示例2: main
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import upsert [as 别名]
def main():
edges = build_network()['edges']
dt = DumpTruck(dbname = '/tmp/open-data.sqlite', adapt_and_convert = True)
datasets_in = dt.execute('SELECT * FROM socrata')
dt.create_table({'id': 'blah-blah'}, 'socrata_deduplicated')
dt.create_index(['id'], 'socrata_deduplicated', if_not_exists = True, unique = True)
for dataset in dedupe(datasets_in, edges):
dt.upsert(dataset, 'socrata_deduplicated')
示例3: extract_dataset_table_info
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import upsert [as 别名]
def extract_dataset_table_info():
dt = DumpTruck(dbname = '/tmp/table_info.db')
dt.create_table({'portal': 'abc', 'id': 'abcd-efgh'}, 'table_info')
dt.create_index(['portal', 'id'], 'table_info', unique = True)
dt.create_index(['tableId'], 'table_info', unique = False)
done = set([tuple(row.keys()) for row in dt.execute('SELECT portal, id FROM table_info')])
for portal in os.listdir('data'):
for viewid in os.listdir(os.path.join('data', portal, 'views')):
if (portal, viewid) in done:
continue
d = _dataset_table_info(portal, viewid)
if d == None:
continue
dt.upsert(d, 'table_info')
示例4: get_links
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import upsert [as 别名]
def get_links(softwares = ['ckan','socrata']):
dt = DumpTruck('/tmp/open-data.sqlite')
dummyrow = dict(zip(['software','catalog','identifier', 'status_code', 'headers', 'error'], (['blah'] * 3) + ([234] * 1) + ([{'a':'b'}] * 2)))
dt.create_table(dummyrow, 'links', if_not_exists = True)
dt.create_index(['software','catalog','identifier'], 'links', if_not_exists = True, unique = True)
for software in softwares:
for catalog in read.catalogs(software):
if SOCRATA_FIX.get(catalog, 'this is a string, not None') == None:
continue
try:
for row in _check_catalog(software, catalog):
dt.upsert(row, 'links')
except:
print(os.path.join('downloads',software,catalog))
raise
示例5: to_sqlite3
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import upsert [as 别名]
def to_sqlite3():
dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False)
dummyrow = dict(zip(['software','catalog','identifier'], ['blah']*3))
dt.create_table(dummyrow, 'datasets', if_not_exists = True)
dt.create_index(['software','catalog','identifier'], 'datasets', if_not_exists = True, unique = True)
for table in ['ckan','socrata']:
dt.create_table({'catalog':'blah','identifier':'blah'}, table, if_not_exists = True)
dt.create_index(['catalog','identifier'], table, if_not_exists = True, unique = True)
dt.create_table({'view_id':'abc','table_id':123}, 'socrata_tables')
dt.create_index(['view_id'], 'socrata_tables', if_not_exists = True, unique = True)
dt.create_index(['table_id'], 'socrata_tables', if_not_exists = True)
for dataset in datasets():
row = {
'software': dataset['software'],
'catalog': dataset['catalog'],
'identifier': dataset[SOFTWARE_MAP['identifier'][dataset['software']]],
}
sql = 'SELECT * FROM datasets WHERE software = ? AND catalog = ? AND identifier = ?'
if dt.execute(sql, [row['software'],row['catalog'],row['identifier']]) != []:
continue
dt.upsert(row, 'datasets')
if dataset['software'] == 'socrata':
socrata_table = {
'view_id': row['identifier'],
'table_id': dataset['tableId'],
}
dt.upsert(socrata_table, 'socrata_tables')
dt.upsert(dataset,dataset['software'])
dt.commit()
示例6: random
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import upsert [as 别名]
from collections import defaultdict
from slscraper import eventdata
d=True
c=1
while d:
r=dt.execute("select id from events where error like '%JSON%' order by random() limit 1")
if len(r)>0 :
rr=r[0]["id"]
try :
dd=eventdata("%s" % rr)
if 'error' not in dd :
dd["error"]="--"
dt.upsert(dd,"events")
print "%(id)s %(title)s: %(error)s" % defaultdict(lambda : "-", dd)
c+=1
if (c % 10)==0 :
r=dt.execute("select count(*) as c from events where error like '%JSON%'")
if len(r)>0 :
print "still %s to go" % r[0]["c"]
except Exception, e:
print "ERROR %s %s" % (rr,e)
else :
d=False
# <codecell>
示例7: map
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import upsert [as 别名]
raise
return map(do_row, trs[2:])
# Schema
dt = DumpTruck(dbname = '/tmp/finalip.db')
dt.create_table({u'DA Number': u'NAE-2009-01067'}, 'finalip', if_not_exists = True)
dt.create_index(['Da Number'], 'finalip', unique = True, if_not_exists = True)
# Skip finished stuff
pages = set([(row['Year'], row['Month'], row['Page']) for row in dt.execute('SELECT Year, Month, Page FROM finalip')])
# Populate
for dirname, subdirnames, filenames in os.walk(os.path.join(os.environ['READER_ROOT'], '..', 'finalips')):
if subdirnames != []:
continue
for filename in filenames:
year, month = map(int, dirname.split('/')[-2:])
page = (year, month, filename)
if page in pages:
continue
path = os.path.join(dirname, filename)
try:
data = read_finalip(path)
except:
print path
raise
for row in data:
row['Year'], row['Month'], row['Page'] = page
dt.upsert(data, 'finalip')
示例8: _diff
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import upsert [as 别名]
def _diff(self, test_name, expected_rowcount):
self._load_fixture_sql('fixtures/' + test_name + '.sql')
dt = DumpTruck(dbname = self.dbfile)
newdata = json.loads(open('fixtures/' + test_name + '.json').read())
dt.upsert(newdata)
self._rowcount_should_be(expected_rowcount)
示例9: DumpTruck
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import upsert [as 别名]
["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16677&mon=jul", "Latest releases"],
["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16676&mon=aug", "Latest releases"],
["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16678&mon=sep", "Latest releases"],
["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16679&mon=oct", "Latest releases"],
["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16680&mon=nov", "Latest releases"],
["http://www.scotlandoffice.gov.uk/scotlandoffice/10804.146.html", "Archive releases"], # 2005
["http://www.scotlandoffice.gov.uk/scotlandoffice/10805.145.html", "Archive releases"], # 2006
["http://www.scotlandoffice.gov.uk/scotlandoffice/10806.144.html", "Archive releases"], # 2007
["http://www.scotlandoffice.gov.uk/scotlandoffice/10807.143.html", "Archive releases"], # 2008
["http://www.scotlandoffice.gov.uk/scotlandoffice/13342.html", "Archive releases"], # 2009
["http://www.scotlandoffice.gov.uk/scotlandoffice/13661.html", "Archive releases"], # 2010
["http://www.scotlandoffice.gov.uk/scotlandoffice/15263.html", "Archive releases"], # 2011
]
dt = DumpTruck(dbname="scotland.db")
dt.create_table({"Title": "",
"Publication date": "",
"Old URL": "",
"Summary": "",
"Attachments": "",
"Type": "",
"Associated organisations": ""}, "publications")
dt.create_index(["Title", "Old URL"], "publications", unique=True)
for url, page_type in URLS:
for publication in scrape_list_page(url):
publication['Type'] = page_type
dt.upsert(publication, "publications")
dumptruck_to_csv(dt, "publications", "/home/http/scotland/publications.csv")
示例10:
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import upsert [as 别名]
"Associated Document Series": ""}, "statistics")
dt.create_index(["Title", "Old URL"], "statistics", unique=True)
for link in doc.xpath("//div[@class='wrapper']/ul/li/a"):
series_title, series_url = link.text, urlparse.urljoin(URL, link.attrib["href"])
print series_title
series_req = requests.get(series_url)
series_doc = lxml.html.fromstring(series_req.text)
for table_line in series_doc.xpath("//tr[not(@bgcolor) or @bgcolor!='#004093']"):
file_pub_date = table_line.xpath("./td[3]")[0].text
for file_node in table_line.xpath("./td[2]//a"):
file_title = etree.tostring(file_node, method="text", encoding="utf8")
file_link = file_node.attrib["href"]
if not file_link.startswith("http"):
file_link = urlparse.urljoin(URL, file_link)
file_data = {"Old URL": series_url,
"Title": file_title,
"Body": file_title,
"Publication date": datetool.parsedate(file_pub_date),
"Attachment": file_link,
"Attachment title": file_title,
"Associated organisations": "Scotland Office",
"Associated Document Series": series_title}
dt.upsert(file_data, "statistics")
dumptruck_to_csv(dt, "statistics", "/home/http/scotland/stats.csv")
示例11: DumpTruck
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import upsert [as 别名]
'http://www.nwo.usace.army.mil',
'http://www.nws.usace.army.mil',
'http://www.nww.usace.army.mil',
'http://www.pof.usace.army.mil',
'http://www.poj.usace.army.mil',
'http://www.saw.usace.army.mil',
'http://www.spa.usace.army.mil',
'http://www.spk.usace.army.mil',
'http://www.spl.usace.army.mil',
'http://www.swf.usace.army.mil',
'http://www.swg.usace.army.mil',
'http://www.tam.usace.army.mil',
}
if __name__ == '__main__':
dt = DumpTruck(dbname = 'usace.db')
dt.create_table({'permit_application_number': 'abcd'}, 'notice')
dt.create_index(['permit_application_number'], 'notice')
for division in parse.locations(get('http://www.usace.army.mil/Locations.aspx')):
for district in division['districts']:
domain = re.sub(r'.usace.army.mil.*$', '.usace.army.mil', district['href'])
path = '/Missions/Regulatory/PublicNotices.aspx'
if domain in SKIPPED_DISTRICTS:
continue
pn_list = None
while pn_list == None or pn_list['last_page'] > pn_list['current_page']:
pn_list = parse.public_notice_list(get(domain + path))
dt.upsert(list(pn_list['notices']), 'notice')
示例12: main
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import upsert [as 别名]
def main():
dt = DumpTruck(dbname = 'metrics.db')
dt.create_table({'portal': 'abc', 'date': datetime.date.today()}, 'series')
dt.create_index(['portal', 'date'], 'series')
dt.upsert(list(table()), 'series')
示例13: scrape_main_article
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import upsert [as 别名]
def scrape_main_article(url):
req = requests.get(url)
doc = lxml.html.fromstring(req.text)
div = doc.xpath("//*[@class='wrapper']")[0]
div.remove(div.find("h1"))
for para in div.findall("p"):
if para.find("strong") is not None:
div.remove(para)
return htmlize(etree.tostring(div))
dt = DumpTruck(dbname="scotland.db")
dt.create_table({"Title": "",
"Publication date": "",
"Old URL": "",
"Summary": "",
"Body": "",
"Associated organisations": ""}, "news")
dt.create_index(["Title", "Old URL"], "news", unique=True)
for url in URLS:
for news_item in scrape_list_page(url):
attachments = json.loads(news_item.pop("Attachments"))
link = attachments[0]["link"]
news_item["Old URL"] = link
news_item["Body"] = scrape_main_article(link)
dt.upsert(news_item, "news")
dumptruck_to_csv(dt, "news", "/home/http/scotland/news.csv")
示例14: count
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import upsert [as 别名]
while True :
idt="%s" % random.randrange(1,mid)
if idt in done :
sc=sc+1
else :
d=dt.execute("select count(*) as c from events where id='%s'" % idt)
if d[0]["c"]==1 :
# logger.debug("%s already scraped" % idt)
sc=sc+1
done[idt]=1
else :
retry=0
success=False
while success==False and retry<3 :
try :
dt.upsert(eventdata(idt,proxy=proxy),"events")
success=True
except OperationalError :
retry += 1
time.sleep(1)
logger.error("Retrying #%s" % retry)
except KeyboardInterrupt :
raise
except Exception, e:
logger.exception("ERROR %s " % (idt,))
err+=1
sc=sc+1
if (sc % 100)==0 :
nn=scrape_now()
try :
stats={
示例15: DumpTruck
# 需要导入模块: from dumptruck import DumpTruck [as 别名]
# 或者: from dumptruck.DumpTruck import upsert [as 别名]
#!/usr/bin/env python
from dumptruck import DumpTruck
dt = DumpTruck(dbname='scotland.db')
dt.create_table({'title': '', 'date': ''}, 'publications')
dt.create_index(['title'], 'publications', unique=True)
dt.upsert({'title': 'one', 'date': 'today'}, 'publications')
dt.upsert({'title': 'one', 'date': 'yesterday'}, 'publications')
data = dt.execute('SELECT * FROM `publications`')
print data