本文整理汇总了Python中wsd.database.MySQLDatabase.get_work_view方法的典型用法代码示例。如果您正苦于以下问题:Python MySQLDatabase.get_work_view方法的具体用法?Python MySQLDatabase.get_work_view怎么用?Python MySQLDatabase.get_work_view使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类wsd.database.MySQLDatabase
的用法示例。
在下文中一共展示了MySQLDatabase.get_work_view方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: req
# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import get_work_view [as 别名]
def req():
# Get URLs from a text file, remove white space.
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_worker_view = db.get_work_view()
articles = db_worker_view.retrieve_all_articles()
#articles = db_worker_view.retrieve_all_articles_questionmark()
# measure time
start = time.clock()
start_time_iteration = start
iteration_number = 483
for i, article in enumerate(articles):
# print some progress
if i % 10000 == 0:
#print time for the iteration
seconds = time.clock() - start_time_iteration
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
print "Number of crawled articles: %d. Total time for last iteration of 10000 articles: %d:%02d:%02d" % (i, h, m, s)
start_time_iteration = time.clock()
iteration_number += 1
# Thread pool.
# Blocks other threads (more than the set limit).
pool.acquire(blocking=True)
# Create a new thread.
# Pass each URL (i.e. u parameter) to the worker function.
t = threading.Thread(target=worker, args=(MEDIAWIKI_API_ENDPOINT+urllib.quote(article['title'])+'/'+str(article['rev_id']), article, iteration_number))
# Start the newly create thread.
t.start()
seconds = time.clock() - start
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
print "Total time: %d:%02d:%02d" % (h, m, s)
示例2: _evaluate_disambiguations
# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import get_work_view [as 别名]
def _evaluate_disambiguations(self):
INPUT_FILE = self.read_path('Please enter the path of the samples file [.xml]', default='./tmp/samples.xml')
LOGGING_PATH = self.read_path('Please enter the path of the logging file [.log]', default='./tmp/evaluation3.log', must_exist=False)
CONTINUE = self.read_yes_no('This process might take from several minutes to several hours.\nDo you want to continue?')
if not CONTINUE:
print '# Aborting...'
return
print '# Starting evaluation...'
# setup logging
LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s'
logging.basicConfig(filename=LOGGING_PATH, level=logging.DEBUG, format=LOGGING_FORMAT, filemode='w')
# connecting to db
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
work_view = db.get_work_view()
# measure time
start = time.clock()
evaluator = Evaluator(INPUT_FILE, work_view)
result = evaluator.evaluate_disambiguations()
seconds = round (time.clock() - start)
print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)
print 'Evaluation done! - precision: %d%%, recall: %d%%' % (round(result['precision']*100), round(result['recall']*100))
示例3: run
# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import get_work_view [as 别名]
def run(self):
self.print_title('This is the interactive runner program')
self.create_tmp_if_not_exists()
INPUT_FILE = self.read_path('Please enter the path of the input file [.txt]', default='./tmp/input.txt')
OUTPUT_FILE = self.read_path('Please enter the path of the output file [.html]', default='./tmp/output.html', must_exist=False)
LOGGING_PATH = self.read_path('Please enter the path of the logging file [.log]', default='./tmp/runner.log', must_exist=False)
print '# Starting runner...'
# setup logging
LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s'
logging.basicConfig(filename=LOGGING_PATH, level=logging.DEBUG, format=LOGGING_FORMAT, filemode='w')
# measure time
start = time.clock()
# connect to db
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
work_view = db.get_work_view()
# read input
f = open(INPUT_FILE, 'r')
text = f.read()
text = text.replace(' ', ' ')
f.close()
# create dummy article
article = {}
article['type'] = 'article'
article['id'] = None
article['title'] = None
article['text'] = text
article['links'] = []
# identify links
link_detector = LinkDetector(work_view)
link_detector.detect_links(article)
# identify terms
#term_identifier = TermIdentifier()
#article = term_identifier.identify_terms(text)
# find possible meanings
meaning_finder = MeaningFinder(work_view)
meaning_finder.find_meanings(article)
# calculate relatedness
relatedness_calculator = RelatednessCalculator(work_view)
# decide for meaning
decider = Decider(relatedness_calculator)
decider.decide(article)
# output results
html_outputter = HTMLOutputter()
html_outputter.output(article, OUTPUT_FILE)
seconds = round (time.clock() - start)
print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)
示例4: pickle_aggregated_counts_distribution
# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import get_work_view [as 别名]
def pickle_aggregated_counts_distribution():
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_worker_view = db.get_work_view()
cursor = db_worker_view._cursor
results = {}
try:
cursor.execute('select sum(counts) from clickstream_derived_internal_links group by prev_id;')
result = cursor.fetchall()
results['source_article']=result
except MySQLdb.Error, e:
print e
示例5: pickle_category_counts_distribution
# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import get_work_view [as 别名]
def pickle_category_counts_distribution():
results = {}
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_worker_view = db.get_work_view()
cursor = db_worker_view._cursor
for category in ['lead', 'infobox', 'body', 'left-body', 'navbox']:
try:
cursor.execute('select counts from link_features where counts is not null and visual_region=%s;', (category,))
result = cursor.fetchall()
results[category] = result
except MySQLdb.Error, e:
print e
示例6: links_heatmap
# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import get_work_view [as 别名]
def links_heatmap():
#http://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set
# Get URLs from a text file, remove white space.
print 'loading'
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_worker_view = db.get_work_view()
coords = db_worker_view.retrieve_all_links_coords()
print 'coord loaded'
x=[]
y=[]
page_lenghts = db_worker_view.retrieve_all_page_lengths()
print 'lenghts loaded'
for coord in coords:
x_normed = float(coord['x'])/float(1920)
y_normed = float(coord['y'])/float(page_lenghts[coord['source_article_id']])
if x_normed <=1.0 and y_normed <=1.0:
x.append(x_normed)
y.append(y_normed)
heatmap, xedges, yedges = np.histogram2d(x, y, bins=100)
extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]]
fig_size = (2.4, 2)
#fig_size = (3.5, 3)
plt.clf()
plt.figure(figsize=fig_size)
plt.grid(True)
plt.imshow(heatmap, extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet'))
plt.colorbar()
#plt.title("Links Heatmap Log Normalized")
plt.show()
plt.savefig('output/links_heatmap_lognormed_self_loop.pdf')
plt.clf()
plt.figure(figsize=fig_size)
plt.grid(True)
plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet'))
plt.colorbar()
#plt.title("Links Heatmap Normalized")
plt.show()
plt.savefig('output/links_heatmap_normed_self_loop.pdf')
print "done"
示例7: pickle_redirects_ids
# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import get_work_view [as 别名]
def pickle_redirects_ids():
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_work_view = db.get_work_view()
redirects_list_id = []
with open(HOME+"data/candidate_articles.tsv") as f:
next(f)
for line in f:
line = line.strip().split('\t')
#look up id
tmp = db_work_view.resolve_title(line[0].replace('_',' '))
#print tmp
if tmp is not None:
redirects_list_id.append(tmp['id'])
pickle.dump(redirects_list_id, open(SSD_HOME+"pickle/redirects_ids.obj", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
示例8: clicks_heatmap_total
# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import get_work_view [as 别名]
def clicks_heatmap_total():
print 'loading'
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_worker_view = db.get_work_view()
coords = db_worker_view.retrieve_all_links_coords_clicks()
print 'coord loaded'
links = {}
x = []
y = []
values = []
for coord in coords:
x_normed = float(coord['x'])/float(1920)
y_normed = float(coord['y'])/float(coord['page_length'])
if x_normed <=1.0 and y_normed <=1.0:
x.append(x_normed)
y.append(y_normed)
values.append(float(coord['counts']))
heatmap, xedges, yedges = np.histogram2d(x, y, bins=100, weights=values)
extent = [xedges[0], xedges[-1], yedges[-1], yedges[0] ]
fig_size = (2.4, 2)
plt.clf()
plt.figure(figsize=fig_size)
plt.grid(True)
plt.imshow(heatmap , extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet'))
plt.colorbar()
#plt.title("Clicks Heatmap Log Normalized")
plt.show()
plt.savefig('output/clicks_heatmap_lognormed_self_loop_total.pdf')
plt.clf()
plt.figure(figsize=fig_size)
plt.grid(True)
plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(), cmap=plt.get_cmap('jet'))
plt.colorbar()
#plt.title("Clicks Heatmap Normalized")
plt.show()
plt.savefig('output/clicks_heatmap_normed_self_loop_total.pdf')
print "done"
示例9: export_data_unresolved
# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import get_work_view [as 别名]
def export_data_unresolved():
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_work_view = db.get_work_view()
connection = db_work_view._db_connection
df_clickstream = pn.read_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/2016_08_clickstream_unresolved.tsv', sep='\t', error_bad_lines=False)
df_clickstream['prev']=df_clickstream['prev'].str.replace('_', ' ')
df_clickstream['curr']=df_clickstream['curr'].str.replace('_', ' ')
df_clickstream['curr_unresolved']=df_clickstream['curr_unresolved'].str.replace('_', ' ')
df_redirects_candidates = pn.read_sql('select * from redirects_candidates_sample', connection)
sample_unresoleved = pn.merge(df_redirects_candidates, df_clickstream, how='left', left_on= ['source_article_name','target_article_name'], right_on=['prev', 'curr_unresolved'])
sample_unresoleved['n'].fillna(0, inplace=True)
sample_unresoleved.to_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/data_unresolved.tsv', sep='\t',encoding="utf-8")
示例10: links_heatmap_rel_prob
# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import get_work_view [as 别名]
def links_heatmap_rel_prob():
#http://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set
# Get URLs from a text file, remove white space.
print 'loading'
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_worker_view = db.get_work_view()
coords = db_worker_view.retrieve_all_links_coords()
x=[]
y=[]
page_lenghts = db_worker_view.retrieve_all_page_lengths()
for coord in coords:
x_normed = float(coord['x'])/float(1920)
y_normed = float(coord['y'])/float(page_lenghts[coord['source_article_id']])
if x_normed <=1.0 and y_normed <=1.0:
x.append(x_normed)
y.append(y_normed)
links_heatmap_hist, xedges, yedges = np.histogram2d(x, y, normed=True, bins=100)
links_extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]]
coords = db_worker_view.retrieve_all_links_coords_clicks()
print 'coord loaded'
links = {}
x = []
y = []
values = []
for coord in coords:
try:
v = links[coord['key']]
links[coord['key']]+=1
except:
links[coord['key']]=0
for coord in coords:
x_normed = float(coord['x'])/float(1920)
y_normed = float(coord['y'])/float(coord['page_length'])
if x_normed <=1.0 and y_normed <=1.0:
x.append(x_normed)
y.append(y_normed)
if links[coord['key']]==0:
#x.append(x_normed)
#y.append(y_normed)
values.append(float(coord['counts']))
else:
values.append(float(coord['counts'])/float(links[coord['key']]))
clicks_heatmap_hist, xedges, yedges = np.histogram2d(x, y, bins=100, normed=True, weights=values)
clicks_extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]]
substraction_hist = np.subtract(clicks_heatmap_hist,links_heatmap_hist)
#rel_prob_hist = np.divide(clicks_heatmap_hist, links_heatmap_hist)
with np.errstate(divide='ignore', invalid='ignore'):
rel_prob_hist = np.divide(clicks_heatmap_hist, links_heatmap_hist)
rel_prob_hist[rel_prob_hist == np.inf] = 0
rel_prob_hist = np.nan_to_num(rel_prob_hist)
fig_size = (2.4, 2)
plt.clf()
plt.figure(figsize=fig_size)
plt.grid(True)
plt.imshow(substraction_hist, extent=clicks_extent, origin='upper',norm=Normalize(), cmap=plt.get_cmap('jet'))
plt.colorbar()
plt.show()
plt.savefig('output/clicks-links_heatmap_normed_self_loop.pdf')
plt.clf()
plt.figure(figsize=fig_size)
plt.grid(True)
plt.imshow(rel_prob_hist , extent=clicks_extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet'))
plt.colorbar()
plt.show()
plt.savefig('output/clicks_over_links_heatmap_normed_self_loop.pdf')
plt.clf()
plt.figure(figsize=fig_size)
plt.grid(True)
plt.imshow(substraction_hist, extent=clicks_extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet'))
plt.colorbar()
plt.show()
plt.savefig('output/clicks-links_heatmap_lognormed_self_loop.pdf')
#.........这里部分代码省略.........
示例11: multiple_links_heatmap
# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import get_work_view [as 别名]
def multiple_links_heatmap():
print 'loading'
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_worker_view = db.get_work_view()
coords = db_worker_view.retrieve_all_links_multpile_occ()
print 'coord loaded'
page_lenghts = db_worker_view.retrieve_all_page_lengths()
print 'lenghts loaded'
links = {}
x = []
y = []
x_conf = []
y_conf = []
x_not_conf = []
y_not_conf = []
number_of_not_confident_clicks=0
number_of_confident_clicks = 0
number_of_valid_normed_links=0
for coord in coords:
try:
v = links[coord['key']]
links[coord['key']]+=1
except:
links[coord['key']]=0
for coord in coords:
x_normed = float(coord['x'])/float(1920)
y_normed = float(coord['y'])/float(page_lenghts[coord['key'][0]])
if x_normed <=1.0 and y_normed <=1.0:
x.append(x_normed)
y.append(y_normed)
number_of_valid_normed_links+=1
if links[coord['key']]==0:
x_conf.append(x_normed)
y_conf.append(y_normed)
number_of_confident_clicks+=1
else:
x_not_conf.append(x_normed)
y_not_conf.append(y_normed)
number_of_not_confident_clicks+=1
print '###########'
print number_of_confident_clicks
print number_of_not_confident_clicks
print number_of_valid_normed_links
print len(coords)
print '###########'
heatmap, xedges, yedges = np.histogram2d(x_conf, y_conf, bins=100)
extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]]
fig_size = (2.4, 2)
#fig_size = (3.5, 3)
plt.clf()
plt.figure(figsize=fig_size)
plt.grid(True)
plt.imshow(heatmap, extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet'))
plt.colorbar()
#plt.title("Links Heatmap Log Normalized")
plt.show()
plt.savefig('output/links_heatmap_lognormed_self_loop_unique.pdf')
plt.clf()
plt.figure(figsize=fig_size)
plt.grid(True)
plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet'))
plt.colorbar()
#plt.title("Links Heatmap Normalized")
plt.show()
plt.savefig('output/links_heatmap_normed_self_loop_unique.pdf')
print "unique done"
heatmap, xedges, yedges = np.histogram2d(x_not_conf, y_not_conf, bins=100)
extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]]
fig_size = (2.4, 2)
#fig_size = (3.5, 3)
plt.clf()
plt.figure(figsize=fig_size)
plt.grid(True)
plt.imshow(heatmap, extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet'))
plt.colorbar()
#plt.title("Links Heatmap Log Normalized")
plt.show()
plt.savefig('output/links_heatmap_lognormed_self_loop_multiple.pdf')
plt.clf()
plt.figure(figsize=fig_size)
plt.grid(True)
plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet'))
plt.colorbar()
#plt.title("Links Heatmap Normalized")
#.........这里部分代码省略.........
示例12: MySQLDatabase
# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import get_work_view [as 别名]
from wsd.database import MySQLDatabase
from graph_tool.all import *
from conf import *
__author__ = 'dimitrovdr'
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_work_view = db.get_work_view()
wikipedia = Graph()
for link in db_work_view.retrieve_all_internal_transitions():
wikipedia.add_edge(link['from'], link['to'])
#print 'from %s, to %s', link['from'], link['to']
#wikipedia.save("output/transitionsnetwork.xml.gz")
# filter all nodes that have no edges
transitions_network = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )
print "clust"
transitions_network.vertex_properties["local_clust"] = local_clustering(transitions_network)
print "page_rank"
transitions_network.vertex_properties["page_rank"] = pagerank(transitions_network)
print "eigenvector_centr"
eigenvalue, eigenvectorcentr = eigenvector(transitions_network)
transitions_network.vertex_properties["eigenvector_centr"] = eigenvectorcentr