当前位置: 首页>>代码示例>>Python>>正文


Python MySQLDatabase._create_connection方法代码示例

本文整理汇总了Python中wsd.database.MySQLDatabase._create_connection方法的典型用法代码示例。如果您正苦于以下问题:Python MySQLDatabase._create_connection方法的具体用法?Python MySQLDatabase._create_connection怎么用?Python MySQLDatabase._create_connection使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在wsd.database.MySQLDatabase的用法示例。


在下文中一共展示了MySQLDatabase._create_connection方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: pickle_vis_data_pandas

# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import _create_connection [as 别名]
def pickle_vis_data_pandas():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()


    df = pd.read_sql('select source_article_id, target_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080, visual_region from link_features', conn)
    print len(df)

    no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first()
    print len(no_dup)

    feature = no_dup.loc[no_dup['visual_region']=='lead']
    print len(feature)
    feature.reset_index(inplace=True)


    feature = no_dup.loc[no_dup['visual_region']=='infobox']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/infobox.tsv', sep='\t', index=False)

    feature = no_dup.loc[no_dup['visual_region']=='navbox']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/navbox.tsv', sep='\t', index=False)

    feature = no_dup.loc[no_dup['visual_region']=='left-body']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/left-body.tsv', sep='\t',index=False)

    feature = no_dup.loc[no_dup['visual_region']=='body']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/body.tsv', sep='\t',index=False)
开发者ID:trovdimi,项目名称:wikilinks,代码行数:37,代码来源:pickle_data.py

示例2: build_links_position_table

# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import _create_connection [as 别名]
def build_links_position_table():
    """creates up the basic database structure
    """
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    connection = db._create_connection()
    cursor = connection.cursor()

    cursor.execute('CREATE TABLE `redirects_candidates` ('
                      '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,'
                      '`source_article_id` BIGINT UNSIGNED NOT NULL,'
                      '`target_article_id` BIGINT UNSIGNED NULL,'
                      '`target_article_name` VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
                      ' target_position_in_text INT UNSIGNED NOT NULL,'
                      ' target_position_in_text_only INT UNSIGNED,'
                      ' target_position_in_section INT UNSIGNED,'
                      ' target_position_in_section_in_text_only INT UNSIGNED,'
                      ' section_name VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
                      ' section_number INT UNSIGNED,'
                      ' target_position_in_table INT UNSIGNED,'
                      ' table_number INT UNSIGNED,'
                      ' table_css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,'
                      ' table_css_style VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,'
                      ' target_x_coord_1920_1080 INT UNSIGNED DEFAULT NULL,'
                      ' target_y_coord_1920_1080 INT UNSIGNED DEFAULT NULL ,'
                      'INDEX(`target_article_id`),'
                      'INDEX(`source_article_id`)'
                  ') ENGINE=InnoDB;')
    connection.close()
开发者ID:trovdimi,项目名称:wikilinks,代码行数:30,代码来源:startredirectsinserter.py

示例3: pickle_correlations_zeros

# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import _create_connection [as 别名]
def pickle_correlations_zeros():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()

    print 'read'
    df = pd.read_sql('select source_article_id, target_article_id, IFNULL(counts, 0) as counts from link_features group by source_article_id, target_article_id', conn)
    print 'group'
    article_counts = df.groupby(by=["target_article_id"])['counts'].sum().reset_index()
    print 'write to file'
    article_counts[["target_article_id","counts"]].to_csv(TMP+'article_counts.tsv', sep='\t', index=False)
开发者ID:trovdimi,项目名称:wikilinks,代码行数:12,代码来源:weighted_pagerank.py

示例4: build_page_length_table

# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import _create_connection [as 别名]
def build_page_length_table():
    """creates up the basic database structure
    """
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    connection = db._create_connection()
    cursor = connection.cursor()

    cursor.execute('CREATE TABLE `redirects_candidates_page_length` ('
                      '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY,'
                      ' page_length_1920_1080 INT UNSIGNED DEFAULT NULL'
                  ') ENGINE=InnoDB;')
    connection.close()
开发者ID:trovdimi,项目名称:wikilinks,代码行数:14,代码来源:startredirectsinserter.py

示例5: build_table

# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import _create_connection [as 别名]
def build_table():
    """creates up the basic database structure
    """
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    connection = db._create_connection()
    cursor = connection.cursor()

    cursor.execute('CREATE TABLE `table_css_class` ('
                      '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,'
                      '`source_article_id` BIGINT UNSIGNED NOT NULL,'
                      ' css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
                      'INDEX(`source_article_id`)'
                  ') ENGINE=InnoDB;')
    connection.close()
开发者ID:trovdimi,项目名称:wikilinks,代码行数:16,代码来源:tableclassinserter.py

示例6: correlations

# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import _create_connection [as 别名]
def correlations(network_name):
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    # wikipedia  graph  structural statistics

    results = None
    try:
        results = cursor.execute('select c.curr_id,  sum(c.counts) as counts from clickstream_derived c where c.link_type_derived= %s  group by c.curr_id;', ("internal-link",))
        results = cursor.fetchall()


    except MySQLdb.Error, e:
        print ('error retrieving xy coord for all links links %s (%d)' % (e.args[1], e.args[0]))
开发者ID:trovdimi,项目名称:wikilinks,代码行数:16,代码来源:weighted_pagerank.py

示例7: rbo

# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import _create_connection [as 别名]
def rbo():
    print 'loading'
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    sm = []
    try:
        cursor.execute('select curr_id, sum(counts) as counts_sum, curr_title from clickstream_derived where link_type_derived=%s group by curr_id order by counts_sum desc limit 10000;', ("entry-sm",))
        result = cursor.fetchall()
        for row in result:
            record = {}
            record['curr_id']= row[0]
            record['counts_sum'] = row[1]
            record['curr_title'] = row[2]
            sm.append(row[0])
    except MySQLdb.Error, e:
        print e
开发者ID:trovdimi,项目名称:wikilinks,代码行数:19,代码来源:rbo.py

示例8: pickle_correlations_zeros_january

# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import _create_connection [as 别名]
def pickle_correlations_zeros_january():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()

    print 'read'
    df = pd.read_sql('select source_article_id, target_article_id from link_features', conn)
    print 'loaded links'
    df2 = pd.read_sql('select prev_id, curr_id, counts from clickstream_derived_en_201501  where link_type_derived= "internal-link";',  conn)
    print 'loaded counts'
    result = pd.merge(df, df2, how='left', left_on = ['source_article_id', 'target_article_id'], right_on = ['prev_id', 'curr_id'])
    print 'merged counts'
    print result
    article_counts = result.groupby(by=["target_article_id"])['counts'].sum().reset_index()
    article_counts['counts'].fillna(0.0, inplace=True)
    print article_counts
    print 'write to file'
    article_counts[["target_article_id","counts"]].to_csv(TMP+'january_article_counts.tsv', sep='\t', index=False)
开发者ID:trovdimi,项目名称:wikilinks,代码行数:19,代码来源:weighted_pagerank.py

示例9: plot_degree_filtered_sql

# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import _create_connection [as 别名]
def plot_degree_filtered_sql():
    print 'before select'
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    cursor.execute('SELECT source_article_id, target_article_id FROM link_occurences where source_article_id in '
                   ' (select distinct prev_id from clickstream_derived_internal_links);')
    result = cursor.fetchall()
    network = Graph()
    print 'after select'
    print 'result len'
    print len(result)

    for i, link in enumerate(result):
        if i % 1000000==0:
            print i, len(result)
        network.add_edge(link[0], link[1])

    # filter all nodes that have no edges
    print 'filter nodes with degree zero graph tool specific code'
    network = GraphView(network, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )
    print 'before save'
    network.save("output/wikipedianetworkfilteredwithtransitions_prev_id.xml.gz")
    print 'done'

    cursor.execute('SELECT source_article_id, target_article_id FROM link_occurences where target_article_id in '
                   ' (select distinct curr_id from clickstream_derived_internal_links);')
    result = cursor.fetchall()
    network = Graph()
    print 'after select'
    print 'resutl len'
    print len(result)

    for i, link in enumerate(result):
        if i % 1000000==0:
            print i, len(result)
        network.add_edge(link[0], link[1])

    # filter all nodes that have no edges
    print 'filter nodes with degree zero graph tool specific code'
    network = GraphView(network, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )
    print 'before save'
    network.save("output/wikipedianetworkfilteredwithtransitions_curr_id.xml.gz")
    print 'done'
开发者ID:linksuccess,项目名称:linksuccess,代码行数:46,代码来源:structural_statistics.py

示例10: print_table

# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import _create_connection [as 别名]
def print_table():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()

    df = pd.read_sql('select source_article_id, target_article_id, rel_degree, rel_in_degree, rel_out_degree, '
                     'rel_page_rank, rel_kcore, target_x_coord_1920_1080, target_y_coord_1920_1080, visual_region, '
                     'IFNULL(counts, 0) as counts from link_features order by source_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080', conn)

    print "dup"
    #no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first()
    no_dup = df.groupby(["source_article_id", "target_article_id"]).first()

    no_dup = no_dup.reset_index()
    print "no dup"
    del df
    #print no_dup
    df_top = pd.read_sql("select source_article_id, target_article_id, sim as topic_similarity  from topic_similarity", conn)
    print "no up"
    topDF = df_top.groupby("source_article_id", as_index=False)["topic_similarity"].median()
    #print topDF
    print "no up1"
    topDF.columns = ["source_article_id", "topic_similarity_article_median"]
    #print topDF
    print "no up2"
    df_top = df_top.merge(topDF, on="source_article_id")
    #print df_top[(df_top['topic_similarity_article_median'] >0)]
    print "no up3"

    df_sem = pd.read_sql("select source_article_id, target_article_id, sim as sem_similarity from semantic_similarity", conn)
    print "no up4"
    semDF = df_sem.groupby("source_article_id", as_index=False)["sem_similarity"].median()
    #rename
    print "no up5"
    semDF.columns = ["source_article_id", "sem_similarity_article_median"]
    print "no up6"
    #print df_top
    df_sem = df_sem.merge(semDF, on="source_article_id")
    #print len(df_sem)
    print "no up7"
    df1 = no_dup.merge(df_sem[['source_article_id', 'sem_similarity', 'sem_similarity_article_median']], on="source_article_id")
    #print no_dup
    del df_sem, semDF
    df = no_dup.merge(df_top[['source_article_id', 'topic_similarity', 'topic_similarity_article_median']], on="source_article_id")
    print "no up9"
    del no_dup
    del df_top, topDF

    table = ""

    table += resultTableLine (df, "src_degr > target_degr", "df.rel_degree > 0")
    table += resultTableLine (df, "src_degr <= target_degr", "df.rel_degree <= 0")


    table += resultTableLine (df, "src_in_degr > target_in_degr", "df.rel_in_degree > 0")
    table += resultTableLine (df, "src_in_degr <= target_in_degr", "df.rel_in_degree <= 0")


    table += resultTableLine (df, "src_out_degr > target_out_degr", "df.rel_out_degree > 0")
    table += resultTableLine (df, "src_out_degr <= target_out_degr", "df.rel_out_degree <= 0")

    table += resultTableLine (df, "src_kcore > target_kcore", "df.rel_kcore > 0")
    table += resultTableLine (df, "src_kcore <= target_kcore", "df.rel_kcore <= 0")

    table += resultTableLine (df, "src_page_rank > target_page_rank", "df.rel_page_rank > 0")
    table += resultTableLine (df, "src_page_rank <= target_page_rank", "df.rel_page_rank <= 0")


    table += resultTableLine (df1, "text_sim > median(text_sim) of page", "df.sem_similarity > df.sem_similarity_article_median")
    table += resultTableLine (df1, "text_sim <= median(text_sim) of page", "df.sem_similarity <= df.sem_similarity_article_median")

    table += resultTableLine (df, "topic_sim > median(topic_sim) of page", "df.topic_similarity > df.topic_similarity_article_median")
    table += resultTableLine (df, "topic_sim <= median(topic_sim) of page", "df.topic_similarity <= df.topic_similarity_article_median")


    table += resultTableLine (df, "left third of screen", "df.target_x_coord_1920_1080 <= 360")
    table += resultTableLine (df, "middle third of screen", "(df.target_x_coord_1920_1080 > 360) & (df.target_x_coord_1920_1080 <= 720)")
    table += resultTableLine (df, "right third of screen", "df.target_x_coord_1920_1080 > 720")

    table += resultTableLine (df, "position = lead", "df.visual_region == 'lead'")
    table += resultTableLine (df, "position = body", "(df.visual_region == 'body') | (df.visual_region == 'left-body')")
    table += resultTableLine (df, "position = navbox", "df.visual_region == 'navbox'")
    #table += resultTableLine (df, "position = left-body", "df.visual_region == 'left-body'")
    table += resultTableLine (df, "position = infobox", "df.visual_region == 'infobox'")


    print table
开发者ID:trovdimi,项目名称:wikilinks,代码行数:88,代码来源:empirlical_table.py

示例11: MySQLDatabase

# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import _create_connection [as 别名]
from wsd.database import MySQLDatabase
from graph_tool.all import *
from conf import *


db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
conn = db._create_connection()
cursor = conn.cursor()
cursor.execute('SELECT source_article_id, target_article_id FROM link_occurences;')
result = cursor.fetchall()
wikipedia = Graph()

for link in result:
    wikipedia.add_edge(link[0], link[1])

# filter all nodes that have no edges
wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )

print "clust"
wikipedia.vertex_properties["local_clust"] = local_clustering(wikipedia)

print "page_rank"
wikipedia.vertex_properties["page_rank"] = pagerank(wikipedia)

print "eigenvector_centr"
eigenvalue, eigenvectorcentr = eigenvector(wikipedia)
wikipedia.vertex_properties["eigenvector_centr"] = eigenvectorcentr

print "kcore"
wikipedia.vertex_properties["kcore"] = kcore_decomposition(wikipedia)
开发者ID:linksuccess,项目名称:linksuccess,代码行数:32,代码来源:createwikipedianetwork.py

示例12: correlations_zeros

# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import _create_connection [as 别名]
def correlations_zeros(labels, consider_zeros=True, clickstream_data='', struct=False):
    #load network
    print struct
    name = '_'.join(labels)
    wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz")
    #read counts with zeros
    if consider_zeros:
        article_counts  =  pd.read_csv(TMP+clickstream_data+'article_counts.tsv', sep='\t')
        print TMP+clickstream_data+'article_counts.tsv'
        correlations_weighted_pagerank = {}
        for label in labels:
            if struct:
                label = label[7:]
            for damping in [0.8,0.85,0.9]:
                key = label+"_page_rank_weighted_"+str(damping)
                pagerank = wikipedia.vertex_properties[key]
                page_rank_values = list()
                counts = list()
                correlations_values = {}
                for index, row in article_counts.iterrows():
                    counts.append(float(row['counts']))
                    page_rank_values.append(pagerank[wikipedia.vertex(int(row['target_article_id']))])
                print 'pearson'
                p = pearsonr(page_rank_values, counts)
                print p
                correlations_values['pearson']=p
                print 'spearmanr'
                s = spearmanr(page_rank_values, counts)
                print s
                correlations_values['spearmanr']=s
                print 'kendalltau'
                k = kendalltau(page_rank_values, counts)
                print k
                correlations_values['kendalltau']=k
                correlations_weighted_pagerank[key]=correlations_values

        write_pickle(HOME+'output/correlations/'+clickstream_data+'correlations_pagerank_'+name+'.obj', correlations_weighted_pagerank)
    else:
        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
        conn = db._create_connection()
        cursor = conn.cursor()
        # wikipedia  graph  structural statistics

        results = None
        try:
            if clickstream_data != '':

                results = cursor.execute('select c.curr_id,  sum(c.counts) as counts from clickstream_derived c where c.link_type_derived= %s  group by c.curr_id;', ("internal-link",))
                results = cursor.fetchall()
            else:
                results = cursor.execute('select c.curr_id,  sum(c.counts) as counts from clickstream_derived_en_201501 c where c.link_type_derived= %s  group by c.curr_id;', ("internal-link",))
                results = cursor.fetchall()

        except MySQLdb.Error, e:
            print ('error retrieving xy coord for all links links %s (%d)' % (e.args[1], e.args[0]))
        print 'after sql load'


        correlations_weighted_pagerank = {}
        for label in labels:
            if struct:
                label = label[7:]
            for damping in [0.8,0.85,0.9]:
                key = label+"_page_rank_weighted_"+str(damping)
                pagerank = wikipedia.vertex_properties[key]
                correlations={}
                counts=[]
                page_rank_values=[]
                for row in results:
                    counts.append(float(row[1]))
                    page_rank_values.append(pagerank[wikipedia.vertex(int(row[0]))])
                print 'pearson'
                p = pearsonr(page_rank_values, counts)
                print p
                correlations['pearson']=p
                print 'spearmanr'
                s= spearmanr(page_rank_values, counts)
                print s
                correlations['spearmanr']=s
                print 'kendalltau'
                k= kendalltau(page_rank_values, counts)
                print k
                correlations['kendalltau']=k
                correlations_weighted_pagerank[key]=correlations



        write_pickle(HOME+'output/correlations/'+clickstream_data+'correlations_pagerank_without_zeros'+name+'.obj', correlations_weighted_pagerank)
开发者ID:trovdimi,项目名称:wikilinks,代码行数:90,代码来源:weighted_pagerank.py

示例13: weighted_pagerank_hyp_engineering

# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import _create_connection [as 别名]
def weighted_pagerank_hyp_engineering(labels):

    #read vocab, graph
    graph =  read_pickle(SSD_HOME+"pickle/graph")
    print "loaded graph"
    values =  read_pickle(SSD_HOME+"pickle/values")
    values_kcore = read_pickle(SSD_HOME+"pickle/values_kcore")

    # transform kcore values to model going out of the kcore
    values_kcore = [1./np.sqrt(float(x)) for x in values_kcore]
    print 'kcore values tranfsormation'

    #sem_sim_hyp = read_pickle(SSD_HOME+"pickle/sem_sim_hyp")
    #print "sem_sim_hyp values"

    #lead_hyp = read_pickle(SSD_HOME+"pickle/lead_hyp")
    #infobox_hyp = read_pickle(SSD_HOME+"pickle/infobox_hyp")
    #left_body_hyp = read_pickle(SSD_HOME+"pickle/left-body_hyp")
    #print "gamma values"

    vocab = read_pickle(SSD_HOME+"pickle/vocab")
    print "loaded vocab"

    state_count = len(vocab)
    states = vocab.keys()
    shape = (state_count, state_count)


    hyp_structural = csr_matrix((values, (graph[0], graph[1])),
                                shape=shape, dtype=np.float)


    hyp_kcore = csr_matrix((values_kcore, (graph[0], graph[1])),
                           shape=shape, dtype=np.float)
    print "hyp_kcore"

    del graph
    del values_kcore

    print "after delete"


    #read sem sim form db and create hyp
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()

    print 'read'
    df = pd.read_sql('select source_article_id, target_article_id, sim from semantic_similarity', conn)
    print 'map sem sim'
    sem_sim_hyp_i = map_to_hyp_indicies(vocab, df['source_article_id'])
    sem_sim_hyp_j = map_to_hyp_indicies(vocab, df['target_article_id'])

    hyp_sem_sim = csr_matrix((df['sim'].values, (sem_sim_hyp_i, sem_sim_hyp_j)),
                             shape=shape, dtype=np.float)
    print 'done map sem sim'
    print hyp_sem_sim.shape
    del sem_sim_hyp_i
    del sem_sim_hyp_j
    del df

    #read vis form csv and create hyp
    lead = pd.read_csv(TMP+'lead.tsv',sep='\t')
    lead_i = map_to_hyp_indicies(vocab, lead['source_article_id'])
    lead_j = map_to_hyp_indicies(vocab, lead['target_article_id'])
    lead_v = np.ones(len(lead_i), dtype=np.float)
    
    hyp_lead = csr_matrix((lead_v, (lead_i, lead_j)),
                            shape=shape, dtype=np.float)
    print 'done map lead'
    print hyp_lead.shape
    del lead
    del lead_i
    del lead_j
    del lead_v

    infobox = pd.read_csv(TMP+'infobox.tsv',sep='\t')
    infobox_i = map_to_hyp_indicies(vocab, infobox['source_article_id'])
    infobox_j = map_to_hyp_indicies(vocab, infobox['target_article_id'])
    infobox_v = np.ones(len(infobox_i), dtype=np.float)

    hyp_infobox = csr_matrix((infobox_v, (infobox_i, infobox_j)),
                             shape=shape, dtype=np.float)
    print 'done map infobox'
    print hyp_infobox.shape
    del infobox
    del infobox_i
    del infobox_j
    del infobox_v

    left_body = pd.read_csv(TMP+'left-body.tsv',sep='\t')
    left_body_i = map_to_hyp_indicies(vocab, left_body['source_article_id'])
    left_body_j = map_to_hyp_indicies(vocab, left_body['target_article_id'])
    left_body_v = np.ones(len(left_body_i), dtype=np.float)

    hyp_left_body = csr_matrix((left_body_v, (left_body_i, left_body_j)),
                               shape=shape, dtype=np.float)
    print 'done map infobox'
    print hyp_left_body.shape
    del left_body
    del left_body_i
#.........这里部分代码省略.........
开发者ID:trovdimi,项目名称:wikilinks,代码行数:103,代码来源:weighted_pagerank.py

示例14: weighted_pagerank

# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import _create_connection [as 别名]
def weighted_pagerank():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    cursor.execute('SELECT source_article_id, target_article_id, occ FROM link_occurences;')
    result = cursor.fetchall()
    wikipedia = Graph()
    eprop = wikipedia.new_edge_property("int")

    for link in result:
        e = wikipedia.add_edge(link[0], link[1])
        eprop[e] = link[2]
    # filter all nodes that have no edges
    wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )


    print "page_rank_weighted"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank_weighted"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop,damping=damping)

    print "page_rank"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, damping=damping)




    wikipedia.save("output/weightedpagerank/wikipedianetwork_link_occ.xml.gz")
    print 'link_occ done'


    cursor.execute('SELECT source_article_id, target_article_id, sim FROM semantic_similarity group by '
                   'source_article_id, target_article_id;')
    result = cursor.fetchall()
    wikipedia = Graph()
    eprop = wikipedia.new_edge_property("double")

    for link in result:
        e = wikipedia.add_edge(link[0], link[1])
        eprop[e] = link[2]
    # filter all nodes that have no edges
    print 'filter nodes graph tool specific code'
    wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )


    print "page_rank_weighted"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank_weighted"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop,damping=damping)

    print "page_rank"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, damping=damping)


    wikipedia.save("output/weightedpagerank/wikipedianetwork_sem_sim_distinct_links.xml.gz")
    print 'sem sim distrinct links done'

    cursor.execute('SELECT source_article_id, target_article_id, sim FROM semantic_similarity;')
    result = cursor.fetchall()
    wikipedia = Graph()
    eprop = wikipedia.new_edge_property("double")

    for link in result:
        e = wikipedia.add_edge(link[0], link[1])
        eprop[e] = link[2]
    # filter all nodes that have no edges
    wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )


    print "page_rank_weighted"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank_weighted"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop,damping=damping)

    print "page_rank"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, damping=damping)

    wikipedia.save("output/weightedpagerank/wikipedianetwork_sem_sim.xml.gz")
    print 'sem_sim done'
开发者ID:trovdimi,项目名称:wikilinks,代码行数:93,代码来源:weighted_pagerank.py

示例15: get_redirecsfromXML

# 需要导入模块: from wsd.database import MySQLDatabase [as 别名]
# 或者: from wsd.database.MySQLDatabase import _create_connection [as 别名]
 def get_redirecsfromXML(self, dump_date):
     db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
     conn = db._create_connection()
     df = pd.read_sql(('select * from redirects'),conn)
     return df.set_index('source_article_name')['target_article_name'].to_dict()
开发者ID:trovdimi,项目名称:wikilinks,代码行数:7,代码来源:check.py


注:本文中的wsd.database.MySQLDatabase._create_connection方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。