当前位置: 首页>>代码示例>>Python>>正文


Python Tokenizer.gen_n_grams方法代码示例

本文整理汇总了Python中tokenizer.Tokenizer.gen_n_grams方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.gen_n_grams方法的具体用法?Python Tokenizer.gen_n_grams怎么用?Python Tokenizer.gen_n_grams使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在tokenizer.Tokenizer的用法示例。


在下文中一共展示了Tokenizer.gen_n_grams方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: extract_grams

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import gen_n_grams [as 别名]
def extract_grams():

    client = MongoClient()
    db = client.news_tfidf

    select_strs = [
        ("se" ,"select ID, story from swift_excel_articles;"),
        ("ne", "select ID, body from nsc_excel_articles;"),
        ("sw", '''select sw.ID, sw.story 
                from swift_articles sw left join swift_excel_articles se on sw.ID = se.swiftID 
                where se.swiftID is null and in_sample=1;'''),
        ("ns", '''select ns.ID, concat(ns.lead_parag, ' ', ns.body) 
                from nsc_articles ns left join nsc_excel_articles ne on ns.ID = ne.nscID 
                where ne.nscID is null and in_sample=1;''')
    ]

    token_ids = get_token_dict(db)
    bi_gram_freqs = defaultdict(int)
    
    t = time.time()
    st = time.time()
    row_count = 0
    
    for source_id, query in select_strs:

        print query

        cnx = get_db_context()
        select_cur = cnx.cursor()
        select_cur.execute(query)
        
        for article_id, article, in select_cur:

            row_count += 1
            if row_count % 10000 == 0:
                print 'processed', row_count, 'rows in', (time.time() - t) / 60, 'minutes'
                t = time.time()

            if row_count % 10000 == 0:
                delete_low_counts(bi_gram_freqs)
                
            if type(article) in (str, unicode) and len(article) > 0:
            
                tokenizer = Tokenizer(article)
                bi_gram_gen = tokenizer.gen_n_grams(n=2)
                counted = set()
                
                for bi_gram in bi_gram_gen:
                    
                    bg_id_tup = (token_ids[bi_gram[0]], token_ids[bi_gram[1]])
                    
                    if bg_id_tup not in counted:
                        bi_gram_freqs[bg_id_tup] += 1
                        counted.add(bg_id_tup)

            if len(bi_gram_freqs) > 1000000:
                pickle_bi_gram_freqs(bi_gram_freqs, source_id)
                bi_gram_freqs = defaultdict(int)                
                        
        pickle_bi_gram_freqs(bi_gram_freqs, source_id)
        bi_gram_freqs = defaultdict(int)
        select_cur.close()
        cnx.close()      

    print 'parsing time: ', (time.time() - st) / 60, 'minutes'
    return bi_gram_freqs
开发者ID:jperelshteyn,项目名称:tr_challenge,代码行数:68,代码来源:extract_bi_grams.py

示例2: extract_grams

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import gen_n_grams [as 别名]
def extract_grams():

    client = MongoClient()
    db = client.news_tfidf

    select_strs = [
        # ("se" ,"select ID, story from swift_excel_articles;"),
        # ("ne", "select ID, body from nsc_excel_articles;"),
        ("sw", "select sw.ID, sw.story from swift_articles sw left join swift_excel_articles se on sw.ID = se.swiftID where se.swiftID is null;"),
        ("ns", "select ns.ID, concat(ns.lead_parag, ' ', ns.body) from nsc_articles ns left join nsc_excel_articles ne on ns.ID = ne.nscID where ne.nscID  is null;")
    ]
    
    id_gen = gen_ids()

    token_ids = get_token_dict(db)

    # token_freqs = dict()
    bi_gram_freqs = dict()
    
    doc_counts = list()
    
    t = time.time()

    row_count = 0
    cnx =  get_db_context()
    
    def incr_token_dict(t_dict, token):
        if token in t_dict:
            t_dict[token]['c'] += 1
        else:
            t_dict[token] = {'i': next(id_gen), 't': token, 'c': 1}

    
    def incr_bi_gram_dict(bg_dict, bg, source_id=None):
        if bg in bg_dict:
            bg_dict[bg]['c'] += 1
        else:
            if source_id:
                bg_dict[bg] = {'g1': bg[0], 'g2': bg[1], 'c': 1, 'source_id': source_id}
            else:
                bg_dict[bg] = {'g1': bg[0], 'g2': bg[1], 'c': 1}

    
    for source_id, query in select_strs:

        print query

        cnx =  get_db_context()
        select_cur = cnx.cursor()
        select_cur.execute(query)
        
        for article_id, article, in select_cur:

            row_count += 1
            if row_count % 5000 == 0:
                print 'processed', row_count, 'rows in', (time.time() - t) / 60, 'minutes'
                t = time.time()

            if type(article) in (str, unicode) and len(article) > 0:
            
                tokenizer = Tokenizer(article)
                # token_gen = tokenizer.gen_tokens()
                bi_gram_gen = tokenizer.gen_n_grams(n=2)
                
                # token_doc_freqs = dict()
                bi_gram_doc_freqs = dict()
                
                # for token in token_gen:
                    
                #     incr_token_dict(token_freqs, token)
                #     incr_token_dict(token_doc_freqs, token)

                for bi_gram in bi_gram_gen:
                    
                    gram1_id = token_ids[bi_gram[0]]
                    gram2_id = token_ids[bi_gram[1]]

                    incr_bi_gram_dict(bi_gram_doc_freqs, (gram1_id, gram2_id))
                    incr_bi_gram_dict(bi_gram_freqs, (gram1_id, gram2_id), source_id)
                    
                doc_counts.append(
                {
                        'sql_id': article_id,
                        'sql_tbl_id': source_id,
                        # 't_counts': token_doc_freqs.values(),
                        'bg_counts': bi_gram_doc_freqs.values()
                    }
                )
                
                if len(doc_counts) > 1000:
                    db_t = time.time()
                    print 'updating db...'
                    db.doc_freq.insert_many(doc_counts)
                    doc_counts = list()
                    
#                         update_doc_bg_counts(db, doc_counts)
#                         doc_counts = list()        

                    insert_bi_gram_freqs(db, source_id, bi_gram_freqs)
                    bi_gram_freqs = dict()
#.........这里部分代码省略.........
开发者ID:jperelshteyn,项目名称:tr_challenge,代码行数:103,代码来源:process_grams.py


注:本文中的tokenizer.Tokenizer.gen_n_grams方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。