本文整理汇总了Python中tokenizer.Tokenizer.gen_n_grams方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.gen_n_grams方法的具体用法?Python Tokenizer.gen_n_grams怎么用?Python Tokenizer.gen_n_grams使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tokenizer.Tokenizer
的用法示例。
在下文中一共展示了Tokenizer.gen_n_grams方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extract_grams
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import gen_n_grams [as 别名]
def extract_grams():
client = MongoClient()
db = client.news_tfidf
select_strs = [
("se" ,"select ID, story from swift_excel_articles;"),
("ne", "select ID, body from nsc_excel_articles;"),
("sw", '''select sw.ID, sw.story
from swift_articles sw left join swift_excel_articles se on sw.ID = se.swiftID
where se.swiftID is null and in_sample=1;'''),
("ns", '''select ns.ID, concat(ns.lead_parag, ' ', ns.body)
from nsc_articles ns left join nsc_excel_articles ne on ns.ID = ne.nscID
where ne.nscID is null and in_sample=1;''')
]
token_ids = get_token_dict(db)
bi_gram_freqs = defaultdict(int)
t = time.time()
st = time.time()
row_count = 0
for source_id, query in select_strs:
print query
cnx = get_db_context()
select_cur = cnx.cursor()
select_cur.execute(query)
for article_id, article, in select_cur:
row_count += 1
if row_count % 10000 == 0:
print 'processed', row_count, 'rows in', (time.time() - t) / 60, 'minutes'
t = time.time()
if row_count % 10000 == 0:
delete_low_counts(bi_gram_freqs)
if type(article) in (str, unicode) and len(article) > 0:
tokenizer = Tokenizer(article)
bi_gram_gen = tokenizer.gen_n_grams(n=2)
counted = set()
for bi_gram in bi_gram_gen:
bg_id_tup = (token_ids[bi_gram[0]], token_ids[bi_gram[1]])
if bg_id_tup not in counted:
bi_gram_freqs[bg_id_tup] += 1
counted.add(bg_id_tup)
if len(bi_gram_freqs) > 1000000:
pickle_bi_gram_freqs(bi_gram_freqs, source_id)
bi_gram_freqs = defaultdict(int)
pickle_bi_gram_freqs(bi_gram_freqs, source_id)
bi_gram_freqs = defaultdict(int)
select_cur.close()
cnx.close()
print 'parsing time: ', (time.time() - st) / 60, 'minutes'
return bi_gram_freqs
示例2: extract_grams
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import gen_n_grams [as 别名]
def extract_grams():
client = MongoClient()
db = client.news_tfidf
select_strs = [
# ("se" ,"select ID, story from swift_excel_articles;"),
# ("ne", "select ID, body from nsc_excel_articles;"),
("sw", "select sw.ID, sw.story from swift_articles sw left join swift_excel_articles se on sw.ID = se.swiftID where se.swiftID is null;"),
("ns", "select ns.ID, concat(ns.lead_parag, ' ', ns.body) from nsc_articles ns left join nsc_excel_articles ne on ns.ID = ne.nscID where ne.nscID is null;")
]
id_gen = gen_ids()
token_ids = get_token_dict(db)
# token_freqs = dict()
bi_gram_freqs = dict()
doc_counts = list()
t = time.time()
row_count = 0
cnx = get_db_context()
def incr_token_dict(t_dict, token):
if token in t_dict:
t_dict[token]['c'] += 1
else:
t_dict[token] = {'i': next(id_gen), 't': token, 'c': 1}
def incr_bi_gram_dict(bg_dict, bg, source_id=None):
if bg in bg_dict:
bg_dict[bg]['c'] += 1
else:
if source_id:
bg_dict[bg] = {'g1': bg[0], 'g2': bg[1], 'c': 1, 'source_id': source_id}
else:
bg_dict[bg] = {'g1': bg[0], 'g2': bg[1], 'c': 1}
for source_id, query in select_strs:
print query
cnx = get_db_context()
select_cur = cnx.cursor()
select_cur.execute(query)
for article_id, article, in select_cur:
row_count += 1
if row_count % 5000 == 0:
print 'processed', row_count, 'rows in', (time.time() - t) / 60, 'minutes'
t = time.time()
if type(article) in (str, unicode) and len(article) > 0:
tokenizer = Tokenizer(article)
# token_gen = tokenizer.gen_tokens()
bi_gram_gen = tokenizer.gen_n_grams(n=2)
# token_doc_freqs = dict()
bi_gram_doc_freqs = dict()
# for token in token_gen:
# incr_token_dict(token_freqs, token)
# incr_token_dict(token_doc_freqs, token)
for bi_gram in bi_gram_gen:
gram1_id = token_ids[bi_gram[0]]
gram2_id = token_ids[bi_gram[1]]
incr_bi_gram_dict(bi_gram_doc_freqs, (gram1_id, gram2_id))
incr_bi_gram_dict(bi_gram_freqs, (gram1_id, gram2_id), source_id)
doc_counts.append(
{
'sql_id': article_id,
'sql_tbl_id': source_id,
# 't_counts': token_doc_freqs.values(),
'bg_counts': bi_gram_doc_freqs.values()
}
)
if len(doc_counts) > 1000:
db_t = time.time()
print 'updating db...'
db.doc_freq.insert_many(doc_counts)
doc_counts = list()
# update_doc_bg_counts(db, doc_counts)
# doc_counts = list()
insert_bi_gram_freqs(db, source_id, bi_gram_freqs)
bi_gram_freqs = dict()
#.........这里部分代码省略.........