本文整理汇总了Python中corpus.Corpus.remove_ngram方法的典型用法代码示例。如果您正苦于以下问题:Python Corpus.remove_ngram方法的具体用法?Python Corpus.remove_ngram怎么用?Python Corpus.remove_ngram使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类corpus.Corpus
的用法示例。
在下文中一共展示了Corpus.remove_ngram方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from corpus import Corpus [as 别名]
# 或者: from corpus.Corpus import remove_ngram [as 别名]
class BiCorpus:
def __init__(self, backup=False, int_tokens=False):
self._src = Corpus(backup, int_tokens)
self._tgt = Corpus(backup, int_tokens)
self._backup = backup
def write(self, out):
for sen_i in xrange(len(self._src)):
src_sen, tgt_sen = self._src[sen_i], self._tgt[sen_i]
src_str = " ".join(self._src.ints_to_tokens(src_sen.get_tokens(self._backup)))
tgt_str = " ".join(self._tgt.ints_to_tokens(tgt_sen.get_tokens(self._backup)))
out.write("{0}\t{1}\n".format(src_str, tgt_str))
def add_sentence_pair(self, pair):
src, tgt = pair
self._src.append(src)
self._tgt.append(tgt)
def create_cache(self):
if hasattr(self, "_coocc_cache"):
del self._coocc_cache
if hasattr(self, "interesting"):
del self.interesting
self._coocc_cache = (defaultdict(int), defaultdict(int))
self.interesting = (defaultdict(dict), defaultdict(dict))
def build_cache(self):
logging.info("Buildind cache...")
gc.disable()
self.create_cache()
for sen_i in xrange(len(self._src)):
if sen_i * 100 / len(self._src) > (sen_i - 1) * 100 / len(self._src):
logging.debug("{0}% done".format(sen_i * 100/len(self._src)))
self.add_sentence_pair_to_cache(self._src[sen_i], self._tgt[sen_i])
self.filter_interesting_pairs()
gc.enable()
logging.info("Buildind cache done")
def add_sentence_pair_to_cache(self, src, tgt):
for src_tok in src:
for tgt_tok in tgt:
try:
self.interesting[0][src_tok][tgt_tok] += 1
except KeyError:
self.interesting[0][src_tok][tgt_tok] = 1
try:
self.interesting[1][tgt_tok][src_tok] += 1
except KeyError:
self.interesting[1][tgt_tok][src_tok] = 1
def filter_interesting_pairs(self, max_per_word=10):
logging.info("Filtering interesting pairs...")
for src in self.interesting[0]:
self.interesting[0][src] = dict(sorted(self.interesting[0][src].iteritems(), key=lambda x: x[1], reverse=True)[:max_per_word])
for tgt in self.interesting[1]:
self.interesting[1][tgt] = dict(sorted(self.interesting[1][tgt].iteritems(), key=lambda x: x[1], reverse=True)[:max_per_word])
logging.info("Filtering interesting pairs done")
def ngram_pair_context(self, pair, max_len=None):
src, tgt = pair
def __insert_contexts(occ, insterter):
for sen_index in occ:
src_sen = self._src[sen_index]
tgt_sen = self._tgt[sen_index]
if max_len is None:
insterter((src_sen, tgt_sen))
else:
src_ngram_indices = src_sen.ngram_positions(src)
tgt_ngram_indices = tgt_sen.ngram_positions(tgt)
for src_ngram_index in src_ngram_indices:
src_left = max(0, src_ngram_index - max_len)
src_right = min(len(src_sen), src_ngram_index + len(src) + max_len)
for tgt_ngram_index in tgt_ngram_indices:
tgt_left = max(0, tgt_ngram_index - max_len)
tgt_right = min(len(tgt_sen), tgt_ngram_index + len(tgt) + max_len)
insterter((
(src_sen[src_left:src_ngram_index],src_sen[src_ngram_index + 1:src_right]),
(tgt_sen[tgt_left:tgt_ngram_index],tgt_sen[tgt_ngram_index + 1:tgt_right])
))
src_occ = self._src.ngram_index(src)
tgt_occ = self._tgt.ngram_index(tgt)
coocc = src_occ & tgt_occ
context = [], [], []
__insert_contexts(coocc, context[0].append)
__insert_contexts(src_occ - coocc, context[1].append)
__insert_contexts(tgt_occ - coocc, context[2].append)
return context
def remove_ngram_pairs(self, pairs):
"""
this method removes ngram pairs from corpora
input is a list because if it were only a pair,
indices can get corrupted
cause:
- while they are ngrams, ngram occurences have to be removed
from cache, not all per token
#.........这里部分代码省略.........