本文整理汇总了Python中nltk.util.ngrams方法的典型用法代码示例。如果您正苦于以下问题:Python util.ngrams方法的具体用法?Python util.ngrams怎么用?Python util.ngrams使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.util
的用法示例。
在下文中一共展示了util.ngrams方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: from_words
# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def from_words(cls, words, window_size=2):
"""Construct a BigramCollocationFinder for all bigrams in the given
sequence. When window_size > 2, count non-contiguous bigrams, in the
style of Church and Hanks's (1990) association ratio.
"""
wfd = FreqDist()
bfd = FreqDist()
if window_size < 2:
raise ValueError("Specify window_size at least 2")
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
if w1 is None:
continue
wfd[w1] += 1
for w2 in window[1:]:
if w2 is not None:
bfd[(w1, w2)] += 1
return cls(wfd, bfd, window_size=window_size)
示例2: classify_text
# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def classify_text(text, classifier, certainity, g, unicodeFlag):
#1. process text
if unicodeFlag: text = text.decode('utf-8')
word_list = process_text(text, removePunct=True, removeSW=False, removeNum=False)
#2. generate ngrams
mygrams = generate_ngrams(word_list, g)
#3. generate features from ngrams
feats = generate_features(mygrams)
#4. classify
probs = classifier.prob_classify(feats)
label = probs.max()
if probs.prob(label) >= certainity: return label, probs.prob(label)
else: return 'none', probs.prob(label)
###################################################################################
# generates n-gram (g = num of grams)
# for example, if g=3, then the fuction will generate unigrams, bigrams, and tri-grams from the text.
示例3: method6
# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def method6(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
"""
Smoothing method 6:
Interpolates the maximum likelihood estimate of the precision *p_n* with
a prior estimate *pi0*. The prior is estimated by assuming that the ratio
between pn and pn−1 will be the same as that between pn−1 and pn−2; from
Gao and He (2013) Training MRF-Based Phrase Translation Models using
Gradient Ascent. In NAACL.
"""
# This smoothing only works when p_1 and p_2 is non-zero.
# Raise an error with an appropriate message when the input is too short
# to use this smoothing technique.
assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
for i, p_i in enumerate(p_n):
if i in [0,1]: # Skips the first 2 orders of ngrams.
continue
else:
pi0 = 0 if p_n[i-2] == 0 else p_n[i-1]**2 / p_n[i-2]
# No. of ngrams in translation that matches the reference.
m = p_i.numerator
# No. of ngrams in translation.
l = sum(1 for _ in ngrams(hypothesis, i+1))
# Calculates the interpolated precision.
p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
return p_n
示例4: is_needle_in_hay
# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def is_needle_in_hay(cls, needle, hay):
needle_length = len(needle.split())
max_sim_val = 0
for ngram in ngrams(hay.split(), needle_length + int(.2 * needle_length)):
hay_ngram = u" ".join(ngram)
similarity = SequenceMatcher(None, hay_ngram, needle).ratio()
if similarity > max_sim_val:
max_sim_val = similarity
max_sim_string = hay_ngram
return max_sim_val # how confident are we that needle was found in hay
# https://stackoverflow.com/a/31505798
# given a string paragraph, return a list of sentences
示例5: __init__
# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def __init__(self, n, vocabulary, unknown="<UNK>"):
"""
n is the size of the ngram
"""
if n < 1:
raise ValueError("ngram size must be greater than or equal to 1")
self.n = n
self.unknown = unknown
self.padding = {
"pad_left": True,
"pad_right": True,
"left_pad_symbol": "<s>",
"right_pad_symbol": "</s>"
}
self.vocabulary = vocabulary
self.allgrams = defaultdict(ConditionalFreqDist)
self.ngrams = FreqDist()
self.unigrams = FreqDist()
示例6: train_counts
# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def train_counts(self, training_text):
for sent in training_text:
checked_sent = (self.check_against_vocab(word) for word in sent)
sent_start = True
for ngram in self.to_ngrams(checked_sent):
self.ngrams[ngram] += 1
context, word = tuple(ngram[:-1]), ngram[-1]
if sent_start:
for context_word in context:
self.unigrams[context_word] += 1
sent_start = False
for window, ngram_order in enumerate(range(self.n, 1, -1)):
context = context[window:]
self.allgrams[ngram_order][context][word] += 1
self.unigrams[word] += 1
示例7: load_data
# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def load_data():
global N, words
raw = list(word
for fileid in corpus.fileids()
for word in corpus.words(fileid))
words = list(token for token in RegexpTokenizer('\w+').tokenize(' '.join(raw)))[100:1000]
tokens = set(words)
tokens_l = list(tokens)
N = len(tokens)
print 'Corpus size: {} words'.format(N)
step = 4
data = []
for gram in ngrams(words, step):
w1, w2, w3, pred = gram
V = Vol(1, 1, N, 0.0)
V.w[tokens_l.index(w1)] = 1
V.w[tokens_l.index(w2)] = 1
V.w[tokens_l.index(w3)] = 1
label = tokens_l.index(pred)
data.append((V, label))
return data
示例8: extract_ngrams
# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def extract_ngrams(self, text, sent_tokenizer, word_tokenizer, max_ngrams,
is_stopword, length_limiter):
ngram_sets = {}
sents = sent_tokenizer(text)
tokens = []
for sent in sents:
tokens.extend([word.lower() for word in word_tokenizer(sent)])
# Remove stopwords.
tokens = [word for word in tokens if is_stopword(word) is False]
tokens = length_limiter(tokens)
for i in xrange(1, max_ngrams + 1):
ngram_sets[i] = {}
total = 0
for ngram in ngrams(tokens, i):
ngram_sets[i][ngram] = ngram_sets[i].get(ngram, 0) + 1
total += 1
ngram_sets[i][u"__TOTAL__"] = total
return ngram_sets
示例9: _apply_filter
# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def _apply_filter(self, fn=lambda ngram, freq: False):
"""Generic filter removes ngrams from the frequency distribution
if the function returns True when passed an ngram tuple.
"""
tmp_ngram = FreqDist()
for ngram, freq in iteritems(self.ngram_fd):
if not fn(ngram, freq):
tmp_ngram[ngram] = freq
self.ngram_fd = tmp_ngram
示例10: apply_freq_filter
# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def apply_freq_filter(self, min_freq):
"""Removes candidate ngrams which have frequency less than min_freq."""
self._apply_filter(lambda ng, freq: freq < min_freq)
示例11: apply_ngram_filter
# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def apply_ngram_filter(self, fn):
"""Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
evaluates to True.
"""
self._apply_filter(lambda ng, f: fn(*ng))
示例12: apply_word_filter
# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def apply_word_filter(self, fn):
"""Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
...) evaluates to True.
"""
self._apply_filter(lambda ng, f: any(fn(w) for w in ng))
示例13: nbest
# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def nbest(self, score_fn, n):
"""Returns the top n ngrams when scored by the given function."""
return [p for p, s in self.score_ngrams(score_fn)[:n]]
示例14: sentence_bleu_4
# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def sentence_bleu_4(hyp, refs, weights=[0.25, 0.25, 0.25, 0.25]):
# input : single sentence, multiple references
count = [0, 0, 0, 0]
clip_count = [0, 0, 0, 0]
r = 0
c = 0
for i in range(4):
hypcnts = Counter(ngrams(hyp, i + 1))
cnt = sum(hypcnts.values())
count[i] += cnt
# compute clipped counts
max_counts = {}
for ref in refs:
refcnts = Counter(ngrams(ref, i + 1))
for ng in hypcnts:
max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng])
clipcnt = dict((ng, min(count, max_counts[ng])) \
for ng, count in hypcnts.items())
clip_count[i] += sum(clipcnt.values())
bestmatch = [1000, 1000]
for ref in refs:
if bestmatch[0] == 0:
break
diff = abs(len(ref) - len(hyp))
if diff < bestmatch[0]:
bestmatch[0] = diff
bestmatch[1] = len(ref)
r = bestmatch[1]
c = len(hyp)
p0 = 1e-7
bp = math.exp(-abs(1.0 - float(r) / float(c + p0)))
p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 for i in range(4)]
s = math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns) if p_n)
bleu_hyp = bp * math.exp(s)
return bleu_hyp
示例15: generate_ngrams
# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def generate_ngrams(word_list, g):
mygrams = []
unigrams = [word for word in word_list]
mygrams += unigrams
for i in range(2,g+1): mygrams += ngrams(word_list, i)
return mygrams
###################################################################################
# generate n-gram features in the form (n-gram, True), i.e., binary feature. In other words, the n-gram exists