Python util.ngrams方法代码示例

本文整理汇总了Python中nltk.util.ngrams方法的典型用法代码示例。如果您正苦于以下问题：Python util.ngrams方法的具体用法？Python util.ngrams怎么用？Python util.ngrams使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.util的用法示例。

在下文中一共展示了util.ngrams方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: from_words

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size)

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:22，代码来源:collocations.py

示例2: classify_text

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def classify_text(text, classifier, certainity, g, unicodeFlag):
	#1. process text
	if unicodeFlag: text = text.decode('utf-8')
	word_list = process_text(text, removePunct=True, removeSW=False, removeNum=False)

	#2. generate ngrams
	mygrams = generate_ngrams(word_list, g)

	#3. generate features from ngrams
	feats = generate_features(mygrams)

	#4. classify
	probs = classifier.prob_classify(feats)
	label = probs.max()
	if probs.prob(label) >= certainity: return label, probs.prob(label)
	else: return 'none', probs.prob(label)

###################################################################################
# generates n-gram (g = num of grams)
# for example, if g=3, then the fuction will generate unigrams, bigrams, and tri-grams from the text.

开发者ID:motazsaad，项目名称:comparable-text-miner，代码行数:22，代码来源:textpro.py

示例3: method6

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def method6(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
        """
        Smoothing method 6:
        Interpolates the maximum likelihood estimate of the precision *p_n* with
        a prior estimate *pi0*. The prior is estimated by assuming that the ratio
        between pn and pn−1 will be the same as that between pn−1 and pn−2; from
        Gao and He (2013) Training MRF-Based Phrase Translation Models using
        Gradient Ascent. In NAACL.
        """
        # This smoothing only works when p_1 and p_2 is non-zero.
        # Raise an error with an appropriate message when the input is too short
        # to use this smoothing technique.
        assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
        for i, p_i in enumerate(p_n):
            if i in [0,1]: # Skips the first 2 orders of ngrams.
                continue
            else:
                pi0 = 0 if p_n[i-2] == 0 else p_n[i-1]**2 / p_n[i-2]
                # No. of ngrams in translation that matches the reference.
                m = p_i.numerator
                # No. of ngrams in translation.
                l = sum(1 for _ in ngrams(hypothesis, i+1))
                # Calculates the interpolated precision.
                p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
        return p_n

开发者ID:magic282，项目名称:NQG，代码行数:27，代码来源:nltk_bleu_score.py

示例4: is_needle_in_hay

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def is_needle_in_hay(cls, needle, hay):

        needle_length = len(needle.split())
        max_sim_val = 0

        for ngram in ngrams(hay.split(), needle_length + int(.2 * needle_length)):
            hay_ngram = u" ".join(ngram)
            similarity = SequenceMatcher(None, hay_ngram, needle).ratio()
            if similarity > max_sim_val:
                max_sim_val = similarity
                max_sim_string = hay_ngram

        return max_sim_val  # how confident are we that needle was found in hay

    # https://stackoverflow.com/a/31505798
    # given a string paragraph, return a list of sentences

开发者ID:fterh，项目名称:sneakpeek，代码行数:18，代码来源:__init__.py

示例5: init

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def __init__(self, n, vocabulary, unknown="<UNK>"):
        """
        n is the size of the ngram
        """
        if n < 1:
            raise ValueError("ngram size must be greater than or equal to 1")

        self.n = n
        self.unknown = unknown
        self.padding = {
            "pad_left": True,
            "pad_right": True,
            "left_pad_symbol": "<s>",
            "right_pad_symbol": "</s>"
        }

        self.vocabulary = vocabulary
        self.allgrams = defaultdict(ConditionalFreqDist)
        self.ngrams = FreqDist()
        self.unigrams = FreqDist()

开发者ID:foxbook，项目名称:atap，代码行数:22，代码来源:model.py

示例6: train_counts

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def train_counts(self, training_text):
        for sent in training_text:
            checked_sent = (self.check_against_vocab(word) for word in sent)
            sent_start = True
            for ngram in self.to_ngrams(checked_sent):
                self.ngrams[ngram] += 1
                context, word = tuple(ngram[:-1]), ngram[-1]
                if sent_start:
                    for context_word in context:
                        self.unigrams[context_word] += 1
                    sent_start = False

                for window, ngram_order in enumerate(range(self.n, 1, -1)):
                    context = context[window:]
                    self.allgrams[ngram_order][context][word] += 1
                self.unigrams[word] += 1

开发者ID:foxbook，项目名称:atap，代码行数:18，代码来源:model.py

示例7: load_data

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def load_data():
    global N, words

    raw = list(word 
            for fileid in corpus.fileids()
            for word in corpus.words(fileid))
    words = list(token for token in RegexpTokenizer('\w+').tokenize(' '.join(raw)))[100:1000]
    tokens = set(words)
    tokens_l = list(tokens)
    N = len(tokens)
    print 'Corpus size: {} words'.format(N)

    step = 4
    data = []
    for gram in ngrams(words, step):
        w1, w2, w3, pred = gram
        V = Vol(1, 1, N, 0.0)
        V.w[tokens_l.index(w1)] = 1
        V.w[tokens_l.index(w2)] = 1
        V.w[tokens_l.index(w3)] = 1
        label = tokens_l.index(pred)
        data.append((V, label))

    return data

开发者ID:benglard，项目名称:ConvNetPy，代码行数:26，代码来源:next_word.py

示例8: extract_ngrams

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def extract_ngrams(self, text, sent_tokenizer, word_tokenizer, max_ngrams,
            is_stopword, length_limiter):
        ngram_sets = {}
        sents = sent_tokenizer(text)
        
        tokens = []
        for sent in sents:
            tokens.extend([word.lower() for word in word_tokenizer(sent)])
        
        # Remove stopwords.
        tokens = [word for word in tokens if is_stopword(word) is False]
        tokens = length_limiter(tokens)

        for i in xrange(1, max_ngrams + 1):
            ngram_sets[i] = {}
            total = 0
            for ngram in ngrams(tokens, i):
                ngram_sets[i][ngram] = ngram_sets[i].get(ngram, 0) + 1
                total += 1
            ngram_sets[i][u"__TOTAL__"] = total
        return ngram_sets

开发者ID:kedz，项目名称:sumpy，代码行数:23，代码来源:eval.py

示例9: _apply_filter

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def _apply_filter(self, fn=lambda ngram, freq: False):
        """Generic filter removes ngrams from the frequency distribution
        if the function returns True when passed an ngram tuple.
        """
        tmp_ngram = FreqDist()
        for ngram, freq in iteritems(self.ngram_fd):
            if not fn(ngram, freq):
                tmp_ngram[ngram] = freq
        self.ngram_fd = tmp_ngram

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:11，代码来源:collocations.py

示例10: apply_freq_filter

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def apply_freq_filter(self, min_freq):
        """Removes candidate ngrams which have frequency less than min_freq."""
        self._apply_filter(lambda ng, freq: freq < min_freq)

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:5，代码来源:collocations.py

示例11: apply_ngram_filter

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def apply_ngram_filter(self, fn):
        """Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
        evaluates to True.
        """
        self._apply_filter(lambda ng, f: fn(*ng))

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:7，代码来源:collocations.py

示例12: apply_word_filter

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def apply_word_filter(self, fn):
        """Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
        ...) evaluates to True.
        """
        self._apply_filter(lambda ng, f: any(fn(w) for w in ng))