当前位置: 首页>>代码示例>>Python>>正文


Python util.ngrams方法代码示例

本文整理汇总了Python中nltk.util.ngrams方法的典型用法代码示例。如果您正苦于以下问题:Python util.ngrams方法的具体用法?Python util.ngrams怎么用?Python util.ngrams使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.util的用法示例。


在下文中一共展示了util.ngrams方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: from_words

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:22,代码来源:collocations.py

示例2: classify_text

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def classify_text(text, classifier, certainity, g, unicodeFlag):
	#1. process text
	if unicodeFlag: text = text.decode('utf-8')
	word_list = process_text(text, removePunct=True, removeSW=False, removeNum=False)

	#2. generate ngrams
	mygrams = generate_ngrams(word_list, g)

	#3. generate features from ngrams
	feats = generate_features(mygrams)

	#4. classify
	probs = classifier.prob_classify(feats)
	label = probs.max()
	if probs.prob(label) >= certainity: return label, probs.prob(label)
	else: return 'none', probs.prob(label)

###################################################################################
# generates n-gram (g = num of grams)
# for example, if g=3, then the fuction will generate unigrams, bigrams, and tri-grams from the text. 
开发者ID:motazsaad,项目名称:comparable-text-miner,代码行数:22,代码来源:textpro.py

示例3: method6

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def method6(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
        """
        Smoothing method 6:
        Interpolates the maximum likelihood estimate of the precision *p_n* with
        a prior estimate *pi0*. The prior is estimated by assuming that the ratio
        between pn and pn−1 will be the same as that between pn−1 and pn−2; from
        Gao and He (2013) Training MRF-Based Phrase Translation Models using
        Gradient Ascent. In NAACL.
        """
        # This smoothing only works when p_1 and p_2 is non-zero.
        # Raise an error with an appropriate message when the input is too short
        # to use this smoothing technique.
        assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
        for i, p_i in enumerate(p_n):
            if i in [0,1]: # Skips the first 2 orders of ngrams.
                continue
            else:
                pi0 = 0 if p_n[i-2] == 0 else p_n[i-1]**2 / p_n[i-2]
                # No. of ngrams in translation that matches the reference.
                m = p_i.numerator
                # No. of ngrams in translation.
                l = sum(1 for _ in ngrams(hypothesis, i+1))
                # Calculates the interpolated precision.
                p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
        return p_n 
开发者ID:magic282,项目名称:NQG,代码行数:27,代码来源:nltk_bleu_score.py

示例4: is_needle_in_hay

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def is_needle_in_hay(cls, needle, hay):

        needle_length = len(needle.split())
        max_sim_val = 0

        for ngram in ngrams(hay.split(), needle_length + int(.2 * needle_length)):
            hay_ngram = u" ".join(ngram)
            similarity = SequenceMatcher(None, hay_ngram, needle).ratio()
            if similarity > max_sim_val:
                max_sim_val = similarity
                max_sim_string = hay_ngram

        return max_sim_val  # how confident are we that needle was found in hay

    # https://stackoverflow.com/a/31505798
    # given a string paragraph, return a list of sentences 
开发者ID:fterh,项目名称:sneakpeek,代码行数:18,代码来源:__init__.py

示例5: __init__

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def __init__(self, n, vocabulary, unknown="<UNK>"):
        """
        n is the size of the ngram
        """
        if n < 1:
            raise ValueError("ngram size must be greater than or equal to 1")

        self.n = n
        self.unknown = unknown
        self.padding = {
            "pad_left": True,
            "pad_right": True,
            "left_pad_symbol": "<s>",
            "right_pad_symbol": "</s>"
        }

        self.vocabulary = vocabulary
        self.allgrams = defaultdict(ConditionalFreqDist)
        self.ngrams = FreqDist()
        self.unigrams = FreqDist() 
开发者ID:foxbook,项目名称:atap,代码行数:22,代码来源:model.py

示例6: train_counts

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def train_counts(self, training_text):
        for sent in training_text:
            checked_sent = (self.check_against_vocab(word) for word in sent)
            sent_start = True
            for ngram in self.to_ngrams(checked_sent):
                self.ngrams[ngram] += 1
                context, word = tuple(ngram[:-1]), ngram[-1]
                if sent_start:
                    for context_word in context:
                        self.unigrams[context_word] += 1
                    sent_start = False

                for window, ngram_order in enumerate(range(self.n, 1, -1)):
                    context = context[window:]
                    self.allgrams[ngram_order][context][word] += 1
                self.unigrams[word] += 1 
开发者ID:foxbook,项目名称:atap,代码行数:18,代码来源:model.py

示例7: load_data

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def load_data():
    global N, words

    raw = list(word 
            for fileid in corpus.fileids()
            for word in corpus.words(fileid))
    words = list(token for token in RegexpTokenizer('\w+').tokenize(' '.join(raw)))[100:1000]
    tokens = set(words)
    tokens_l = list(tokens)
    N = len(tokens)
    print 'Corpus size: {} words'.format(N)

    step = 4
    data = []
    for gram in ngrams(words, step):
        w1, w2, w3, pred = gram
        V = Vol(1, 1, N, 0.0)
        V.w[tokens_l.index(w1)] = 1
        V.w[tokens_l.index(w2)] = 1
        V.w[tokens_l.index(w3)] = 1
        label = tokens_l.index(pred)
        data.append((V, label))

    return data 
开发者ID:benglard,项目名称:ConvNetPy,代码行数:26,代码来源:next_word.py

示例8: extract_ngrams

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def extract_ngrams(self, text, sent_tokenizer, word_tokenizer, max_ngrams,
            is_stopword, length_limiter):
        ngram_sets = {}
        sents = sent_tokenizer(text)
        
        tokens = []
        for sent in sents:
            tokens.extend([word.lower() for word in word_tokenizer(sent)])
        
        # Remove stopwords.
        tokens = [word for word in tokens if is_stopword(word) is False]
        tokens = length_limiter(tokens)

        for i in xrange(1, max_ngrams + 1):
            ngram_sets[i] = {}
            total = 0
            for ngram in ngrams(tokens, i):
                ngram_sets[i][ngram] = ngram_sets[i].get(ngram, 0) + 1
                total += 1
            ngram_sets[i][u"__TOTAL__"] = total
        return ngram_sets 
开发者ID:kedz,项目名称:sumpy,代码行数:23,代码来源:eval.py

示例9: _apply_filter

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def _apply_filter(self, fn=lambda ngram, freq: False):
        """Generic filter removes ngrams from the frequency distribution
        if the function returns True when passed an ngram tuple.
        """
        tmp_ngram = FreqDist()
        for ngram, freq in iteritems(self.ngram_fd):
            if not fn(ngram, freq):
                tmp_ngram[ngram] = freq
        self.ngram_fd = tmp_ngram 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:11,代码来源:collocations.py

示例10: apply_freq_filter

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def apply_freq_filter(self, min_freq):
        """Removes candidate ngrams which have frequency less than min_freq."""
        self._apply_filter(lambda ng, freq: freq < min_freq) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:5,代码来源:collocations.py

示例11: apply_ngram_filter

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def apply_ngram_filter(self, fn):
        """Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
        evaluates to True.
        """
        self._apply_filter(lambda ng, f: fn(*ng)) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:7,代码来源:collocations.py

示例12: apply_word_filter

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def apply_word_filter(self, fn):
        """Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
        ...) evaluates to True.
        """
        self._apply_filter(lambda ng, f: any(fn(w) for w in ng)) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:7,代码来源:collocations.py

示例13: nbest

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def nbest(self, score_fn, n):
        """Returns the top n ngrams when scored by the given function."""
        return [p for p, s in self.score_ngrams(score_fn)[:n]] 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:5,代码来源:collocations.py

示例14: sentence_bleu_4

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def sentence_bleu_4(hyp, refs, weights=[0.25, 0.25, 0.25, 0.25]):
    # input : single sentence, multiple references
    count = [0, 0, 0, 0]
    clip_count = [0, 0, 0, 0]
    r = 0
    c = 0

    for i in range(4):
        hypcnts = Counter(ngrams(hyp, i + 1))
        cnt = sum(hypcnts.values())
        count[i] += cnt

        # compute clipped counts
        max_counts = {}
        for ref in refs:
            refcnts = Counter(ngrams(ref, i + 1))
            for ng in hypcnts:
                max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng])
        clipcnt = dict((ng, min(count, max_counts[ng])) \
                       for ng, count in hypcnts.items())
        clip_count[i] += sum(clipcnt.values())

    bestmatch = [1000, 1000]
    for ref in refs:
        if bestmatch[0] == 0:
            break
        diff = abs(len(ref) - len(hyp))
        if diff < bestmatch[0]:
            bestmatch[0] = diff
            bestmatch[1] = len(ref)
    r = bestmatch[1]
    c = len(hyp)

    p0 = 1e-7
    bp = math.exp(-abs(1.0 - float(r) / float(c + p0)))

    p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 for i in range(4)]
    s = math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns) if p_n)
    bleu_hyp = bp * math.exp(s)

    return bleu_hyp 
开发者ID:ConvLab,项目名称:ConvLab,代码行数:43,代码来源:nlp.py

示例15: generate_ngrams

# 需要导入模块: from nltk import util [as 别名]
# 或者: from nltk.util import ngrams [as 别名]
def generate_ngrams(word_list, g):
	mygrams = []
	unigrams = [word for word in word_list]
	mygrams += unigrams
	for i in range(2,g+1): mygrams += ngrams(word_list, i)
	return mygrams
###################################################################################

# generate n-gram features in the form (n-gram, True), i.e., binary feature. In other words, the n-gram exists 
开发者ID:motazsaad,项目名称:comparable-text-miner,代码行数:11,代码来源:textpro.py


注:本文中的nltk.util.ngrams方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。