当前位置: 首页>>代码示例>>Python>>正文


Python nltk.defaultdict函数代码示例

本文整理汇总了Python中nltk.defaultdict函数的典型用法代码示例。如果您正苦于以下问题:Python defaultdict函数的具体用法?Python defaultdict怎么用?Python defaultdict使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了defaultdict函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: generate_correct_number_pool

def generate_correct_number_pool(n, corpus, homo, pool, eps = 1e-6):
    """Generate number of words to match length and p(word) from ngram model using a pool a pre-generated word"""
    x = NgramModel(n, corpus, 1, homo)
    poolxLengths = nltk.defaultdict(list)
    poolxP = nltk.defaultdict(float)

    for item in pool:
            item = item.strip().split(",")
            poolxLengths[int(item[0])].append(item[1])
            poolxP[item[1]] = float(item[2])
        
    same_length = nltk.defaultdict(int)
    for i in range(20):
            same_length[i] = dict([(k, poolxP[k]) for k in poolxLengths[i] if k in poolxP])
    newwords = []
    exist =0
    hom_count =0
    tot_p=0
    for i, w in enumerate(corpus):
        p_match = x.evaluate(w)
        sample = get_range(same_length[len(w)], p_match-eps, p_match+eps)
        while len(sample) == 0:
                eps = eps*2
                sample = get_range(same_length[len(w)], p_match-eps, p_match+eps)
        eps = 1e-6
        nw = random.choice(sample.keys())
        while nw in newwords and homo !=1:
            nw = random.choice(sample.keys())
        if nw in corpus:
            exist +=1
        if nw in newwords:
            hom_count +=1
        del same_length[len(w)][nw]
        tot_p += poolxP[nw] + log(1/float(len(sample)))
            newwords += [nw]      
开发者ID:mahowak,项目名称:lexicon,代码行数:35,代码来源:old_models.py

示例2: extractNE

def extractNE(tok, pnouns, dic):
    names = LinkedList.LinkedList()
    nameprofs = defaultdict(lambda: defaultdict(lambda: 0))
    #bag of words model
    tent = []
    prevword = "" #usually names are not preceded by an article - filters out some other named entities and some other cases
    for i in range(len(tok)):
        if STok.isuc(tok[i][0]) and (tok[i].lower() in pnouns) and (prevword not in articles):
            tent.append(tok[i])
        else:
            if len(tent) > 0 and type(tent): #will add if the named entity 
                match = matchNE(names, tent) #matches to most recent matching occurrence
                
                for j in range(0, len(weights)):
                    try:
                        word = tok[i+j].lower()
                        if (word not in pnouns) and len(word) > 3 and ("," not in word) and (word in dic):
                            nameprofs[match][word] += weights[j]
                    except:
                        break
                for j in range(0, len(weights)):
                    try:
                        word = tok[i-len(tent)-j-1].lower()
                        if (word not in pnouns) and len(word) > 3 and ("," not in word) and (word in dic):
                            nameprofs[match][word] += weights[j]
                    except:
                        break
                
            tent = []
            prevword = tok[i]
    return [names, nameprofs]
开发者ID:gabrielhuang,项目名称:parse-novel,代码行数:31,代码来源:NEProfile.py

示例3: most_informative_features

    def most_informative_features(self, n=100):
        """
        Return a list of the 'most informative' features used by this
        classifier.  For the purpose of this function, the
        informativeness of a feature C{(fname,fval)} is equal to the
        highest value of P(fname=fval|label), for any label, divided by
        the lowest value of P(fname=fval|label), for any label.

          max[ P(fname=fval|label1) / P(fname=fval|label2) ]
        """
        # The set of (fname, fval) pairs used by this classifier.
        features = set()
        # The max & min probability associated w/ each (fname, fval)
        # pair.  Maps (fname,fval) -> float.
        maxprob = defaultdict(lambda: 0.0)
        minprob = defaultdict(lambda: 1.0)

        for (label, fname), probdist in self._feature_probdist.items():
            for fval in probdist.samples():
                feature = (fname, fval)
                features.add( feature )
                p = probdist.prob(fval)
                maxprob[feature] = max(p, maxprob[feature])
                minprob[feature] = min(p, minprob[feature])
                if minprob[feature] == 0:
                    features.discard(feature)

        # Convert features to a list, & sort it by how informative
        # features are.
        features = sorted(features, 
            key=lambda feature: minprob[feature]/maxprob[feature])
        return features[:n]
开发者ID:DrDub,项目名称:icsisumm,代码行数:32,代码来源:naivebayes.py

示例4: analysis_using_word_and_prev_pos

def analysis_using_word_and_prev_pos():
    from nltk.corpus import brown

    pos = nltk.defaultdict(lambda: nltk.defaultdict(int))
    brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
    for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged):
        pos[(t1, w2)][t2] += 1
    print pos[("DET", "right")]
开发者ID:prashiyn,项目名称:nltk-examples,代码行数:8,代码来源:ch05.py

示例5: NBCtrain

def NBCtrain(labeled_featuresets, estimator=nltk.ELEProbDist):
    """A copy of the nltk.NaiveBayesClassifer.train(...)
    method to allow inspection of what the method is actually doing
    and how long it's taking"""
    """ 
    @param labeled_featuresets: A list of classified featuresets, 
             i.e., a list of tuples C{(featureset, label)}. 
          """
    label_freqdist = nltk.FreqDist()
    feature_freqdist = nltk.defaultdict(nltk.FreqDist)
    feature_values = nltk.defaultdict(set)
    fnames = set()

    print "There are " + str(len(labeled_featuresets)) + " labeled featuresets"
    # Count up how many times each feature value occured, given
    # the label and featurename.
    print "Counting feature value occurence"
    i = 0
    for featureset, label in labeled_featuresets:
        label_freqdist.inc(label)
        for fname, fval in featureset.items():
            # Increment freq(fval|label, fname)
            feature_freqdist[label, fname].inc(fval)
            # Record that fname can take the value fval.
            feature_values[fname].add(fval)
            # Keep a list of all feature names.
            fnames.add(fname)
        print "At featureset..." + str(i)
        i += 1

    # If a feature didn't have a value given for an instance, then
    # we assume that it gets the implicit value 'None.'  This loop
    # counts up the number of 'missing' feature values for each
    # (label,fname) pair, and increments the count of the fval
    # 'None' by that amount.
    for label in label_freqdist:
        num_samples = label_freqdist[label]
        for fname in fnames:
            count = feature_freqdist[label, fname].N()
            feature_freqdist[label, fname].inc(None, num_samples - count)
            feature_values[fname].add(None)

    # Create the P(label) distribution
    print "Making the P(label) distribution..."
    label_probdist = estimator(label_freqdist)

    # Create the P(fval|label, fname) distribution
    print "Making the P(fval|label, fname) distribution from " + str(
        len(feature_freqdist.items())
    ) + " feature freqs..."
    feature_probdist = {}
    for ((label, fname), freqdist) in feature_freqdist.items():
        probdist = estimator(freqdist, bins=len(feature_values[fname]))
        feature_probdist[label, fname] = probdist

    return nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
开发者ID:brynmathias,项目名称:insult_detect,代码行数:56,代码来源:classify.py

示例6: word_valency

def word_valency():
    table = nltk.defaultdict(lambda: nltk.defaultdict(set))
    entries = nltk.corpus.ppattach.attachments("training")
    for entry in entries:
        #    print entry
        key = entry.noun1 + "-" + entry.prep + "-" + entry.noun2
        table[key][entry.attachment].add(entry.verb)
    for key in sorted(table):
        if len(table[key]) > 1:
            print key, "N:", sorted(table[key]["N"]), "V:", sorted(table[key]["V"])
开发者ID:prashiyn,项目名称:nltk-examples,代码行数:10,代码来源:ch08.py

示例7: generate_correct_number

def generate_correct_number(corpus, homo, lm):
    """Generate number of words to match length, handle homophones being generated"""
    lengths = nltk.defaultdict(lambda: nltk.defaultdict(int))
    lengths_needed = nltk.defaultdict(int)

    for item in corpus:
        lengths[get_cv(item)][len(item.split("-"))] += 1
        lengths_needed[get_cv(item)] += 1
    newwords = []
    newwords2 = []
    exist = nltk.defaultdict(int)
#    print lengths_needed
#    hom = dict((i,corpus.count(i)) for i in corpus if corpus.count(i) > 1)
#    ratio = sum(hom.values())/float(len(hom)+1)

#    for i in lengths_needed.keys():
#        while lengths_needed[i] > 0:
#            words = lm.generate(i)
#            for w in words:
#                if homo == 1 or (w not in newwords and re.sub("-","",w) not in newwords2):
#                    lengths_needed[i] -= 1
#                    newwords += [w]
#                    newwords2 += [re.sub("-", "", w)]
#                    if re.sub("-","",w) in corpus:
#                        exist[len(re.sub("-","",w))] +=1
#    print exist
#    return newwords

    while True:
        words = lm.generate()
        for w in words:
            if lengths_needed[get_cv(w)] > 0:
                if homo == 1 or (w not in newwords and re.sub("-","",w) not in newwords2):
#                    temp = newwords + [w]
#                    hom_new = Set([(i,temp.count(i)) for i in temp if temp.count(i) > 1])
#                    if len(hom_new)!= 0: 
#                        ratio_temp = sum([h[1] for h in hom_new])/float(len(hom_new))
#                    else:
#                        ratio_temp = 0
#                    if (ratio_temp <= (ratio + 0.1) and len(hom_new) <= len(hom)) or w not in newwords:
                    lengths_needed[get_cv(w)] += -1
#                    if sum([lengths_needed[j] for j in lengths_needed.keys()]) %1000 == 0:
#                        print sum([lengths_needed[j] for j in lengths_needed.keys()])
                    newwords += [w]
                    newwords2 += [re.sub("-", "", w)]
                    if w in corpus:
                        exist[len(w)] +=1
            elif sum([lengths_needed[j] for j in lengths_needed.keys()]) == 0: 
                print "nb of real words", sum(exist.values())
                return newwords
开发者ID:mahowak,项目名称:lexicon,代码行数:50,代码来源:generation.py

示例8: extract_real_lex

def extract_real_lex(path, lemma, language, mono, hom,minlength, maxlength, minsyll, maxsyll, match, celex_list):
    celex_path = get_celex_path(path, lemma, language)
    lengths = nltk.defaultdict(int)
    print celex_path
    corpus = build_celex_corpus(celex_path, language, lemma, mono)
    print ">>>TOTAL NB OF WORDS", len(corpus)
    corpus = [c for c in corpus if c[1] > 0] #freq greater than 0
    corpus = [clean_word(c[0]) for c in corpus] #reduce celex to just pronunciation
    corpus =  [celex_diphthong_sub(c) for c in corpus if "c" not in c and "q" not in c and "0" not in c and "~" not in c]
    print ">>>TOTAL NB OF WORDS", len(corpus)
    corpus = [i for i in corpus if (len(i.split("-")) > minsyll and len(i.split("-")) < maxsyll)]
    print ">>>TOTAL NB OF WORDS", len(corpus)
    corpus = [i for i in corpus if (len(re.sub("-", "", i)) > minlength and len(re.sub("-", "", i)) < maxlength)]
    print ">>>TOTAL NB OF WORDS", len(corpus)
    if match == "length":
        corpus = [clean_syll(c) for c in corpus] #reduce celex to just pronunciation
    print ">>>TOTAL NB OF WORDS", len(corpus)
    if hom == 0: corpus = list(set(corpus))
    print ">>>TOTAL NB OF WORDS", len(corpus)
    f = open("kyle_celexes/" + "_".join([str(i) for i in celex_list]) + ".txt", "w")
    for line in corpus:
    	lengths[len(re.sub("-", "", line))] +=1
    	f.write(line + "\n")
    f.close()
    print ">>> Word-Length frequencies of lexicon to match"
    for k in lengths.keys():
		print k, lengths[k]
    return corpus
开发者ID:mahowak,项目名称:lexicon,代码行数:28,代码来源:permut_kyle.py

示例9: ch05_34_num_words_with_1to10_distinct_tags

def ch05_34_num_words_with_1to10_distinct_tags():
  from nltk.corpus import brown
  tagged_words = brown.tagged_words(categories="news")
  # number of distinct tags and number of words in corpus for this
  dd = nltk.defaultdict(set)
  for w,t in tagged_words:
    dd[w].add(t)
  for i in range(1,10):
    print i, len(filter(lambda x: len(dd[x]) == i, dd.keys()))
  # for the word with greatest number of tags, print out concordance
  # one for each tag
  maxtags = 6
  word = None
  tags = None
  for w in dd.keys():
    if len(dd[w]) >= maxtags:
      word = w
      tags = dd[w]
      break
  poss = []
  pos = 0
  for w, t in tagged_words:
    if w == word and t in tags:
      poss.append((t, pos))
      tags.remove(t)
    pos += 1
  for t, pos in poss:
    print t, " ".join(w for w,t in tagged_words[pos-10:pos+10])
开发者ID:447327642,项目名称:nltk-examples,代码行数:28,代码来源:ch05_ex.py

示例10: load

def load(path_to_dict):
    # Although pystardict.Dictionary is a child class of dict, it doesn't
    # implement quite a few important basic method such as keys(), iterkeys()
    # and etc, so we cannot just simply iterate through it.
    raw_dict = Dictionary(path_to_dict);
   
    new_dict = defaultdict(tuple)
    size = float(len(raw_dict))
    count = 0
   
    # This is a workaround to iterate through the keys
    # NB Since the idx stores the offset-size pairs, its keys must be sorted in 
    # order to read the dictionary data linearly and gain the best performance
    for tuple_key in sorted(raw_dict.idx._idx):
        key = ''.join(tuple_key)
        value = raw_dict[key]
        # Convert value to set of French words
        value = re.sub(r'\d\. ?', '', value);
        value = re.split(r', | \n ', value);
        new_dict[key] = value;
        # Show a nice progress report
        count += 1
        print 'Loading dictionary...       %5.2f%%\r' % ((count / size) * 100),
        sys.stdout.flush() # this must be flushed to see the latest result
        
    print
    return new_dict
开发者ID:yochananmkp,项目名称:clir,代码行数:27,代码来源:load.py

示例11: get_similar_groups

def get_similar_groups(word_list, minimum):
    tri_list=get_all_pairs_similarity(word_list)
    tri_filtered=filter_pairs_similarity(tri_list, minimum)
    neighbor=nltk.defaultdict(set)
    for tri in tri_filtered:
        neighbor[tri[0]].add(tri[1])
        neighbor[tri[1]].add(tri[0])

    def bors_kerbosch_v1(R, P, X, G, C): #CODE FROM ONLINE RESOURCE
        if len(P) == 0 and len(X) == 0:
            if len(R) > 2:
                C.append(sorted(R))
            return    
        for v in P.union(set([])):
            bors_kerbosch_v1(R.union(set([v])), P.intersection(G[v]), X.intersection(G[v]), G, C)
            P.remove(v)
            X.add(v)

    def bors_kerbosch_v2(R, P, X, G, C): #CODE FROM ONLINE RESOURCE
        if len(P) == 0 and len(X) == 0:
            if len(R) > 2:
                C.append(sorted(R))
            return
        (d, pivot) = max([(len(G[v]), v) for v in P.union(X)])                  
        for v in P.difference(G[pivot]):
            bors_kerbosch_v2(R.union(set([v])), P.intersection(G[v]), X.intersection(G[v]), G, C)
            P.remove(v)
            X.add(v)
    C = []
    bors_kerbosch_v2(set([]),set(neighbor.keys()),set([]),neighbor,C)
    return C
开发者ID:gabhi,项目名称:new-york-times-summarization,代码行数:31,代码来源:wordnet.py

示例12: nMostFreq

def nMostFreq(N, words):
    wCounts = nltk.defaultdict(int)
    nCounts = nltk.defaultdict(int)
    for word in words:
        wCounts[word.lower()] += 1
    for key in wCounts.keys():
        nCounts[wCounts[key]] += 1
    tot = 0
    numStop = []
    while tot<N:
        numStop.append(max(nCounts.keys()))
        tot += nCounts.pop(max(nCounts.keys()))
    revWCounts = getReverseDict(wCounts)
    wordsN = []
    for num in numStop:
        wordsN.extend(revWCount[num])
开发者ID:Jsalim,项目名称:NLP-Stuff,代码行数:16,代码来源:NLPwP_ch5_ex01.py

示例13: invert_dict

def invert_dict(d):
    from nltk import defaultdict
    inverted_dict = defaultdict(list)
    for key in d:
        for term in d[key]:
            inverted_dict[term].append(key)
    return inverted_dict
开发者ID:DrDub,项目名称:icsisumm,代码行数:7,代码来源:utilities.py

示例14: parseLexicon

def parseLexicon(lex_str):
    primitives = []
    families = {}
    entries = defaultdict(list)
    for line in lex_str.splitlines():
        # Strip comments and leading/trailing whitespace.
        line = reComm.match(line).groups()[0].strip()
        if line == "":
            continue

        if line.startswith(':-'):
            # A line of primitive categories.
            # The first line is the target category
            # ie, :- S, N, NP, VP
            primitives = primitives + [
                prim.strip() for prim in line[2:].strip().split(',')
            ]
        else:
            # Either a family definition, or a word definition
            (ident, sep, catstr) = reLex.match(line).groups()
            (cat, var) = augParseCategory(catstr, primitives, families)
            if sep == '::':
                # Family definition
                # ie, Det :: NP/N
                families[ident] = (cat, var)
            else:
                # Word definition
                # ie, which => (N\N)/(S/NP)
                entries[ident].append(cat)
    return CCGLexicon(primitives[0], primitives, families, entries)
开发者ID:sneilan,项目名称:EverythingIveDoneOverTheYears,代码行数:30,代码来源:lexicon.py

示例15: _w_b

def _w_b(word, overview):
    pos_forms = defaultdict(list)
    words = word.split(',')
    words = [w.strip() for w in words]
    for pos_str in ['noun', 'verb', 'adj', 'adv']:
        for w in words:
            '''
            if overview:
                pos_forms[pos_str].append(w)
            else:
                for form in _morphy(w, pos=pos_str):
                    if form not in pos_forms[pos_str]:
                        pos_forms[pos_str].append(form)
            '''
            for form in _morphy(w, pos=pos_str):
                if form not in pos_forms[pos_str]:
                    pos_forms[pos_str].append(form)
    body = ''
    for pos,pos_str,name in \
        ((N,'noun','Noun'), (V,'verb','Verb'),
         (ADJ,'adj','Adjective'), (ADV,'adv','Adverb')):
        if pos_str in pos_forms:
            if not overview:
                body += _hlev(3, name) + '\n'
            for w in pos_forms[pos_str]:
                # Not all words of exc files are in the database, so:
                try:
                    body += _collect_all(w, pos)
                except KeyError:
                    pass
    if not body:
        word = None
    return word,body
开发者ID:DrDub,项目名称:icsisumm,代码行数:33,代码来源:browseutil.py


注:本文中的nltk.defaultdict函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。