本文整理汇总了Python中nltk.defaultdict函数的典型用法代码示例。如果您正苦于以下问题:Python defaultdict函数的具体用法?Python defaultdict怎么用?Python defaultdict使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了defaultdict函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: generate_correct_number_pool
def generate_correct_number_pool(n, corpus, homo, pool, eps = 1e-6):
"""Generate number of words to match length and p(word) from ngram model using a pool a pre-generated word"""
x = NgramModel(n, corpus, 1, homo)
poolxLengths = nltk.defaultdict(list)
poolxP = nltk.defaultdict(float)
for item in pool:
item = item.strip().split(",")
poolxLengths[int(item[0])].append(item[1])
poolxP[item[1]] = float(item[2])
same_length = nltk.defaultdict(int)
for i in range(20):
same_length[i] = dict([(k, poolxP[k]) for k in poolxLengths[i] if k in poolxP])
newwords = []
exist =0
hom_count =0
tot_p=0
for i, w in enumerate(corpus):
p_match = x.evaluate(w)
sample = get_range(same_length[len(w)], p_match-eps, p_match+eps)
while len(sample) == 0:
eps = eps*2
sample = get_range(same_length[len(w)], p_match-eps, p_match+eps)
eps = 1e-6
nw = random.choice(sample.keys())
while nw in newwords and homo !=1:
nw = random.choice(sample.keys())
if nw in corpus:
exist +=1
if nw in newwords:
hom_count +=1
del same_length[len(w)][nw]
tot_p += poolxP[nw] + log(1/float(len(sample)))
newwords += [nw]
示例2: extractNE
def extractNE(tok, pnouns, dic):
names = LinkedList.LinkedList()
nameprofs = defaultdict(lambda: defaultdict(lambda: 0))
#bag of words model
tent = []
prevword = "" #usually names are not preceded by an article - filters out some other named entities and some other cases
for i in range(len(tok)):
if STok.isuc(tok[i][0]) and (tok[i].lower() in pnouns) and (prevword not in articles):
tent.append(tok[i])
else:
if len(tent) > 0 and type(tent): #will add if the named entity
match = matchNE(names, tent) #matches to most recent matching occurrence
for j in range(0, len(weights)):
try:
word = tok[i+j].lower()
if (word not in pnouns) and len(word) > 3 and ("," not in word) and (word in dic):
nameprofs[match][word] += weights[j]
except:
break
for j in range(0, len(weights)):
try:
word = tok[i-len(tent)-j-1].lower()
if (word not in pnouns) and len(word) > 3 and ("," not in word) and (word in dic):
nameprofs[match][word] += weights[j]
except:
break
tent = []
prevword = tok[i]
return [names, nameprofs]
示例3: most_informative_features
def most_informative_features(self, n=100):
"""
Return a list of the 'most informative' features used by this
classifier. For the purpose of this function, the
informativeness of a feature C{(fname,fval)} is equal to the
highest value of P(fname=fval|label), for any label, divided by
the lowest value of P(fname=fval|label), for any label.
max[ P(fname=fval|label1) / P(fname=fval|label2) ]
"""
# The set of (fname, fval) pairs used by this classifier.
features = set()
# The max & min probability associated w/ each (fname, fval)
# pair. Maps (fname,fval) -> float.
maxprob = defaultdict(lambda: 0.0)
minprob = defaultdict(lambda: 1.0)
for (label, fname), probdist in self._feature_probdist.items():
for fval in probdist.samples():
feature = (fname, fval)
features.add( feature )
p = probdist.prob(fval)
maxprob[feature] = max(p, maxprob[feature])
minprob[feature] = min(p, minprob[feature])
if minprob[feature] == 0:
features.discard(feature)
# Convert features to a list, & sort it by how informative
# features are.
features = sorted(features,
key=lambda feature: minprob[feature]/maxprob[feature])
return features[:n]
示例4: analysis_using_word_and_prev_pos
def analysis_using_word_and_prev_pos():
from nltk.corpus import brown
pos = nltk.defaultdict(lambda: nltk.defaultdict(int))
brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged):
pos[(t1, w2)][t2] += 1
print pos[("DET", "right")]
示例5: NBCtrain
def NBCtrain(labeled_featuresets, estimator=nltk.ELEProbDist):
"""A copy of the nltk.NaiveBayesClassifer.train(...)
method to allow inspection of what the method is actually doing
and how long it's taking"""
"""
@param labeled_featuresets: A list of classified featuresets,
i.e., a list of tuples C{(featureset, label)}.
"""
label_freqdist = nltk.FreqDist()
feature_freqdist = nltk.defaultdict(nltk.FreqDist)
feature_values = nltk.defaultdict(set)
fnames = set()
print "There are " + str(len(labeled_featuresets)) + " labeled featuresets"
# Count up how many times each feature value occured, given
# the label and featurename.
print "Counting feature value occurence"
i = 0
for featureset, label in labeled_featuresets:
label_freqdist.inc(label)
for fname, fval in featureset.items():
# Increment freq(fval|label, fname)
feature_freqdist[label, fname].inc(fval)
# Record that fname can take the value fval.
feature_values[fname].add(fval)
# Keep a list of all feature names.
fnames.add(fname)
print "At featureset..." + str(i)
i += 1
# If a feature didn't have a value given for an instance, then
# we assume that it gets the implicit value 'None.' This loop
# counts up the number of 'missing' feature values for each
# (label,fname) pair, and increments the count of the fval
# 'None' by that amount.
for label in label_freqdist:
num_samples = label_freqdist[label]
for fname in fnames:
count = feature_freqdist[label, fname].N()
feature_freqdist[label, fname].inc(None, num_samples - count)
feature_values[fname].add(None)
# Create the P(label) distribution
print "Making the P(label) distribution..."
label_probdist = estimator(label_freqdist)
# Create the P(fval|label, fname) distribution
print "Making the P(fval|label, fname) distribution from " + str(
len(feature_freqdist.items())
) + " feature freqs..."
feature_probdist = {}
for ((label, fname), freqdist) in feature_freqdist.items():
probdist = estimator(freqdist, bins=len(feature_values[fname]))
feature_probdist[label, fname] = probdist
return nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
示例6: word_valency
def word_valency():
table = nltk.defaultdict(lambda: nltk.defaultdict(set))
entries = nltk.corpus.ppattach.attachments("training")
for entry in entries:
# print entry
key = entry.noun1 + "-" + entry.prep + "-" + entry.noun2
table[key][entry.attachment].add(entry.verb)
for key in sorted(table):
if len(table[key]) > 1:
print key, "N:", sorted(table[key]["N"]), "V:", sorted(table[key]["V"])
示例7: generate_correct_number
def generate_correct_number(corpus, homo, lm):
"""Generate number of words to match length, handle homophones being generated"""
lengths = nltk.defaultdict(lambda: nltk.defaultdict(int))
lengths_needed = nltk.defaultdict(int)
for item in corpus:
lengths[get_cv(item)][len(item.split("-"))] += 1
lengths_needed[get_cv(item)] += 1
newwords = []
newwords2 = []
exist = nltk.defaultdict(int)
# print lengths_needed
# hom = dict((i,corpus.count(i)) for i in corpus if corpus.count(i) > 1)
# ratio = sum(hom.values())/float(len(hom)+1)
# for i in lengths_needed.keys():
# while lengths_needed[i] > 0:
# words = lm.generate(i)
# for w in words:
# if homo == 1 or (w not in newwords and re.sub("-","",w) not in newwords2):
# lengths_needed[i] -= 1
# newwords += [w]
# newwords2 += [re.sub("-", "", w)]
# if re.sub("-","",w) in corpus:
# exist[len(re.sub("-","",w))] +=1
# print exist
# return newwords
while True:
words = lm.generate()
for w in words:
if lengths_needed[get_cv(w)] > 0:
if homo == 1 or (w not in newwords and re.sub("-","",w) not in newwords2):
# temp = newwords + [w]
# hom_new = Set([(i,temp.count(i)) for i in temp if temp.count(i) > 1])
# if len(hom_new)!= 0:
# ratio_temp = sum([h[1] for h in hom_new])/float(len(hom_new))
# else:
# ratio_temp = 0
# if (ratio_temp <= (ratio + 0.1) and len(hom_new) <= len(hom)) or w not in newwords:
lengths_needed[get_cv(w)] += -1
# if sum([lengths_needed[j] for j in lengths_needed.keys()]) %1000 == 0:
# print sum([lengths_needed[j] for j in lengths_needed.keys()])
newwords += [w]
newwords2 += [re.sub("-", "", w)]
if w in corpus:
exist[len(w)] +=1
elif sum([lengths_needed[j] for j in lengths_needed.keys()]) == 0:
print "nb of real words", sum(exist.values())
return newwords
示例8: extract_real_lex
def extract_real_lex(path, lemma, language, mono, hom,minlength, maxlength, minsyll, maxsyll, match, celex_list):
celex_path = get_celex_path(path, lemma, language)
lengths = nltk.defaultdict(int)
print celex_path
corpus = build_celex_corpus(celex_path, language, lemma, mono)
print ">>>TOTAL NB OF WORDS", len(corpus)
corpus = [c for c in corpus if c[1] > 0] #freq greater than 0
corpus = [clean_word(c[0]) for c in corpus] #reduce celex to just pronunciation
corpus = [celex_diphthong_sub(c) for c in corpus if "c" not in c and "q" not in c and "0" not in c and "~" not in c]
print ">>>TOTAL NB OF WORDS", len(corpus)
corpus = [i for i in corpus if (len(i.split("-")) > minsyll and len(i.split("-")) < maxsyll)]
print ">>>TOTAL NB OF WORDS", len(corpus)
corpus = [i for i in corpus if (len(re.sub("-", "", i)) > minlength and len(re.sub("-", "", i)) < maxlength)]
print ">>>TOTAL NB OF WORDS", len(corpus)
if match == "length":
corpus = [clean_syll(c) for c in corpus] #reduce celex to just pronunciation
print ">>>TOTAL NB OF WORDS", len(corpus)
if hom == 0: corpus = list(set(corpus))
print ">>>TOTAL NB OF WORDS", len(corpus)
f = open("kyle_celexes/" + "_".join([str(i) for i in celex_list]) + ".txt", "w")
for line in corpus:
lengths[len(re.sub("-", "", line))] +=1
f.write(line + "\n")
f.close()
print ">>> Word-Length frequencies of lexicon to match"
for k in lengths.keys():
print k, lengths[k]
return corpus
示例9: ch05_34_num_words_with_1to10_distinct_tags
def ch05_34_num_words_with_1to10_distinct_tags():
from nltk.corpus import brown
tagged_words = brown.tagged_words(categories="news")
# number of distinct tags and number of words in corpus for this
dd = nltk.defaultdict(set)
for w,t in tagged_words:
dd[w].add(t)
for i in range(1,10):
print i, len(filter(lambda x: len(dd[x]) == i, dd.keys()))
# for the word with greatest number of tags, print out concordance
# one for each tag
maxtags = 6
word = None
tags = None
for w in dd.keys():
if len(dd[w]) >= maxtags:
word = w
tags = dd[w]
break
poss = []
pos = 0
for w, t in tagged_words:
if w == word and t in tags:
poss.append((t, pos))
tags.remove(t)
pos += 1
for t, pos in poss:
print t, " ".join(w for w,t in tagged_words[pos-10:pos+10])
示例10: load
def load(path_to_dict):
# Although pystardict.Dictionary is a child class of dict, it doesn't
# implement quite a few important basic method such as keys(), iterkeys()
# and etc, so we cannot just simply iterate through it.
raw_dict = Dictionary(path_to_dict);
new_dict = defaultdict(tuple)
size = float(len(raw_dict))
count = 0
# This is a workaround to iterate through the keys
# NB Since the idx stores the offset-size pairs, its keys must be sorted in
# order to read the dictionary data linearly and gain the best performance
for tuple_key in sorted(raw_dict.idx._idx):
key = ''.join(tuple_key)
value = raw_dict[key]
# Convert value to set of French words
value = re.sub(r'\d\. ?', '', value);
value = re.split(r', | \n ', value);
new_dict[key] = value;
# Show a nice progress report
count += 1
print 'Loading dictionary... %5.2f%%\r' % ((count / size) * 100),
sys.stdout.flush() # this must be flushed to see the latest result
print
return new_dict
示例11: get_similar_groups
def get_similar_groups(word_list, minimum):
tri_list=get_all_pairs_similarity(word_list)
tri_filtered=filter_pairs_similarity(tri_list, minimum)
neighbor=nltk.defaultdict(set)
for tri in tri_filtered:
neighbor[tri[0]].add(tri[1])
neighbor[tri[1]].add(tri[0])
def bors_kerbosch_v1(R, P, X, G, C): #CODE FROM ONLINE RESOURCE
if len(P) == 0 and len(X) == 0:
if len(R) > 2:
C.append(sorted(R))
return
for v in P.union(set([])):
bors_kerbosch_v1(R.union(set([v])), P.intersection(G[v]), X.intersection(G[v]), G, C)
P.remove(v)
X.add(v)
def bors_kerbosch_v2(R, P, X, G, C): #CODE FROM ONLINE RESOURCE
if len(P) == 0 and len(X) == 0:
if len(R) > 2:
C.append(sorted(R))
return
(d, pivot) = max([(len(G[v]), v) for v in P.union(X)])
for v in P.difference(G[pivot]):
bors_kerbosch_v2(R.union(set([v])), P.intersection(G[v]), X.intersection(G[v]), G, C)
P.remove(v)
X.add(v)
C = []
bors_kerbosch_v2(set([]),set(neighbor.keys()),set([]),neighbor,C)
return C
示例12: nMostFreq
def nMostFreq(N, words):
wCounts = nltk.defaultdict(int)
nCounts = nltk.defaultdict(int)
for word in words:
wCounts[word.lower()] += 1
for key in wCounts.keys():
nCounts[wCounts[key]] += 1
tot = 0
numStop = []
while tot<N:
numStop.append(max(nCounts.keys()))
tot += nCounts.pop(max(nCounts.keys()))
revWCounts = getReverseDict(wCounts)
wordsN = []
for num in numStop:
wordsN.extend(revWCount[num])
示例13: invert_dict
def invert_dict(d):
from nltk import defaultdict
inverted_dict = defaultdict(list)
for key in d:
for term in d[key]:
inverted_dict[term].append(key)
return inverted_dict
示例14: parseLexicon
def parseLexicon(lex_str):
primitives = []
families = {}
entries = defaultdict(list)
for line in lex_str.splitlines():
# Strip comments and leading/trailing whitespace.
line = reComm.match(line).groups()[0].strip()
if line == "":
continue
if line.startswith(':-'):
# A line of primitive categories.
# The first line is the target category
# ie, :- S, N, NP, VP
primitives = primitives + [
prim.strip() for prim in line[2:].strip().split(',')
]
else:
# Either a family definition, or a word definition
(ident, sep, catstr) = reLex.match(line).groups()
(cat, var) = augParseCategory(catstr, primitives, families)
if sep == '::':
# Family definition
# ie, Det :: NP/N
families[ident] = (cat, var)
else:
# Word definition
# ie, which => (N\N)/(S/NP)
entries[ident].append(cat)
return CCGLexicon(primitives[0], primitives, families, entries)
示例15: _w_b
def _w_b(word, overview):
pos_forms = defaultdict(list)
words = word.split(',')
words = [w.strip() for w in words]
for pos_str in ['noun', 'verb', 'adj', 'adv']:
for w in words:
'''
if overview:
pos_forms[pos_str].append(w)
else:
for form in _morphy(w, pos=pos_str):
if form not in pos_forms[pos_str]:
pos_forms[pos_str].append(form)
'''
for form in _morphy(w, pos=pos_str):
if form not in pos_forms[pos_str]:
pos_forms[pos_str].append(form)
body = ''
for pos,pos_str,name in \
((N,'noun','Noun'), (V,'verb','Verb'),
(ADJ,'adj','Adjective'), (ADV,'adv','Adverb')):
if pos_str in pos_forms:
if not overview:
body += _hlev(3, name) + '\n'
for w in pos_forms[pos_str]:
# Not all words of exc files are in the database, so:
try:
body += _collect_all(w, pos)
except KeyError:
pass
if not body:
word = None
return word,body