本文整理汇总了Python中nltk.corpus.cmudict.dict函数的典型用法代码示例。如果您正苦于以下问题:Python dict函数的具体用法?Python dict怎么用?Python dict使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了dict函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, dataset="picasso2", basedir="parsed_data"):
self.dataset = dataset
self.basedir = basedir
filename = "%s/%s/source/%s" % (basedir,dataset,dataset)
self.debug("poemparser:init:dataset parsing '%s'..." % filename)
with open("pyParser/english_words.txt") as word_file:
self.english_words = set(word.strip().lower() for word in word_file)
# Open and analyze the text data.
self.unknownWords = {}
self.iffyWords = {}
self.allmatch = {}
self.alltokens = self.openTokens(filename)
self.parsedTokens = [token for token in self.alltokens[0] if token != '-']
self.replacedTokens = [token for token in self.alltokens[1] if token != '-']
self.fullTokens = [token for token in self.alltokens[2] if token != '-']
self.tokens = self.parsedTokens
self.loweredTokens = [token.lower() for token in self.replacedTokens]
self.pos_tags = nltk.pos_tag(self.replacedTokens)
self.text = nltk.Text(self.tokens)
self.dict = cmudict.dict()
self.lastspeed = 0
self.midiindex = 0
self.setMIDISettings(12)
self.debug("poemparser:init:words %s" % self.fullTokens)
self.debug("poemparser:init:tokens %s" % self.tokens)
self.debug("poemparser:init:text %s" % self.text)
示例2: approx_nsyl
def approx_nsyl(word):
"""Credit - Jason Sundram, http://runningwithdata.com/post/3576752158/w
Return the max syllable count in the case of multiple pronunciations"""
d = cmudict.dict()
if word not in d.keys():
return 0
return max([len([y for y in x if y[-1].isdigit()]) for x in d[word.lower()]])
示例3: reset_country_codes_to_emoflags
def reset_country_codes_to_emoflags(cc_path='country_codes.txt',
irange=ET.FLAGS_RANGE, charset='utf-8'):
'''
Using a country code dict, set the name and syllable fields
in a copy of emo_tuples.
'''
cmu_prons = cmudict.dict() # get the CMU Pronouncing Dict
cc_dict = load_country_codes(cc_path)
for tup in ET.EMO_TUPLES[irange.start:irange.stop]:
cc2 = tup[ET.INDEX_ALTERNATIVES][0].strip(':').upper()
# print(cc2, ' ', end='')
monos, polys, names = [], [], [cc2]
names.extend(nm for nm in tup[ET.INDEX_POLYSYLLABLES] if len(nm) > 2)
try:
names.extend(cc_dict[cc2])
# print(names, file=sys.stderr)
except KeyError:
print("{} missing {}\n\tusing: {}".format(
cc2, tup, names), file=sys.stderr)
for name in set(names):
if sylc.syl_count(cmu_prons, name) == 1:
monos.append(name)
else:
polys.append(name)
tupal = list(tup)
tupal[ET.INDEX_WORDSYLLABLES] = monos
tupal[ET.INDEX_POLYSYLLABLES] = polys
ret = tuple(tupal)
print(" {},".format(ret), file=sys.stdout)
# tupal[ET.INDEX_WORDSYLLABLES] =
print()
示例4: compile_meter_list
def compile_meter_list(self, new_words, verbose=True):
# simplifies and compiles cmu cormpus info into listed list
iambic = cmudict.dict() # connect to cmu corpus, called iambic
big_list = [] # list to collect all the different versions of words and their meter
for word in new_words: # get word from list of clean words
syl_num = sylco([word])
word_n_versions_list = [] # list has each word and the different versions
word_n_versions_list.append(word) # add word
versions_list = [] # list of all diff versions
try: # if word is in corpus
for n,x in enumerate(iambic[word.lower()]): # get versions for each word
version = [] # list for each version
version.append(word+str(n)) # add word+version
meter_list = [] # list holds word version's meter
for y in x: # for word in cmu-dict sent
for char in y: # for character in word
if char.isdigit() == True: # if the char is a number
meter_list.append(int(char)) # add number to meter
version.append(meter_list) # add meter to the word version
versions_list.append(version) # add all the versions to one list
word_n_versions_list.append(versions_list) # add list of diff versions to word and versions list
big_list.append(word_n_versions_list)
except: # if word isnt in corpus
version = [] # empty version
version.append(word+str(0)) # add word1
meter_list = [] # empty meter list
if len(syl_num) == 1:
for syl in range(syl_num[0]): # for each syllable...
meter_list.append(-1) # add 0 to meter_list
version.append(meter_list) # add empty meter list to version
versions_list.append(version) # add version w/ word1 to versions list
word_n_versions_list.append(versions_list) # add list of diff versions to word and versions list
big_list.append(word_n_versions_list) # adds word and versions to big list
return big_list
示例5: __compliant_haiku
def __compliant_haiku(self, haiku_source):
"""Ensure that newlines remain and all
other punctuation has been stripped"""
"""Ensure that newlines remain and all
other punctuation has been stripped"""
dict = cmudict.dict()
haiku_lines = haiku_source.splitlines()
syllables = []
for line in haiku_lines:
if line == "":
continue
sal=[]
for word in line.split(" "):
sal.append(len([x for x in dict[word][0] if x[-1].isdigit()]))
syllables.append(sum(sal))
pattern = [5,7,5]
if len(syllables) % 3 == 0:
while len(syllables) > 0:
if syllables[:3] == pattern:
for x in range(2,-1,-1):
syllables.pop(x)
else:
return False
else:
return False
return True
示例6: __init__
def __init__(self,text):
# Initialize vars
self.sent_count = 0
self.word_count = 0
self.syll_count = 0
self.cmu = cmudict.dict()
self.processText(text)
示例7: make_cmu_wordlist
def make_cmu_wordlist():
"""
Strip the CMU Pronunciation Dictionary of accent marks.
Add '$' to the end of strings (for markov chain use).
Pickle and dump to 'cmu.p'.
"""
d = cmudict.dict()
pronunciation_list = d.values()
edited_list = []
for entry in pronunciation_list:
for word in entry:
edited_word = ["#"]
for i in xrange(len(word)):
#remove accent marks
edited_word.append(word[i].rstrip('0123456789'))
#Use '$' to mark the end of words
edited_word.append('$')
edited_list.append(edited_word)
# with open('wordlists/cmu.p', 'w') as outfile:
# pickle.dump(edited_list, outfile)
return edited_list
示例8: group_rhyming_tweets
def group_rhyming_tweets(filtered_tweet_list):
"""groups rhyming tweets into lists, then returns a list containing those lists. lists are sorted so that the list with the most rhyming words
is first in the list."""
copy_filtered_tweet_list = list(filtered_tweet_list)
dictionary = cmudict.dict()
grouped_rhyming_tweets = []
index = 0
while (
index < len(copy_filtered_tweet_list) - 1
): # don't need to check last element for rhymes against other words b/c all pairs of words checked already by that point
rhyme_list = [copy_filtered_tweet_list[index]]
i = index + 1
while i < len(copy_filtered_tweet_list):
if (
do_sentences_rhyme(copy_filtered_tweet_list[index], copy_filtered_tweet_list[i], dictionary)
or sentence_rhyme_score(copy_filtered_tweet_list[index], copy_filtered_tweet_list[i]) > 4
):
rhyme_list.append(copy_filtered_tweet_list[i])
copy_filtered_tweet_list.pop(i)
i = i - 1
i = i + 1
rhyme_list = list(set(rhyme_list)) # remove non-unique entries by converting to a set and back again
grouped_rhyming_tweets.append(rhyme_list)
index = index + 1
# grouped_rhyming_tweets = sorted(grouped_rhyming_tweets, key = len, reverse = True)
grouped_rhyming_tweets = [i for i in grouped_rhyming_tweets if len(i) > 1]
return grouped_rhyming_tweets
示例9: __init__
def __init__(self):
# generate n2w
self.n2w = gen_n2w()
# syllable dict
self.cmu = cmudict.dict()
示例10: on_load
def on_load(self):
print "Loading: " + self.__class__.__name__
wd = self.context.getWorkingDir()
nltk.data.path.append(wd + "nltk_data")
self.d = cmudict.dict()
pass
示例11: parse_sentence
def parse_sentence(sent, syl=partial(syllabify, English),
pron_dict=cmudict.dict()):
sent = sent.strip()
if not len(sent):
return
tokens = list(filter(len, map(preprocess, sent.split())))
phonemes = (map(syl, pron_dict[t]) for t in tokens)
nsyllables = set()
final_sounds = set()
for words in product(*phonemes):
if not len(words):
return
# Count the number of syllables and extract the stress pattern.
stress, syllables = zip(*((s[0], s[1:]) for w in words for s in w))
# Compute the final sound.
final_syllable = syllables[-1]
if len(final_syllable[2]):
final_sound = "_".join(map("_".join, final_syllable[1:]))
elif len(final_syllable[0]):
final_sound = "{0}_{1}".format(final_syllable[0][-1],
"_".join(final_syllable[1]))
else:
final_sound = "_".join(final_syllable[1])
# Update the possible versions for this sentence.
nsyllables.add(len(stress))
final_sounds.add(final_sound + "_{0}".format(int(stress[-1] > 0)))
return nsyllables, final_sounds, [tokens[-1]]
示例12: stress
def stress(self,bysentence=False):
"""
tokenizes (I guess) the words in self.text by the stress pattern in each of the words.
"""
vowels = ['A','E','I','O','U']
possible_stresses = ['1','2','0']
totaldic = cmudict.dict()
def gen_stress(stripped_text):
stress_list = []
for word in stripped_text.lower().split():
try:
stress = str()
phonemized = totaldic[word][0]
for phoneme in phonemized:
for stresser in possible_stresses:
if stresser in phoneme:
stress += stresser
for index, sound in enumerate(phonemized[len(phonemized)-2:len(phonemized)]):
for vowel in vowels:
if vowel in sound:
stress_list.append([word,stress,[index, sound],phonemized,len(phonemized)])
except KeyError:
# print("{} couldn't be found".format(word))
pass
return stress_list
if bysentence:
sentences = PunktSentenceTokenizer().tokenize(master_str)
stress_by_sentence = [sentence.translate(string.maketrans("",""), string.punctuation) for sentence in sentences]
return [gen_stress(sentence) for sentence in stress_by_sentence]
elif not bysentence:
stress_total = self.text.translate(string.maketrans("",""), string.punctuation)
return gen_stress(stress_total)
示例13: fix_db
def fix_db():
print "* Executing database FIX procedure..."
# connect to db
mongodb_url = os.getenv("OPENSHIFT_MONGODB_DB_URL")
client = pym.MongoClient(mongodb_url)
db = client["shalk"]
coll = db["ngrams"]
base_data_dir = os.getenv("OPENSHIFT_DATA_DIR")
if not base_data_dir:
base_data_dir = "../data/"
# initialize cmu dict
nltk.data.path = ["{0}nltk/".format(base_data_dir)]
cdict = cmudict.dict()
count = 0
upcount = 0
mod = 100
# iterate over all docs that need fixing
orlist = [
{"syllables": {"$exists": False}},
{"rand": {"$exists": False}},
{"type": {"$exists": False}},
{"rhyme": {"$exists": False}},
]
ngrams = coll.find({"$or": orlist})
total = ngrams.count()
for ngram in ngrams:
upngram = False
lastword = get_last_word(ngram)
if "syllables" not in ngram:
upngram = True
ngram["syllables"] = count_syllables(lastword, cdict)
if "rand" not in ngram:
upngram = True
ngram["rand"] = random.random()
if "rhyme" not in ngram:
upngram = True
ngram["rhyme"] = get_rhyme(lastword, cdict)
if not upngram:
count += 1
continue
update_ngram(ngram, db)
upcount += 1
count += 1
if count % mod == 0:
print "- {0} out of {1} analysed! Docs updated: {2}".format(count, total, upcount)
sys.stdout.flush()
print "* Database FIX procedure finished!"
示例14: load_pronunciations
def load_pronunciations(pronun_dictionary_name='cmudict', stress='unstressed'):
""" note that we only support cmudict from nltk """
if stress not in STRESS_OPTIONS:
raise TypeError
try: cmu = cmudict.dict()
except LookupError, AttributeError:
cmu = load_cmu_pickle()
示例15: does_rhyme_unit_test
def does_rhyme_unit_test():
dictionary = cmudict.dict()
print does_rhyme('lol','bol',2,dictionary)
print does_rhyme('cat','dog',2,dictionary)
print does_rhyme('cat','bat',2,dictionary)
print does_rhyme('cat','tot',2,dictionary)
print does_rhyme('cat','tot',2,dictionary)
print does_rhyme('hello','yellow',2,dictionary)