本文整理汇总了Python中nltk.corpus.names.words方法的典型用法代码示例。如果您正苦于以下问题:Python names.words方法的具体用法?Python names.words怎么用?Python names.words使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.names
的用法示例。
在下文中一共展示了names.words方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from nltk.corpus import names [as 别名]
# 或者: from nltk.corpus.names import words [as 别名]
def __init__(self):
males = [(name, 'male') for name in names.words('male.txt')]
females = [(name, 'female') for name in names.words('female.txt')]
combined = males + females
random.shuffle(combined)
training = [(self.feature(name), gender) for (name, gender) in combined]
self._classifier = nltk.NaiveBayesClassifier.train(training)
示例2: demo
# 需要导入模块: from nltk.corpus import names [as 别名]
# 或者: from nltk.corpus.names import words [as 别名]
def demo():
def gender_features(word):
return {'last_letter': word[-1], 'penultimate_letter': word[-2]}
from nltk.classify import accuracy
from nltk.corpus import names
import random
names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])
import random
random.seed(60221023)
random.shuffle(names)
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
print '--- nltk.classify.svm demo ---'
print 'Number of training examples:', len(train_set)
classifier = SvmClassifier.train(train_set)
print 'Total SVM dimensions:', len(classifier._svmfeatureindex)
print 'Label mapping:', classifier._labelmapping
print '--- Processing an example instance ---'
print 'Reference instance:', names[0]
print 'NLTK-format features:\n ' + str(test_set[0])
print 'SVMlight-format features:\n ' + str(map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex))
distr = classifier.prob_classify(test_set[0][0])
print 'Instance classification and confidence:', distr.max(), distr.prob(distr.max())
print '--- Measuring classifier performance ---'
print 'Overall accuracy:', accuracy(classifier, test_set)
示例3: maximum_overlap
# 需要导入模块: from nltk.corpus import names [as 别名]
# 或者: from nltk.corpus.names import words [as 别名]
def maximum_overlap(w, i, text):
"""Return the candidate expansion with the highest overlap."""
best = 0
current = []
curr = ''
t_matches = tag_matches(i, text)
if t_matches:
if len(t_matches) == 1:
if t_matches[0] in brown_common:
return t_matches[0]
else:
return w
for cand in t_matches:
olap = overlap(i, cand, text)
if olap > best and cand in words:
best = olap
current = [cand]
elif olap == best and best != 0:
current.append(cand)
best = 0
for c in current:
if c in brown_common:
freq = brown_common[c]
else:
freq = 0
if freq < best:
best = freq
curr = c
elif freq == best and len(tag_matches(i, text)) == 1:
best = freq
curr = c
return curr
if curr == '':
return w
else:
return curr
示例4: overlap
# 需要导入模块: from nltk.corpus import names [as 别名]
# 或者: from nltk.corpus.names import words [as 别名]
def overlap(i, word, text):
"""Return overlap between words in the context of the abbreviation and
words in the signatures generated for each candidate expansion."""
overlap = 0
sig = gen_signature(word)
context = gen_context(i, text)
for w in context:
if w in sig:
if w in brown_common:
overlap += brown_common[w]
else:
overlap += log(1161192 / 1)
return overlap
示例5: gen_signature
# 需要导入模块: from nltk.corpus import names [as 别名]
# 或者: from nltk.corpus.names import words [as 别名]
def gen_signature(word):
"""Generate a signature for each candidate expansion, using contextual
information from the Brown corpus, as well as WordNet definitions and
examples (if applicable)."""
if word in gen_signature.dict:
return gen_signature.dict[word]
inds = find_matches(word)
if len(inds) > 50:
f = len(inds) / 50
inds = [inds[int(i * f)] for i in range(50)]
signature = defaultdict(int)
for i in inds:
for w in gen_context(i, brown):
signature[w] += 1
sig = {w for w in signature
if signature[w] > 1
and w not in stopwords.words('english') and w != ','}
if word in wn.words():
if wn.synsets(word) and str(wn.synsets(word)[0]).count("'") == 2:
define = (eval("wn.{}.definition()".format(
str(wn.synsets(word)[0]).lower())))
examples = (eval("wn.{}.examples()".format(
str(wn.synsets(word)[0]).lower())))
if examples:
for ex in examples:
sig.update([w for w in wt(ex)
if w not in stopwords.words('english')])
if define:
sig.update([w for w in wt(define)
if w not in stopwords.words('english')])
gen_signature.dict[word] = sig
return sig
示例6: gen_context
# 需要导入模块: from nltk.corpus import names [as 别名]
# 或者: from nltk.corpus.names import words [as 别名]
def gen_context(i, text):
"""Generate context for the abbreviation - 4 words either side unless
sentence is too short."""
ind = i
context = []
text = text[:]
if not isinstance(i, int):
ind = int(i)
split_token = text[ind]
del text[ind]
parts = split({ind: (split_token, 'SPLT')}, verbose=False)
for it in sorted(parts, reverse=True):
text.insert(ind, parts[it][0])
start = ind
end = ind + 1
sloop = True
while sloop and start > 0:
if text[start - 1] not in ['.', '!', '?']:
start -= 1
else:
sloop = False
eloop = True
while eloop and end <= len(text) - 1:
if text[end] in ['.', '!', '?']:
eloop = False
else:
end += 1
if ind - start < 4:
if end - start >= 9:
context += text[start: start + 9]
else:
context += text[start: end]
elif end - ind < 5:
if end - start >= 9:
context += text[end - 9: end]
else:
context += text[start: end]
else:
context += text[ind - 4: ind + 5]
return context
示例7: gen_candidates
# 需要导入模块: from nltk.corpus import names [as 别名]
# 或者: from nltk.corpus.names import words [as 别名]
def gen_candidates(word):
"""Generate a list of candidate expansions given an abbreviation."""
vowel_cands = []
start_cands = []
start_and_end_cands = []
reg_cons = ''
reg_start = ''
reg_start_and_end = ''
for lt in word.lower():
if lt.isalpha():
reg_cons += lt
reg_cons += '[aeiou]*'
reg_cons += '$'
regex_cons = re.compile(reg_cons)
for lt in word.lower():
if lt.isalpha():
reg_start += lt
regex_start = re.compile(reg_start)
last = find_last_letter(word)
if last == 's':
last = find_last_letter(word[:word.rfind(last)]) + last
for lt in word[:word.rfind(last)].lower():
if lt.isalpha():
reg_start_and_end += lt
reg_start_and_end += '.*{}$'.format(last)
regex_start_and_end = re.compile(reg_start_and_end)
for w in words:
if regex_cons.match(w):
vowel_cands.append(w)
elif regex_start_and_end.match(w):
start_and_end_cands.append(w)
elif regex_start.match(w):
start_cands.append(w)
return vowel_cands, start_and_end_cands, start_cands
示例8: gender_match
# 需要导入模块: from nltk.corpus import names [as 别名]
# 或者: from nltk.corpus.names import words [as 别名]
def gender_match(tree, pos, pro):
""" Takes a proposed antecedent and pronoun and checks whether
they match in gender. Only checks for mismatches between singular
proper name antecedents and singular pronouns.
"""
male_names = (name.lower() for name in names.words('male.txt'))
female_names = (name.lower() for name in names.words('female.txt'))
male_pronouns = ["he", "him", "himself"]
female_pronouns = ["she", "her", "herself"]
neuter_pronouns = ["it", "itself"]
for c in tree[pos]:
if isinstance(c, nltk.Tree) and c.label() in nominal_labels:
# If the proposed antecedent is a recognized male name,
# but the pronoun being resolved is either female or
# neuter, they don't match
if c.leaves()[0].lower() in male_names:
if pro in female_pronouns:
return False
elif pro in neuter_pronouns:
return False
# If the proposed antecedent is a recognized female name,
# but the pronoun being resolved is either male or
# neuter, they don't match
elif c.leaves()[0].lower() in female_names:
if pro in male_pronouns:
return False
elif pro in neuter_pronouns:
return False
# If the proposed antecedent is a numeral, but the
# pronoun being resolved is not neuter, they don't match
elif c.leaves()[0].isdigit():
if pro in male_pronouns:
return False
elif pro in female_pronouns:
return False
return True
示例9: splitter
# 需要导入模块: from nltk.corpus import names [as 别名]
# 或者: from nltk.corpus.names import words [as 别名]
def splitter(data,num_words):
words=data.split(' ')
output=[]
cur_count=0
cur_words=[]
for word in words:
cur_words.append(word)
cur_count+=1
if cur_count==num_words:
output.append(' '.join(cur_words))
cur_words=[]
cur_count=0
output.append(' '.join(cur_words))
return output
示例10: names_demo
# 需要导入模块: from nltk.corpus import names [as 别名]
# 或者: from nltk.corpus.names import words [as 别名]
def names_demo(trainer, features=names_demo_features):
from nltk.corpus import names
import random
# Construct a list of classified names, using the names corpus.
namelist = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])
# Randomly split the names into a test & train set.
random.seed(123456)
random.shuffle(namelist)
train = namelist[:5000]
test = namelist[5000:5500]
# Train up a classifier.
print('Training classifier...')
classifier = trainer( [(features(n), g) for (n, g) in train] )
# Run the classifier on the test data.
print('Testing classifier...')
acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
print('Accuracy: %6.4f' % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
try:
test_featuresets = [features(n) for (n, g) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold)
for ((name, gold), pdist) in zip(test, pdists)]
print('Avg. log likelihood: %6.4f' % (sum(ll)/len(test)))
print()
print('Unseen Names P(Male) P(Female)\n'+'-'*40)
for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
if gender == 'male':
fmt = ' %-15s *%6.4f %6.4f'
else:
fmt = ' %-15s %6.4f *%6.4f'
print(fmt % (name, pdist.prob('male'), pdist.prob('female')))
except NotImplementedError:
pass
# Return the classifier
return classifier
示例11: partial_names_demo
# 需要导入模块: from nltk.corpus import names [as 别名]
# 或者: from nltk.corpus.names import words [as 别名]
def partial_names_demo(trainer, features=names_demo_features):
from nltk.corpus import names
import random
male_names = names.words('male.txt')
female_names = names.words('female.txt')
random.seed(654321)
random.shuffle(male_names)
random.shuffle(female_names)
# Create a list of male names to be used as positive-labeled examples for training
positive = map(features, male_names[:2000])
# Create a list of male and female names to be used as unlabeled examples
unlabeled = map(features, male_names[2000:2500] + female_names[:500])
# Create a test set with correctly-labeled male and female names
test = [(name, True) for name in male_names[2500:2750]] \
+ [(name, False) for name in female_names[500:750]]
random.shuffle(test)
# Train up a classifier.
print('Training classifier...')
classifier = trainer(positive, unlabeled)
# Run the classifier on the test data.
print('Testing classifier...')
acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
print('Accuracy: %6.4f' % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
try:
test_featuresets = [features(n) for (n, m) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold)
for ((name, gold), pdist) in zip(test, pdists)]
print('Avg. log likelihood: %6.4f' % (sum(ll)/len(test)))
print()
print('Unseen Names P(Male) P(Female)\n'+'-'*40)
for ((name, is_male), pdist) in zip(test, pdists)[:5]:
if is_male == True:
fmt = ' %-15s *%6.4f %6.4f'
else:
fmt = ' %-15s %6.4f *%6.4f'
print(fmt % (name, pdist.prob(True), pdist.prob(False)))
except NotImplementedError:
pass
# Return the classifier
return classifier
示例12: names_demo
# 需要导入模块: from nltk.corpus import names [as 别名]
# 或者: from nltk.corpus.names import words [as 别名]
def names_demo(trainer, features=names_demo_features):
from nltk.corpus import names
import random
# Construct a list of classified names, using the names corpus.
namelist = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])
# Randomly split the names into a test & train set.
random.seed(123456)
random.shuffle(namelist)
train = namelist[:5000]
test = namelist[5000:5500]
# Train up a classifier.
print 'Training classifier...'
classifier = trainer( [(features(n), g) for (n,g) in train] )
# Run the classifier on the test data.
print 'Testing classifier...'
acc = accuracy(classifier, [(features(n),g) for (n,g) in test])
print 'Accuracy: %6.4f' % acc
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
try:
test_featuresets = [features(n) for (n,g) in test]
pdists = classifier.batch_prob_classify(test_featuresets)
ll = [pdist.logprob(gold)
for ((name, gold), pdist) in zip(test, pdists)]
print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test))
print
print 'Unseen Names P(Male) P(Female)\n'+'-'*40
for ((name, gender), pdist) in zip(test, pdists)[:5]:
if gender == 'male':
fmt = ' %-15s *%6.4f %6.4f'
else:
fmt = ' %-15s %6.4f *%6.4f'
print fmt % (name, pdist.prob('male'), pdist.prob('female'))
except NotImplementedError:
pass
# Return the classifier
return classifier
示例13: partial_names_demo
# 需要导入模块: from nltk.corpus import names [as 别名]
# 或者: from nltk.corpus.names import words [as 别名]
def partial_names_demo(trainer, features=names_demo_features):
from nltk.corpus import names
import random
male_names = names.words('male.txt')
female_names = names.words('female.txt')
random.seed(654321)
random.shuffle(male_names)
random.shuffle(female_names)
# Create a list of male names to be used as positive-labeled examples for training
positive = map(features, male_names[:2000])
# Create a list of male and female names to be used as unlabeled examples
unlabeled = map(features, male_names[2000:2500] + female_names[:500])
# Create a test set with correctly-labeled male and female names
test = [(name, True) for name in male_names[2500:2750]] \
+ [(name, False) for name in female_names[500:750]]
random.shuffle(test)
# Train up a classifier.
print 'Training classifier...'
classifier = trainer(positive, unlabeled)
# Run the classifier on the test data.
print 'Testing classifier...'
acc = accuracy(classifier, [(features(n),m) for (n,m) in test])
print 'Accuracy: %6.4f' % acc
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
try:
test_featuresets = [features(n) for (n,m) in test]
pdists = classifier.batch_prob_classify(test_featuresets)
ll = [pdist.logprob(gold)
for ((name, gold), pdist) in zip(test, pdists)]
print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test))
print
print 'Unseen Names P(Male) P(Female)\n'+'-'*40
for ((name, is_male), pdist) in zip(test, pdists)[:5]:
if is_male == True:
fmt = ' %-15s *%6.4f %6.4f'
else:
fmt = ' %-15s %6.4f *%6.4f'
print fmt % (name, pdist.prob(True), pdist.prob(False))
except NotImplementedError:
pass
# Return the classifier
return classifier
示例14: names_demo
# 需要导入模块: from nltk.corpus import names [as 别名]
# 或者: from nltk.corpus.names import words [as 别名]
def names_demo(trainer, features=names_demo_features):
from nltk.corpus import names
import random
# Construct a list of classified names, using the names corpus.
namelist = [(name, 'male') for name in names.words('male.txt')] + [
(name, 'female') for name in names.words('female.txt')
]
# Randomly split the names into a test & train set.
random.seed(123456)
random.shuffle(namelist)
train = namelist[:5000]
test = namelist[5000:5500]
# Train up a classifier.
print('Training classifier...')
classifier = trainer([(features(n), g) for (n, g) in train])
# Run the classifier on the test data.
print('Testing classifier...')
acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
print('Accuracy: %6.4f' % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
try:
test_featuresets = [features(n) for (n, g) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
print()
print('Unseen Names P(Male) P(Female)\n' + '-' * 40)
for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
if gender == 'male':
fmt = ' %-15s *%6.4f %6.4f'
else:
fmt = ' %-15s %6.4f *%6.4f'
print(fmt % (name, pdist.prob('male'), pdist.prob('female')))
except NotImplementedError:
pass
# Return the classifier
return classifier
示例15: partial_names_demo
# 需要导入模块: from nltk.corpus import names [as 别名]
# 或者: from nltk.corpus.names import words [as 别名]
def partial_names_demo(trainer, features=names_demo_features):
from nltk.corpus import names
import random
male_names = names.words('male.txt')
female_names = names.words('female.txt')
random.seed(654321)
random.shuffle(male_names)
random.shuffle(female_names)
# Create a list of male names to be used as positive-labeled examples for training
positive = map(features, male_names[:2000])
# Create a list of male and female names to be used as unlabeled examples
unlabeled = map(features, male_names[2000:2500] + female_names[:500])
# Create a test set with correctly-labeled male and female names
test = [(name, True) for name in male_names[2500:2750]] + [
(name, False) for name in female_names[500:750]
]
random.shuffle(test)
# Train up a classifier.
print('Training classifier...')
classifier = trainer(positive, unlabeled)
# Run the classifier on the test data.
print('Testing classifier...')
acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
print('Accuracy: %6.4f' % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
try:
test_featuresets = [features(n) for (n, m) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
print()
print('Unseen Names P(Male) P(Female)\n' + '-' * 40)
for ((name, is_male), pdist) in zip(test, pdists)[:5]:
if is_male == True:
fmt = ' %-15s *%6.4f %6.4f'
else:
fmt = ' %-15s %6.4f *%6.4f'
print(fmt % (name, pdist.prob(True), pdist.prob(False)))
except NotImplementedError:
pass
# Return the classifier
return classifier