本文整理汇总了Python中nltk.corpus.brown.tagged_words函数的典型用法代码示例。如果您正苦于以下问题:Python tagged_words函数的具体用法?Python tagged_words怎么用?Python tagged_words使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了tagged_words函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: exercise3
def exercise3():
print
print "Exercise 3"
print "Part 1"
count = 0
total_brown_tagged_words = bn.tagged_words()
cfd1 = nltk.ConditionalFreqDist(total_brown_tagged_words)
set1 = set([a for (a, b) in total_brown_tagged_words])
for s in set1:
if(len(cfd1[s].keys()) == 5):
count = count + 1
print "Number of words which have exactly 5 different tags: %d" % count
print
print "Part 2"
print "Words which have the most distinct tags are: "
tags = [b for (a, b) in bn.tagged_words()]
fd = nltk.FreqDist(tags)
ft = fd.keys()
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in bn.tagged_words())
for a in ft:
if fd[a] == 1:
print "For POS: " +a
print cfd2[a].keys()
print
print
示例2: __init__
def __init__(self):
"""Initialize your data structures in the constructor."""
tag_corpus = []
# from nltk.corpus import treebank
# corpus = treebank.tagged_words()
# for (word,tag) in treebank.tagged_words():
# tag_corpus.append(tag)
from nltk.corpus import brown
corpus = brown.tagged_words()
for (word,tag) in brown.tagged_words():
tag_corpus.append(tag)
self.wordCounts = collections.defaultdict(int)
self.tagCounts = collections.defaultdict(int)
self.wordTagCounts = collections.defaultdict(int)
self.wordTagList = {}
self.totalTag = 0
self.train(corpus)
#estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
#estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2)
estimator = _estimator
self.tagLM = NgramModel(2, tag_corpus, estimator)
示例3: verb_stem
def verb_stem(s):
"""extracts the stem from the 3sg form of a verb, or returns empty string"""
# goes through rules outlined in handout
if re.match ("has", s):
toReturn = 'have'
elif re.match (".*(ays|eys|iys|oys|uys)", s):
toReturn = s[:-1]
elif re.match (".*(ies)", s):
if (len(s) == 4):
toReturn = s[:-1]
else:
s1 = s[:-3]
s2 = s1 + "y"
toReturn = s2
elif re.match(".*(oes|xes|ches|shes|sses|zzes)", s):
toReturn = s[:-2]
elif re.match (".*(!sses|!zzes|ses|zes)", s):
toReturn = s[:-1]
elif re.match(".*(!ies|!oes|!ses|!xes|!ches|!shes|es)", s):
toReturn = s[:-1]
elif re.match(".*(!ss|!xs|!ys|!zs|!chs|!shs|s)", s):
toReturn = s[:-1]
else:
toReturn = ''
# will check if original plural or creted singular verb is in the Brown corpus.
if ((s, 'VBZ') not in brown.tagged_words()):
if ((toReturn, 'VB') not in brown.tagged_words()):
return ''
else:
return toReturn
else:
return toReturn
示例4: exercise2
def exercise2(category):
print
print "For Category: " + category
print "Part 1"
print "Words with the tag 'JJ':"
words = bn.tagged_words(categories = category)
wordlist = bn.words(categories = category)
words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ']))
print len(words_JJ)
print
print "Part 2"
print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:"
words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS']
print words_VBP_NNPS_NNS[:10]
print
sent = ""
print "Part 3"
print "The 3 most frequent 3-word prepositional phrases are:"
words = bn.tagged_words(categories = category)
for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(words):
if(t1.startswith('IN') and t2.startswith('AT') and t3.startswith('NN')):
sent = sent + w1.lower() + " " + w2.lower() + " " + w3.lower() + "."
sent_part = sent.split(".")
fd = nltk.FreqDist(sent_part)
v = fd.most_common(3)
print v
print
print "Part 4"
print "Ratio of Masculine to Feminine is:"
male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b'
female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b'
male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())])
female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())])
print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns)
print
示例5: exploreTaggedCorpora
def exploreTaggedCorpora():
brown_learned_text = brown.words(categories="learned")
sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == "often"))
brown_lrnd_tagged = brown.tagged_words(categories="learned", simplify_tags=True)
tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == "often"]
fd = nltk.FreqDist(tags)
fd.tabulate()
def process(sentence):
for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
if t1.startswith("V") and t2 == "TO" and t3.startswith("V"):
print w1, w2, w3
for tagged_sent in brown.tagged_sents():
process(tagged_sent)
brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
data = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged)
for word in data.conditions():
if len(data[word]) > 3:
tags = data[word].keys()
print word, " ".join(tags)
示例6: tagged_token_representation
def tagged_token_representation():
print nltk.tag.str2tuple("fly/NN")
from nltk.corpus import brown
print brown.tagged_words()
# distribution of tags
brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
print tag_fd
tag_fd.plot(cumulative=True)
# distribution of POS+N pairs
word_tag_pairs = nltk.bigrams(brown_news_tagged)
print nltk.FreqDist(a[1] for (a, b) in word_tag_pairs if b[1] == "N")
示例7: automaticTagging
def automaticTagging():
from nltk.corpus import brown
print "=============== The Default Tagger ==============="
brown_tagged_sents = brown.tagged_sents(categories='news')
print brown_tagged_sents[0:3]
brown_sents = brown.sents(categories='news')
print brown_sents[0:3]
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
print nltk.FreqDist(tags).max()
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
print default_tagger.tag(tokens)
print default_tagger.evaluate(brown_tagged_sents)
print "=============== The Regular Expression Tagger ==============="
patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'), (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN') ]
regexp_tagger = nltk.RegexpTagger(patterns)
print regexp_tagger.tag(brown_sents[3])
print regexp_tagger.evaluate(brown_tagged_sents)
print "=============== The Lookup Tagger ==============="
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = fd.keys()[:100]
print most_freq_words
likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
print baseline_tagger
print baseline_tagger.evaluate(brown_tagged_sents)
def performance(cfd, wordlist):
lt = dict((word, cfd[word].max()) for word in wordlist)
baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
def display():
import pylab
words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
sizes = 2 ** pylab.arange(15)
perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
pylab.plot(sizes, perfs, '-bo')
pylab.title('Lookup Tagger Performance with Varying Model Size')
pylab.xlabel('Model Size')
pylab.ylabel('Performance')
pylab.show()
display()
示例8: ch05_11_train_test_affix_tagger
def ch05_11_train_test_affix_tagger():
from nltk.corpus import brown
fd = nltk.FreqDist(brown.words(categories="news"))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news"))
most_freq_pos = dict((word, cfd[word].max()) for word in fd.keys())
affix_tagger = nltk.AffixTagger(model=most_freq_pos)
print affix_tagger.evaluate(brown.tagged_sents(categories="editorial"))
示例9: ch05_34_num_words_with_1to10_distinct_tags
def ch05_34_num_words_with_1to10_distinct_tags():
from nltk.corpus import brown
tagged_words = brown.tagged_words(categories="news")
# number of distinct tags and number of words in corpus for this
dd = nltk.defaultdict(set)
for w,t in tagged_words:
dd[w].add(t)
for i in range(1,10):
print i, len(filter(lambda x: len(dd[x]) == i, dd.keys()))
# for the word with greatest number of tags, print out concordance
# one for each tag
maxtags = 6
word = None
tags = None
for w in dd.keys():
if len(dd[w]) >= maxtags:
word = w
tags = dd[w]
break
poss = []
pos = 0
for w, t in tagged_words:
if w == word and t in tags:
poss.append((t, pos))
tags.remove(t)
pos += 1
for t, pos in poss:
print t, " ".join(w for w,t in tagged_words[pos-10:pos+10])
示例10: ch05_21_qualifiers_before_adore_love_like_prefer
def ch05_21_qualifiers_before_adore_love_like_prefer():
from nltk.corpus import brown
tagged_words = brown.tagged_words(categories="news")
tagged_word_bigrams = nltk.bigrams(tagged_words)
allp = set(["adore", "love", "like", "prefer"])
print set([w for (w1,t1), (w2,t2) in tagged_word_bigrams
if t1 == "QL" and w2.lower() in allp])
示例11: ch05_20_brown_corpus_words_phrases_by_tag
def ch05_20_brown_corpus_words_phrases_by_tag():
from nltk.corpus import brown
tagged_words = brown.tagged_words(categories="news")
# produce alpha sorted list of distinct words tagged MD
print sorted(set([w.lower()
for (w,t) in filter(lambda (w,t): t == "MD", tagged_words)]))
# identify words that can be plural (NRS, NPS*, NNS*) or
# third person singular verbs (BEDZ*, BEZ*, DOZ*, *BEZ)
# AND the ones ending with "s"
print set([w for (w, t) in tagged_words
if w.lower().endswith("s") and
(t == "NRS" or t.startswith("NPS")
or t.startswith("NPS") or t.startswith("NNS")
or t.startswith("BEDZ") or t.startswith("BEZ")
or t.startswith("DOZ") or t.endswith("BEZ"))])
# identify 3 word prepositional phrases IN+DET+NN
tagged_word_trigrams = nltk.trigrams(tagged_words)
print tagged_word_trigrams[:10]
print set([" ".join([w1, w2, w3])
for (w1,t1), (w2,t2), (w3,t3) in tagged_word_trigrams
if t1 == "IN" and t2 == "DET" and t3 == "NN"])
# ratio of masculine to feminine pronouns
num_masc_pn = len([w for (w,t) in tagged_words if w.lower() == "he"])
num_fem_pn = len([w for (w,t) in tagged_words if w.lower() == "she"])
print "masc/fem = ", (num_masc_pn / num_fem_pn)
示例12: category_by_pos
def category_by_pos():
from nltk.corpus import brown
from nltk import FreqDist
from nltk import DecisionTreeClassifier
from nltk import NaiveBayesClassifier
from nltk import classify
suffix_fdist = FreqDist()
for word in brown.words():
word = word.lower()
suffix_fdist.inc(word[-1:])
suffix_fdist.inc(word[-2:])
suffix_fdist.inc(word[-3:])
common_suffixes = suffix_fdist.keys()[:100]
# print common_suffixes
def pos_features(word):
features = {}
for suffix in common_suffixes:
features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
return features
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
# classifier = DecisionTreeClassifier.train(train_set)
# print 'Decision Tree %f' % classify.accuracy(classifier, test_set)
classifier = NaiveBayesClassifier.train(train_set)
print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
示例13: lookupTagger
def lookupTagger():
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = fd.keys()[:100]
likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown_tagged_sents)
sent = brown.sents(categories='news')[3]
baseline_tagger.tag(sent)
baseline_tagger = nltk.UnigramTagger(model=likely_tags,
backoff=nltk.DefaultTagger('NN'))
def performance(cfd, wordlist):
lt = dict((word, cfd[word].max()) for word in wordlist)
baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
def display():
import pylab
words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
sizes = 2 ** pylab.arange(15)
perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
pylab.plot(sizes, perfs, '-bo')
pylab.title('Lookup Tagger Performance with Varying Model Size')
pylab.xlabel('Model Size')
pylab.ylabel('Performance')
pylab.show()
示例14: demo
def demo():
root = Tk()
root.bind('<Control-q>', lambda e: root.destroy())
table = Table(root, 'Word Synset Hypernym Hyponym'.split(),
column_weights=[0, 1, 1, 1],
reprfunc=(lambda i,j,s: ' %s' % s))
table.pack(expand=True, fill='both')
from nltk.corpus import wordnet
from nltk.corpus import brown
for word, pos in sorted(set(brown.tagged_words()[:500])):
if pos[0] != 'N': continue
word = word.lower()
for synset in wordnet.synsets(word):
hyper = (synset.hypernyms()+[''])[0]
hypo = (synset.hyponyms()+[''])[0]
table.append([word,
getattr(synset, 'definition', '*none*'),
getattr(hyper, 'definition', '*none*'),
getattr(hypo, 'definition', '*none*')])
table.columnconfig('Word', background='#afa')
table.columnconfig('Synset', background='#efe')
table.columnconfig('Hypernym', background='#fee')
table.columnconfig('Hyponym', background='#ffe')
for row in range(len(table)):
for column in ('Hypernym', 'Hyponym'):
if table[row, column] == '*none*':
table.itemconfig(row, column, foreground='#666',
selectforeground='#666')
root.mainloop()
示例15: question2
def question2(category):
#print
#print "For Category: " + category
#print "Words with the tag 'JJ':"
#print
words = bn.tagged_words(categories = category)
wordlist = bn.words(categories = category)
words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ']))
print len(words_JJ)
print
print
print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:"
print
words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS']
print words_VBP_NNPS_NNS[:10]
print
print
print "Ratio"
print
male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b'
female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b'
male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())])
female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())])
print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns)
print
print
sent = ""
print "3 word prepositional phrases are:"