本文整理汇总了Python中nltk.pos_tag函数的典型用法代码示例。如果您正苦于以下问题:Python pos_tag函数的具体用法?Python pos_tag怎么用?Python pos_tag使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了pos_tag函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: create_synonyms
def create_synonyms(orig_word):
'''
funation for creating synonyms by passing word
'''
try:
headers = {
"X-Mashape-Key": "aIder4iWr4msh5Scn073WRoddmAEp1qA0I3jsnSR8lfJwtyzpg",
"Accept": "application/json"}
response = requests.get("https://wordsapiv1.p.mashape.com/words/{}/synonyms".format(orig_word), headers=headers)
if response.status_code == 200:
json = response.json()
synonyms = json['synonyms']
# synonyms = nltk.word_tokenize(synonyms)
synonyms = nltk.pos_tag(synonyms)
word = nltk.word_tokenize(orig_word)
word = nltk.pos_tag(word)[0]
print(synonyms)
good_syns = []
for syn in synonyms:
print(word[1], syn[1])
if word[1] == syn[1]:
print('*')
good_syns.append(syn[0])
word = Word.objects.get_or_create(word=orig_word)
for syn in good_syns[:2]:
try:
new_word = Word.objects.create(word=syn.lower(), is_synonym=True)
except Exception:
new_word = Word.objects.get(word=word)
syn = Synonym.objects.create(word=new_word)
syn.synonym_to.add(word)
return good_syns
except Exception as e:
print(e)
示例2: extract_pos_pair
def extract_pos_pair(event_mention_1, event_mention_2):
trigger1=""
extent1=""
trigger2=""
extent2=""
for one_anchor in event_mention_1.findall("anchor"):
trigger1=one_anchor[0].text
for one_anchor in event_mention_2.findall("anchor"):
trigger2=one_anchor[0].text
for one_extent in event_mention_1.findall("extent"):
extent1=one_extent[0].text
for one_extent in event_mention_2.findall("extent"):
extent2=one_extent[0].text
text1 = nltk.word_tokenize(extent1)
dict1 = nltk.pos_tag(text1)
for one_pair in dict1:
if one_pair[0] in trigger1 or trigger1 in one_pair[0]:
pos1=one_pair[1]
break
text2 = nltk.word_tokenize(extent2)
dict2 = nltk.pos_tag(text2)
for one_pair in dict2:
if one_pair[0] in trigger2 or trigger2 in one_pair[0]:
pos2=one_pair[1]
break
return (pos1, pos2)
示例3: writeOut
def writeOut(lsummary_out, allwordsphrases=[], outputpath='.', gridset=''):
# Write data out for the last folder (gridset) encountered - MUST BE A BETTER WAY THAN THIS?
uWordsPhrases = uniqueSet(allwordsphrases) # Set of unique words.
uwords =[]
uphrases = []
words = []
phrases =[]
wordtypes =[]
wordtypes =[]
total_wordsphrases = total_uwordsphrases = total_words = total_phrases = 0
ldata_out = UnicodeWriter(open(outputpath + '/'+ gridset +'/language-data.csv', 'wb'), delimiter=',', quotechar='"')
ldata_out.writerow(["WORD", "NUMBER OF WORDS", "COUNT", "TYPE"])
# Output metrics to file.
for item in uWordsPhrases:
num_words = len(item.split())
item_count = allwordsphrases.count(item)
if num_words == 1: # Single word
word_type = nltk.pos_tag(item)[-1][-1]
#word_type_help = nltk.help.upenn_tagset(word_type)
# MAYBE CONVERT TAGS INTO MORE USEFUL WORDS?!
ldata_out.writerow([item, str(num_words), str(item_count), word_type])
uwords.append(item)
wordtypes.append(word_type)
elif num_words > 1: # Phrase
nltk_words = nltk.word_tokenize(item)
word_pos = nltk.pos_tag(nltk_words) ### HOW TO DEAL WITH PHRASES???
word_types = [x[1] for x in word_pos]
ldata_out.writerow([item, str(num_words), str(item_count), " ,".join(word_types)])
# HOW TO OUTPUT EACH POS TO A COLUMN???
uphrases.append(item)
for item in allwordsphrases:
num_words = len(item.split())
if num_words == 1:
words.append(item)
elif num_words > 1:
phrases.append(item)
uword_types = countDuplicatesInList(wordtypes)
total_wordsphrases = len(allwordsphrases)
total_uwordsphrases = len(uWordsPhrases)
total_uwords = len(uwords)
total_uphrases = len(uphrases)
total_words = len(words)
total_phrases = len(phrases)
#["File Name", "Total Words or Phrases", "Total Unique Words or Phrases", "Total Words", "Total Phrases", "Total Unique Words", "Total Unique Phrases", "Types of Word"])
lsummary_out.writerow([gridset, str(total_wordsphrases), str(total_uwordsphrases), str(total_words), str(total_phrases), str(total_uwords), str(total_uphrases), ', '.join(map(str, uword_types))])
raw_words_out = open(outputpath + '/'+ gridset +'/raw-unique-words.text', 'wb')
raw_words_out.writelines('\n'.join(uWordsPhrases).encode('utf-8'))
raw_phrases_out = open(outputpath + '/'+ gridset +'/raw-unique-phrases.txt', 'wb')
raw_phrases_out.writelines('\n'.join(uphrases).encode('utf-8'))
raw_words_out = open(outputpath + '/'+ gridset +'/raw-wordsphrases.text', 'wb')
raw_words_out.writelines('\n'.join(allwordsphrases).encode('utf-8'))
示例4: nltk_filter
def nltk_filter(sent):
b1, b2 = sent.split(blockSeparator)
b2 = b2.rstrip()
b1 = b1.lower()
tokens = word_tokenize(b1)
pos_tags = pos_tag(tokens)
filtered_sent = ' '
for token in tokens:
filtered_sent += '1'+token + ' '
# for pos_t in pos_tags:
# if pos_t[1] in filterList:
# #filtered_sent += stemmer.stem(pos_t[0]) + ' '
# filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '
#note: 1 concat stemmer(word) == stemmer(1 concat word)
b2 = b2.lower()
tokens = word_tokenize(b2)
pos_tags = pos_tag(tokens)
# filtered_sent = ' '
# for pos_t in pos_tags:
# if pos_t[1] in filterList:
# #filtered_sent += stemmer.stem(pos_t[0]) + ' '
# filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '
for token in tokens:
filtered_sent += '2' + token + ' '
return filtered_sent
示例5: load_data
def load_data(path):
sentences_pos = []
r1 = re.compile(r'\<([^ ]+)\>')
r2 = re.compile(r'\$US(\d)')
for l in open(path):
if not l.strip():
continue
l = l.decode('utf-8')
l = l.replace(u'’', "'")
l = l.replace(u'``', '"')
l = l.replace(u"''", '"')
l = l.replace(u"—", '--')
l = l.replace(u"–", '--')
l = l.replace(u"´", "'")
l = l.replace(u"-", " ")
l = l.replace(u"/", " ")
l = r1.sub(r'\1', l)
l = r2.sub(r'$\1', l)
s = l.strip().split('\t')
sa, sb = tuple(nltk.word_tokenize(s)
for s in l.strip().split('\t') if s) # ignore double \t
sa, sb = ([x.encode('utf-8') for x in sa],
[x.encode('utf-8') for x in sb])
for s in (sa, sb):
for i in xrange(len(s)):
if s[i] == "n't":
s[i] = "not"
elif s[i] == "'m":
s[i] = "am"
sa, sb = fix_compounds(sa, sb), fix_compounds(sb, sa)
sentences_pos.append((nltk.pos_tag(sa), nltk.pos_tag(sb)))
return sentences_pos
示例6: replace_proper_nouns
def replace_proper_nouns(self, o_sent, n_sent):
proper_nouns = []
p_pnouns = []
o_tagged = pos_tag(word_tokenize(o_sent))
n_tagged = pos_tag(word_tokenize(n_sent))
# print("\nTransforming the output:")
# print("Input sentence:", o_sent)
# print("Found sentence:", n_sent)
# print("Input sentence tagged:", o_tagged)
# print("Found sentence tagged:", n_tagged)
for o in o_tagged:
if o[1] == 'NNP' and o not in proper_nouns:
proper_nouns.append(o)
for n in n_tagged:
if (n[1] == 'PRP' or n[1] == 'PRP$' or n[1] == 'NNP') and n not in p_pnouns:
p_pnouns.append(n)
# print("")
if (len(proper_nouns) == 1) and (len(p_pnouns) > 0):
n_sent = sub(r"\b%s\b" %p_pnouns[0][0] , proper_nouns[0][0], n_sent, 1)
gender = self.gp.classify(proper_nouns[0][0])
# print(proper_nouns[0][0], "is classified as", gender)
for pnoun in p_pnouns:
n_pnoun = self.change_gender(pnoun[0], gender)
n_sent = sub(r"\b%s\b" %pnoun[0] , n_pnoun, n_sent)
elif len(proper_nouns) < 1:
print("No proper nouns to replace")
else:
print("Not yet implemented, :P")
return n_sent
示例7: normalize_word
def normalize_word(word, lowercase=True, lemmatize=True):
"Normalize word by stripping plural nouns"
global NORMWORD_CACHE
global NORMWORD_POS
if NORMWORD_WNL is None:
init_normword_wnl()
if lowercase:
word = word.lower()
if word in NORMWORD_CACHE:
return NORMWORD_CACHE[word]
if not lemmatize:
return word
treebank_tag = nltk.pos_tag([word])[0][1]
newword = word
if ( len(newword) > 4 ) and ( treebank_tag == 'NNS' ):
# Only lemmatize plural nouns, leave verbs alone
wnpos = get_wordnet_pos(treebank_tag)
if wnpos:
newword = NORMWORD_WNL.lemmatize(newword, wnpos)
if newword != word:
LOGGER.debug('Changing %s to %s' % (word, newword))
NORMWORD_POS[newword] = nltk.pos_tag([newword])[0][1]
else:
NORMWORD_POS[word] = treebank_tag
NORMWORD_CACHE[word] = newword
return newword
示例8: test_nltkNERParsing
def test_nltkNERParsing(self):
testString = 'Natural Sciences and Engineering Research Council of Canada'
unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
posTagged = nltk.pos_tag(unigrams)
chunked = nltk.ne_chunk(posTagged)
getGPEs = []
for treeBranch in chunked:
if hasattr(treeBranch, 'label') and treeBranch.label() == 'GPE':
getGPEs.append(str(treeBranch))
self.assertEqual(1, len(getGPEs))
testString = 'Milwaukee Foundation'
unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
posTagged = nltk.pos_tag(unigrams)
chunked = nltk.ne_chunk(posTagged)
# returns (S (PERSON Milwaukee/NNP) (ORGANIZATION Foundation/NNP))
testString = 'New England Board of Higher Education'
unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
posTagged = nltk.pos_tag(unigrams)
chunked = nltk.ne_chunk(posTagged)
# returns (S (GPE New/NNP)(ORGANIZATION England/NNP Board/NNP) of/IN (PERSON Higher/NNP Education/NNP))
testString = 'New England Board of Higher Education'
unigrams = TokenizeOnWhitespacePunctuation(testString).getUnigrams()
posTagged = nltk.pos_tag(unigrams)
chunked = nltk.ne_chunk(posTagged)
示例9: printer
def printer(sentencescorelist, sentenceList, wordscorelist, wordList):
outFile = open('./tldr/outFile.txt', 'w')
for s in range(0, len(sentenceList)):
if s in sentencescorelist:
printsentence(sentenceList[s], outFile)
outFile.write("Topics to research: ")
topics = []
numtopics = 3
poswords = nltk.pos_tag(wordList)
poskeep = ["NN", "NNS", "NNP", "NNPS"]
while numtopics > 0:
temp = max(wordscorelist.iteritems(), key=operator.itemgetter(1))[0]
templist = [temp]
templist = nltk.pos_tag(templist)
if templist[0][1] in poskeep:
numtopics -= 1
topics.append(temp)
del wordscorelist[temp]
for i in range(0, len(topics)):
if i != len(topics) - 1:
outFile.write(topics[i] + ", ")
else:
outFile.write(topics[i])
outFile.close()
示例10: parse_stock_name
def parse_stock_name(self, stockname):
p = engine()
instruction_set = stockname.split(',')
word_list = instruction_set[0].split(' ')
index = 1
categories_ignored = ['RB', 'TO']
tokens = word_tokenize(instruction_set[0])
tags = pos_tag(tokens)
i=0
while i < len(tags):
if tags[i][1] in categories_ignored:
index += 1
i+= 1
else:
break
quantity = word_list[index-1]
disallowed = ['g', 'ml', 'x', 'kg', 'cups', 'cup', 'grams', 'can', 'tbsp', 'tsp', 'tbsps', 'tsps',
'small', 'bunch', 'piece', 'handful', 'pack', 'chopped', 'large', 'a', 'pinch',
'fresh', 'dried', 'heaped', 'thick', 'slices', 'slice', 'of', 'about']
while index < len(word_list):
if word_list[index] not in disallowed:
break
else:
index+=1
sentence = " ".join(word_list[index:])
tokens = word_tokenize(sentence)
categories = pos_tag(tokens)
words = []
for category in categories:
if category[1] not in ['NNS', 'VBN', 'VBG']:
words.append(category[0])
word = " ".join(words)
return quantity, word, None
示例11: test
def test(ws,wf,s,pf,wm,alfa2):
f1=open('test_data.data','rb')
f2=open('test.csv','rb')
val_text=f1.read()
comt=f2.read().splitlines()
val_lines=val_text.splitlines()
acc=0
lc=0
for line in val_lines:
token = line.split(' | ')
token[2]="<S> "+token[2]+" <E>"
t_t =token[2].split(' %% ')
if t_t[0]!="<S> ":
bff = nltk.pos_tag(t_t[0].split(".")[-1].split(" "))[-1][1]
else:
bff="<S>"
if t_t[2]!=" <E>":
aff = nltk.pos_tag(t_t[2].split(".")[0].split(" "))[0][1]
else:
aff="<E>"
val_label = nb(ws,wf,s,token[0],pf,aff,bff,alfa2)
if val_label==comt[lc].split(",")[1]:
acc+=1
lc+=1
print float(acc)/len(val_lines)
f1.close()
f2.close()
示例12: m_surrounding
def m_surrounding(self):
D = {}
sent = self.sentence["form"]
l = len(sent)
#print sent
K = self.index
'''
for k in range(l):
if sent[k] == self.word:
K = k
break
'''
#print K, l
tagp = tagn = ""
if (K+1) < l:
tagn = nt.word_tokenize(sent[K+1])
tagn = nt.pos_tag(tagn)
if (K-1) >=0:
tagp = nt.word_tokenize(sent[K-1])
tagp = nt.pos_tag(tagp)
if tagp != "":
D["ptag"] = tagp[0][1]
else:
D["ptag"] = ""
if tagn != "":
D["ntag"] = tagn[0][1]
else:
D["ntag"] = ""
print D
return D
示例13: score_glove_pos
def score_glove_pos(src, dst, numpy_arrays, labels_array, g, normalize=True):
b1 = []
b2 = []
lines = 0
with open(src) as p:
for i, line in enumerate(p):
s = line.split('\t')
b1.append(s[0])
b2.append(s[1][:-1]) #remove \n
lines = i + 1
b1_pos = [nltk.pos_tag(nltk.word_tokenize(re.sub(r'[^\x00-\x7F]+',' ', text))) for text in b1]
b2_pos = [nltk.pos_tag(nltk.word_tokenize(re.sub(r'[^\x00-\x7F]+',' ', text))) for text in b2]
res = []
for i in range(lines):
tags1 = [tag[0] for tag in b1_pos[i] if tag[1] in NOUN]
tags2 = [tag[0] for tag in b2_pos[i] if tag[1] in NOUN]
r = [1 - spatial.distance.cosine(g[tag1], g[tag2]) for tag1 in tags1 for tag2 in tags2 if tag1 in labels_array and tag2 in labels_array]
if len(r) == 0:
res.append(0)
else:
res.append(round(5*max(r), 2))
if normalize:
res = normarlize_score(res)
with open(dst, 'w') as thefile:
thefile.write("\n".join(str(i) for i in res))
print src + ' finished!'
示例14: test
def test(ws,wf,s,pf):
f1=open('validation_data.data','rb')
#f2=open('test_data.csv','w')
val_text=f1.read()
val_lines=val_text.splitlines()
acc=0
for line in val_lines:
token = line.split(' | ')
t_t =token[2].split(' %% ')
if t_t[0]!="<S>":
bff = nltk.pos_tag(t_t[0].split(".")[-1].split(" "))[-1][1]
else:
bff="<S>"
if t_t[2]!="<\S>":
aff = nltk.pos_tag(t_t[2].split(".")[0].split(" "))[0][1]
else:
aff="<\S>"
val_label = nb(ws,wf,s,token[0],pf,aff,bff)
#f2.write(token[0]+" | "+val_label+" | "+token[2])
#f1.close()
#f2.close()
#print "Done"
if val_label==token[1]:
acc+=1
print float(acc)/len(val_lines)
示例15: expand_with_wordnet
def expand_with_wordnet(query):
"""
This function expands every contentful word in the query with its wordnet
definition. The word itself is not removed. Stop words are removed from the
word definition as well.
(Contentful means that it is not a stopword or punctuation sign)
INPUT:
query -- user query that is a simple string
OUTPUT:
expanded_query -- user query + definitions of contentful words
"""
stop = stopwords.words("english")
stop += EXCLUDED
contentful_tokens = [tok for tok in query.split() if tok not in stop]
# take the first definition for the current word
defs = []
for token in contentful_tokens:
syn1 = wn.synsets(token, pos=wn.ADJ)[:1]
syn2 = wn.synsets(token, pos=wn.NOUN)[:1]
# we take into account only adj defs
if syn1:
defs.append(token)
def_tokenized = word_tokenize(syn1[0].definition())
[defs.append(t[0]) for t in pos_tag(def_tokenized) if t[1] in ["NN", "JJ"]]
elif syn2:
defs.append(token)
def_tokenized = word_tokenize(syn2[0].definition())
[defs.append(t[0]) for t in pos_tag(def_tokenized) if t[1] in ["NN", "JJ"]]
# expansion can add some EXCLUDED words back in the query
defs = set(defs) - set(EXCLUDED) # removing again
expanded = " ".join(defs)
return expanded