本文整理汇总了Python中nltk.tag.pos_tag函数的典型用法代码示例。如果您正苦于以下问题:Python pos_tag函数的具体用法?Python pos_tag怎么用?Python pos_tag使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了pos_tag函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: make_pos
def make_pos(target_tag, edit_rev):
tags, srcs, dsts = edit_rev_triple
# target_tag: 文中に存在する
# 品詞を付与する前に、文中から削除・追加タグが存在する部分を取り除く
if target_tag == del_tag:
sentence = dsts
elif target_tag == add_tag:
sentence = srcs
if target_tag in tags:
tag_indexes = [i for i, x in enumerate(tags) if x == target_tag]
trimed = sentence
for tag_index in tag_indexes:
trimed = trimed[:tag_index] + trimed[tag_index+1:]
posed = pos_tag(trimed)
pos = [w[1] for w in posed]
for tag_index in tag_indexes:
pos.insert(tag_index, u'')
#debug
None_indexes = [i for i, x in enumerate(pos) if x == u'']
if tag_indexes != None_indexes:
print >>sys.stderr, tag_indexes
print >>sys.stderr, None_indexes
print >>sys.stderr, tags
print >>sys.stderr, pos
else:
posed = pos_tag(u' '.join(sentence).split())
pos = [w[1] for w in posed]
return pos
示例2: number_of_exact_word_match
def number_of_exact_word_match(a, b, word_tokenizer, lemmatizer, stop_words):
pos_a = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(a)))
pos_b = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(b)))
lemmae_a = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_a \
if token.lower().strip(punctuation) not in stop_words]
lemmae_b = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_b \
if token.lower().strip(punctuation) not in stop_words]
matched_words = set(lemmae_a).intersection(lemmae_b)
return [len(matched_words), matched_words, b]
示例3: number_of_noun_match
def number_of_noun_match(a, b, word_tokenizer, lemmatizer, stop_words):
pos_a = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(a)))
pos_b = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(b)))
lemmae_a = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_a \
if pos == NOUN and token.lower().strip(punctuation) not in stop_words]
lemmae_b = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_b \
if pos == NOUN and token.lower().strip(punctuation) not in stop_words]
# Calculate Jaccard similarity
#ratio = len(set(lemmae_a).intersection(lemmae_b)) / float(len(set(lemmae_a).union(lemmae_b)))
#return (ratio > 0.66)
matched_words = set(lemmae_a).intersection(lemmae_b)
return [len(matched_words), matched_words, b]
示例4: keep_nouns
def keep_nouns(tf):
n_tf = {}
for k in tf:
if pos_tag([k])[0][1].find('N') == 0:
n_tf[k] = tf[k]
return n_tf
示例5: title_permutations
def title_permutations(title_expanded):
title_tagged = pos_tag(title_expanded.split())
st = PorterStemmer()
title_pos = [st.stem(word) for word, pos in title_tagged if pos != 'IN']
title_perms = list(map("*".join, permutations(title_pos)))
return title_perms
示例6: extract
def extract(query):
sentence = query
tagged_sent = pos_tag(sentence.split())
propernouns = [word for word,pos in tagged_sent if pos == 'NN']
return propernouns
#extract("I want to buy a car and a dog and plane")
示例7: test_run
def test_run():
results = {}
nouns = []
product_list = {}
for p in Post.query.all():
tagged_sent = pos_tag(p.story.split())
propernouns = [word for word,pos in tagged_sent if pos == 'NNP']
for n in propernouns:
if n == "I’m" or n == "It’s" or n == "Can’t":
continue
results[n.replace('.', '')] = True
for r in results.keys():
nouns.append(r)
for i in range(10):
noun = random.choice(nouns)
# print('Using "%s"', (noun,))
for k in test_keywords:
try:
products = amazon.search(Keywords=noun, SearchIndex=k)
for product in products:
product_list[product.title] = True
except:
continue
for p in product_list.keys():
print(" Found title: %s" % (p,))
示例8: analiseSentimento
def analiseSentimento(resposta):
texto = resposta['corpo']
frases = sentencesTokenizer.tokenize(texto)
palavras = []
for frase in frases:
palavras.extend(wordsTokenizer.tokenize(frase))
posTags = pos_tag(palavras)
positivo = 0
negativo = 0
for palavra, tag in posTags:
synsets = None
if tag.startswith('J'):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADJ)
elif tag.startswith('V'):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.VERB)
elif tag.startswith('N'):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.NOUN)
elif tag.startswith('R'):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADV)
else:
synsets = sentiwordnet.senti_synsets(palavra, '')
if synsets != None:
synsets = list(synsets)
if len(synsets) > 0:
synset = synsets[0]
positivo = positivo + synset.pos_score()
negativo = negativo + synset.neg_score()
if positivo > negativo:
return (resposta, 'positivo')
elif negativo > positivo:
return (resposta, 'negativo')
else:
return (resposta, 'neutro')
开发者ID:vbozelli,项目名称:Sentiment-Analysis,代码行数:33,代码来源:analise_sentimento_sentiwordnet_com_stopwords.py
示例9: _process_simpleHash
def _process_simpleHash(self, simpleHash):
# Extract entities from keys resulting from SimpleExtractor process_*
entityHash = {}
for data in simpleHash:
occs = simpleHash[data]['occurences']
proxLoc = simpleHash[data]['proxLoc']
# Tokenize sentences
for sent in tokenize_sentences(data):
# Tokenize words
tokens = tokenize_words(sent)
# Tag words with Parts of Speech
tagged = pos_tag(tokens)
# Identify named entities
entities = ne_chunk(tagged)
for ent in entities:
if isinstance(ent, NLTKParseTree):
# Is it a wanted type?
if ent.node in self.types:
# Should we keep the PoS tag?
if self.keepPos:
txts = ['/'.join(token) for token in ent.leaves()]
else:
txts = [token[0] for token in ent.leaves()]
txt = ' '.join(txts)
new = {txt: {'text': txt,
'occurences': occs,
'proxLoc': proxLoc[:]}}
entityHash = self._mergeHash(entityHash, new)
return entityHash
示例10: process_raw_text
def process_raw_text(text):
"""
First some code to standardize the formatting, then basic nlp.
"""
# Remove breaks and tabs
for char in ["\t", "\n"]:
text = text.replace(char, " ")
text = text.replace('."', '".')
text = text.replace(".'", "'.")
# Split special characters from words
for char in ["'", '"', ",", ".", "?", "!", ";", ":"]:
text = text.replace(char, " " + char + " ")
# Magic to remove all multi-spaces
text = ' '.join(text.split())
# get the words, sentences, POS tags, and chunks.
chunks = [ tuple([ c.type for c in t.chunks ]) for t in parsetree(text) ]
sentences = sent_tokenize(text)
sentences = [ word_tokenize(s) for s in sentences ]
sentences_tags = [ tuple([ (w, simplify_tag(t)) for w, t in pos_tag(s) ]) for s in sentences ]
sentences = [ tuple([ w for w, _ in s]) for s in sentences_tags ]
tags = [ tuple([ t for _, t in s]) for s in sentences_tags ]
words = flatten(sentences)
return tuple(words), tuple(sentences), tuple(tags), tuple(chunks)
示例11: GetContractPage
def GetContractPage(x):
url = 'http://www.defense.gov/contracts/contract.aspx?contractid=%d' % x
html = urllib.urlopen(url).read()
if re.search("The Official Home of the Department of Defense", html):
return
soup = BeautifulSoup(html)
p_tags = soup.findAll("p")
p_tags_text_list = [tag.text for tag in p_tags]
tokenized_list = []
for text in p_tags_text_list:
tokenized_list = tokenize.word_tokenize(text)
tokenized_list.append(nltk_tag.pos_tag(tokenized_list))
tagged_list = tokenized_list[-1]
data = {
"url": url}
for token in tagged_list[1:]:
if token[1]=="NNP":
data['entity'] = token[0]
break
for token in tagged_list[1:]:
if token[1]=="CD":
data['Amount'] = token[0]
break
print data
示例12: processoFeatures
def processoFeatures(resposta):
frases = tokenizerFrases.tokenize(resposta["corpo"])
palavras = []
palavrasTexto = {}
for frase in frases:
palavrasTemp = tokenizerPalavras.tokenize(frase)
for palavra in palavrasTemp:
palavrasTexto[palavra] = True
posTags = pos_tag(palavras)
positivo = 0
negativo = 0
for palavra, tag in posTags:
synsets = None
if tag.startswith("J"):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADJ)
elif tag.startswith("V"):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.VERB)
elif tag.startswith("N"):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.NOUN)
elif tag.startswith("R"):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADV)
else:
synsets = sentiwordnet.senti_synsets(palavra, "")
if synsets != None:
synsets = list(synsets)
if len(synsets) > 0:
synset = synsets[0]
positivo = positivo + synset.pos_score()
negativo = negativo + synset.neg_score()
if positivo > negativo:
return (palavrasTexto, "positivo")
elif negativo > positivo:
return (palavrasTexto, "negativo")
else:
return (palavrasTexto, "neutro")
示例13: count_words_unigram_pos
def count_words_unigram_pos(input_filename, output_path=''):
txt = get_file_text(input_filename)
word_regex = '[a-zA-Z]+'
word_frequency = {}
total_words = 0.
matches = re.findall(word_regex, txt, re.M + re.S + re.U)
for m in matches:
word_frequency[m] = word_frequency.get(m, 0.) + 1.
total_words+=1.
sorted_words = sorted(word_frequency.iteritems(), key=operator.itemgetter(1))
word_analysis = []
for word in sorted_words:
pos = pos_tag([word[0]])
word_analysis.append([word[0], word[1], pos[0][1]])
o_file = make_output_file(input_filename, output_path=output_path, prefix='', suffix='-words_unigram_pos')
o_file.write('word\tcount\tpos\n')
for w in word_analysis:
o_file.write('%s\t%d\t%s\n' % (w[0], w[1], w[2]))
o_file.close()
示例14: extract_pos
def extract_pos(tokens, simple=True):
"""
Simple parts of speech of speech are:
VERB - verbs (all tenses and modes)
NOUN - nouns (common and proper)
PRON - pronouns
ADJ - adjectives
ADV - adverbs
ADP - adpositions (prepositions and postpositions)
CONJ - conjunctions
DET - determiners
NUM - cardinal numbers
PRT - particles or other function words
X - other: foreign words, typos, abbreviations
. - punctuation
:param tokens:
:return:
"""
tokens_pos = pos_tag(tokens)
pos = [p for t, p in tokens_pos]
if simple:
# translate larger set of part of speech tags into small, simpler set
pos_dict = nltk.tagset_mapping('en-ptb', 'universal')
pos = [pos_dict[p] for p in pos]
return pos
示例15: lda_train
def lda_train(raw):
stop = set(stopwords.words('english'))
p_stemmer = PorterStemmer()
text_array = []
for i in range(len(raw)):
text = raw[i].lower()
text = text.replace('\r\n', ' ')
text = re.sub("[^a-z0-9]", " ", text)
# Tokenization segments a document into its atomic elements.
words = text.split()
# Stop words
# Certain parts of English speech, like (for, or) or the word the are meaningless to a topic model.
# These terms are called stop words and need to be removed from our token list.
words = [j for j in words if j not in stop]
tokenized = nltk.word_tokenize(text)
tagged_sent = pos_tag(words)
words = [word for word,pos in tagged_sent if pos == 'NN']
# Stemming words is another common NLP technique to reduce topically similar words to their root.
# stemming reduces those terms to stem. This is important for topic modeling, which would otherwise view those terms as separate entities and reduce their importance in the model.
#words = [p_stemmer.stem(s) for s in words]
text_array.append(words)
dictionary = corpora.Dictionary(text_array)
dictionary.save('dictionary.dic')
corpus = [dictionary.doc2bow(text) for text in text_array]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=15, id2word=dictionary, passes=20)
filename = 'finalized_model_15.sav'
joblib.dump(ldamodel, filename)
print(ldamodel.print_topics(num_topics=15, num_words=6))
return ldamodel,dictionary