本文整理汇总了Python中nltk.pos_tag方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.pos_tag方法的具体用法?Python nltk.pos_tag怎么用?Python nltk.pos_tag使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.pos_tag方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: make_data_instance
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import pos_tag [as 别名]
def make_data_instance(text, index):
"""
Takes a line of text and creates a CoNLL09Example instance from it.
"""
tokenized = nltk.tokenize.word_tokenize(text.lstrip().rstrip())
pos_tagged = [p[1] for p in nltk.pos_tag(tokenized)]
lemmatized = [lemmatizer.lemmatize(tokenized[i])
if not pos_tagged[i].startswith("V") else lemmatizer.lemmatize(tokenized[i], pos='v')
for i in range(len(tokenized))]
conll_lines = ["{}\t{}\t_\t{}\t_\t{}\t{}\t_\t_\t_\t_\t_\t_\t_\tO\n".format(
i+1, tokenized[i], lemmatized[i], pos_tagged[i], index) for i in range(len(tokenized))]
elements = [CoNLL09Element(conll_line) for conll_line in conll_lines]
sentence = Sentence(syn_type=None, elements=elements)
instance = CoNLL09Example(sentence, elements)
return instance
示例2: text_to_num
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import pos_tag [as 别名]
def text_to_num(text):
tokenized = nltk.word_tokenize(text);
tags = nltk.pos_tag(tokenized)
print(tags)
chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """
chunkParser = nltk.RegexpParser(chunkPattern)
chunkedData = chunkParser.parse(tags)
print(chunkedData)
for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"):
exp = ""
for l in subtree.leaves():
exp += str(l[0]) + " "
exp = exp[:-1]
print(exp)
try:
text = text.replace(exp, str(t2n.text2num(exp)))
except Exception as e:
print("error text2num ->", e.args)
print("text2num -> ", text)
return text
示例3: evaluate_sentiment
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import pos_tag [as 别名]
def evaluate_sentiment(text):
pos_score = 0
neg_score = 0
tokened = nltk.word_tokenize(text.decode('utf8', 'ignore').replace('<br />',' '))
pos_pairs = nltk.pos_tag(tokened)
for tuple in pos_pairs:
pos = ''
if tuple[1] == "NN":
pos = 'n/'
if tuple[1] == "JJ":
pos = 'a/'
if tuple[1] == "VB":
pos = 'v/'
if tuple[1] == "RB":
pos = 'r/'
try:
pos_score += sentiwordnet[pos+tuple[0].lower()][0]
neg_score += sentiwordnet[pos+tuple[0].lower()][1]
except:
pass
return pos_score, neg_score
示例4: evaluate_sentiment
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import pos_tag [as 别名]
def evaluate_sentiment(text):
pos_score = 0
neg_score = 0
tokened = nltk.word_tokenize(text)
pos_pairs = nltk.pos_tag(tokened)
for tuple in pos_pairs:
pos = ''
if tuple[1] == "NN":
pos = 'n/'
if tuple[1] == "JJ":
pos = 'a/'
if tuple[1] == "VB":
pos = 'v/'
if tuple[1] == "RB":
pos = 'r/'
try:
pos_score += sentiwordnet[pos+tuple[0].lower()][0]
neg_score += sentiwordnet[pos+tuple[0].lower()][1]
except:
pass
return pos_score, neg_score
示例5: _nltk_process_sents
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import pos_tag [as 别名]
def _nltk_process_sents(self, sents):
for sentence in sents:
if isinstance(sentence, STRING_TYPES):
if self._tokenizer_lang is None:
raise ValueError(
"No word tokenizer available for this language. "
"Please tokenize before calling the parser."
)
sentence = nltk.word_tokenize(sentence, self._tokenizer_lang)
if IS_PY2:
sentence = [
word.decode('utf-8', 'ignore') if isinstance(word, str) else word
for word in sentence
]
if not self._provides_tags:
sentence = nltk.pos_tag(sentence)
yield [word for word, tag in sentence], sentence
else:
yield sentence, sentence
示例6: words_by_part_of_speech
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import pos_tag [as 别名]
def words_by_part_of_speech(self) -> dict:
"""
Compute the parts of speech for each word in the document.
Uses nltk.pos_tag.
Returns:
dict
"""
words = self.words()
tagged = nltk.pos_tag(words)
categories = {}
for _type in {t[1] for t in tagged}:
categories[_type] = [t[0] for t in tagged if t[1] == _type]
return categories
示例7: preprocess
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import pos_tag [as 别名]
def preprocess(html):
"""
Returns a preprocessed document consisting of a list of paragraphs, which
is a list of sentences, which is a list of tuples, where each tuple is a
(token, part of speech) pair.
"""
try:
return [
[
nltk.pos_tag(nltk.wordpunct_tokenize(sent))
for sent in nltk.sent_tokenize(paragraph)
]
for paragraph in para_tokenize(html)
]
except Exception as e:
raise NLTKError("could not preprocess text: {}".format(str(e)))
示例8: line_prep
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import pos_tag [as 别名]
def line_prep(self, line):
""" Tokenizes and POS-tags a line from the SICK corpus to be compatible with WordNet synset lookup. """
# Split line into sentences + score
s1, s2, sim_score = line.split('\t')
# Tokenize
s1_tokens = word_tokenize(s1)
s2_tokens = word_tokenize(s2)
# Assign part of speech tags
s1_penn_pos = nltk.pos_tag(s1_tokens)
s2_penn_pos = nltk.pos_tag(s2_tokens)
# Convert to WordNet POS tags and store word position in sentence for replacement
# Each tuple contains (word, WordNet_POS_tag, position)
s1_wn_pos = list()
s2_wn_pos = list()
for idx, item in enumerate(s1_penn_pos):
if self.get_wordnet_pos(item[1]) != 'OTHER':
s1_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s1_penn_pos.index(item)))
for idx, item in enumerate(s2_penn_pos):
if self.get_wordnet_pos(item[1]) != 'OTHER':
s2_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s2_penn_pos.index(item)))
# Each tuple contains (word, WordNet_POS_tag, position); Source sentence provided for use in disambiguation
return [(s1_wn_pos, s1_tokens), (s2_wn_pos, s2_tokens)], sim_score
示例9: tokenize
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import pos_tag [as 别名]
def tokenize(data, language="english", filterStopWords = False, tagging = False):
result = {}
tags = []
filterChars = [",", ".", "?", ";", ":", "'", "!", "@", "#", "$", "%", "&", "*", "(", ")", "+", "{", "}", "[", "]", "\\", "|"]
sent_token = nltk.tokenize.sent_tokenize(data, language)
word_token = nltk.tokenize.word_tokenize(data, language)
word_token = [w for w in word_token if not w in filterChars]
if filterStopWords is True:
stop_words = set(stopwords.words(language))
word_token = [w for w in word_token if not w in stop_words]
if tagging is True:
tags = nltk.pos_tag(word_token)
result = {"sent_token": sent_token, "word_token": word_token, "pos_tag": tags}
return json.loads(jsonpickle.encode(result, unpicklable=False))
示例10: get_last_noun_and_article
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import pos_tag [as 别名]
def get_last_noun_and_article(sentence):
tokens = nltk.word_tokenize(sentence)
tags = nltk.pos_tag(tokens)
noun = None
for tag in reversed(tags):
if "NN" in tag[1]:
if noun:
noun = (tag[0] + " " + noun).strip()
else:
noun = tag[0]
# If encountering an article while there is a noun found
elif bool(noun):
if "DT" in tag[1] or "PRP$" in tag[1]:
return tag[0] + " " + noun
return noun
return None
示例11: nltk_preprocess
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import pos_tag [as 别名]
def nltk_preprocess(strings):
if not has_nltk:
return
strings = "\n".join(map(str, list(strings)))
tokens = re.findall(FUNCTION_NAMES_REGEXP, strings)
l = []
for token in tokens:
l.append(token[0])
word_tags = nltk.pos_tag(l)
for word, tag in word_tags:
try:
FOUND_TOKENS[word.lower()].add(tag)
except:
FOUND_TOKENS[word.lower()] = set([tag])
#-------------------------------------------------------------------------------
示例12: process_text
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import pos_tag [as 别名]
def process_text(text):
soup = BeautifulSoup(text, "lxml")
tags_del = soup.get_text()
no_html = re.sub('<[^>]*>', '', tags_del)
tokenized = casual_tokenizer(no_html)
lower = [item.lower() for item in tokenized]
decontract = [expandContractions(item, c_re=c_re) for item in lower]
tagged = nltk.pos_tag(decontract)
lemma = lemma_wordnet(tagged)
#no_num = [re.sub('[0-9]+', '', each) for each in lemma]
no_punc = [w for w in lemma if w not in punc]
no_stop = [w for w in no_punc if w not in stop_words]
return no_stop
################################################################################################################################################################
#### THE ABOVE Process_Text secion Re-used with Permission from:
#### R O B S A L G A D O robert.salgado@gmail.com Thank YOU!
################################################################################
示例13: extract_nnp_phrases
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import pos_tag [as 别名]
def extract_nnp_phrases(text):
"""
NNP extractor convenience method.
:param text:
:return:
"""
phrase_list = []
for sentence in nltk.sent_tokenize(text):
# Get POS
tokens = nltk.word_tokenize(sentence)
pos = nltk.pos_tag(tokens)
# Get POS
phrase = []
for t, p in pos:
if p in ["NNP", "NNPS"] or t in [",", "&"]:
phrase.append(t)
else:
if len(phrase) > 1:
phrase_list.append(clean_nnp_phrase(phrase))
phrase = []
return phrase_list
示例14: annotate_pos_with_term
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import pos_tag [as 别名]
def annotate_pos_with_term(sentence, term):
"""POS-tag single sentence while preserving _TERM_ using the original term"""
try:
pos_term = []
# replace term if necessary
if '_term_' not in sentence.lower():
sentence_term = sentence.lower().replace(term.lower(), '_TERM_')
else:
sentence_term = sentence.lower()
tok = word_tokenize(sentence_term)
tags = pos_tag(tok)
for tag in tags:
if '_TERM_' in tag[0].upper():
pos_term.append('_TERM_')
else:
pos_term.append(tag[1])
return ' '.join(pos_term)
except Exception, e:
log.error('POS annotation error: %s', e)
return None
示例15: annotate_sentence
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import pos_tag [as 别名]
def annotate_sentence(sentence_dict, term):
"""
Annotates a sentence object from a message with Penn Treebank POS tags.
Args:
sentence_dict: dict -- Must contain 's' and 's_clean', which is the
sentence with all occurrences of the search term
replaced with '_TERM-'
Returns:
dict -- updated sentence_dict with 'pos_tags' field.
"""
tags = pos_tag(word_tokenize(sentence_dict['s_clean']))
pos_tags = ['/'.join(b) for b in tags]
sentence_dict['pos_tags'] = " ".join(pos_tags)
sentence_dict['features'] = {}
return sentence_dict