本文整理汇总了Python中nltk.tag方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.tag方法的具体用法?Python nltk.tag怎么用?Python nltk.tag使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.tag方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: lookup
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def lookup(self, node, depgraph, counter):
semtype_names = self.get_semtypes(node)
semtype = None
for name in semtype_names:
if name in self:
semtype = self[name]
break
if semtype is None:
# raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
return []
self.add_missing_dependencies(node, depgraph)
lookup = self._lookup_semtype_option(semtype, node, depgraph)
if not len(lookup):
raise KeyError(
"There is no GlueDict entry for sem type of '%s' "
"with tag '%s', and rel '%s'" %
(node['word'], node['tag'], node['rel'])
)
return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
示例2: get_semtypes
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def get_semtypes(self, node):
"""
Based on the node, return a list of plausible semtypes in order of
plausibility.
"""
rel = node['rel'].lower()
word = node['word'].lower()
if rel == 'spec':
if word in SPEC_SEMTYPES:
return [SPEC_SEMTYPES[word]]
else:
return [SPEC_SEMTYPES['default']]
elif rel in ['nmod', 'vmod']:
return [node['tag'], rel]
else:
return [node['tag']]
示例3: extract_JK
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def extract_JK(pos_seq):
"""The 'JK' method in Handler et al. 2016.
Returns token positions of valid ngrams."""
def find_ngrams(input_list, num_):
'''get ngrams of len n from input list'''
return zip(*[input_list[i:] for i in range(num_)])
# copied from M and S chp 5'''
patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN'])
pos_seq = [tag2coarse.get(tag, 'O') for tag in pos_seq]
pos_seq = [(i, p) for i, p in enumerate(pos_seq)]
ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)]
def stringify(s):
return "".join(a[1] for a in s)
def positionify(s):
return tuple(a[0] for a in s)
ngrams = filter(lambda x: stringify(x) in patterns, ngrams)
return [set(positionify(n)) for n in ngrams]
########
示例4: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def __init__(self):
import nltk
from nltk.tag import PerceptronTagger
from nltk.tokenize import TreebankWordTokenizer
#return pkgutil.get_data('scattertext',
# 'data/viz/semiotic_new.html').decode('utf-8')
path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
tokenizer_fn = path + 'punkt.english.pickle'
tagger_fn = path + 'averaged_perceptron_tagger.pickle'
#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
# Load the tagger
self.tagger = PerceptronTagger(load=False)
self.tagger.load(tagger_fn)
# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
# Calling the TreebankWordTokenizer like this allows skipping the downloader.
# It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
# https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
self.tokenize = TreebankWordTokenizer().tokenize
self.sent_detector = nltk.data.load(tokenizer_fn)
# http://www.nltk.org/book/ch05.html
示例5: nltk_preprocess
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def nltk_preprocess(strings):
if not has_nltk:
return
strings = "\n".join(map(str, list(strings)))
tokens = re.findall(FUNCTION_NAMES_REGEXP, strings)
l = []
for token in tokens:
l.append(token[0])
word_tags = nltk.pos_tag(l)
for word, tag in word_tags:
try:
FOUND_TOKENS[word.lower()].add(tag)
except:
FOUND_TOKENS[word.lower()] = set([tag])
#-------------------------------------------------------------------------------
示例6: lookup
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def lookup(self, node, depgraph, counter):
semtype_names = self.get_semtypes(node)
semtype = None
for name in semtype_names:
if name in self:
semtype = self[name]
break
if semtype is None:
# raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
return []
self.add_missing_dependencies(node, depgraph)
lookup = self._lookup_semtype_option(semtype, node, depgraph)
if not len(lookup):
raise KeyError, "There is no GlueDict entry for sem type of '%s'"\
" with tag '%s', and rel '%s'" %\
(node['word'], node['tag'], node['rel'])
return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
示例7: get_semtypes
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def get_semtypes(self, node):
"""
Based on the node, return a list of plausible semtypes in order of
plausibility.
"""
semtype_name = None
rel = node['rel'].lower()
word = node['word'].lower()
if rel == 'spec':
if word in SPEC_SEMTYPES:
return [SPEC_SEMTYPES[word]]
else:
return [SPEC_SEMTYPES['default']]
elif rel in ['nmod', 'vmod']:
return [node['tag'], rel]
else:
return [node['tag']]
示例8: bio_tagger
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def bio_tagger(self, ne_tagged):
bio_tagged = []
prev_tag = "O"
for token, tag in ne_tagged:
if tag == "O": # O
bio_tagged.append((token, tag))
prev_tag = tag
continue
if tag != "O" and prev_tag == "O": # Begin NE
bio_tagged.append((token, "B-" + tag))
prev_tag = tag
elif prev_tag != "O" and prev_tag == tag: # Inside NE
bio_tagged.append((token, "I-" + tag))
prev_tag = tag
elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
bio_tagged.append((token, "B-" + tag))
prev_tag = tag
return bio_tagged
# Create tree
示例9: extract_JK
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def extract_JK(pos_seq):
"""The 'JK' method in Handler et al. 2016.
Returns token positions of valid ngrams."""
def find_ngrams(input_list, num_):
'''get ngrams of len n from input list'''
return zip(*[input_list[i:] for i in range(num_)])
# copied from M and S chp 5'''
patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN'])
pos_seq = [tag2coarse.get(tag,'O') for tag in pos_seq]
pos_seq = [(i, p) for i, p in enumerate(pos_seq)]
ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)]
def stringify(s):
return "".join(a[1] for a in s)
def positionify(s):
return tuple(a[0] for a in s)
ngrams = filter(lambda x: stringify(x) in patterns, ngrams)
return [set(positionify(n)) for n in ngrams]
########
示例10: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def __init__(self):
import nltk
from nltk.tag import PerceptronTagger
from nltk.tokenize import TreebankWordTokenizer
tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
# Load the tagger
self.tagger = PerceptronTagger(load=False)
self.tagger.load(tagger_fn)
# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
# Calling the TreebankWordTokenizer like this allows skipping the downloader.
# It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
# https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
self.tokenize = TreebankWordTokenizer().tokenize
self.sent_detector = nltk.data.load(tokenizer_fn)
# http://www.nltk.org/book/ch05.html
示例11: postag_multi
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def postag_multi(self, multi_sentence):
""" tag multiple sentences one time
RECOMMAND! Because the efficiency of stanford pos tagger in NLTK is too slow.
Args:
multi_sentence: [[token1, token2], ..., [...]]
Returns:
"""
#word_pos_pairs_multi_sent = self.eng_tagger.tag_sents(multi_sentence)
'''
word_pos_pairs_multi_sent = pos_tag_sents(multi_sentence)
pos_lists = []
for word_pos_pairs in word_pos_pairs_multi_sent:
pos_lists.append([pos for (word, pos) in word_pos_pairs])
return pos_lists
'''
return [self.postag(sent) for sent in multi_sentence]
示例12: lookup
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def lookup(self, node, depgraph, counter):
semtype_names = self.get_semtypes(node)
semtype = None
for name in semtype_names:
if name in self:
semtype = self[name]
break
if semtype is None:
# raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
return []
self.add_missing_dependencies(node, depgraph)
lookup = self._lookup_semtype_option(semtype, node, depgraph)
if not len(lookup):
raise KeyError(
"There is no GlueDict entry for sem type of '%s' "
"with tag '%s', and rel '%s'" % (node['word'], node['tag'], node['rel'])
)
return self.get_glueformulas_from_semtype_entry(
lookup, node['word'], node, depgraph, counter
)
示例13: _join
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def _join(lst, sep=' ', untag=False):
"""
Join a list into a string, turning tags tuples into tag strings or just words.
:param untag: if ``True``, omit the tag from tagged input strings.
:type lst: list
:rtype: str
"""
try:
return sep.join(lst)
except TypeError:
if untag:
return sep.join(tup[0] for tup in lst)
from nltk.tag import tuple2str
return sep.join(tuple2str(tup) for tup in lst)
示例14: coarse_tag_str
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def coarse_tag_str(pos_seq):
"""Convert POS sequence to our coarse system, formatted as a string."""
global tag2coarse
tags = [tag2coarse.get(tag, 'O') for tag in pos_seq]
return ''.join(tags)
# POS extraction assuming list of POS tags as input.
# >>> pyre.extract_finditer(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 4)]
# >>> pyre.extract_ngram_filter(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
示例15: tag_text
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def tag_text(self, text):
'''take input text and return tokens w/ part of speech tags using NLTK'''
# putting import here instead of top of file b.c. not all will have nltk installed
sents = self.sent_detector.tokenize(text) # TODO: this will fail on some unicode chars. I think assumes ascii
word_pos_pairs = []
all_tokens = []
for sent in sents:
tokens = self.tokenize(sent)
all_tokens = all_tokens + tokens
word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
return {'tokens': all_tokens, 'pos': [tag for (w, tag) in word_pos_pairs]}