当前位置: 首页>>代码示例>>Python>>正文


Python nltk.tag方法代码示例

本文整理汇总了Python中nltk.tag方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.tag方法的具体用法?Python nltk.tag怎么用?Python nltk.tag使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk的用法示例。


在下文中一共展示了nltk.tag方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: lookup

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:26,代码来源:glue.py

示例2: get_semtypes

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:19,代码来源:glue.py

示例3: extract_JK

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def extract_JK(pos_seq):
	"""The 'JK' method in Handler et al. 2016.
	Returns token positions of valid ngrams."""

	def find_ngrams(input_list, num_):
		'''get ngrams of len n from input list'''
		return zip(*[input_list[i:] for i in range(num_)])

	# copied from M and S chp 5'''
	patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN'])
	pos_seq = [tag2coarse.get(tag, 'O') for tag in pos_seq]
	pos_seq = [(i, p) for i, p in enumerate(pos_seq)]
	ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)]

	def stringify(s):
		return "".join(a[1] for a in s)

	def positionify(s):
		return tuple(a[0] for a in s)

	ngrams = filter(lambda x: stringify(x) in patterns, ngrams)
	return [set(positionify(n)) for n in ngrams]


######## 
开发者ID:JasonKessler,项目名称:scattertext,代码行数:27,代码来源:phrasemachine.py

示例4: __init__

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html 
开发者ID:JasonKessler,项目名称:scattertext,代码行数:25,代码来源:phrasemachine.py

示例5: nltk_preprocess

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def nltk_preprocess(strings):
  if not has_nltk:
    return

  strings = "\n".join(map(str, list(strings)))
  tokens = re.findall(FUNCTION_NAMES_REGEXP, strings)
  l = []
  for token in tokens:
    l.append(token[0])
  word_tags = nltk.pos_tag(l)
  for word, tag in word_tags:
    try:
      FOUND_TOKENS[word.lower()].add(tag)
    except:
      FOUND_TOKENS[word.lower()] = set([tag])

#------------------------------------------------------------------------------- 
开发者ID:joxeankoret,项目名称:idamagicstrings,代码行数:19,代码来源:IDAMagicStrings.py

示例6: lookup

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
#            raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError, "There is no GlueDict entry for sem type of '%s'"\
                    " with tag '%s', and rel '%s'" %\
                    (node['word'], node['tag'], node['rel'])

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
开发者ID:blackye,项目名称:luscan-devel,代码行数:24,代码来源:glue.py

示例7: get_semtypes

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        semtype_name = None

        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
开发者ID:blackye,项目名称:luscan-devel,代码行数:21,代码来源:glue.py

示例8: bio_tagger

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def bio_tagger(self, ne_tagged):
        bio_tagged = []
        prev_tag = "O"
        for token, tag in ne_tagged:
            if tag == "O":  # O
                bio_tagged.append((token, tag))
                prev_tag = tag
                continue
            if tag != "O" and prev_tag == "O":  # Begin NE
                bio_tagged.append((token, "B-" + tag))
                prev_tag = tag
            elif prev_tag != "O" and prev_tag == tag:  # Inside NE
                bio_tagged.append((token, "I-" + tag))
                prev_tag = tag
            elif prev_tag != "O" and prev_tag != tag:  # Adjacent NE
                bio_tagged.append((token, "B-" + tag))
                prev_tag = tag
        return bio_tagged

    # Create tree 
开发者ID:singnet,项目名称:nlp-services,代码行数:22,代码来源:entity_recognizer_mod.py

示例9: extract_JK

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def extract_JK(pos_seq):
    """The 'JK' method in Handler et al. 2016.  
    Returns token positions of valid ngrams."""

    def find_ngrams(input_list, num_):
        '''get ngrams of len n from input list'''
        return zip(*[input_list[i:] for i in range(num_)])

    # copied from M and S chp 5'''
    patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN'])
    pos_seq = [tag2coarse.get(tag,'O') for tag in pos_seq]
    pos_seq = [(i, p) for i, p in enumerate(pos_seq)]
    ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)]
    def stringify(s):
        return "".join(a[1] for a in s)
    def positionify(s):
        return tuple(a[0] for a in s)
    ngrams = filter(lambda x: stringify(x) in patterns, ngrams)
    return [set(positionify(n)) for n in ngrams]

######## 
开发者ID:slanglab,项目名称:phrasemachine,代码行数:23,代码来源:phrasemachine.py

示例10: __init__

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
        tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)


    # http://www.nltk.org/book/ch05.html 
开发者ID:slanglab,项目名称:phrasemachine,代码行数:21,代码来源:phrasemachine.py

示例11: postag_multi

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def postag_multi(self, multi_sentence):
        """ tag multiple sentences one time
        RECOMMAND! Because the efficiency of stanford pos tagger in NLTK is too slow.
        Args:
            multi_sentence: [[token1, token2], ..., [...]]
        Returns:
        """
        #word_pos_pairs_multi_sent = self.eng_tagger.tag_sents(multi_sentence)
        '''
        word_pos_pairs_multi_sent = pos_tag_sents(multi_sentence)
        pos_lists = []
        for word_pos_pairs in word_pos_pairs_multi_sent:
            pos_lists.append([pos for (word, pos) in word_pos_pairs])
        return pos_lists
        '''
        return [self.postag(sent) for sent in multi_sentence] 
开发者ID:microsoft,项目名称:NeuronBlocks,代码行数:18,代码来源:EnglishPOSTagger.py

示例12: lookup

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" % (node['word'], node['tag'], node['rel'])
            )

        return self.get_glueformulas_from_semtype_entry(
            lookup, node['word'], node, depgraph, counter
        ) 
开发者ID:V1EngineeringInc,项目名称:V1EngineeringInc-Docs,代码行数:27,代码来源:glue.py

示例13: _join

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:16,代码来源:relextract.py

示例14: coarse_tag_str

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def coarse_tag_str(pos_seq):
	"""Convert POS sequence to our coarse system, formatted as a string."""
	global tag2coarse
	tags = [tag2coarse.get(tag, 'O') for tag in pos_seq]
	return ''.join(tags)


# POS extraction assuming list of POS tags as input.
# >>> pyre.extract_finditer(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 4)]
# >>> pyre.extract_ngram_filter(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 3), (1, 4), (2, 3), (2, 4), (3, 4)] 
开发者ID:JasonKessler,项目名称:scattertext,代码行数:14,代码来源:phrasemachine.py

示例15: tag_text

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tag [as 别名]
def tag_text(self, text):
		'''take input text and return tokens w/ part of speech tags using NLTK'''
		# putting import here instead of top of file b.c. not all will have nltk installed

		sents = self.sent_detector.tokenize(text)  # TODO: this will fail on some unicode chars. I think assumes ascii
		word_pos_pairs = []

		all_tokens = []
		for sent in sents:
			tokens = self.tokenize(sent)
			all_tokens = all_tokens + tokens
			word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
		return {'tokens': all_tokens, 'pos': [tag for (w, tag) in word_pos_pairs]} 
开发者ID:JasonKessler,项目名称:scattertext,代码行数:15,代码来源:phrasemachine.py


注:本文中的nltk.tag方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。