本文整理汇总了Python中nltk.tag.stanford.POSTagger.batch_tag方法的典型用法代码示例。如果您正苦于以下问题:Python POSTagger.batch_tag方法的具体用法?Python POSTagger.batch_tag怎么用?Python POSTagger.batch_tag使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tag.stanford.POSTagger
的用法示例。
在下文中一共展示了POSTagger.batch_tag方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: stanford_batch_tag
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import batch_tag [as 别名]
def stanford_batch_tag(sentences):
'''use stanford tagger to batch tag a list of tokenized
sentences
'''
import src.experiment.path as path
# need to replace the model path and tagger path of standford parser
# in your computer (I use two functions here, you can hard code the paths if
# you like)
tagger = POSTagger(path.stanford_tagger_model_path(),
path.stanford_tagger_path())
return tagger.batch_tag(sentences)
示例2: tokenise_tweet
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import batch_tag [as 别名]
__author__ = 'Luke'
import cPickle as pickle
def tokenise_tweet():
pass
objective_tweets = pickle.load(open('../../Data/Training/objective-tweets.obj'))
subjective_tweets = pickle.load(open('../../Data/Training/subjective-tweets.obj'))
objective_tweets = [(tweet, u'obj') for tweet in objective_tweets]
subjective_tweets = [(tweet, u'sub') for tweet, sent in subjective_tweets]
total_set = objective_tweets + subjective_tweets
random.shuffle(total_set)
cut_off = int(0.85*len(total_set))
tagger = POSTagger('stanford-model.tagger', 'stanford-postagger.jar', encoding='utf8')
tagged_sentences = tagger.batch_tag([sent.split() for sent, label in total_set])
target_values = [label for sent, label in total_set]
to_disk = zip(tagged_sentences, target_values)
pickle.dump(to_disk, open('../../Data/Training/sentiment_detector_training.obj', 'wb'))
示例3: POSExtractor
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import batch_tag [as 别名]
class POSExtractor(object):
def __init__(self, gold, toClassify,
base="/resources/processors/tagger/stanford-postagger-3.0/"):
self.posTagger = POSTagger(base + "/models/german.tagger",
base + "/stanford-postagger.jar")
self.posCache = {}
self.pos_dv = self._trainPOSDictVectorizer(gold, toClassify)
def _trainPOSDictVectorizer(self, goldTree, to_classify=None):
sentences = list(goldTree)
if to_classify:
sentences.extend(to_classify)
pos_tagged = self.get_pos_tags_for_sentences(sentences)
items = []
assert len(pos_tagged) == len(sentences)
for sentence, pos in itertools.izip(sentences, pos_tagged):
# feels silly, but there is the occasional encoding error
# when using str(sentence)
self.posCache[sentence.pprint().encode('utf-8')] = pos
items.extend(self.extract_POS(sentence, pos))
dv = DictVectorizer(sparse=False)
dv.fit(items)
#logger.debug("DictVectorizer vocab: %s", dv.vocabulary_)
return dv
def get_pos_tags_for_sentences(self, sentences):
tokenizedSentences = []
for parseTree in sentences:
tokens = parseTree.leaves()
# (PROAV Deshalb)
# (@S-:-PROAV-..
# (@S-:-PROAV-...-$.
# (VVFIN 3 1/2)
# (NP-SB (NN Sterne) (PP (APPR von) (PPER mir))))
# ($. .)))
# [('Deshalb', 'PROAV'), ('3', 'CARD'), ('1/2', 'CARD')
#
# encode as utf-8
# the POSTagger object hands this over to a separate object,
# i.e. at some point str() is called on the tokens
tokens = map(lambda x: x.encode('utf-8'), tokens)
# 3 1/2 is separated by a non-breaking space which prevented
# correct tokenization in the parse tree
# the pos tagger however breaks it up correctly
# so replace 3 1/2 with 3-1/2
tokens = map(lambda x: x.replace('3\xc2\xa01/2', '3-1/2'), tokens)
tokenizedSentences.append(tokens)
pos_tagged = self.posTagger.batch_tag(tokenizedSentences)
assert len(pos_tagged) == len(tokenizedSentences)
return pos_tagged
def transform(self, posTag):
return self.pos_dv.transform(posTag)
def extract_POS(self, goldSentence, tagged=None):
if tagged is None:
tagged = self.posCache[goldSentence.pprint().encode('utf-8')]
if tagged is None:
#tagged = self.get_pos_tags_for_sentences([goldSentence])[0]
raise ValueError("Should have seen sentence in cache: %s" %
goldSentence)
leaves = goldSentence.leaves()
if not len(leaves) == len(tagged):
logger.error("leaves do not correspond to tagged!")
logger.error("leaves: %s, tagged: %s", leaves, tagged)
# TODO: there's a chance that similar leaves will have their POS tags
# overriden
# but yeah, good enough for now.
leafDict = {}
for (leaf, pos) in itertools.izip(leaves, tagged):
pos = pos[1]
leafDict[leaf] = pos
items = []
all_pos_tags = set()
for goldNode in ma_util.walkTree(goldSentence):
res = {}
for subTreeLeaf in goldNode.leaves():
key = leafDict[subTreeLeaf] # [0]
if not key in res:
res[key] = 0
res[key] += 1 # += 1
all_pos_tags.add(key)
items.append(res)
return items