本文整理汇总了Python中nltk.tag.stanford.POSTagger.tag方法的典型用法代码示例。如果您正苦于以下问题:Python POSTagger.tag方法的具体用法?Python POSTagger.tag怎么用?Python POSTagger.tag使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tag.stanford.POSTagger
的用法示例。
在下文中一共展示了POSTagger.tag方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: stanford_corenlp_filter
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag [as 别名]
def stanford_corenlp_filter(sent):
from nltk.tag.stanford import POSTagger
posTagger = POSTagger('/Users/gt/Downloads/'
'stanford-postagger-2013-06-20/models/'
'wsj-0-18-bidirectional-nodistsim.tagger',
'/Users/gt/Downloads/stanford-postagger-2013-06-20'
'/stanford-postagger-3.2.0.jar',encoding=encoding)
b1, b2 = sent.split(blockSeparator)
b2 = b2.rstrip()
b1 = b1.lower()
tokens = word_tokenize(b1)
pos_tags = posTagger.tag(tokens)
filtered_sent = ' '
for pos_t in pos_tags:
if pos_t[1] in filterList:
# filtered_sent += stemmer.stem(pos_t[0]) + ' '
filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '
#note: 1 concat stemmer(word) == stemmer(1 concat word)
b2 = b2.lower()
tokens = word_tokenize(b2)
pos_tags = posTagger.tag(tokens)
filtered_sent = ' '
for pos_t in pos_tags:
if pos_t[1] in filterList:
# filtered_sent += stemmer.stem(pos_t[0]) + ' '
filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '
return filtered_sent
示例2: vectorizer
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag [as 别名]
def vectorizer(tokens, w2v_db):
db_path = w2v_db
# POS TAGGING
tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar')
tagged_tokens = tagger.tag(tokens)
unsorted_kw = OrderedDict()
for (w,t) in tagged_tokens:
if t in ['NNP', 'NNPS', 'FW']:
label = 1.5
elif t in ['NN', 'NNS']:
label = 1
else:
continue
w = w.lower()
try:
unsorted_kw[w] += label
except KeyError:
unsorted_kw[w] = label
# Get the vectors of words. Maintain order as in document.
token_vecs = OrderedDict()
conn = SQLCon(db_path)
words = (word.lower() for word in unsorted_kw)
for word in words:
try:
if token_vecs[word]: continue
except KeyError:
v = conn.read(word)
if not v is None:
token_vecs[word] = list(v)
print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) #Output for debugging; total vs unique words.
conn.close()
return unsorted_kw, token_vecs
示例3: cleanTokens
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag [as 别名]
def cleanTokens(tokens):
st = POSTagger('/models/german-fast.tagger')
tags = st.tag(tokens);
def cleanTags(x):
y = x[1]
return True if re.match("NE|NN",y) and len(x[0]) > 3 else False
clean_tags= filter(cleanTags,tags)
#import pdb;pdb.set_trace();
def buildSentens(arr):
list = []
sen =""
for i in arr:
list.append(i[0])
return list
#print len(clean_tags)
#print clean_tags
clean = buildSentens(clean_tags)
return clean
示例4: postext_st
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag [as 别名]
def postext_st(filename):
# Opening of File
path_to_raw = '/home/cyneo/Work/Scans/Text Version/'
if type(filename) != str:
raise IOError('Filename must be a string')
# Preparing to Tokenize
with open(osp.abspath(path_to_raw + filename + '.txt'),
'r', encoding='utf8') as raw:
# Initialize the punkt module
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sents = []
for line in raw:
sents.extend(sent_detector.tokenize(line.strip()))
tokenedsents = []
# Tokenizing
from nltk.tokenize.stanford import StanfordTokenizer
for line in sents:
tokenedsents.append(StanfordTokenizer().tokenize(line))
# Parts of Speech Tagging
posSents = []
from nltk.tag.stanford import POSTagger
st = POSTagger('/mnt/sda2/stanford-packages/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger',
encoding='utf8')
for line in tokenedsents:
# Returns a list of a list of tuples
posSents.append(st.tag(line))
return posSents
示例5: nltk_stanfordpos
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag [as 别名]
def nltk_stanfordpos(inpath, outfolder):
"""POS-Tagging French text with Stanford POS-Tagger via NLTK."""
print("\nLaunched nltk_stanfordpos.")
import os
import glob
from nltk.tag.stanford import POSTagger
for file in glob.glob(inpath):
st = POSTagger('/home/christof/Programs/stanfordpos/models/french.tagger', '/home/christof/Programs/stanfordpos/stanford-postagger.jar', encoding="utf8")
with open(file, "r", encoding="utf-8") as infile:
untagged = infile.read()
tagged = st.tag(untagged.split())
taggedstring = ""
for item in tagged:
item = "\t".join(item)
taggedstring = taggedstring + str(item) + "\n"
#print(taggedstring)
basename = os.path.basename(file)
cleanfilename = basename
if not os.path.exists(outfolder):
os.makedirs(outfolder)
with open(os.path.join(outfolder, cleanfilename),"w") as output:
output.write(taggedstring)
print("Done.")
示例6: main
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag [as 别名]
def main():
st = POSTagger(
"/home/shaun/stanford-postagger-full-2013-11-12/models/german-dewac.tagger",
"/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar",
)
# st = POSTagger("/home/shaun/stanford-postagger-full-2013-11-12/models/german-fast.tagger", \
# "/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar")
# print st.tag("Die Kinder in Bayern haben lange Ferien".split())
# return
with open(sys.argv[1], "r") as f:
content = f.read()
sentences = re.split("\n|\.|\?", content)
for s in sentences:
if len(s) == 0:
continue
# print s
pieces = st.tag(s.split())
strippedPieces = stripPieces(pieces)
print " ".join(strippedPieces)
示例7: createModel
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag [as 别名]
def createModel():
global classifierit
global classifierloose
global classifieryou
global classifierto
global classifiertheir
trainingitSet = []
traininglooseSet = []
trainingyouSet = []
trainingtoSet = []
trainingtheirSet= []
st = POSTagger('/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/models/english-bidirectional-distsim.tagger', '/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/stanford-postagger.jar')
for line in brown.sents():
print line
tagSent = st.tag(line)
print tagSent
arrayOfitFeature = pos_itfeatures(tagSent)
arrayOfyouFeature = pos_youfeatures(tagSent)
arrayOftheirFeature = pos_theirfeatures(tagSent)
arrayOflooseFeature = pos_loosefeatures(tagSent)
arrayOftoFeature = pos_tofeatures(tagSent)
if arrayOfitFeature:
trainingitSet.extend(arrayOfitFeature)
if arrayOftheirFeature:
trainingtheirSet.extend(arrayOftheirFeature)
if arrayOflooseFeature:
traininglooseSet.extend(arrayOflooseFeature)
if arrayOftoFeature:
trainingtoSet.extend(arrayOftoFeature)
if arrayOfyouFeature:
trainingyouSet.extend(arrayOfyouFeature)
algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[1]
#encodingit = maxent.TypedMaxentFeatureEncoding.train(trainingitSet, count_cutoff=3, alwayson_features=True)
classifierit = maxent.MaxentClassifier.train(trainingitSet, algorithm)
f = open('classifierit.pickle', 'wb')
pickle.dump(classifierit, f)
f.close()
#encodingloose = maxent.TypedMaxentFeatureEncoding.train(traininglooseSet, count_cutoff=3, alwayson_features=True)
classifierloose = maxent.MaxentClassifier.train(traininglooseSet, algorithm)
f = open('classifierloose.pickle', 'wb')
pickle.dump(classifierloose, f)
f.close()
#encodingyou = maxent.TypedMaxentFeatureEncoding.train(trainingyouSet, count_cutoff=3, alwayson_features=True)
classifieryou = maxent.MaxentClassifier.train(trainingyouSet, algorithm)
f = open('classifieryou.pickle', 'wb')
pickle.dump(classifieryou, f)
f.close()
#encodingto = maxent.TypedMaxentFeatureEncoding.train(trainingtoSet, count_cutoff=3, alwayson_features=True)
classifierto = maxent.MaxentClassifier.train(trainingtoSet, algorithm)
f = open('classifierto.pickle', 'wb')
pickle.dump(classifierto, f)
f.close()
#encodingtheir = maxent.TypedMaxentFeatureEncoding.train(trainingtheirSet, count_cutoff=3, alwayson_features=True)
classifiertheir = maxent.MaxentClassifier.train(trainingtheirSet, algorithm)
f = open('classifiertheir.pickle', 'wb')
pickle.dump(classifiertheir, f)
f.close()
示例8: stanford_tag
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag [as 别名]
def stanford_tag(sentence):
''' use stanford tagger to tag a single tokenized sentence
'''
import src.experiment.path as path
tagger = POSTagger(path.stanford_tagger_model_path(),
path.stanford_tagger_path(),
java_options='-Xmx16g -XX:MaxPermSize=256m')
return tagger.tag(sentence)
示例9: tag
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag [as 别名]
def tag(segments):
#st = POSTagger('/home/dc65/Documents/tools/stanford-postagger-2014-01-04/models/english-left3words-distsim.tagger', '/home/dc65/Documents/tools/stanford-postagger-2014-01-04/stanford-postagger-3.3.1.jar')
st = POSTagger(os.path.join(stanford_path, 'models/english-left3words-distsim.tagger'),
os.path.join(stanford_path, 'stanford-postagger-3.3.1.jar'))
tagged = []
for segment in segments:
x = ' '.join(nltk.tag.tuple2str(w) for w in st.tag(word_tokenize(segment)))
tagged.append(x.decode('utf-8'))
return tagged
示例10: spanish_pos
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag [as 别名]
def spanish_pos(text):
""" Parts of speech tagger for Spanish """
text = text.encode('utf8')
st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/spanish-distsim.tagger',
'/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8')
pos_tagged = st.tag(text.split())
return pos_tagged
示例11: german_pos
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag [as 别名]
def german_pos(text):
""" Parts of speech tagger for German """
text = text.encode('utf8')
st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/german-fast.tagger',
'/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8')
pos_tagged = st.tag(text.split())
return pos_tagged
示例12: main
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag [as 别名]
def main():
print "Inicio..."
with open("tweets_a_procesar_v2.csv", 'rb') as csvfile:
lines = csv.reader(csvfile, delimiter=DELIMITER, quotechar="'")
# En esta variable estan todos los tweets
tweets = []
for line in lines:
tweet = Tweet(line)
#print tweet.spanish_text.split()
tweets.append(tweet)
#archivo de salida
output = open("output_tagged_v2.csv", 'wb')
filewriter = csv.writer(output, delimiter=DELIMITER, quotechar="'")
#importando el tagger en español de Stanford NLP
from nltk.tag.stanford import POSTagger
st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish-distsim.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8')
#st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8')
#st = POSTagger('C:\Data\stanford-postagger-full-2014-08-27\models\spanish.tagger', 'C:\Data\stanford-postagger-full-2014-08-27\stanford-postagger-3.4.1.jar', encoding='utf-8')
n=0
for tweet in tweets:
n+=1
print tweet.spanish_text
#Ejemplo: st.tag('What is the airspeed of an unladen swallow ?'.split())
tweet_tagged = st.tag((tweet.spanish_text).split())
#Ejem_output: [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
#print tweet_tagged
important_words = []
n_adj = 0
for tag in tweet_tagged:
inicial = tag[1][:1]
if('a' in inicial):
important_words.append(tag[0])
if('r' in inicial):
important_words.append(tag[0])
if('n' in inicial):
important_words.append(tag[0])
if('v' in inicial):
important_words.append(tag[0])
#tweet.cant_adj = n_adj
tweet.tweet_tagged = tweet_tagged
tweet.important_words = important_words
filewriter.writerow(tweet.to_CSV())
if n % 100 == 0: print n
print "Done"
output.close()
示例13: __init__
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag [as 别名]
class yagoScores:
def __init__(self):
None
self.en_postagger = POSTagger('parser/models/english-bidirectional-distsim.tagger', 'parser/stanford-postagger.jar')
def parse(self,text):
return self.en_postagger.tag(text.split())
def get_underscoreWords(self,text):
return re.findall("[a-z]+_[a-z]+", text)
def findNounsSeq(self,tuples):
self.noun = []
self.nouns = []
prev = ""
for each in tuples:
if(each[1]=="NN"):
self.noun.append(each[0])
if(each[1]=="NNS"):
self.nouns.append(prev+" "+each[0])
prev = prev+" "+each[0]
else:
prev = each[0]
def searchInWiki(self,guessess):
#text = " ".join(self.noun)+" ".join(self.nouns)
text = " ".join(self.nouns)
print text
links = wikipedia.search(text)
print ("LINKS")
print links
for link in links:
page = wikipedia.page(link)
print page.title
# check if guess appears in that page
for eachg in guessess:
print eachg.replace("_", " ").lower()
if(eachg.replace("_", " ").lower() in page.content.lower()):
print "founddddddddddddddddddddd"
self.freq[eachg] += 1
# Call getScore(self,text,guessess)function from outside, returns dict of scores of wiki appearances
def getScore(self,text,guessess):
self.freq = defaultdict(int)
tuples = self.parse(text)
print tuples
self.findNounsSeq(tuples)
self.searchInWiki(guessess)
print self.freq
return self.freq
示例14: pos_tag_stanford
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag [as 别名]
def pos_tag_stanford(toked_sentence):
"""
INPUT: list of strings
OUTPUT: list of tuples
Given a tokenized sentence, return
a list of tuples of form (token, POS)
where POS is the part of speech of token
"""
from nltk.tag.stanford import POSTagger
st = POSTagger('/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger',
'/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar')
return st.tag(toked_sentence)
示例15: StanfordTagger
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag [as 别名]
class StanfordTagger(WorkflowNativePOSTagger):
def __init__(self, xml):
from nltk.tag.stanford import POSTagger
import os
super(StanfordTagger, self).__init__(xml)
self.tagger = POSTagger(os.path.join(os.getcwd(),'External/english-bidirectional-distsim.tagger'), os.path.join(os.getcwd(),'External/stanford-postagger.jar'))
def is_ascii(self, s):
return all(ord(c) < 128 for c in s)
def tokenize(self, document):
# Non ASCII characters makes the stanford tagger go crazy and run out of heap space
if self.is_ascii(document):
for word, tag in self.tagger.tag(document):
yield "%s/%s" % (word, tag)