本文整理匯總了Python中nltk.tag.stanford.POSTagger類的典型用法代碼示例。如果您正苦於以下問題:Python POSTagger類的具體用法?Python POSTagger怎麽用?Python POSTagger使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了POSTagger類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: nltk_stanfordpos
def nltk_stanfordpos(inpath, outfolder):
"""POS-Tagging French text with Stanford POS-Tagger via NLTK."""
print("\nLaunched nltk_stanfordpos.")
import os
import glob
from nltk.tag.stanford import POSTagger
for file in glob.glob(inpath):
st = POSTagger('/home/christof/Programs/stanfordpos/models/french.tagger', '/home/christof/Programs/stanfordpos/stanford-postagger.jar', encoding="utf8")
with open(file, "r", encoding="utf-8") as infile:
untagged = infile.read()
tagged = st.tag(untagged.split())
taggedstring = ""
for item in tagged:
item = "\t".join(item)
taggedstring = taggedstring + str(item) + "\n"
#print(taggedstring)
basename = os.path.basename(file)
cleanfilename = basename
if not os.path.exists(outfolder):
os.makedirs(outfolder)
with open(os.path.join(outfolder, cleanfilename),"w") as output:
output.write(taggedstring)
print("Done.")
示例2: main
def main():
st = POSTagger(
"/home/shaun/stanford-postagger-full-2013-11-12/models/german-dewac.tagger",
"/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar",
)
# st = POSTagger("/home/shaun/stanford-postagger-full-2013-11-12/models/german-fast.tagger", \
# "/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar")
# print st.tag("Die Kinder in Bayern haben lange Ferien".split())
# return
with open(sys.argv[1], "r") as f:
content = f.read()
sentences = re.split("\n|\.|\?", content)
for s in sentences:
if len(s) == 0:
continue
# print s
pieces = st.tag(s.split())
strippedPieces = stripPieces(pieces)
print " ".join(strippedPieces)
示例3: cleanTokens
def cleanTokens(tokens):
st = POSTagger('/models/german-fast.tagger')
tags = st.tag(tokens);
def cleanTags(x):
y = x[1]
return True if re.match("NE|NN",y) and len(x[0]) > 3 else False
clean_tags= filter(cleanTags,tags)
#import pdb;pdb.set_trace();
def buildSentens(arr):
list = []
sen =""
for i in arr:
list.append(i[0])
return list
#print len(clean_tags)
#print clean_tags
clean = buildSentens(clean_tags)
return clean
示例4: stanford_corenlp_filter
def stanford_corenlp_filter(sent):
from nltk.tag.stanford import POSTagger
posTagger = POSTagger('/Users/gt/Downloads/'
'stanford-postagger-2013-06-20/models/'
'wsj-0-18-bidirectional-nodistsim.tagger',
'/Users/gt/Downloads/stanford-postagger-2013-06-20'
'/stanford-postagger-3.2.0.jar',encoding=encoding)
b1, b2 = sent.split(blockSeparator)
b2 = b2.rstrip()
b1 = b1.lower()
tokens = word_tokenize(b1)
pos_tags = posTagger.tag(tokens)
filtered_sent = ' '
for pos_t in pos_tags:
if pos_t[1] in filterList:
# filtered_sent += stemmer.stem(pos_t[0]) + ' '
filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '
#note: 1 concat stemmer(word) == stemmer(1 concat word)
b2 = b2.lower()
tokens = word_tokenize(b2)
pos_tags = posTagger.tag(tokens)
filtered_sent = ' '
for pos_t in pos_tags:
if pos_t[1] in filterList:
# filtered_sent += stemmer.stem(pos_t[0]) + ' '
filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '
return filtered_sent
示例5: vectorizer
def vectorizer(tokens, w2v_db):
db_path = w2v_db
# POS TAGGING
tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar')
tagged_tokens = tagger.tag(tokens)
unsorted_kw = OrderedDict()
for (w,t) in tagged_tokens:
if t in ['NNP', 'NNPS', 'FW']:
label = 1.5
elif t in ['NN', 'NNS']:
label = 1
else:
continue
w = w.lower()
try:
unsorted_kw[w] += label
except KeyError:
unsorted_kw[w] = label
# Get the vectors of words. Maintain order as in document.
token_vecs = OrderedDict()
conn = SQLCon(db_path)
words = (word.lower() for word in unsorted_kw)
for word in words:
try:
if token_vecs[word]: continue
except KeyError:
v = conn.read(word)
if not v is None:
token_vecs[word] = list(v)
print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) #Output for debugging; total vs unique words.
conn.close()
return unsorted_kw, token_vecs
示例6: postext_st
def postext_st(filename):
# Opening of File
path_to_raw = '/home/cyneo/Work/Scans/Text Version/'
if type(filename) != str:
raise IOError('Filename must be a string')
# Preparing to Tokenize
with open(osp.abspath(path_to_raw + filename + '.txt'),
'r', encoding='utf8') as raw:
# Initialize the punkt module
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sents = []
for line in raw:
sents.extend(sent_detector.tokenize(line.strip()))
tokenedsents = []
# Tokenizing
from nltk.tokenize.stanford import StanfordTokenizer
for line in sents:
tokenedsents.append(StanfordTokenizer().tokenize(line))
# Parts of Speech Tagging
posSents = []
from nltk.tag.stanford import POSTagger
st = POSTagger('/mnt/sda2/stanford-packages/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger',
encoding='utf8')
for line in tokenedsents:
# Returns a list of a list of tuples
posSents.append(st.tag(line))
return posSents
示例7: createModel
def createModel():
global classifierit
global classifierloose
global classifieryou
global classifierto
global classifiertheir
trainingitSet = []
traininglooseSet = []
trainingyouSet = []
trainingtoSet = []
trainingtheirSet= []
st = POSTagger('/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/models/english-bidirectional-distsim.tagger', '/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/stanford-postagger.jar')
for line in brown.sents():
print line
tagSent = st.tag(line)
print tagSent
arrayOfitFeature = pos_itfeatures(tagSent)
arrayOfyouFeature = pos_youfeatures(tagSent)
arrayOftheirFeature = pos_theirfeatures(tagSent)
arrayOflooseFeature = pos_loosefeatures(tagSent)
arrayOftoFeature = pos_tofeatures(tagSent)
if arrayOfitFeature:
trainingitSet.extend(arrayOfitFeature)
if arrayOftheirFeature:
trainingtheirSet.extend(arrayOftheirFeature)
if arrayOflooseFeature:
traininglooseSet.extend(arrayOflooseFeature)
if arrayOftoFeature:
trainingtoSet.extend(arrayOftoFeature)
if arrayOfyouFeature:
trainingyouSet.extend(arrayOfyouFeature)
algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[1]
#encodingit = maxent.TypedMaxentFeatureEncoding.train(trainingitSet, count_cutoff=3, alwayson_features=True)
classifierit = maxent.MaxentClassifier.train(trainingitSet, algorithm)
f = open('classifierit.pickle', 'wb')
pickle.dump(classifierit, f)
f.close()
#encodingloose = maxent.TypedMaxentFeatureEncoding.train(traininglooseSet, count_cutoff=3, alwayson_features=True)
classifierloose = maxent.MaxentClassifier.train(traininglooseSet, algorithm)
f = open('classifierloose.pickle', 'wb')
pickle.dump(classifierloose, f)
f.close()
#encodingyou = maxent.TypedMaxentFeatureEncoding.train(trainingyouSet, count_cutoff=3, alwayson_features=True)
classifieryou = maxent.MaxentClassifier.train(trainingyouSet, algorithm)
f = open('classifieryou.pickle', 'wb')
pickle.dump(classifieryou, f)
f.close()
#encodingto = maxent.TypedMaxentFeatureEncoding.train(trainingtoSet, count_cutoff=3, alwayson_features=True)
classifierto = maxent.MaxentClassifier.train(trainingtoSet, algorithm)
f = open('classifierto.pickle', 'wb')
pickle.dump(classifierto, f)
f.close()
#encodingtheir = maxent.TypedMaxentFeatureEncoding.train(trainingtheirSet, count_cutoff=3, alwayson_features=True)
classifiertheir = maxent.MaxentClassifier.train(trainingtheirSet, algorithm)
f = open('classifiertheir.pickle', 'wb')
pickle.dump(classifiertheir, f)
f.close()
示例8: stanford_tag
def stanford_tag(sentence):
''' use stanford tagger to tag a single tokenized sentence
'''
import src.experiment.path as path
tagger = POSTagger(path.stanford_tagger_model_path(),
path.stanford_tagger_path(),
java_options='-Xmx16g -XX:MaxPermSize=256m')
return tagger.tag(sentence)
示例9: tag
def tag(segments):
#st = POSTagger('/home/dc65/Documents/tools/stanford-postagger-2014-01-04/models/english-left3words-distsim.tagger', '/home/dc65/Documents/tools/stanford-postagger-2014-01-04/stanford-postagger-3.3.1.jar')
st = POSTagger(os.path.join(stanford_path, 'models/english-left3words-distsim.tagger'),
os.path.join(stanford_path, 'stanford-postagger-3.3.1.jar'))
tagged = []
for segment in segments:
x = ' '.join(nltk.tag.tuple2str(w) for w in st.tag(word_tokenize(segment)))
tagged.append(x.decode('utf-8'))
return tagged
示例10: spanish_pos
def spanish_pos(text):
""" Parts of speech tagger for Spanish """
text = text.encode('utf8')
st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/spanish-distsim.tagger',
'/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8')
pos_tagged = st.tag(text.split())
return pos_tagged
示例11: german_pos
def german_pos(text):
""" Parts of speech tagger for German """
text = text.encode('utf8')
st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/german-fast.tagger',
'/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8')
pos_tagged = st.tag(text.split())
return pos_tagged
示例12: stanford_batch_tag
def stanford_batch_tag(sentences):
'''use stanford tagger to batch tag a list of tokenized
sentences
'''
import src.experiment.path as path
# need to replace the model path and tagger path of standford parser
# in your computer (I use two functions here, you can hard code the paths if
# you like)
tagger = POSTagger(path.stanford_tagger_model_path(),
path.stanford_tagger_path())
return tagger.batch_tag(sentences)
示例13: pos_tag
def pos_tag(texts):
from nltk.tag.stanford import POSTagger
jar = config.mainpath+"analyze/SPOS/stanford-postagger.jar"
if language == "german":
model = config.mainpath+"analyze/SPOS/models/german-fast.tagger"
if language == "english":
model = config.mainpath+"analyze/SPOS/models/english-bidirectional-distsim.tagger"
tagger = POSTagger(model, path_to_jar = jar, encoding="UTF-8")
return tagger.tag_sents(texts)
示例14: main
def main():
print "Inicio..."
with open("tweets_a_procesar_v2.csv", 'rb') as csvfile:
lines = csv.reader(csvfile, delimiter=DELIMITER, quotechar="'")
# En esta variable estan todos los tweets
tweets = []
for line in lines:
tweet = Tweet(line)
#print tweet.spanish_text.split()
tweets.append(tweet)
#archivo de salida
output = open("output_tagged_v2.csv", 'wb')
filewriter = csv.writer(output, delimiter=DELIMITER, quotechar="'")
#importando el tagger en español de Stanford NLP
from nltk.tag.stanford import POSTagger
st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish-distsim.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8')
#st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8')
#st = POSTagger('C:\Data\stanford-postagger-full-2014-08-27\models\spanish.tagger', 'C:\Data\stanford-postagger-full-2014-08-27\stanford-postagger-3.4.1.jar', encoding='utf-8')
n=0
for tweet in tweets:
n+=1
print tweet.spanish_text
#Ejemplo: st.tag('What is the airspeed of an unladen swallow ?'.split())
tweet_tagged = st.tag((tweet.spanish_text).split())
#Ejem_output: [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
#print tweet_tagged
important_words = []
n_adj = 0
for tag in tweet_tagged:
inicial = tag[1][:1]
if('a' in inicial):
important_words.append(tag[0])
if('r' in inicial):
important_words.append(tag[0])
if('n' in inicial):
important_words.append(tag[0])
if('v' in inicial):
important_words.append(tag[0])
#tweet.cant_adj = n_adj
tweet.tweet_tagged = tweet_tagged
tweet.important_words = important_words
filewriter.writerow(tweet.to_CSV())
if n % 100 == 0: print n
print "Done"
output.close()
示例15: pos_tag_stanford
def pos_tag_stanford(toked_sentence):
"""
INPUT: list of strings
OUTPUT: list of tuples
Given a tokenized sentence, return
a list of tuples of form (token, POS)
where POS is the part of speech of token
"""
from nltk.tag.stanford import POSTagger
st = POSTagger('/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger',
'/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar')
return st.tag(toked_sentence)