本文整理汇总了Python中nltk.tag.stanford.POSTagger.tag_sents方法的典型用法代码示例。如果您正苦于以下问题:Python POSTagger.tag_sents方法的具体用法?Python POSTagger.tag_sents怎么用?Python POSTagger.tag_sents使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tag.stanford.POSTagger
的用法示例。
在下文中一共展示了POSTagger.tag_sents方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: pos_tag
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag_sents [as 别名]
def pos_tag(texts):
from nltk.tag.stanford import POSTagger
jar = config.mainpath+"analyze/SPOS/stanford-postagger.jar"
if language == "german":
model = config.mainpath+"analyze/SPOS/models/german-fast.tagger"
if language == "english":
model = config.mainpath+"analyze/SPOS/models/english-bidirectional-distsim.tagger"
tagger = POSTagger(model, path_to_jar = jar, encoding="UTF-8")
return tagger.tag_sents(texts)
示例2: add_POS
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag_sents [as 别名]
def add_POS(self,row_file,target):
'''
row_str = '';
f = open(row_file,'rb');
for row in f:
row_str+=row;
soup = BeautifulSoup(row_str);
self.soup = soup;
sentences = soup.find_all('sentence');
all_token = list();
for block in sentences:
text = block.text.strip();
text_token = self.tf.stanford_tokenize(text);
all_token.append(text_token);
'''
all_token = self.get_token(target);
stanford_tagger = \
POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar');
tag_list = list();
for row in all_token:
temp_list = list();
for word in row:
if len(word)>1 and re.match(r'^[A-Z]+',word):
temp_list.append(word.lower());
else:
temp_list.append(word);
tag_list.append(temp_list);1
#end for
tagged_result = stanford_tagger.tag_sents(tag_list);
'''
for row in tagged_result:
index_list = list();
for num,item in enumerate(row):
if not re.match(r'.*[\w\d]+',item[0]):
index_list.append(num);
for i in index_list:
row[i]=(row[i][0],row[i][0]);
#end for
'''
w = open('pos_%s'%target,'wb');
for num1,row in enumerate(tagged_result):
for num2,item in enumerate(row):
w.write(all_token[num1][num2]+' '+item[1]+'\n');
w.write('\n');
#print tagged_result;
return;
示例3: generate_pos_set
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag_sents [as 别名]
def generate_pos_set(self):
print '正在构建正性集词典....';
pos_dict = dict();
pos_set=set();
sentences = list();
for row in self.train_label:
for key in row:
if ' ' in key:
sentences.append(self.tk.word_tokenize(key));
else:
pos_dict[key] = pos_dict.setdefault(key,0) + 1;
#pos_set.add(key);
#end for
st=POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger'\
,'../stanford-postagger-full-2015-01-30/stanford-postagger.jar');
result = st.tag_sents(sentences);
for row in result:
for item in row:
if item[1].startswith('NN'):
pos_dict[item[0]] = pos_dict.setdefault(item[0],0) + 1;
#pos_set.add(item[0]);
#end for
neg_dict = dict();
for num,row in enumerate(self.tagged_train_data):
for item in row :
if item[1].startswith('NN') and item[0] not in self.train_word_label[num]:
neg_dict[item[0]] = neg_dict.setdefault(item[0],0) + 1;
for key in pos_dict.keys():
if pos_dict[key] > 1:
if neg_dict.has_key(key):
if neg_dict[key]/pos_dict[key] < 2:
pos_set.add(key);
else:
pos_set.add(key);
self.pos_set=pos_set;
print '完成!';
return;
示例4: __init__
# 需要导入模块: from nltk.tag.stanford import POSTagger [as 别名]
# 或者: from nltk.tag.stanford.POSTagger import tag_sents [as 别名]
class POSTagSelector:
def __init__(self, pos_model, stanford_tagger, java_path):
"""
Creates a POSTagSelector instance.
@param pos_model: Path to a POS tagging model for the Stanford POS Tagger.
The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
@param stanford_tagger: Path to the "stanford-postagger.jar" file.
The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
@param java_path: Path to the system's "java" executable.
Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems.
"""
os.environ['JAVAHOME'] = java_path
self.tagger = POSTagger(pos_model, stanford_tagger)
def selectCandidates(self, substitutions, victor_corpus):
"""
Selects which candidates can replace the target complex words in each instance of a VICTOR corpus.
@param substitutions: Candidate substitutions to be filtered.
It can be in two formats:
A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions.
Example: substitutions['perched'] = {'sat', 'roosted'}
A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector.
Example: [['sat', 'roosted'], ['easy', 'uncomplicated']]
@param victor_corpus: Path to a corpus in the VICTOR format.
For more information about the file's format, refer to the LEXenstein Manual.
@return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus.
"""
selected_substitutions = []
substitution_candidates = []
if isinstance(substitutions, list):
substitution_candidates = substitutions
elif isinstance(substitutions, dict):
void = VoidSelector()
substitution_candidates = void.selectCandidates(substitutions, victor_corpus)
else:
print('ERROR: Substitutions are neither a dictionary or a list!')
return selected_substitutions
#Read VICTOR corpus:
lexf = open(victor_corpus)
sents = []
targets = []
heads = []
words = set([])
c = -1
for line in lexf:
c += 1
data = line.strip().split('\t')
sent = data[0].strip().split(' ')
target = data[1].strip()
head = int(data[2].strip())
sents.append(sent)
targets.append(target)
heads.append(head)
words.update(set(substitution_candidates[c]))
lexf.close()
#Tag sentences:
tagged_sents = self.tagger.tag_sents(sents)
#Tag words:
words = list(words)
words_sents = [[w] for w in words]
tagged_words = self.tagger.tag_sents(words_sents)
word_to_tag = {}
for i in range(0, len(words)):
word_to_tag[words[i]] = tagged_words[i][0][1]
for i in range(0, len(sents)):
target = targets[i]
head = heads[i]
target_pos = str(tagged_sents[i][head][1])
candidates = []
candidates = set(substitution_candidates[i])
candidates = self.getCandidatesWithSamePOS(candidates, word_to_tag, target_pos)
selected_substitutions.append(candidates)
lexf.close()
return selected_substitutions
def getTargetPOS(self, sent, target, head):
pos_data = []
try:
pos_data = nltk.pos_tag(sent)
return pos_data[head][1]
except UnicodeDecodeError:
try:
pos_data = nltk.pos_tag(target)
return pos_data[0][1]
except UnicodeDecodeError:
return 'None'
def getCandidatesWithSamePOS(self, candidates, word_to_tag, target_pos):
result = set([])
#.........这里部分代码省略.........