本文整理汇总了Python中corpus.Corpus.get_sentences方法的典型用法代码示例。如果您正苦于以下问题:Python Corpus.get_sentences方法的具体用法?Python Corpus.get_sentences怎么用?Python Corpus.get_sentences使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类corpus.Corpus
的用法示例。
在下文中一共展示了Corpus.get_sentences方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from corpus import Corpus [as 别名]
# 或者: from corpus.Corpus import get_sentences [as 别名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument("train", help="Path to training corpus.")
parser.add_argument("corpus", help="Path to corpus.")
parser.add_argument("n", help="Tag sentences shorter than this length.")
args = parser.parse_args()
train_corpus = Corpus(args.train)
corpus = Corpus(args.corpus)
n = int(args.n)
pos_frequencies = processing.pos_frequencies(train_corpus)
poses_for_word_from_train, total_pos_count = processing.calculate_poses_for_word(train_corpus)
pos_bigram_probabilities_train = processing.calculate_pos_bigram_probabilities(train_corpus)
word_pos_probabilities_train = processing.calculate_word_pos_probabilities(train_corpus)
sentences = [sentence for sentence in corpus.get_sentences() if len(sentence) < n]
WORD_GIVEN_POS = 0
POS_GIVEN_PREVPOS = 1
for sentence in sentences:
prev_pos = "<s>"
columns = {}
current_sentence = []
for word in sentence:
id, form, lemma, plemma, pos, ppos = word
current_sentence.append([id, form, lemma, plemma, pos])
columns[id] = {}
if form in poses_for_word_from_train:
for (pos_for_word, pos_for_word_count) in poses_for_word_from_train[form].items():
p_word_given_pos = word_pos_probabilities_train["{0} {1}".format(form, pos_for_word)]
pos_bigram = "{0} {1}".format(prev_pos, pos_for_word)
if pos_bigram in pos_bigram_probabilities_train:
p_pos_given_prevpos = pos_bigram_probabilities_train[pos_bigram]
else:
p_pos_given_prevpos = 0.00001 # Low chance that this is what we want
columns[id][pos_for_word] = {}
columns[id][pos_for_word][WORD_GIVEN_POS] = p_word_given_pos
columns[id][pos_for_word][POS_GIVEN_PREVPOS] = p_pos_given_prevpos
else:
most_common_pos = max(pos_frequencies.items(), key=lambda x: x[1])
if form in word_pos_probabilities_train:
p_word_given_pos = word_pos_probabilities_train["{0} {1}".format(form, most_common_pos[0])]
else:
p_word_given_pos = 0.00001 # Low chance that this is what we want
p_pos_given_prevpos = pos_bigram_probabilities_train["{0} {1}".format(prev_pos, most_common_pos[0])]
columns[id][most_common_pos[0]] = {}
columns[id][most_common_pos[0]][WORD_GIVEN_POS] = p_word_given_pos
columns[id][most_common_pos[0]][POS_GIVEN_PREVPOS] = p_pos_given_prevpos
prev_pos = pos
path = {}
trellis = {}
for (column_id, poses) in sorted(columns.items(), key=lambda x: int(x[0])):
column_id = int(column_id)
trellis[column_id] = {}
for (current_pos, data) in poses.items():
current_word_given_pos = data[WORD_GIVEN_POS]
current_pos_given_prevpos = data[POS_GIVEN_PREVPOS]
if column_id == 0:
break
elif column_id == 1:
trellis[column_id][current_pos] = current_word_given_pos * current_pos_given_prevpos
else:
max_prev_column = max(
[(id, data * current_pos_given_prevpos) for id, data in trellis[column_id - 1].items()],
key=lambda x: x[1],
)
p = max_prev_column[1] * current_word_given_pos
trellis[column_id][current_pos] = p
if column_id == 0:
continue
else:
path[column_id] = max(trellis[column_id].items(), key=lambda x: x[1])[0]
for (id, predicted) in sorted(path.items(), key=lambda x: x[0]):
if id == 1:
print()
id, form, lemma, plemma, pos = current_sentence[id]
print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(id, form, lemma, plemma, pos, predicted))
示例2: main
# 需要导入模块: from corpus import Corpus [as 别名]
# 或者: from corpus.Corpus import get_sentences [as 别名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('train', help='Path to training corpus.')
parser.add_argument('corpus', help='Path to corpus file.')
args = parser.parse_args()
train_corpus = Corpus(args.train)
corpus = Corpus(args.corpus)
pos_frequencies = processing.pos_frequencies(corpus)
word_pos_probabilities = processing.calculate_word_pos_probabilities(train_corpus)
bigram_probabilities = processing.calculate_pos_bigram_probabilities(train_corpus)
poses_for_words, total_pos_count = processing.calculate_poses_for_word(train_corpus)
for sentence in corpus.get_sentences():
parent_trellis = {'<s>': {'probability': 1, 'parent': None}}
for word in sentence:
id, form, lemma, plemma, current_word_pos, ppos = word
if word == [0, '<s>', '<s>', '<s>', '<s>', '<s>']:
continue
trellis = {}
# P(W|T)
if form not in poses_for_words:
most_common_pos = max(pos_frequencies.items(), key=lambda x: x[1])
probability_pos_given_prevpos = {}
for prev_pos in parent_trellis:
probability = parent_trellis[prev_pos]['probability']
bigram = '{0} {1}'.format(prev_pos, pos_for_word)
if not bigram in bigram_probabilities:
probability_bigram = 0.0001 # Gives highest value on the development set for unknown bigrams
else:
# P(T_i|T_i-1)
probability_bigram = bigram_probabilities[bigram]
probability *= probability_bigram
probability_pos_given_prevpos[prev_pos] = probability
max_probability = max(probability_pos_given_prevpos.items(), key=lambda x: x[1])
trellis[most_common_pos[0]] = {}
trellis[most_common_pos[0]][
'probability'] = 0.0001 # Gives highest value on the development set for unknown poses
trellis[most_common_pos[0]]['parent'] = {max_probability[0]: parent_trellis[max_probability[0]]}
else:
for (pos_for_word, count) in poses_for_words[form].items():
probability_word_given_pos = word_pos_probabilities['{0} {1}'.format(form, pos_for_word)]
probability_pos_given_prevpos = {}
for prev_pos in parent_trellis:
probability = parent_trellis[prev_pos]['probability']
bigram = '{0} {1}'.format(prev_pos, pos_for_word)
if not bigram in bigram_probabilities:
probability_bigram = 0.0001 # Gives highest value on the development set for
# unknown bigrams
else:
# P(T_i|T_i-1)
probability_bigram = bigram_probabilities[bigram]
probability *= probability_bigram
probability_pos_given_prevpos[prev_pos] = probability
max_probability = max(probability_pos_given_prevpos.items(), key=lambda x: x[1])
trellis[pos_for_word] = {}
trellis[pos_for_word]['probability'] = probability_word_given_pos * max_probability[1]
trellis[pos_for_word]['parent'] = {max_probability[0]: parent_trellis[max_probability[0]]}
parent_trellis = trellis
optimal_path = max(trellis.items(), key=lambda x: x[1]['probability'])
prev_path = {optimal_path[0]: optimal_path[1]}
current_id = int(sentence[-1][0])
while prev_path != None:
predicted = prev_path.keys()
if current_id == 0:
break
sentence[current_id][-1] = list(predicted)[0]
prev_path = prev_path[list(predicted)[0]]['parent']
current_id -= 1
for word in sentence:
id, form, lemma, plemma, pos, ppos = word
if id == 0:
print()
else:
print('{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(id, form, lemma, plemma, pos, ppos))