本文整理汇总了Python中article.Article.sentence_stream方法的典型用法代码示例。如果您正苦于以下问题:Python Article.sentence_stream方法的具体用法?Python Article.sentence_stream怎么用?Python Article.sentence_stream使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类article.Article
的用法示例。
在下文中一共展示了Article.sentence_stream方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: article_test
# 需要导入模块: from article import Article [as 别名]
# 或者: from article.Article import sentence_stream [as 别名]
def article_test(session):
sentence_stream = Article.sentence_stream(limit = TEST_SET)
for sent in sentence_stream:
txt = " ".join(t["x"] for t in sent if "x" in t)
if txt:
toklist = tokenize(txt, enclosing_session = session)
dlist = tagger.tag(toklist)
print("Sentence: '{0}'".format(txt))
print("Tagging result:\n{0}".format("\n".join(str(d) for d in dlist)))
示例2: train_tagger
# 需要导入模块: from article import Article [as 别名]
# 或者: from article.Article import sentence_stream [as 别名]
def train_tagger():
""" Train the TnT tagger and store its model in a pickle file """
# Number of training and test sentences
TRAINING_SET = 0 # 25000
TEST_SET = 400
BEAM_SIZE = 250 # A higher number does not seem to yield improved results
tnt_tagger = TnT(N = BEAM_SIZE, C = True)
if TRAINING_SET:
with timeit(f"Train TnT tagger on {TRAINING_SET} sentences from articles"):
# Get a sentence stream from parsed articles
# Number of sentences, size of training set
sentence_stream = Article.sentence_stream(limit = TRAINING_SET, skip = TEST_SET)
word_tag_stream = IFD_Tagset.word_tag_stream(sentence_stream)
tnt_tagger.train(word_tag_stream)
with timeit(f"Train TnT tagger on IFD training set"):
# Get a sentence stream from parsed articles
# Number of sentences, size of training set
sample_ratio = 50
word_tag_stream = IFD_Corpus().word_tag_stream(skip = lambda n: n % sample_ratio == 0)
tnt_tagger.train(word_tag_stream)
with timeit(f"Store TnT model trained on {tnt_tagger.count} sentences"):
tnt_tagger.store(_TNT_MODEL_FILE)
示例3: test_tagger
# 需要导入模块: from article import Article [as 别名]
# 或者: from article.Article import sentence_stream [as 别名]
def test_tagger():
print("Initializing tagger")
# Number of training and test sentences
TRAINING_SET = 500
IFD_TRAINING_SET = 21000 # There are only about 20.800 sentences in the IFD corpus
TEST_SET = 400
BEAM_SIZE = 250 # A higher number does not seem to yield improved results
# noinspection PyUnreachableCode
if False:
tnt_tagger = TnT(N = BEAM_SIZE, C = True)
tagger = NgramTagger(n = 3, verbose = False)
# Create a new model and store it
with timeit("Train NgramTagger"):
# Get a sentence stream from parsed articles
# Number of sentences, size of training set
sentence_stream = Article.sentence_stream(limit = TRAINING_SET, skip = TEST_SET)
tagger.train(sentence_stream)
with timeit("Train TnT_Tagger on articles"):
# Get a sentence stream from parsed articles
# Number of sentences, size of training set
sentence_stream = Article.sentence_stream(limit = TRAINING_SET, skip = TEST_SET)
word_tag_stream = IFD_Tagset.word_tag_stream(sentence_stream)
tnt_tagger.train(word_tag_stream)
with timeit("Train TnT_Tagger on IFD"):
# Get a sentence stream from parsed articles
# Number of sentences, size of training set
word_tag_stream = IFD_Corpus().word_tag_stream(limit = IFD_TRAINING_SET, skip = TEST_SET)
tnt_tagger.train(word_tag_stream)
with timeit("Store TnT model"):
tnt_tagger.store(_TNT_MODEL_FILE)
else:
tagger = None
# Load an existing model
with timeit("load_model()"):
tnt_tagger = TnT.load(_TNT_MODEL_FILE)
if tnt_tagger is None:
print(f"Unable to load TnT model from {_TNT_MODEL_FILE}, test aborted")
return
#tagger.show_model()
#return
total_tags = 0
correct_tag = 0
partial_tag = 0
missing_tag = 0
correct_tag_tnt = 0
partial_tag_tnt = 0
missing_tag_tnt = 0
def simple_test(session):
txt = "Þau segja að börn hafi gott af því."
toklist = tokenize(txt, enclosing_session = session)
dlist = tagger.tag(toklist)
print("Sentence: '{0}'".format(txt))
print("Tagging result:\n{0}".format("\n".join(str(d) for d in dlist)))
def article_test(session):
sentence_stream = Article.sentence_stream(limit = TEST_SET)
for sent in sentence_stream:
txt = " ".join(t["x"] for t in sent if "x" in t)
if txt:
toklist = tokenize(txt, enclosing_session = session)
dlist = tagger.tag(toklist)
print("Sentence: '{0}'".format(txt))
print("Tagging result:\n{0}".format("\n".join(str(d) for d in dlist)))
def test_ifd_file(session):
print("\n\n*** IFD TEST SET ***\n\n")
gen = IFD_Corpus().raw_sentence_stream(limit = TEST_SET)
dlist = None
for sent in gen:
orðalisti = [ triple[0] for triple in sent ]
mörk_OTB = [ triple[1] for triple in sent ]
lemmur_OTB = [ triple[2] for triple in sent ]
txt = " ".join(orðalisti)
if tagger is not None:
toklist = tokenize(txt, enclosing_session = session)
dlist = tagger.tag(toklist)
tntlist = tnt_tagger.tag(orðalisti)
ix = 0
print("\n{0}\n".format(txt))
for tag, lemma, word, tnt_wt in zip(mörk_OTB, lemmur_OTB, orðalisti, tntlist):
tnt_tag = tnt_wt[1]
j = ix
if dlist is None:
gtag = "?"
else:
while j < len(dlist) and dlist[j].get("x", "") != word:
j += 1
if j < len(dlist):
ix = j
gtag = dlist[ix].get("i", "?")
if gtag == "?" and dlist[ix].get("k") == TOK.PUNCTUATION:
gtag = word
#.........这里部分代码省略.........