本文整理汇总了Python中document.Document.paragraphs方法的典型用法代码示例。如果您正苦于以下问题:Python Document.paragraphs方法的具体用法?Python Document.paragraphs怎么用?Python Document.paragraphs使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类document.Document
的用法示例。
在下文中一共展示了Document.paragraphs方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: TestDocumentBasics
# 需要导入模块: from document import Document [as 别名]
# 或者: from document.Document import paragraphs [as 别名]
class TestDocumentBasics(unittest.TestCase):
def setUp(self):
self.doc = Document(MR_LEE)
def test_initialization(self):
assert self.doc.raw_text == MR_LEE
def test_paragraphs(self):
assert len(self.doc.paragraphs()) == 18
def test_paragraph_stripping(self):
assert self.doc.paragraphs()[0][0] == "I"
def test_ensure_no_empty_sentences(self):
pgs = self.doc.paragraphs()
last_pg = pgs[len(pgs)-1]
assert last_pg != ''
def test_sentences_by_paragraph_are_arrays(self):
pgs = self.doc.sentences_by_paragraph()
assert len(pgs) == 18
def test_sentences_by_paragraph_are_arrays_of_arrays(self):
pgs = self.doc.sentences_by_paragraph()
for pg in pgs:
assert pg.__class__ == [].__class__
assert len(pg) > 0
def test_paragraphs_are_word_tokenized(self):
paragraph = self.doc.sentences_by_paragraph()[-1]
tokenized_paragraph = Document.word_tokenize_paragraph(paragraph)
assert len(tokenized_paragraph) == 4
for sentence in tokenized_paragraph:
assert sentence.__class__ == [].__class__
assert len(sentence) > 0
def test_tokenized_sentences_by_paragraph(self):
tsbp = self.doc.tokenized_sentences_by_paragraph()
assert len(tsbp) == 18
iterable_to_len = lambda x: len(x)
sentences_per_paragraph = map(iterable_to_len, tsbp)
assert sentences_per_paragraph == [1, 1, 3, 1, 4, 5, 3, 4, 2, 1, 1, 6, 2, 2, 6, 6, 2, 4]
known_sentence_lengths = [
[22],
[11],
[5, 9, 21],
[23],
[10, 10, 8, 18],
[6, 6, 15, 7, 6],
[8, 13, 9],
[6, 18, 20, 19],
[11, 19],
[12],
[23],
[12, 7, 15, 7, 6, 5],
[6, 22],
[37, 22],
[10, 36, 8, 18, 8, 13],
[10, 6, 3, 8, 9, 18],
[26, 66],
[26, 17, 5, 4]
]
words_per_sentence = [map(iterable_to_len, sentence) for sentence in tsbp]
assert words_per_sentence == known_sentence_lengths
def test_paragraphs_are_tagged(self):
tokenized_document = self.doc.tagged_document()
word_count = 0
for paragraph in tokenized_document:
for sentence in paragraph:
for word in sentence:
word_count += 1
assert len(word) == 2
assert word_count == 765