本文整理汇总了Python中voikkohtml.parseHtml函数的典型用法代码示例。如果您正苦于以下问题:Python parseHtml函数的具体用法?Python parseHtml怎么用?Python parseHtml使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了parseHtml函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: checkPage
def checkPage(url, dictionary, clientIp, requestHeaders, checkForMaybeErrors):
log("checkPage: " + url.encode("UTF-8"))
if dictionary not in _voikko:
return u""
v = _voikko[dictionary]
try:
html = getHtmlSafely(url.encode('UTF-8'), clientIp, requestHeaders)
segments = parseHtml(html)
res = u"Analyysi sivusta " + escape(url) + u"<br />"
v.setAcceptUnfinishedParagraphsInGc(True)
for segment in segments:
segmentClass = None
checkGrammar = True
if segment[0] == SEGMENT_TYPE_HEADING:
v.setAcceptTitlesInGc(True)
v.setAcceptBulletedListsInGc(False)
segmentClass = u"webvoikkoH"
elif segment[0] == SEGMENT_TYPE_LIST_ITEM:
v.setAcceptTitlesInGc(False)
v.setAcceptBulletedListsInGc(True)
segmentClass = u"webvoikkoLi"
elif segment[0] == SEGMENT_TYPE_PARAGRAPH:
v.setAcceptTitlesInGc(False)
v.setAcceptBulletedListsInGc(False)
segmentClass = u"webvoikkoP"
elif segment[0] == SEGMENT_TYPE_OTHER:
checkGrammar = False
segmentClass = u"webvoikkoO"
res = res + u"<p class='" + segmentClass + u"'>" + doSpell(segment[1], v, checkGrammar, checkForMaybeErrors) + u"</p>"
return res
except HttpException, e:
return u"Sivua %s ei voitu hakea: %s" % (escape(url), e.parameter)
示例2: testH1WithinPClosesP
def testH1WithinPClosesP(self):
result = parseHtml(u"<html><body><p>Kissa<h1>Koira</h1>jotain muuta</p></body></html>")
self.assertEquals(
[
(SEGMENT_TYPE_PARAGRAPH, u"Kissa"),
(SEGMENT_TYPE_HEADING, u"Koira"),
(SEGMENT_TYPE_OTHER, u"jotain muuta"),
],
result,
)
示例3: testUnclosedP
def testUnclosedP(self):
result = parseHtml(u"<html><body><p>kissa<p>koira<div><p>hevonen</div></body></html>")
self.assertEquals(
[
(SEGMENT_TYPE_PARAGRAPH, u"kissa"),
(SEGMENT_TYPE_PARAGRAPH, u"koira"),
(SEGMENT_TYPE_PARAGRAPH, u"hevonen"),
],
result,
)
示例4: testNonAscii
def testNonAscii(self):
result = parseHtml(u"<html><body><h1>Eläinlääkärissä käynti €</h1></body></html>")
self.assertEquals([(SEGMENT_TYPE_HEADING, u"Eläinlääkärissä käynti €")], result)
示例5: testUnderlineInducesNoSpace
def testUnderlineInducesNoSpace(self):
result = parseHtml(u"<html><body><h1>Libre<u>Office</u></h1></body></html>")
self.assertEquals([(SEGMENT_TYPE_HEADING, u"LibreOffice")], result)
示例6: testParseHeader
def testParseHeader(self):
result = parseHtml(u"<html><body><h1>Kissan ruokkiminen</h1></body></html>")
self.assertEquals([(SEGMENT_TYPE_HEADING, u"Kissan ruokkiminen")], result)
示例7: testParseNestedLists
def testParseNestedLists(self):
result = parseHtml(u"<html><body><ul><li>kissa<ul><li>koira</li></ul></li></ul></body></html>")
self.assertEquals([(SEGMENT_TYPE_OTHER, u"kissa"), (SEGMENT_TYPE_LIST_ITEM, u"koira")], result)
示例8: testTextWithinBody
def testTextWithinBody(self):
result = parseHtml(u"<html><body>kissa</body></html>")
self.assertEquals([(SEGMENT_TYPE_OTHER, u"kissa")], result)
示例9: testParseParagraph
def testParseParagraph(self):
result = parseHtml(u"<html><body><p>Kissaa on ruokittava huolella.</p></body></html>")
self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava huolella.")], result)
示例10: testScriptsAreStripped
def testScriptsAreStripped(self):
result = parseHtml(u"<html><body><p>Kissaa on ruokittava.</p><script>lksjdf</script></body></html>")
self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava.")], result)
示例11: testScriptsWithinPIsIgnoredAndContentStripped
def testScriptsWithinPIsIgnoredAndContentStripped(self):
result = parseHtml(u"<html><body><p>Kissaa on <script>aksldj</script>ruokittava.</p></body></html>")
self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava.")], result)
示例12: testExtraWhitespaceIsRemoved
def testExtraWhitespaceIsRemoved(self):
result = parseHtml(u"<html><body><p>\tKissaa on \rruokittava huolella. </p></body></html>")
self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava huolella.")], result)
示例13: testLineFeedIsJustSpace
def testLineFeedIsJustSpace(self):
result = parseHtml(u"<html><body><p>Kissaa\non\r\nruokittava\rhuolella.</p></body></html>")
self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava huolella.")], result)
示例14: testBrIsWhitespace
def testBrIsWhitespace(self):
result = parseHtml(u"<html><body><p>Kissaa on ruokittava<br/>huolella.</p></body></html>")
self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava huolella.")], result)
示例15: testIgnoreImages
def testIgnoreImages(self):
result = parseHtml(u"<html><body><p>Kissaa <img src='cat.jpg'>on ruokittava.</p></body></html>")
self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava.")], result)