本文整理汇总了Python中bs4.dammit.UnicodeDammit.detwingle方法的典型用法代码示例。如果您正苦于以下问题:Python UnicodeDammit.detwingle方法的具体用法?Python UnicodeDammit.detwingle怎么用?Python UnicodeDammit.detwingle使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类bs4.dammit.UnicodeDammit
的用法示例。
在下文中一共展示了UnicodeDammit.detwingle方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_detwingle_ignores_multibyte_characters
# 需要导入模块: from bs4.dammit import UnicodeDammit [as 别名]
# 或者: from bs4.dammit.UnicodeDammit import detwingle [as 别名]
def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending
# in \x93. \x93 is a smart quote if interpreted as
# Windows-1252. But our code knows to skip over multibyte
# UTF-8 characters, so they'll survive the process unscathed.
for tricky_unicode_char in (
"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
):
input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93'))
output = UnicodeDammit.detwingle(input)
self.assertEqual(output, input)
示例2: exportUrlFeeder
# 需要导入模块: from bs4.dammit import UnicodeDammit [as 别名]
# 或者: from bs4.dammit.UnicodeDammit import detwingle [as 别名]
def exportUrlFeeder(self, filename,urlList):# Takes as an input a list and returns nothing.
'''
Description: This function is used to export the urls into a flat file.
Status: In progress - Should be moved to a separate package.
Usage: Is used within the harvest functions as a url exporter.
'''
urlList = sorted(urlList) # Sort urls so it can be more easy to read.
fobj = open(filename,'wa')
for link in range(len(urlList)):
try:
encodedUrl = UnicodeDammit.detwingle(urlList[link])
encodedUrl.decode("utf8")
fobj.write(encodedUrl) # Exports the urls in a file.Re move function-
fobj.write('\n')
except:
exportFeedLogger.logError("Unexpected error while open output file in exportUrlFeeder")
pass
fobj.flush() # Flush IO buffer.
fobj.close()# Close file.
示例3: test_detwingle
# 需要导入模块: from bs4.dammit import UnicodeDammit [as 别名]
# 或者: from bs4.dammit.UnicodeDammit import detwingle [as 别名]
def test_detwingle(self):
# Here's a UTF8 document.
utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
# Here's a Windows-1252 document.
windows_1252 = ("\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" "\N{RIGHT DOUBLE QUOTATION MARK}").encode(
"windows_1252"
)
# Through some unholy alchemy, they've been stuck together.
doc = utf8 + windows_1252 + utf8
# The document can't be turned into UTF-8:
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
# Unicode, Dammit thinks the whole document is Windows-1252,
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
# But if we run it through fix_embedded_windows_1252, it's fixed:
fixed = UnicodeDammit.detwingle(doc)
self.assertEqual("☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))