本文整理汇总了Python中HTMLParser.HTMLParser方法的典型用法代码示例。如果您正苦于以下问题:Python HTMLParser.HTMLParser方法的具体用法?Python HTMLParser.HTMLParser怎么用?Python HTMLParser.HTMLParser使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类HTMLParser
的用法示例。
在下文中一共展示了HTMLParser.HTMLParser方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_cdata_with_closing_tags
# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def test_cdata_with_closing_tags(self):
# see issue #13358
# make sure that HTMLParser calls handle_data only once for each CDATA.
# The normal event collector normalizes the events in get_events,
# so we override it to return the original list of events.
class Collector(EventCollector):
def get_events(self):
return self.events
content = """<!-- not a comment --> ¬-an-entity-ref;
<a href="" /> </p><p> & <span></span></style>
'</script' + '>' </html> </head> </scripter>!"""
for element in [' script', 'script ', ' script ',
'\nscript', 'script\n', '\nscript\n']:
s = u'<script>{content}</{element}>'.format(element=element,
content=content)
self._run_check(s, [("starttag", "script", []),
("data", content),
("endtag", "script")],
collector=Collector)
示例2: __init__
# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def __init__(self, file_name, user_id):
"""
Reads data from file, loads it as JSON
"""
with open(file_name, 'r') as self.opened_file:
self.data = self.opened_file.read()
self.user = user_id
self.data = ujson.loads(self.data)
self.urls = dict() # Keeps track of all the urls in the import file, used when adding to db
self.tags_dict = dict() # Store tag objects for imported bookmarks
self.tags_set = set() # Keeps track of all the tags in the import file
self.check_duplicates = dict() # Store all current bookmarks for the user
self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
Bookmark.deleted == False).all()
for x in self.check_duplicates_query:
self.check_duplicates[x.main_url] = x # Add bookmark object to dict
self.html_parser = HTMLParser.HTMLParser()
self.valid_url = re.compile(
r'^(?:[a-z0-9\.\-]*)://'
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
r'(?::\d+)?'
r'(?:/?|[/?]\S+)$', re.IGNORECASE) # We only want valid URLs in the database
示例3: resolveParseResult
# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def resolveParseResult(self, result, itemName):
""" This method is due to the fact that our result set is a list of dicts """
resultValue = ""
try:
resultValue = result[itemName][0]
resultValue = util.html_unescape(resultValue)
resultValue = resultValue.strip()
# unescape ugly html encoding from websites
resultValue = HTMLParser().unescape(resultValue)
except Exception as e:
# log.warn("Error while resolving item: " + itemName + " : " + str(exc))
log.warn("Error while resolving item: {0} : {1} {2}".format(itemName, type(e), str(e)))
try:
log.debug("Result " + itemName + " = " + resultValue)
except:
pass
return resultValue
示例4: get_url
# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def get_url(domain,port,timeout):
url_list = []
if port ==443:
surl = 'https://' + domain
else:
surl = 'http://' + domain
res = urllib2.urlopen(surl, timeout=timeout)
html = res.read()
root_url = res.geturl()
m = re.findall("<(?:img|link|script)[^>]*?(?:src|href)=('|\")(.*?)\\1", html, re.I)
if m:
for url in m:
ParseResult = urlparse.urlparse(url[1])
if ParseResult.netloc and ParseResult.scheme:
if domain == ParseResult.hostname:
url_list.append(HTMLParser.HTMLParser().unescape(url[1]))
elif not ParseResult.netloc and not ParseResult.scheme:
url_list.append(HTMLParser.HTMLParser().unescape(urlparse.urljoin(root_url, url[1])))
return list(set(url_list))
示例5: feed
# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def feed(self, markup):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
try:
parser.feed(markup)
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
示例6: get_attribute_line_column
# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def get_attribute_line_column(tag_definition, line, column, attribute):
"""Returns the line and column of the provided attribute.
Args:
tag_definition: str with the definition of the tag.
line: line where the tag starts.
column: column where the tag starts (1-based).
attribute: str representing the attribute to find.
Return:
A (line, column) tuple representing the position of the attribute.
"""
for match in HTMLParser.attrfind.finditer(tag_definition):
if match.group(1).lower() == attribute:
return get_line_column(tag_definition, line, column, match.start(1))
assert False, 'Could not find the requested attribute %s' % attribute
示例7: __init__
# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def __init__(self, html):
self._messages = []
# Variables used to get the indentation
self._last_data = ''
self._last_data_position = (0, 1)
self._last_indent = 0
# Variables used to check if a charset tag should be required.
self._first_meta_line_col = None
self._after_head_line_col = None
self._has_charset = False
# Variables to extend the feature set of HTMLParser.
self._endtag_text = None
HTMLParser.HTMLParser.__init__(self)
# In case we are dealing with Python 3, set it to non-strict mode.
if hasattr(self, 'strict'):
self.strict = False
self.feed(html)
self.close()
示例8: unescape
# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def unescape(s):
import HTMLParser
html_parser = HTMLParser.HTMLParser()
return html_parser.unescape(s)
示例9: __init__
# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def __init__(self, url=""):
HTMLParser.HTMLParser.__init__(self)
self.liens = []
self.forms = []
self.form_values = []
self.inform = 0
self.inscript = 0
self.current_form_url = url
self.uploads = []
self.current_form_method = "get"
self.url = url
self.__defaults = {'checkbox': 'default',
'color': '%23adeadb',
'date': '2011-06-08',
'datetime': '2011-06-09T20:35:34.32',
'datetime-local': '2011-06-09T22:41',
'file': ['pix.gif', 'GIF89a'],
'hidden': 'default',
'email': 'wapiti%40mailinator.com',
'month': '2011-06',
'number': '1337',
'password': 'letmein',
'radio': 'beton',
'range': '37',
'search': 'default',
'submit': 'submit',
'tel': '0606060606',
'text': 'default',
'time': '13:37',
'url': 'http://wapiti.sf.net/',
'week': '2011-W24'
}
# This is ugly but let's keep it while there is not a js parser
self.common_js_strings = ["Msxml2.XMLHTTP", "application/x-www-form-urlencoded", ".php", "text/xml",
"about:blank", "Microsoft.XMLHTTP", "text/plain", "text/javascript",
"application/x-shockwave-flash"]
示例10: htmlparser_trace
# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def htmlparser_trace(data):
"""Print out the HTMLParser events that occur during parsing.
This lets you see how HTMLParser parses a document when no
Beautiful Soup code is running.
"""
parser = AnnouncingParser()
parser.feed(data)
示例11: unescape
# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def unescape(s):
html_parser = HTMLParser.HTMLParser()
return html_parser.unescape(s)
# Return addrlist sequence at random, it can help create_connection function
示例12: safe_text
# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def safe_text(text):
return (HTMLParser.HTMLParser().unescape(text.decode('utf8'))).encode('utf8')
示例13: masterlist
# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def masterlist():
master_db = []
master_data = connection.getURL(SHOWS)
json = simplejson.loads(master_data)['menu_html']
master_menu = re.compile('<li class="views-row .*?">.*?<div>\s*<div><a href="(.*?)">.*?<div class="field .*?">\n\s*(.*?)</div>.*?</li>' , re.DOTALL).findall(json)
for season_url, master_name in master_menu:
master_name = common.smart_unicode(master_name).strip()
master_name = HTMLParser.HTMLParser().unescape(master_name)
master_db.append((master_name, SITE, 'seasons', season_url))
return master_db
示例14: get_url
# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def get_url(self,url):
headers = {'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'}
try:
r = requests.get(url,headers=headers)
html = HTMLParser.HTMLParser().unescape(r.content.decode('utf-8'))
return html
except:
return ''
示例15: get_url
# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def get_url(url):
#headers = {'user-agent': 'Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/10.0.9.2372 Mobile Safari/537.10+'}
headers = {'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'}
try:
r = requests.get(url,headers=headers)
html = HTMLParser.HTMLParser().unescape(r.content.decode('utf-8'))
return html
except:
return ''