当前位置: 首页>>代码示例>>Python>>正文


Python HTMLParser.HTMLParser方法代码示例

本文整理汇总了Python中HTMLParser.HTMLParser方法的典型用法代码示例。如果您正苦于以下问题:Python HTMLParser.HTMLParser方法的具体用法?Python HTMLParser.HTMLParser怎么用?Python HTMLParser.HTMLParser使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在HTMLParser的用法示例。


在下文中一共展示了HTMLParser.HTMLParser方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_cdata_with_closing_tags

# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def test_cdata_with_closing_tags(self):
        # see issue #13358
        # make sure that HTMLParser calls handle_data only once for each CDATA.
        # The normal event collector normalizes the events in get_events,
        # so we override it to return the original list of events.
        class Collector(EventCollector):
            def get_events(self):
                return self.events

        content = """<!-- not a comment --> &not-an-entity-ref;
                  <a href="" /> </p><p> &amp; <span></span></style>
                  '</script' + '>' </html> </head> </scripter>!"""
        for element in [' script', 'script ', ' script ',
                        '\nscript', 'script\n', '\nscript\n']:
            s = u'<script>{content}</{element}>'.format(element=element,
                                                        content=content)
            self._run_check(s, [("starttag", "script", []),
                                ("data", content),
                                ("endtag", "script")],
                            collector=Collector) 
开发者ID:IronLanguages,项目名称:ironpython2,代码行数:22,代码来源:test_htmlparser.py

示例2: __init__

# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def __init__(self, file_name, user_id):
        """
        Reads data from file, loads it as JSON
        """
        with open(file_name, 'r') as self.opened_file:
            self.data = self.opened_file.read()
        self.user = user_id
        self.data = ujson.loads(self.data)
        self.urls = dict()  # Keeps track of all the urls in the import file, used when adding to db
        self.tags_dict = dict()  # Store tag objects for imported bookmarks
        self.tags_set = set()  # Keeps track of all the tags in the import file
        self.check_duplicates = dict()  # Store all current bookmarks for the user
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for x in self.check_duplicates_query:
            self.check_duplicates[x.main_url] = x  # Add bookmark object to dict
        self.html_parser = HTMLParser.HTMLParser()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)  # We only want valid URLs in the database 
开发者ID:dhamaniasad,项目名称:crestify,代码行数:26,代码来源:parsers.py

示例3: resolveParseResult

# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def resolveParseResult(self, result, itemName):
        """ This method is due to the fact that our result set is a list of dicts """

        resultValue = ""

        try:
            resultValue = result[itemName][0]
            resultValue = util.html_unescape(resultValue)
            resultValue = resultValue.strip()
            # unescape ugly html encoding from websites
            resultValue = HTMLParser().unescape(resultValue)

        except Exception as e:
            # log.warn("Error while resolving item: " + itemName + " : " + str(exc))
            log.warn("Error while resolving item: {0} : {1} {2}".format(itemName, type(e), str(e)))

        try:
            log.debug("Result " + itemName + " = " + resultValue)
        except:
            pass

        return resultValue 
开发者ID:maloep,项目名称:romcollectionbrowser,代码行数:24,代码来源:matcher.py

示例4: get_url

# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def get_url(domain,port,timeout):
    url_list = []
    if port ==443:
        surl = 'https://' + domain
    else:
        surl = 'http://' + domain
    res = urllib2.urlopen(surl, timeout=timeout)
    html = res.read()
    root_url = res.geturl()
    m = re.findall("<(?:img|link|script)[^>]*?(?:src|href)=('|\")(.*?)\\1", html, re.I)
    if m:
        for url in m:
            ParseResult = urlparse.urlparse(url[1])
            if ParseResult.netloc and ParseResult.scheme:
                if domain == ParseResult.hostname:
                    url_list.append(HTMLParser.HTMLParser().unescape(url[1]))
            elif not ParseResult.netloc and not ParseResult.scheme:
                url_list.append(HTMLParser.HTMLParser().unescape(urlparse.urljoin(root_url, url[1])))
    return list(set(url_list)) 
开发者ID:ysrc,项目名称:xunfeng,代码行数:21,代码来源:nginx_CVE_2017_7529.py

示例5: feed

# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3. 
开发者ID:evait-security,项目名称:weeman,代码行数:18,代码来源:_htmlparser.py

示例6: get_attribute_line_column

# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def get_attribute_line_column(tag_definition, line, column, attribute):
    """Returns the line and column of the provided attribute.

    Args:
        tag_definition: str with the definition of the tag.
        line: line where the tag starts.
        column: column where the tag starts (1-based).
        attribute: str representing the attribute to find.

    Return:
       A (line, column) tuple representing the position of the attribute.
    """
    for match in HTMLParser.attrfind.finditer(tag_definition):
        if match.group(1).lower() == attribute:
            return get_line_column(tag_definition, line, column, match.start(1))

    assert False, 'Could not find the requested attribute %s' % attribute 
开发者ID:deezer,项目名称:html-linter,代码行数:19,代码来源:html_linter.py

示例7: __init__

# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def __init__(self, html):
        self._messages = []

        # Variables used to get the indentation
        self._last_data = ''
        self._last_data_position = (0, 1)
        self._last_indent = 0

        # Variables used to check if a charset tag should be required.
        self._first_meta_line_col = None
        self._after_head_line_col = None
        self._has_charset = False

        # Variables to extend the feature set of HTMLParser.
        self._endtag_text = None

        HTMLParser.HTMLParser.__init__(self)

        # In case we are dealing with Python 3, set it to non-strict mode.
        if hasattr(self, 'strict'):
            self.strict = False

        self.feed(html)
        self.close() 
开发者ID:deezer,项目名称:html-linter,代码行数:26,代码来源:html_linter.py

示例8: unescape

# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def unescape(s):
    import HTMLParser
    html_parser = HTMLParser.HTMLParser()
    return html_parser.unescape(s) 
开发者ID:fpsw,项目名称:Servo,代码行数:6,代码来源:utils.py

示例9: __init__

# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def __init__(self, url=""):
        HTMLParser.HTMLParser.__init__(self)
        self.liens = []
        self.forms = []
        self.form_values = []
        self.inform = 0
        self.inscript = 0
        self.current_form_url = url
        self.uploads = []
        self.current_form_method = "get"
        self.url = url
        self.__defaults = {'checkbox':       'default',
                           'color':          '%23adeadb',
                           'date':           '2011-06-08',
                           'datetime':       '2011-06-09T20:35:34.32',
                           'datetime-local': '2011-06-09T22:41',
                           'file':           ['pix.gif', 'GIF89a'],
                           'hidden':         'default',
                           'email':           'wapiti%40mailinator.com',
                           'month':          '2011-06',
                           'number':         '1337',
                           'password':       'letmein',
                           'radio':          'beton',
                           'range':          '37',
                           'search':         'default',
                           'submit':         'submit',
                           'tel':            '0606060606',
                           'text':           'default',
                           'time':           '13:37',
                           'url':            'http://wapiti.sf.net/',
                           'week':           '2011-W24'
                           }
        # This is ugly but let's keep it while there is not a js parser
        self.common_js_strings = ["Msxml2.XMLHTTP", "application/x-www-form-urlencoded", ".php", "text/xml",
                                  "about:blank", "Microsoft.XMLHTTP", "text/plain", "text/javascript",
                                  "application/x-shockwave-flash"] 
开发者ID:flipkart-incubator,项目名称:watchdog,代码行数:38,代码来源:lswww.py

示例10: htmlparser_trace

# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data) 
开发者ID:MarcelloLins,项目名称:ServerlessCrawler-VancouverRealState,代码行数:10,代码来源:diagnose.py

示例11: unescape

# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def unescape(s):
        html_parser = HTMLParser.HTMLParser()
        return html_parser.unescape(s)

# Return addrlist sequence at random, it can help create_connection function 
开发者ID:ForgQi,项目名称:bilibiliupload,代码行数:7,代码来源:compact.py

示例12: safe_text

# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def safe_text(text):
  return (HTMLParser.HTMLParser().unescape(text.decode('utf8'))).encode('utf8') 
开发者ID:lipis,项目名称:github-stats,代码行数:4,代码来源:magic.py

示例13: masterlist

# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def masterlist():
	master_db = []
	master_data = connection.getURL(SHOWS)
	json = simplejson.loads(master_data)['menu_html']
	master_menu =  re.compile('<li class="views-row .*?">.*?<div>\s*<div><a href="(.*?)">.*?<div class="field .*?">\n\s*(.*?)</div>.*?</li>' , re.DOTALL).findall(json)
	for season_url, master_name in master_menu:
		master_name = common.smart_unicode(master_name).strip()
		master_name = HTMLParser.HTMLParser().unescape(master_name)
		master_db.append((master_name, SITE, 'seasons', season_url))
	return master_db 
开发者ID:moneymaker365,项目名称:plugin.video.ustvvod,代码行数:12,代码来源:nbc.py

示例14: get_url

# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def get_url(self,url):
        headers = {'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'}
        try:
            r = requests.get(url,headers=headers)
            html = HTMLParser.HTMLParser().unescape(r.content.decode('utf-8'))
            return html
        except:
            return '' 
开发者ID:primaeval,项目名称:script.tvguide.fullscreen,代码行数:10,代码来源:source.py

示例15: get_url

# 需要导入模块: import HTMLParser [as 别名]
# 或者: from HTMLParser import HTMLParser [as 别名]
def get_url(url):
    #headers = {'user-agent': 'Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/10.0.9.2372 Mobile Safari/537.10+'}
    headers = {'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'}
    try:
        r = requests.get(url,headers=headers)
        html = HTMLParser.HTMLParser().unescape(r.content.decode('utf-8'))
        return html
    except:
        return '' 
开发者ID:primaeval,项目名称:script.tvguide.fullscreen,代码行数:11,代码来源:yo.py


注:本文中的HTMLParser.HTMLParser方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。