当前位置: 首页>>代码示例>>Python>>正文


Python BeautifulSoup.BeautifulSoup方法代码示例

本文整理汇总了Python中BeautifulSoup.BeautifulSoup方法的典型用法代码示例。如果您正苦于以下问题:Python BeautifulSoup.BeautifulSoup方法的具体用法?Python BeautifulSoup.BeautifulSoup怎么用?Python BeautifulSoup.BeautifulSoup使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在BeautifulSoup的用法示例。


在下文中一共展示了BeautifulSoup.BeautifulSoup方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: extract_context

# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def extract_context(html, url):
    soup = BeautifulSoup(html)
    # Insert into Content (under this domain)
    texts = soup.findAll(text=True)
    try:
        Content.objects.create(
            url=url,
            title=soup.title.string,
            summary=helpers.strip_tags(" \n".join(filter(visible, texts)))[:4000],
            last_crawled_at=datetime.datetime.utcnow().replace(tzinfo=pytz.utc)
        )
    except IntegrityError:
        println('%s - already existed in Content' % url)
    soup.prettify()
    return [str(anchor['href'])
            for anchor in soup.findAll('a', attrs={'href': re.compile("^http://")}) if anchor['href']] 
开发者ID:pixlie,项目名称:oxidizr,代码行数:18,代码来源:crawl.py

示例2: parse

# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def parse(self, html):
        """
        This method initiates parsing of HTML content, cleans resulting
        content as needed, and notifies the parser instance of
        resulting instances via the handle_article callback.
        """
        self.soup = BeautifulSoup(html)

        # This parses any global, non-itemized attributes from the page.
        self._parse_globals()

        # Now parse out listed articles:
        for div in self.soup.findAll(ScholarArticleParser._tag_results_checker):
            self._parse_article(div)
            self._clean_article()
            if self.article['title']:
                self.handle_article(self.article) 
开发者ID:dnlcrl,项目名称:PyScholar,代码行数:19,代码来源:parser.py

示例3: list_of_all_href

# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def list_of_all_href(self,html):
        '''
        It will return all hyper links found in the mr-jatt page for download
        '''	
        soup=BeautifulSoup(html)
        links=[]
        a_list=soup.findAll('a','touch')
        for x in xrange(len(a_list)-1):
            link = a_list[x].get('href')
            name = a_list[x]
            name = str(name)
            name=re.sub(r'<a.*/>|<span.*">|</span>|</a>|<a.*html">|<font.*">|</font>','',name)
            name=re.sub(r'^[0-9]+\.','',name)
            links.append([link,name])

        #quit()
        return links 
开发者ID:ankitmathur3193,项目名称:song-cli,代码行数:19,代码来源:MrJattParser.py

示例4: parse_checkerproxy

# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def parse_checkerproxy(self, html):
        ''' Only get elite proxies from checkerproxy '''
        ips = []
        soup = BeautifulSoup(html)
        for tr in soup.findAll('tr'):
            if len(tr) == 19:
                ip_found = False
                elite = False
                ip_port = None
                tds = tr.findAll('td')
                for td in tds:
                    if ':' in td.text:
                        ip_found = True
                        ip_port_re = re.match('(\d{1,3}\.){3}\d{1,3}:\d{1,5}', td.text)
                        if ip_port_re:
                            ip_port = ip_port_re.group()
                        if not ip_port:
                            ip_found = False
                    if 'Elite' in td.text:
                        elite = True
                    if ip_found == True and elite == True:
                        ips.append(str(ip_port))
                        break
        return ips 
开发者ID:DanMcInerney,项目名称:get_proxy,代码行数:26,代码来源:get_proxy.py

示例5: sanitize_html

# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def sanitize_html(value):
    valid_tags = ALLOWED_TAGS.split()
    valid_attrs = 'href src target alt'.split()

    if not value:
        return ''

    soup = BeautifulSoup(value)

    if not (soup.find('div', 'not-sanitize')):
        for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
            comment.extract()
        for tag in soup.findAll(True):
            if tag.name not in valid_tags:
                tag.hidden = True
            tag.attrs = [(attr, val) for attr, val in tag.attrs
                         if attr in valid_attrs]
        return '<p>' + soup.renderContents().decode('utf8').replace('javascript:', '').replace("\n", '</p><p>') + '</p>'
    return soup.renderContents().decode('utf8') 
开发者ID:znick,项目名称:anytask,代码行数:21,代码来源:sanitize_html.py

示例6: get_soup

# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def get_soup(content):
    # check if page content can be used
    pattern = "subtitles from the source! - Addic7ed.com"
    try:
        soup = BeautifulSoup(content)
        title = str(soup.findAll("title")[0])
        if title.find(pattern) > -1:
            return soup
        else:
            log("bad page, maybe index after 404")
            return False
    except:
        log("badly formatted content")
        if self_notify:
            xbmc.executebuiltin((u'Notification(%s,%s,%s,%s)' % (__addonname__, __language__(30009), 750, __icon__)).encode('utf-8', 'ignore'))
        return False 
开发者ID:skylex,项目名称:xbmc-betaseries,代码行数:18,代码来源:service.py

示例7: get_soup

# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def get_soup(content):
    # check if page content can be used
    pattern = "TVsubtitles.net - "
    try:
        soup = BeautifulSoup(content)
        title = str(soup.findAll("title")[0])
        if title.find(pattern) > -1:
            return soup
        else:
            log("bad page, maybe index after 404")
            return False
    except:
        log("badly formatted content")
        if self_notify:
            xbmc.executebuiltin((u'Notification(%s,%s,%s,%s)' % (__addonname__, __language__(30009), 750, __icon__)).encode('utf-8', 'ignore'))
        return False 
开发者ID:skylex,项目名称:xbmc-betaseries,代码行数:18,代码来源:service.py

示例8: ubuntu_url

# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def ubuntu_url(start_date, end_date):
    """
    Args:
        start_date (date object): Starting date from which logs need to be fetched 
        end_date (date object) : Last date for which logs need to be fetched
    Returns:
        Yields channel name, current_date, and url at which log for returned
        channel and current_date is present.
    """
    
    for current_date in rrule(freq=DAILY, dtstart=start_date, until=end_date):
        url = UBUNTU_ENDPOINT.format(current_date.year,month=current_date.month, day=current_date.day)
        
        r = send_request(url)
        soup = BeautifulSoup(r)
        links = soup.findAll(href=re.compile(".txt"))
        
        for link in links:
            channel = link.string
            channel_ = channel[1:]
            
            yield channel, current_date, UBUNTU_CHANNEL_ENDPOINT.format(current_date.year, month=current_date.month, day=current_date.day, channel=channel_) 
开发者ID:prasadtalasila,项目名称:IRCLogParser,代码行数:24,代码来源:log_download.py

示例9: run

# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def run(self):
        url = 'http://urlvoid.com/scan/%s/' % self.artifact['name']

        try:
            status, response = get(url, headers=self.headers)

            if status:
                data = BeautifulSoup(response.text)

                if data.findAll('div', attrs={'class': 'bs-callout bs-callout-info'}):
                    pass

                elif data.findAll('div', attrs={'class': 'bs-callout bs-callout-warning'}):
                    self.artifact['data']['urlvoid'] = {}
                    for each in data.findAll('img', alt='Alert'):
                        site = each.parent.parent.td.text.lstrip()
                        url = each.parent.a['href']
                        self.artifact['data']['urlvoid'][site] = url

        except Exception as err:
            warning('Caught exception in module (%s)' % str(err)) 
开发者ID:InQuest,项目名称:omnibus,代码行数:23,代码来源:urlvoid.py

示例10: ip

# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def ip(self):
        url = 'http://bgp.he.net/ip/%s#_dns' % self.artifact['name']
        headers = {'User-Agent': 'OSINT Omnibus (https://github.com/InQuest/Omnibus)'}

        try:
            status, response = get(url, headers=headers)

            if status:
                result = []
                data = BeautifulSoup(response.text)

                for item in data.findAll(attrs={'id': 'dns', 'class': 'tabdata hidden'}):
                    result.append(item.text.strip())

        except Exception as err:
            warning('Caught exception in module (%s)' % str(err)) 
开发者ID:InQuest,项目名称:omnibus,代码行数:18,代码来源:he.py

示例11: run

# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def run(self):
        url = 'http://www.ipvoid.com/scan/%s/' % self.artifact['name']

        try:
            status, response = get(url, headers=self.headers)

            if status:
                data = BeautifulSoup(response.text)

                if data.findAll('span', attrs={'class': 'label label-success'}):
                    pass

                elif data.findAll('span', attrs={'class': 'label label-danger'}):
                    for each in data.findAll('img', alt='Alert'):
                        site = each.parent.parent.td.text.lstrip()
                        url = each.parent.a['href']
                        self.artifact['data']['ipvoid'] = {site: url}
        except Exception as err:
            warning('Caught exception in module (%s)' % str(err)) 
开发者ID:InQuest,项目名称:omnibus,代码行数:21,代码来源:ipvoid.py

示例12: update_planet_fleet

# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def update_planet_fleet(self, planet):
        resp = self.br.open(self._get_url('fleet', planet))
        soup = BeautifulSoup(resp)
        ships = {}
        for k, v in self.SHIPS.iteritems():
            available = 0
            try:
                s = soup.find(id='button' + v)
                available = int(s.find('span', 'textlabel').nextSibling.replace('.', ''))
            except:
                available = 0
            ships[k] = available

        #self.logger.info('Updating %s fleet' % planet)
        #self.logger.info('%s' % fleet)
        planet.ships = ships 
开发者ID:r4fek,项目名称:ogame-bot,代码行数:18,代码来源:bot.py

示例13: get_player_status

# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def get_player_status(self, destination, origin_planet=None):
        if not destination:
            return
            
        status = {}
        origin_planet = origin_planet or self.get_closest_planet(destination)
        galaxy, system, position = destination.split(':')

        url = self._get_url('galaxyCnt', origin_planet)
        data = urlencode({'galaxy': galaxy, 'system': system})
        resp = self.br.open(url, data=data)
        soup = BeautifulSoup(resp)

        soup.find(id='galaxytable')
        planets = soup.findAll('tr', {'class': 'row'})
        target_planet = planets[int(position)-1]
        name_el = target_planet.find('td', 'playername')
        status['name'] = name_el.find('span').text

        status['inactive'] = 'inactive' in name_el.get('class', '')
        return status 
开发者ID:r4fek,项目名称:ogame-bot,代码行数:23,代码来源:bot.py

示例14: spider_image

# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def spider_image(url):
    user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
    headers = {'User-Agent': user_agent}
    request = urllib2.Request(url, headers = headers)
    response = urllib2.urlopen(request)
    soup = BeautifulSoup(response.read())
    result = {}

    img = soup.findAll("img", attrs={"class": re.compile("wall")}, limit=1)
    result['url'] = img[0]['src']
    
    for size in soup.findAll("div", "l1"):
        if size.parent.name == "a":
            result['size'] = size.text
            break
    
    return result 
开发者ID:zengqiu,项目名称:spider,代码行数:19,代码来源:wallbase.py

示例15: getLinks

# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def getLinks():
  req = urllib2.urlopen('http://www.example.com')
  soup = BeautifulSoup(req.read())
  for link in soup.findAll('a'):
    linkArray.append(link.get('href'))
    print(len(linkArray)) 
开发者ID:PacktPublishing,项目名称:Learning-Concurrency-in-Python,代码行数:8,代码来源:ioBottleneck2.py


注:本文中的BeautifulSoup.BeautifulSoup方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。