当前位置: 首页>>代码示例>>Python>>正文


Python cfscrape.create_scraper函数代码示例

本文整理汇总了Python中cfscrape.create_scraper函数的典型用法代码示例。如果您正苦于以下问题:Python create_scraper函数的具体用法?Python create_scraper怎么用?Python create_scraper使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了create_scraper函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _scrape_photo_info_from_source_3

def _scrape_photo_info_from_source_3(page_url):
    scraper = cfscrape.create_scraper()
    scraped_content = scraper.get(page_url).content
    soup = BeautifulSoup(scraped_content, "lxml")
    photos = soup.find_all("img", class_="main-image")
    photo_url = photos[0]["src"]

    # Scrape the aircraft model and airline
    aircraft_model, airline = None, None
    info_section = soup.find("section", class_="additional-info aircraft")
    p_elems = info_section.select("p")
    for p_elem in p_elems:
        text = p_elem.text.strip()
        if len(text) > 0:
            if "Aircraft: " in text:
                aircraft_model = text.split(":")[1].strip()
            if "Airline: " in text:
                airline = text.split(":")[1].strip()

    # Scrape the photographer's name
    photographer_name = None
    info_section = soup.find("section", class_="additional-info photographer")
    p_elems = info_section.select("p")
    for i, p_elem in enumerate(p_elems):
        text = p_elem.text.strip()
        if len(text) > 0:
            if i == 0:
                photographer_name = text.strip()
    size = ""  # Placeholder - we set it after we download the photo
    return ((airline, page_url, photo_url, aircraft_model, size, "No",
             photographer_name))
开发者ID:nirmalyaghosh,项目名称:mini-projects,代码行数:31,代码来源:harvest_photos.py

示例2: __init__

        def __init__(self, un, pw, session_path=None):
            '''
                Params:
                    un: account username (required)
                    pw: account password (required)
                    session_path: the path to the actual file you want to persist your cookies in
                                If blank, saves to $HOME/.32p_cookies.dat

            '''
            self.module = '[32P-AUTHENTICATION]'
            try:
                self.ses = cfscrape.create_scraper()
            except Exception as e:
                logger.error(self.module + " Can't create session with cfscrape")

            self.session_path = session_path if session_path is not None else os.path.join(mylar.CACHE_DIR, ".32p_cookies.dat")
            self.ses.cookies = LWPCookieJar(self.session_path)
            if not os.path.exists(self.session_path):
                logger.fdebug(self.module + ' Session cookie does not exist. Signing in and Creating.')
                self.ses.cookies.save()
            else:
                logger.fdebug(self.module + ' Session cookie found. Attempting to load...')
                self.ses.cookies.load(ignore_discard=True)
            self.un = un
            self.pw = pw
            self.authkey = None
            self.passkey = None
            self.uid = None
            self.inkdrops = None
开发者ID:rupaschomaker,项目名称:mylar,代码行数:29,代码来源:auth32p.py

示例3: scrape

def scrape():
    try:
        purge()
        
        # Connect to the site
        scrp = cfscrape.create_scraper()
        rqst = scrp.get('http://800notes.com/').content
        soup = BeautifulSoup(rqst, 'lxml')
        
        # Connect to the database
        with sql.connect('complaint-scraper.db') as con:
            with con as cur:
                for div in soup.findAll('div', class_='oos_preview'):
                    cnt = div.find('div', class_='oos_previewSide')
                    wrp = div.find('div', class_='oos_previewMain')
                    num = wrp.find('div', class_='oos_previewHeader')
                    lnk = num.find('a',   class_='oos_previewTitle')
                    txt = wrp.find('div', class_='oos_previewBody')
                    areaCode = lnk.text[:3]
                    fullNmbr = areaCode + lnk.text[4:7] + lnk.text[8:]
                    cmntText = txt.text
                    numCmnts = cnt.text
                    cur.execute('''
                        INSERT INTO Comments(
                            Area_Code, Full_Number, Comment, Num_Comments)
                        VALUES(?,?,?,?)
                        ''', (areaCode, fullNmbr, cmntText, numCmnts))

    except sql.IntegrityError, e:
        print "Error: %s" % e.args[0]
开发者ID:jonlandrum,项目名称:complaint-scraper,代码行数:30,代码来源:complaint-scraper.py

示例4: test_http_link_active

def test_http_link_active(content, link=None):
    "link URL must be active"
    import cfscrape
    from requests.exceptions import RequestException
    from rfc3986 import is_valid_uri, uri_reference
    _verify_valid_link_entry(link)
    key, value = list(link.items())[0]

    if not is_valid_uri(value, require_scheme=True):
        return

    parsed_value = uri_reference(value)
    if parsed_value.scheme not in ("http", "https"):
        return

    # Hooray.
    if parsed_value.host.endswith("linkedin.com"):
        raise SkipTest("linkedin.com won't let us see {} anyway".format(value))

    try:
        r = cfscrape.create_scraper().get(value, timeout=30.0, headers={"User-Agent": USER_AGENT})
    except RequestException as exc:
        assert False, "error while checking {}: {}".format(value, exc)
    else:
        assert 200 <= r.status_code < 300, \
            "expected {} link {} to be active, but got {}".format(key, value, r.status_code)
开发者ID:VetraCommunity,项目名称:tokenbase,代码行数:26,代码来源:test.py

示例5: fetch

    def fetch():
        url = environ.get('URL')
        root_url = environ.get('ROOT_URL')

        scraper = cfscrape.create_scraper()
        html = scraper.get(url).content
        soup = BeautifulSoup(html, 'html.parser')

        posts = list()

        for link in soup.select('#threads a.title'):
            post = dict()

            try:
                post['title'] = link.text
                post['href'] = root_url + link.get('href')
                post['uid'] = post['href'].replace(root_url + 'threads/', '')[:6] #TODO

                posts.append(post)
            except Exception as e:
                print(e)

                pass

        return posts
开发者ID:leandrotoledo,项目名称:hardmob_promo,代码行数:25,代码来源:feed.py

示例6: enter_raffle

def enter_raffle(url):
    """Enters raffle at given URL."""

    headers = {
            "Host" : "csgorage.com",
            "Origin" : "http://csgorage.com",
            "Referer" : service_url + url,
            "Accept" : "application/json, text/javascript, */*; q=0.01",
            "Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8",
            "Accept-Encoding" : "gzip, deflate",
            "Accept-Language" : "en-US,en;q=0.8"
            }

    r = cfscrape.create_scraper()
    s = scrape(url)

    raffleId = url[-5:]
    token_tag = s.find("span", { "class" : "hide tok"}).contents[1]
    token = str(token_tag)[6:-7]
    ticketId = randint(900,1350)
    payload = {
            'rid' : raffleId,
            'slots[]' : ticketId,
            '_token' : token,
            'rnd' : 1
            }

    t = r.post(service_url + "/getslotfree", data=payload, cookies=cookies, headers=headers)
    if t.status_code == 200:
        print("200")
    else:
        print("Not 200")
开发者ID:elock37,项目名称:raffler,代码行数:32,代码来源:raffler.py

示例7: Bookmarks

def Bookmarks(title):

	oc = ObjectContainer(title1 = title)
	post_values = {
		'username' : username,
		'password' : password
		}

	if username and password:
		sess = requests.session()
		s = cfscrape.create_scraper(sess)
		page = s.post("http://kissanime.com/Login", post_values)
		#bookmarks = s.get(BASE_URL + '/BookmarkList')
		#pagehtml = html.fromstring(bookmarks.text)
		return MessageContainer(
			"Success",
			page.text
		)
		for each in pagehtml.xpath("//a[@class='aAnime']"):
			url = each.xpath("./@href")[0]
			title = each.xpath("./text()")[0]
			thumb = ""
		
			oc.add(DirectoryObject(
				key = Callback(EpisodeDetail, title = title, url = url),
				title = title,
				thumb = Resource.ContentsOfURLWithFallback(url = thumb, fallback='icon-cover.png')
				)
			)
		return oc
	else:
		return MessageContainer(
			"Error",
			"You need to provide a username and password"
		)
开发者ID:romosborne,项目名称:KissAnime.bundle,代码行数:35,代码来源:__init__.py

示例8: boerse_refresh

    def boerse_refresh(self):
        FILE = open(self.boerse_entries, "r")
        filetext = FILE.read()
        FILE.close()

        scraper = cfscrape.create_scraper()
        url = scraper.get(feeds['boerse_url']).content
        boerse = BeautifulSoup(url)
        for entry in boerse.findAll('item'):
            items = entry.find('title')
            title = '{}'.format(items).replace('<title>', '')\
                                      .replace('</title>', '')\
                                      .replace(' ', '.')\
                                      .replace('.-.', '')

            if title not in filetext and\
                    any([x in title for x in whitelist['boerse']]) and\
                    any([x not in title for x in blacklist['boerse']]):
                FILE = open(self.boerse_entries, "a")
                FILE.write("{}\n".format(title))
                FILE.close()
                self.on_rss_entry(
                    '{0}{1}[BOERSE]{2} {3}'.format(
                        self.BOLD, self.RED, self.END, title))

        threading.Timer(feeds['boerse_delay'], self.boerse_refresh).start()
开发者ID:grm34,项目名称:piratbot,代码行数:26,代码来源:wookie.py

示例9: get_url_headers

def get_url_headers(url, configfile, dbfile, headers):
    config = RssConfig('RSScrawler', configfile)
    proxy = config.get('proxy')
    scraper = cfscrape.create_scraper(delay=10)
    agent = fake_user_agent()
    headers.update({'User-Agent': agent})
    if proxy:
        sj = decode_base64("c2VyaWVuanVua2llcy5vcmc=")
        mb = decode_base64("bW92aWUtYmxvZy50bw==")
        db = RssDb(dbfile, 'proxystatus')
        if sj in url:
            if db.retrieve("SJ") and config.get("fallback"):
                return scraper.get(url, headers=headers, timeout=30)
        elif mb in url:
            if db.retrieve("MB") and config.get("fallback"):
                return scraper.get(url, headers=headers, timeout=30)
        proxies = {'http': proxy, 'https': proxy}
        try:
            response = scraper.get(url, headers=headers, proxies=proxies, timeout=30)
            return response
        except Exception as e:
            print(u"Fehler beim Abruf von: " + url + " " + str(e))
            return ""
    else:
        try:
            response = scraper.get(url, headers=headers, timeout=30)
            return response
        except Exception as e:
            print(u"Fehler beim Abruf von: " + url + " " + str(e))
            return ""
开发者ID:rix1337,项目名称:RSScrawler,代码行数:30,代码来源:url.py

示例10: __init__

 def __init__(self):
     self.anime = sys.argv[1]
     self.anime_url = 'http://kissanime.to/Anime/'
     self.scraper = cfscrape.create_scraper()
     self.s_check = ['{}/Episode'.format(self.anime), '?id=']
     self.audited_links = []
     self.decoded_links = []
开发者ID:ModalSeoul,项目名称:anitai-download,代码行数:7,代码来源:kissanime.py

示例11: __init__

 def __init__(self):
     self.items = 0
     self.pages = 0
     self.time = 0
     self.memory = 0
     self.scraper = cfscrape.create_scraper()
     self.scraper.headers.update(HEADERS)
开发者ID:anaoaktree,项目名称:findaplumber,代码行数:7,代码来源:utils.py

示例12: cms_identifier

    def cms_identifier(self):
        """ Identifies the target's content management system. """
        engine.setup(self)
        
        targets = [target for target in self.args.target if target.strip()]
        error_count = 0
        for url in targets:
            self.sanitize_url(url)
            msg = "Getting source for {}".format(self.url); report.low(msg)
            headers = {'User-Agent': "Mozilla/5.0 (X11; Fedora; Linux i686;" +\
			"rv:40.0) Gecko/20100101 Firefox/40.1"}
            response = None
            try:
                response = requests.get(self.url, headers=headers, verify=False)
                if "Checking your browser before accessing" in response.content:
                    msg ="Site: {} is using cloudflare. "\
                         "Trying to bypass cloudflare protection.".format(self.url);report.medium(msg)
                    #damn cloudflare, lets see if how to circumvert it. 
                    #TODO: Ask for permision since executing JS might be a security issue.
                    # https://github.com/Anorov/cloudflare-scrape
                    cfscraper = cfscrape.create_scraper()
                    response = cfscraper.get(self.url)
            except Exception as e:
                #print e
                error_count += 1
                msg="Something went wrong while getting ({}), moving on...".format(self.url);report.error(msg)
                if error_count > 3:
                    msg = "Too many error. Exiting..."; report.error(msg)
                    sys.exit()
            
            framework, site = engine.pwn(self,response)
            if framework:
                report.info("This is a website based on: {0} from {1}".format(framework, site))
            else:
                report.high("Failed to determine CMS of site.")
开发者ID:kenjoe41,项目名称:cmspwn,代码行数:35,代码来源:cmspwn.py

示例13: scraper

 def scraper():
     try:
         import cfscrape
     except ImportError as e:
         log.debug('Error importing cfscrape: %s', e)
         raise plugin.DependencyError('cfscraper', 'cfscrape', 'cfscrape module required. ImportError: %s' % e)
     else:
         return cfscrape.create_scraper()
开发者ID:AnthonyGuerreiro,项目名称:Flexget,代码行数:8,代码来源:horriblesubs.py

示例14: scrape

def scrape(url):
    """Connects to raffle url and returns a BeautifulSoup object."""

    fullUrl = service_url + url
    r = cfscrape.create_scraper()
    s = r.get(fullUrl, cookies=cookies)
    t = BeautifulSoup(s.text, "html5lib")
    return t
开发者ID:elock37,项目名称:raffler,代码行数:8,代码来源:raffler.py

示例15: __init__

 def __init__(self, params):
     for param in params:
         print(param)
     # create a webdriver instance with a lenient timeout duration
     self.scraper = cfscrape.create_scraper()
     self.rootPage = ""
     self.file_extension = ""
     self.download(params)
开发者ID:BDrgon,项目名称:KissDownloader,代码行数:8,代码来源:KissDownloader.py


注:本文中的cfscrape.create_scraper函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。