本文整理汇总了Python中cfscrape.create_scraper函数的典型用法代码示例。如果您正苦于以下问题:Python create_scraper函数的具体用法?Python create_scraper怎么用?Python create_scraper使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了create_scraper函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _scrape_photo_info_from_source_3
def _scrape_photo_info_from_source_3(page_url):
scraper = cfscrape.create_scraper()
scraped_content = scraper.get(page_url).content
soup = BeautifulSoup(scraped_content, "lxml")
photos = soup.find_all("img", class_="main-image")
photo_url = photos[0]["src"]
# Scrape the aircraft model and airline
aircraft_model, airline = None, None
info_section = soup.find("section", class_="additional-info aircraft")
p_elems = info_section.select("p")
for p_elem in p_elems:
text = p_elem.text.strip()
if len(text) > 0:
if "Aircraft: " in text:
aircraft_model = text.split(":")[1].strip()
if "Airline: " in text:
airline = text.split(":")[1].strip()
# Scrape the photographer's name
photographer_name = None
info_section = soup.find("section", class_="additional-info photographer")
p_elems = info_section.select("p")
for i, p_elem in enumerate(p_elems):
text = p_elem.text.strip()
if len(text) > 0:
if i == 0:
photographer_name = text.strip()
size = "" # Placeholder - we set it after we download the photo
return ((airline, page_url, photo_url, aircraft_model, size, "No",
photographer_name))
示例2: __init__
def __init__(self, un, pw, session_path=None):
'''
Params:
un: account username (required)
pw: account password (required)
session_path: the path to the actual file you want to persist your cookies in
If blank, saves to $HOME/.32p_cookies.dat
'''
self.module = '[32P-AUTHENTICATION]'
try:
self.ses = cfscrape.create_scraper()
except Exception as e:
logger.error(self.module + " Can't create session with cfscrape")
self.session_path = session_path if session_path is not None else os.path.join(mylar.CACHE_DIR, ".32p_cookies.dat")
self.ses.cookies = LWPCookieJar(self.session_path)
if not os.path.exists(self.session_path):
logger.fdebug(self.module + ' Session cookie does not exist. Signing in and Creating.')
self.ses.cookies.save()
else:
logger.fdebug(self.module + ' Session cookie found. Attempting to load...')
self.ses.cookies.load(ignore_discard=True)
self.un = un
self.pw = pw
self.authkey = None
self.passkey = None
self.uid = None
self.inkdrops = None
示例3: scrape
def scrape():
try:
purge()
# Connect to the site
scrp = cfscrape.create_scraper()
rqst = scrp.get('http://800notes.com/').content
soup = BeautifulSoup(rqst, 'lxml')
# Connect to the database
with sql.connect('complaint-scraper.db') as con:
with con as cur:
for div in soup.findAll('div', class_='oos_preview'):
cnt = div.find('div', class_='oos_previewSide')
wrp = div.find('div', class_='oos_previewMain')
num = wrp.find('div', class_='oos_previewHeader')
lnk = num.find('a', class_='oos_previewTitle')
txt = wrp.find('div', class_='oos_previewBody')
areaCode = lnk.text[:3]
fullNmbr = areaCode + lnk.text[4:7] + lnk.text[8:]
cmntText = txt.text
numCmnts = cnt.text
cur.execute('''
INSERT INTO Comments(
Area_Code, Full_Number, Comment, Num_Comments)
VALUES(?,?,?,?)
''', (areaCode, fullNmbr, cmntText, numCmnts))
except sql.IntegrityError, e:
print "Error: %s" % e.args[0]
示例4: test_http_link_active
def test_http_link_active(content, link=None):
"link URL must be active"
import cfscrape
from requests.exceptions import RequestException
from rfc3986 import is_valid_uri, uri_reference
_verify_valid_link_entry(link)
key, value = list(link.items())[0]
if not is_valid_uri(value, require_scheme=True):
return
parsed_value = uri_reference(value)
if parsed_value.scheme not in ("http", "https"):
return
# Hooray.
if parsed_value.host.endswith("linkedin.com"):
raise SkipTest("linkedin.com won't let us see {} anyway".format(value))
try:
r = cfscrape.create_scraper().get(value, timeout=30.0, headers={"User-Agent": USER_AGENT})
except RequestException as exc:
assert False, "error while checking {}: {}".format(value, exc)
else:
assert 200 <= r.status_code < 300, \
"expected {} link {} to be active, but got {}".format(key, value, r.status_code)
示例5: fetch
def fetch():
url = environ.get('URL')
root_url = environ.get('ROOT_URL')
scraper = cfscrape.create_scraper()
html = scraper.get(url).content
soup = BeautifulSoup(html, 'html.parser')
posts = list()
for link in soup.select('#threads a.title'):
post = dict()
try:
post['title'] = link.text
post['href'] = root_url + link.get('href')
post['uid'] = post['href'].replace(root_url + 'threads/', '')[:6] #TODO
posts.append(post)
except Exception as e:
print(e)
pass
return posts
示例6: enter_raffle
def enter_raffle(url):
"""Enters raffle at given URL."""
headers = {
"Host" : "csgorage.com",
"Origin" : "http://csgorage.com",
"Referer" : service_url + url,
"Accept" : "application/json, text/javascript, */*; q=0.01",
"Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8",
"Accept-Encoding" : "gzip, deflate",
"Accept-Language" : "en-US,en;q=0.8"
}
r = cfscrape.create_scraper()
s = scrape(url)
raffleId = url[-5:]
token_tag = s.find("span", { "class" : "hide tok"}).contents[1]
token = str(token_tag)[6:-7]
ticketId = randint(900,1350)
payload = {
'rid' : raffleId,
'slots[]' : ticketId,
'_token' : token,
'rnd' : 1
}
t = r.post(service_url + "/getslotfree", data=payload, cookies=cookies, headers=headers)
if t.status_code == 200:
print("200")
else:
print("Not 200")
示例7: Bookmarks
def Bookmarks(title):
oc = ObjectContainer(title1 = title)
post_values = {
'username' : username,
'password' : password
}
if username and password:
sess = requests.session()
s = cfscrape.create_scraper(sess)
page = s.post("http://kissanime.com/Login", post_values)
#bookmarks = s.get(BASE_URL + '/BookmarkList')
#pagehtml = html.fromstring(bookmarks.text)
return MessageContainer(
"Success",
page.text
)
for each in pagehtml.xpath("//a[@class='aAnime']"):
url = each.xpath("./@href")[0]
title = each.xpath("./text()")[0]
thumb = ""
oc.add(DirectoryObject(
key = Callback(EpisodeDetail, title = title, url = url),
title = title,
thumb = Resource.ContentsOfURLWithFallback(url = thumb, fallback='icon-cover.png')
)
)
return oc
else:
return MessageContainer(
"Error",
"You need to provide a username and password"
)
示例8: boerse_refresh
def boerse_refresh(self):
FILE = open(self.boerse_entries, "r")
filetext = FILE.read()
FILE.close()
scraper = cfscrape.create_scraper()
url = scraper.get(feeds['boerse_url']).content
boerse = BeautifulSoup(url)
for entry in boerse.findAll('item'):
items = entry.find('title')
title = '{}'.format(items).replace('<title>', '')\
.replace('</title>', '')\
.replace(' ', '.')\
.replace('.-.', '')
if title not in filetext and\
any([x in title for x in whitelist['boerse']]) and\
any([x not in title for x in blacklist['boerse']]):
FILE = open(self.boerse_entries, "a")
FILE.write("{}\n".format(title))
FILE.close()
self.on_rss_entry(
'{0}{1}[BOERSE]{2} {3}'.format(
self.BOLD, self.RED, self.END, title))
threading.Timer(feeds['boerse_delay'], self.boerse_refresh).start()
示例9: get_url_headers
def get_url_headers(url, configfile, dbfile, headers):
config = RssConfig('RSScrawler', configfile)
proxy = config.get('proxy')
scraper = cfscrape.create_scraper(delay=10)
agent = fake_user_agent()
headers.update({'User-Agent': agent})
if proxy:
sj = decode_base64("c2VyaWVuanVua2llcy5vcmc=")
mb = decode_base64("bW92aWUtYmxvZy50bw==")
db = RssDb(dbfile, 'proxystatus')
if sj in url:
if db.retrieve("SJ") and config.get("fallback"):
return scraper.get(url, headers=headers, timeout=30)
elif mb in url:
if db.retrieve("MB") and config.get("fallback"):
return scraper.get(url, headers=headers, timeout=30)
proxies = {'http': proxy, 'https': proxy}
try:
response = scraper.get(url, headers=headers, proxies=proxies, timeout=30)
return response
except Exception as e:
print(u"Fehler beim Abruf von: " + url + " " + str(e))
return ""
else:
try:
response = scraper.get(url, headers=headers, timeout=30)
return response
except Exception as e:
print(u"Fehler beim Abruf von: " + url + " " + str(e))
return ""
示例10: __init__
def __init__(self):
self.anime = sys.argv[1]
self.anime_url = 'http://kissanime.to/Anime/'
self.scraper = cfscrape.create_scraper()
self.s_check = ['{}/Episode'.format(self.anime), '?id=']
self.audited_links = []
self.decoded_links = []
示例11: __init__
def __init__(self):
self.items = 0
self.pages = 0
self.time = 0
self.memory = 0
self.scraper = cfscrape.create_scraper()
self.scraper.headers.update(HEADERS)
示例12: cms_identifier
def cms_identifier(self):
""" Identifies the target's content management system. """
engine.setup(self)
targets = [target for target in self.args.target if target.strip()]
error_count = 0
for url in targets:
self.sanitize_url(url)
msg = "Getting source for {}".format(self.url); report.low(msg)
headers = {'User-Agent': "Mozilla/5.0 (X11; Fedora; Linux i686;" +\
"rv:40.0) Gecko/20100101 Firefox/40.1"}
response = None
try:
response = requests.get(self.url, headers=headers, verify=False)
if "Checking your browser before accessing" in response.content:
msg ="Site: {} is using cloudflare. "\
"Trying to bypass cloudflare protection.".format(self.url);report.medium(msg)
#damn cloudflare, lets see if how to circumvert it.
#TODO: Ask for permision since executing JS might be a security issue.
# https://github.com/Anorov/cloudflare-scrape
cfscraper = cfscrape.create_scraper()
response = cfscraper.get(self.url)
except Exception as e:
#print e
error_count += 1
msg="Something went wrong while getting ({}), moving on...".format(self.url);report.error(msg)
if error_count > 3:
msg = "Too many error. Exiting..."; report.error(msg)
sys.exit()
framework, site = engine.pwn(self,response)
if framework:
report.info("This is a website based on: {0} from {1}".format(framework, site))
else:
report.high("Failed to determine CMS of site.")
示例13: scraper
def scraper():
try:
import cfscrape
except ImportError as e:
log.debug('Error importing cfscrape: %s', e)
raise plugin.DependencyError('cfscraper', 'cfscrape', 'cfscrape module required. ImportError: %s' % e)
else:
return cfscrape.create_scraper()
示例14: scrape
def scrape(url):
"""Connects to raffle url and returns a BeautifulSoup object."""
fullUrl = service_url + url
r = cfscrape.create_scraper()
s = r.get(fullUrl, cookies=cookies)
t = BeautifulSoup(s.text, "html5lib")
return t
示例15: __init__
def __init__(self, params):
for param in params:
print(param)
# create a webdriver instance with a lenient timeout duration
self.scraper = cfscrape.create_scraper()
self.rootPage = ""
self.file_extension = ""
self.download(params)