本文整理汇总了Python中scraper.Scraper类的典型用法代码示例。如果您正苦于以下问题:Python Scraper类的具体用法?Python Scraper怎么用?Python Scraper使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Scraper类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: execute
def execute(self):
myScraper = Scraper(self.url,self.matchingDict)
result = myScraper.scrape()
if self.target is None:
return result
else:
self.target(result, self.url)
示例2: test_find_docs
def test_find_docs():
declare_test_start( 'follow_link' )
url_data = {
'url_id': 1,
'target_url': 'http://timduffy.me/',
'max_link_level': 6,
'creation_date_time': str(datetime.datetime.now()),
'doc_type': 'application/pdf',
'dispatch_datetime': str(datetime.datetime.now()),
'allowed_domains': [],
}
uid = str(uuid.uuid4())
scraper = Scraper(uid)
scraper.set_url_data(url_data)
docs = scraper.find_docs( )
print '[ TEST ] {0}'.format(json.dumps(scraper.status))
print '[ TEST ] {0}'.format(json.dumps(docs))
passed = False
if len(docs) > 0:
passed = True
declare_test_end( passed )
示例3: testExtractTag
def testExtractTag(self):
pattern = "<a name='$name'></a>"
_scraper = Scraper(pattern)
exp = BeautifulSoup(pattern)
# one attribute
actual = BeautifulSoup("<a name='abc'></a>")
self.assertEqual('abc', _scraper.extractTag(exp.contents[0], actual.contents[0])['name'])
# one attribute
actual = BeautifulSoup("<a name='abc' age='27'></a>")
self.assertEqual('abc', _scraper.extractTag(exp.contents[0], actual.contents[0])['name'])
# two attributes
pattern = "<a name='$name' age='$age'></a>"
exp = BeautifulSoup(pattern)
actual = BeautifulSoup("<a name='abc' age='27'></a>")
ret = _scraper.extractTag(exp.contents[0], actual.contents[0])
self.assertEqual(2, len(ret))
self.assertEqual('abc', ret['name'])
self.assertEqual('27', ret['age'])
# get attribute from sub tag
pattern = "<a><b name='$name'></b></a>"
exp = BeautifulSoup(pattern)
# one attribute
actual = BeautifulSoup("<a><b name='abc'></b></a>")
self.assertEqual('abc', _scraper.extractTag(exp.contents[0], actual.contents[0])['name'])
示例4: main
def main(request):
username = request.POST['username']
password = request.POST['password']
scraper = Scraper(username, password)
data = scraper.scrap(True)
return HttpResponse(obj2json(data), mimetype='application/json')
示例5: create_job
def create_job():
worker = Scraper(FREQUENCY, TAB, UNIT, FINAL_YEAR)
while True:
item = q.get()
worker.do_work(item)
print(item + ' is downloaded | ' + str(q.qsize()) + ' item(s) left')
q.task_done()
示例6: scrape
def scrape():
scraper = Scraper(**get_creds())
# Fetch usage info re: boosters.
le = UsageDataPoint(
time=datetime.datetime.utcnow(),
**scraper.fetch_booster_usage()
)
db_session.add(le)
yield le
# Fetch latest transactions and put these in the DB,
# but only if we don't already have them.
for transaction in scraper.fetch_most_recent_transactions():
existing = KoodoTransaction \
.query \
.filter_by(koodo_id=transaction['koodo_id']) \
.first()
if not existing:
kt = KoodoTransaction(**transaction)
db_session.add(kt)
yield kt
db_session.commit()
示例7: scrape
def scrape(request, tvdb_id):
"""
Takes a scrape request, constructs a Scraper object and performs a scrape for the show if it hasn't been scraped
before or hasn't been scraped within the last :math:`x` days (where :math:`x` is the number of days specified by
RESCRAPE_AFTER). Otherwise if the show exists and has been scraped within the last :math:`x` days redirect to the
appropriate show page
:param request: A Scrape request object.
:param tvdb_id: The id of the tv show to be scraped (or shown)
:return: A HttpResponse Object containing the page of the show requested.
"""
# Determine if the show already exists in the datastore
q = TVShow.get_by_key_name(tvdb_id)
if users.is_current_user_admin() and 'force' in request.GET and request.GET[
'force'] == '1':
Scraper(tvdb_id, rescrape=True, options=q.options)
return HttpResponseRedirect('/show/{0}'.format(q.url_string))
# Check if the show has been scraped before and if that scrape was in the last x days specified by RESCRAPE_AFTER
if q and q.last_scraped > datetime.now() - timedelta(days=RESCRAPE_AFTER):
url_slug = q.url_string
else:
# If scraping is switched on then scrape the show
if settings.SCRAPING:
s = Scraper(tvdb_id)
url_slug = s.get_url_slug()
else:
url_slug = tvdb_id
return HttpResponseRedirect('/show/{0}'.format(url_slug))
示例8: save_info_from
def save_info_from(href, data_dir):
# initialize child destination
scrap = Scraper(href)
dest = scrap.create_destination()
dest.children_href = scrap.get_children()
# check if we have already crawled this area
OBJECT_OUTFILE = data_dir + dest.nickname + '.pickle'
if os.path.exists(OBJECT_OUTFILE):
print dest.nickname + ' has already been crawled'
pass
else:
if not os.path.isdir(os.path.dirname(OBJECT_OUTFILE)):
os.makedirs(os.path.dirname(OBJECT_OUTFILE))
# traverse tree of areas-->routes
all_dest = traverse(dest)
# returns destination object
# write out to file.. for viz??
BIG_JSON = data_dir + dest.nickname + '.json'
with open(BIG_JSON, 'w+') as dump:
flat = json.dumps(all_dest, default=lambda o: o.__dict__)
dump.write(flat)
# save destination object as pickle
BIG_PICKLE = data_dir + dest.nickname + '.pickle'
with open(BIG_PICKLE, 'wb') as handle:
pickle.dump(all_dest, handle)
flourish = '<<<'+'-'*25
print flourish + dest.nickname + flourish[::-1]
print
示例9: traverse
def traverse(node):
""" Pre-order depth-first search of Mountain Project tree """
children = []
for href in node.children_href:
# initialize Scraper for this page
scrap = Scraper(href)
if scrap.soup is None:
pass
else:
# grab features from the soup
dest = scrap.create_destination()
# find children in the soup if any
dest.children_href = scrap.get_children()
# recursively deeper down the tree if this is an area
if dest.children_href != None:
print
print '**'+dest.nickname+'**'
traverse(dest)
# inner traverse function has returned with destination object
print dest.nickname + ' | ' + dest.href
children.append(dest)
node.children = children
return node
示例10: __init__
def __init__(self, base_url = awards_base_url, search_url = ""):
Scraper.__init__(self, base_url, search_url)
self.file = open('academy_awards.csv', 'wb')
self.writer = csv.writer(self.file, delimiter='\t')
self.writer.writerow(['Year', 'Category', 'Won', 'FilmName', 'ActorDirectorName'])
self.soup = self.connect(base_url)
self.next_record = '1'
示例11: main
def main():
uid = str(uuid.uuid4())
print "Creating Scraper() instance ..."
scraper = Scraper(uid)
scraper.run()
print "Running tests ..."
# typelink()
test_typelink(scraper)
# checkmatch()
test_checkmatch(scraper)
# getpagelinks
test_getpagelinks(scraper)
# folowlinks()
test_followlinks(scraper)
# get scraper status
text_getstatus(scraper)
scraper.stop();
print "Done."
示例12: scrape_all
def scrape_all(root_href, data_dir):
""" Scrape Mountain Project and save Destination objects """
scrap = Scraper(root_href)
# iterate over children of the root (e.g. states in the US)
for href in scrap.get_children():
save_info_from(href, data_dir)
示例13: testExtractText
def testExtractText(self):
pattern = "<a>$text</a>"
_scraper = Scraper(pattern)
exp = BeautifulSoup(pattern)
# one text
actual = BeautifulSoup("<a>hello world</a>")
self.assertEqual('hello world', _scraper.extractText(exp.contents[0], actual.contents[0])['text'])
示例14: run
def run(self):
try:
print "y"
s = Scraper('en')
#s.getCategory(self.cat)
s.getGameList(self.cat, endPage=0)
except:
print "n"
示例15: testExtract
def testExtract(self):
pattern = "<a name='$name'>$text</a>"
_scraper = Scraper(pattern)
exp = BeautifulSoup(pattern)
# text in sub tag
actual = BeautifulSoup("<a name='abc'>hello world</a>")
ret = _scraper.extract(actual.contents[0])
self.assertEqual('hello world', ret['text'])