def test_calendar_tag_rendering(self, timezone_mock):
timezone_mock.now.return_value = tz_datetime(2015, 1, 10, 12)
page_with_apphook = self.create_base_pages()
other_config = EventsConfig.objects.create(namespace='other')
start_date=tz_datetime(2015, 1, 13),
publish_at=tz_datetime(2015, 1, 10)
start_date=tz_datetime(2015, 1, 15),
publish_at=tz_datetime(2015, 1, 10)
start_date=tz_datetime(2015, 1, 16),
publish_at=tz_datetime(2015, 1, 10)
start_date=tz_datetime(2015, 1, 18),
publish_at=tz_datetime(2015, 1, 10),
start_date=tz_datetime(2015, 1, 22),
end_date=tz_datetime(2015, 1, 27),
publish_at=tz_datetime(2015, 1, 10)
start_date=tz_datetime(2015, 1, 25),
# make use of default tests self.app_config namespace, instead of
# hard coding it
template_str = """
{%% load aldryn_events %%}
{%% calendar 2015 1 'en' '%s' %%}
""" % self.app_config.namespace
t = Template(template_str)
with override('en'):
html = t.render(SekizaiContext({}))
table = PyQuery(html)('table.table-calendar')
page_url_en = page_with_apphook.get_absolute_url()
links = table.find('td.events, td.multiday-events').find('a')
# test if tag rendered important elements
self.assertEqual('1', table.attr('data-month-numeric'), )
self.assertEqual('2015', table.attr('data-year'))
self.assertEqual('10', table.find('td.today').text())
self.assertEqual(8, links.length) # 13, 15, 22, 23, 24, 25, 26, 27
expected_days = (13, 15, 22, 23, 24, 25, 26, 27)
for position, day in enumerate(expected_days):
event_url = '{0}2015/1/{1}/'.format(page_url_en, day)
rendered_url = links[position].attrib['href']
self.assertEqual(event_url, rendered_url)
def parseProductPage(product, need_img_urls=False):
"""进入商品详情页, 抓取四个新字段
delivery reviews star total_sales
if product['product_url']:
content = fetchContent(product['product_url'], False)
#product['delivery'] = doc("div.cost-entries-type > p > em.value").text() 运费JS动态 解决不了
product['reviews'] = doc('p.satisfaction-number > a > em.value').text()
product['star'] = doc('p.star-level > i').attr("class")
product['total_sales'] = doc('p.bargain-number > a > em.value').text()
if need_img_urls:
url_list = get_img_urls(content)
product['img_urls'] = ', '.join(url_list)
product['img_urls'] = ''
product['color'], product['size'] = '', ''
for index, td in enumerate(doc('div.obj-content > table > tbody > tr > td')):
tdQ = PyQuery(td)
if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'颜色':
product['color'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text()
if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'尺寸':
product['size'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text()
product['MOQ'] = extractNum(doc('tr.amount > td.ladder-1-1 > span.value').text().replace(u"≥", ""))
if not product['MOQ'] or product['MOQ'] == 0:
product['MOQ'] = extractNum(PyQuery(doc('tr.amount').remove('td.amount-title').children('td').eq(0))('span.value').text())
if product['MOQ'] == 1:
#print product['product_url']
product['sku_size'] = PyQuery(doc('div.unit-detail-spec-operator').eq(0))('span.text').text()
product['sku_color'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.name').text()
product['sku_price'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.price').text()
product['sku_amount'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.count > span > em.value').text()
print product['sku_id'], '\t', product['sku_size'], "\t", product['sku_color'], "\t", product['sku_price'], "\t", product['sku_amount']
return product
def fixLinks(text, parser):
d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
for element in d('a, link'):
e = PyQuery(element)
href = e.attr('href')
if href is None:
print '// Drop queryString in included src'
print 'from: ', href
result = urlparse(href)
if result.scheme == 'https':
href = href
elif result.scheme == '':
href = result.path + (('#' + result.fragment) if result.fragment != '' else '')
print 'to: ', href
new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href)
if not abs_url_regex.search(href):
new_href = re.sub(r'/index\.html$', '/', new_href)
if href != new_href:
e.attr('href', new_href)
print "\t", href, "=>", new_href
if parser == 'html':
return d.html(method='html').encode('utf8')
return d.__unicode__().encode('utf8')
def scrape(slug, url, name, title=None):
f = urlopen(url)
doc = f.read()
doc, errs = tidy_document(
"output-html": 1,
"clean": 1,
"drop-font-tags": 1,
if errs:
# raise Exception, errs
print errs
doc = html5lib.parse(doc, treebuilder="lxml") # this didn't work, but above three lines did: encoding='utf-8',
jQuery = PyQuery([doc])
td = jQuery("td#content")
assert len(td) == 1
for img in td("img"):
# print 'img:', PyQuery (img)
img = PyQuery(img)
src = img.attr("src")
# alt = img.attr('alt')
# if src.startswith ('/image'):
rslt = getimage(src, slug.split("/")[0])
img.attr("src", rslt)
if trace:
print rslt
# td =
# no_fonts (td)
# need to fix links here
content = PyQuery(td[0])
# content = content.html()
content = no_namespaces(content.html())
print slug, content[:60] # .html() # [:60]
if dbteeth:
# q, created = QuickPage.objects.get_or_create (
qp, created = create_or_update(
title=title if title else name,
# defaults = dict (sortorder = sortorder),
def _add_nested(self, k, el):
"""Parse nested element by its children."""
el = Pq(el)
tagname = Pq(el)[0].tag
if tagname in self.invalid_tags:
id = self._format_id(el.attr('id'))
classes = self._format_classes(el.attr('class'))
selector = self._format_selector(el, id, classes)
children = Pq(el).children()
if not self._is_root_body_node(el):
# Add for single nodes only
if not children:
# Build nested css by traversing all child nodes and getting
# their attributes.
while children:
for child in children:
# 1. Add current
# 2. Add child
child = Pq(child)
selector += self._add_id_and_classes(child)
# # 3. Move to next children
children = child.children()
def getTweets(tweetCriteria):
refreshCursor = ''
results = []
while True:
json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor)
refreshCursor = json['min_position']
tweets = PyQuery(json['items_html'])('div.js-stream-tweet')
except Exception, e:
print e
# There was either an error in the request or nothing returned
return results
if len(tweets) == 0:
for tweetHTML in tweets:
tweetPQ = PyQuery(tweetHTML)
tweet = models.Tweet()
# print tweetPQ("p.js-tweet-text").text()
usernameTweet = tweetPQ("span.username.js-action-profile-name b").text()
txt = re.sub(r"[^\x00-\x7F]", "", tweetPQ("p.js-tweet-text").text()) \
.replace('# ', '#') \
.replace('@ ', '@') \
.replace('www. ', 'www.') \
.replace('/ ', '/')
retweets = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
favorites = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
dateSec = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time"))
id = tweetPQ.attr("data-tweet-id")
permalink = tweetPQ.attr("data-permalink-path")
geo = ''
geoSpan = tweetPQ('span.Tweet-geo')
if len(geoSpan) > 0:
geo = geoSpan.attr('title')
tweet.id = id
tweet.permalink = 'https://twitter.com' + permalink
tweet.username = usernameTweet
tweet.text = txt
tweet.date = datetime.datetime.fromtimestamp(dateSec)
tweet.retweets = retweets
tweet.favorites = favorites
tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text))
tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text))
tweet.geo = geo
if tweetCriteria.maxTweets > 0 and len(results) >= tweetCriteria.maxTweets:
return results
def _add(self, k, el):
"""Parse element, without considering children."""
el = Pq(el)
id, classes = el.attr('id'), el.attr('class')
if id is not None:
if classes is not None:
for _class in classes.split(' '):
def replace_img(index, node):
node = PyQuery(node)
if not node.attr('src'):
return node
node.attr('src', urljoin_rfc(base_url, node.attr('src')))
return node
def _absoluteurl(x):
q = PyQuery(this)
href = q.attr('href')
if href and (href.startswith('#') or href.startswith('http') or
if href:
q.attr('href','/' + href)
def __processInstagramTag(self, i, e):
obj = PyQuery(e)
url = obj('a').attr('href')
shortCode = re.match("http://.*/p/(.*)/", url).group(1)
imageUrl = self.getInstagramImageUrl(shortCode)
newObj = PyQuery("<img />")
newObj.attr('src', imageUrl)
def fixLinks(text):
d = PyQuery(text, parser='html')
for element in d('a'):
e = PyQuery(element)
href = e.attr('href')
if not abs_url_regex.search(href):
new_href = re.sub(r'/index\.html$', '/', href)
e.attr('href', new_href)
print "\t", href, "=>", new_href
return d.__unicode__().encode('utf8')
def scrape_category (url, title):
category_slug = slugify (title)
f = urlopen (url)
except ValueError:
if trace: print 'Retrying:', url
url = 'http://eracks.com' + url.replace (' ','%20')
if trace: print 'As:', url
f = urlopen (url)
doc = html5lib.parse(f, treebuilder='lxml', namespaceHTMLElements=False) # this didn't work, but above three lines did: encoding='utf-8',
html.xhtml_to_html (doc)
jQuery = PyQuery([doc])
page_title = jQuery ('title').text()
if page_title.startswith ("eRacks Open Source Systems: "):
page_title = page_title.partition ("eRacks Open Source Systems: ") [-1]
if page_title.startswith ("eRacks "):
page_title = page_title.partition ("eRacks ") [-1]
content = jQuery ('td#content')
links = content ('a')
images = content ('img')
for link in links:
a = PyQuery (link)
href = a.attr('href')
skus = find_sku.findall (href)
if skus:
sku = skus [0]
#a.attr ('href', '/%s/%s/' % (category_slug, slugify (sku)))
a.attr ('href', '/products/%s/%s/' % (category_slug, sku))
elif href.startswith ('/Legacy'):
sku = slugify (href.split ('/') [-1])
#a.attr ('href', '/%s/%s/' % (category_slug, slugify (sku)))
a.attr ('href', '/products/%s/%s/' % (category_slug, sku))
print 'link:', a.attr('href')
for image in images:
img = PyQuery (image)
src = img.attr('src')
newsrc = getimage (src, 'categories/' + category_slug)
img.attr ('src', newsrc)
print 'image:', newsrc
description = content.html()
if trace: print description
if dbteeth:
cat = Categories.objects.get (name=title)
cat.comments = cat.comments + '\n\nScraped from Zope as of ' + str(datetime.date.today())
cat.description = description
cat.title = page_title
print '..saved.'
def make_possible_feed(link_element):
""" Visits each <link rel="alternate" href="http://..." /> element """
link = PyQuery(link_element)
title = 'Unknown'
if link.attr('title'):
title = link.attr('title')
if link.attr('href'):
return {'feed_url': link.attr('href'), 'feed_title': title}
log.info("Skipping malformed link element for feed, missing href")
return False
def _append_contents(struct, par):
tag = struct['tag']
_node = PyQuery('<%s />' % tag)
if 'attributes' in struct:
for key in struct['attributes'].keys():
_node.attr(key, struct['attributes'][key])
if 'text' in struct:
elif 'children' in struct:
for (ugh, child) in struct['children'].iteritems():
_append_contents(child, _node)
def replace_link(index, node):
node = PyQuery(node)
if not node.attr('href'):
return node
link = node.attr('href').strip()
if regex.match(link):
node.attr('href', urljoin_rfc(base_url, link))
return node