本文整理汇总了Python中utils.urlnorm.normalize函数的典型用法代码示例。如果您正苦于以下问题:Python normalize函数的具体用法?Python normalize怎么用?Python normalize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了normalize函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _addLinksToCrawler
def _addLinksToCrawler(self):
"""
"""
try:
log.info(self.log_msg('levels : %s , %s:%s:%s'%(self.currenturi,self.task.level,self.level,self.max_recursion_level)))
if self.task.level > self.max_recursion_level and not self.task.instance_data.get('metapage'):
log.debug('TaskID:%s::Client:%s::recursion level greater then MAX, returning for %s' % (self.task.id, self.task.client_name,self.currenturi))
return
#increment=1
#if self.task.instance_data['metapage']:
#increment=0
for anchor in self.soup.findAll('a',href=True):
try:
url = normalize(unicode(anchor['href']), self.currenturi, self.base)
#apply regex patters to urls :
if self.task.instance_data.get('url_filter'):
url_pattern = re.compile(self.task.instance_data['url_filter'],
re.IGNORECASE|re.DOTALL)
if not url_pattern.search(url):
continue
log.info(self.log_msg("clone uri :: %s"%normalize(unicode(anchor['href']), self.currenturi, self.base)))
temp_task=self.task.clone()
temp_task.instance_data['uri']=normalize(unicode(anchor['href']), self.currenturi, self.base)
#temp_task.level=int(self.task.level)+increment
temp_task.pagedata['title']=getTitleFromLink(anchor)
temp_task.priority=self.task.priority
self.linksOut.append(temp_task)
except:
log.exception('TaskID:%s::Client:%s::failed to create one of the clone tasks' % (self.task.id, self.task.client_name))
continue
return True #intentional indentation
except:
log.exception('TaskID:%s::Client:%s::addLinksToCrawler failed' % (self.task.id, self.task.client_name))
示例2: _process_item
def _process_item(self, item):
feed_title = item.xpath('./string[@name="title"]') and \
item.xpath('./string[@name="title"]')[0].text
feed_address = item.xpath('./string[@name="id"]') and \
item.xpath('./string[@name="id"]')[0].text.replace('feed/', '', 1)
feed_link = item.xpath('./string[@name="htmlUrl"]') and \
item.xpath('./string[@name="htmlUrl"]')[0].text
category = item.xpath('./list[@name="categories"]/object/string[@name="label"]') and \
item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text
if not feed_address:
feed_address = feed_link
try:
feed_link = urlnorm.normalize(feed_link)
feed_address = urlnorm.normalize(feed_address)
feed = {
'title': feed_title,
'url': feed_address,
'link': feed_link,
'category': category,
}
return feed
except Exception, e:
print '---->Exception: %s: %s' % (e, item)
示例3: process_outline
def process_outline(self, outline):
folders = []
for item in outline:
if not hasattr(item, 'xmlUrl') and hasattr(item, 'text'):
folder = item
# if hasattr(folder, 'text'):
# logging.info(' ---> [%s] ~FRNew Folder: %s' % (self.user, folder.text))
folders.append({folder.text: self.process_outline(folder)})
elif hasattr(item, 'xmlUrl'):
feed = item
if not hasattr(feed, 'htmlUrl'):
setattr(feed, 'htmlUrl', None)
# If feed title matches what's in the DB, don't override it on subscription.
feed_title = getattr(feed, 'title', None) or getattr(feed, 'text', None)
if not feed_title:
setattr(feed, 'title', feed.htmlUrl or feed.xmlUrl)
user_feed_title = None
else:
setattr(feed, 'title', feed_title)
user_feed_title = feed.title
feed_address = urlnorm.normalize(feed.xmlUrl)
feed_link = urlnorm.normalize(feed.htmlUrl)
if len(feed_address) > Feed._meta.get_field('feed_address').max_length:
continue
if feed_link and len(feed_link) > Feed._meta.get_field('feed_link').max_length:
continue
# logging.info(' ---> \t~FR%s - %s - %s' % (feed.title, feed_link, feed_address,))
feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed.title)
# feeds.append(feed_data)
# See if it exists as a duplicate first
duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address)
if duplicate_feed:
feed_db = duplicate_feed[0].feed
else:
feed_data['active_subscribers'] = 1
feed_data['num_subscribers'] = 1
feed_db, _ = Feed.objects.get_or_create(feed_address=feed_address,
feed_link=feed_link,
defaults=dict(**feed_data))
if user_feed_title == feed_db.feed_title:
user_feed_title = None
us, _ = UserSubscription.objects.get_or_create(
feed=feed_db,
user=self.user,
defaults={
'needs_unread_recalc': True,
'mark_read_date': datetime.datetime.utcnow() - datetime.timedelta(days=1),
'active': self.user.profile.is_premium,
'user_title': user_feed_title
}
)
if self.user.profile.is_premium and not us.active:
us.active = True
us.save()
folders.append(feed_db.pk)
return folders
示例4: get_or_create
def get_or_create(cls, address, title='', link=''):
address = urlnorm.normalize(address)
link = link and urlnorm.normalize(link)
feed = cls.get_by_url(address)
if feed: return feed, True
feed = Feed(address, title = title, link = link)
feed.save()
return feed.update(), False
示例5: process_item
def process_item(self, item, folders):
feed_title = item.xpath('./string[@name="title"]') and item.xpath('./string[@name="title"]')[0].text
feed_address = item.xpath('./string[@name="id"]') and item.xpath('./string[@name="id"]')[0].text.replace(
"feed/", ""
)
feed_link = item.xpath('./string[@name="htmlUrl"]') and item.xpath('./string[@name="htmlUrl"]')[0].text
category = (
item.xpath('./list[@name="categories"]/object/string[@name="label"]')
and item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text
)
if not feed_address:
feed_address = feed_link
try:
feed_link = urlnorm.normalize(feed_link)
feed_address = urlnorm.normalize(feed_address)
if len(feed_address) > Feed._meta.get_field("feed_address").max_length:
return folders
# See if it exists as a duplicate first
duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address)
if duplicate_feed:
feed_db = duplicate_feed[0].feed
else:
feed_data = dict(feed_title=feed_title)
feed_data["active_subscribers"] = 1
feed_data["num_subscribers"] = 1
feed_db, _ = Feed.find_or_create(
feed_address=feed_address, feed_link=feed_link, defaults=dict(**feed_data)
)
us, _ = UserSubscription.objects.get_or_create(
feed=feed_db,
user=self.user,
defaults={
"needs_unread_recalc": True,
"mark_read_date": datetime.datetime.utcnow() - datetime.timedelta(days=1),
"active": self.user.profile.is_premium or self.auto_active,
},
)
if not us.needs_unread_recalc:
us.needs_unread_recalc = True
us.save()
if not category:
category = ""
if category:
obj = {category: []}
folders = add_object_to_folder(obj, "", folders)
folders = add_object_to_folder(feed_db.pk, category, folders)
# if feed_db.pk not in folders[category]:
# folders[category].append(feed_db.pk)
except Exception, e:
logging.info(" *** -> Exception: %s: %s" % (e, item))
示例6: process_item
def process_item(self, item, folders):
feed_title = item.xpath('./string[@name="title"]') and item.xpath('./string[@name="title"]')[0].text
feed_address = item.xpath('./string[@name="id"]') and item.xpath('./string[@name="id"]')[0].text.replace(
"feed/", ""
)
feed_link = item.xpath('./string[@name="htmlUrl"]') and item.xpath('./string[@name="htmlUrl"]')[0].text
category = (
item.xpath('./list[@name="categories"]/object/string[@name="label"]')
and item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text
)
if not feed_address:
feed_address = feed_link
try:
feed_link = urlnorm.normalize(feed_link)
feed_address = urlnorm.normalize(feed_address)
if len(feed_address) > Feed._meta.get_field("feed_address").max_length:
return folders
# See if it exists as a duplicate first
duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address)
if duplicate_feed:
feed_db = duplicate_feed[0].feed
else:
feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed_title)
feed_data["active_subscribers"] = 1
feed_data["num_subscribers"] = 1
feeds = Feed.objects.filter(feed_address=feed_address, branch_from_feed__isnull=True).order_by(
"-num_subscribers"
)
if feeds:
feed_db = feeds[0]
else:
feed_db = Feed.objects.create(**feed_data)
us, _ = UserSubscription.objects.get_or_create(
feed=feed_db,
user=self.user,
defaults={
"needs_unread_recalc": True,
"mark_read_date": datetime.datetime.utcnow() - datetime.timedelta(days=1),
"active": self.user.profile.is_premium,
},
)
if not category:
category = "Root"
folders[category].append(feed_db.pk)
except Exception, e:
logging.info(" *** -> Exception: %s" % e)
示例7: process_outline
def process_outline(self, outline):
folders = []
for item in outline:
if not hasattr(item, "xmlUrl"):
folder = item
# if hasattr(folder, 'text'):
# logging.info(' ---> [%s] ~FRNew Folder: %s' % (self.user, folder.text))
folders.append({folder.text: self.process_outline(folder)})
elif hasattr(item, "xmlUrl"):
feed = item
if not hasattr(feed, "htmlUrl"):
setattr(feed, "htmlUrl", None)
if not hasattr(feed, "title") or not feed.title:
setattr(feed, "title", feed.htmlUrl or feed.xmlUrl)
feed_address = urlnorm.normalize(feed.xmlUrl)
feed_link = urlnorm.normalize(feed.htmlUrl)
if len(feed_address) > Feed._meta.get_field("feed_address").max_length:
continue
if feed_link and len(feed_link) > Feed._meta.get_field("feed_link").max_length:
continue
if len(feed.title) > Feed._meta.get_field("feed_title").max_length:
feed.title = feed.title[:255]
# logging.info(' ---> \t~FR%s - %s - %s' % (feed.title, feed_link, feed_address,))
feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed.title)
# feeds.append(feed_data)
# See if it exists as a duplicate first
duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address)
if duplicate_feed:
feed_db = duplicate_feed[0].feed
else:
feed_data["active_subscribers"] = 1
feed_data["num_subscribers"] = 1
feed_db, _ = Feed.objects.get_or_create(feed_address=feed_address, defaults=dict(**feed_data))
us, _ = UserSubscription.objects.get_or_create(
feed=feed_db,
user=self.user,
defaults={
"needs_unread_recalc": True,
"mark_read_date": datetime.datetime.utcnow() - datetime.timedelta(days=1),
"active": self.user.profile.is_premium,
},
)
if self.user.profile.is_premium and not us.active:
us.active = True
us.save()
folders.append(feed_db.pk)
return folders
示例8: process_item
def process_item(self, item, folders):
feed_title = item.xpath('./string[@name="title"]') and \
item.xpath('./string[@name="title"]')[0].text
feed_address = item.xpath('./string[@name="id"]') and \
item.xpath('./string[@name="id"]')[0].text.replace('feed/', '')
feed_link = item.xpath('./string[@name="htmlUrl"]') and \
item.xpath('./string[@name="htmlUrl"]')[0].text
category = item.xpath('./list[@name="categories"]/object/string[@name="label"]') and \
item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text
if not feed_address:
feed_address = feed_link
try:
feed_link = urlnorm.normalize(feed_link)
feed_address = urlnorm.normalize(feed_address)
if len(feed_address) > Feed._meta.get_field('feed_address').max_length:
return folders
# See if it exists as a duplicate first
duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address)
if duplicate_feed:
feed_db = duplicate_feed[0].feed
else:
feed_data = dict(feed_title=feed_title)
feed_data['active_subscribers'] = 1
feed_data['num_subscribers'] = 1
feed_db, _ = Feed.find_or_create(feed_address=feed_address, feed_link=feed_link,
defaults=dict(**feed_data))
us, _ = UserSubscription.objects.get_or_create(
feed=feed_db,
user=self.user,
defaults={
'needs_unread_recalc': True,
'mark_read_date': datetime.datetime.utcnow() - datetime.timedelta(days=1),
'active': self.user.profile.is_premium or self.auto_active,
}
)
if not us.needs_unread_recalc:
us.needs_unread_recalc = True
us.save()
if not category: category = "Root"
if feed_db.pk not in folders[category]:
folders[category].append(feed_db.pk)
except Exception, e:
logging.info(' *** -> Exception: %s: %s' % (e, item))
示例9: fetch
def fetch(self):
"""
same fetch method, I need to write something for doc string
So I m writing this doc string
"""
try:
self.parenturi = self.currenturi
self.genre = "Review"
if self.currenturi == 'http://www.laptopical.com/laptop-reviews.html':
if not self._setSoup():
return False
hrefs = [ 'http://www.laptopical.com' + div.find('a')['href'] \
for div in self.soup.find('div',{'id':'review-listing'})\
.find('ul').findAll('li') if not div.find('a') == None ]
for href in hrefs:
temp_task=self.task.clone()
temp_task.instance_data[ 'uri' ] = normalize( href )
self.linksOut.append( temp_task )
log.info('Total uris are %d'%(len( hrefs )))
return True
if re.compile('http://www.laptopical.com/.+?\.html').match(self.currenturi):
if not self._setSoup():
return False
self._getParentPage()
self._addReview()
return True
except:
log.exception('error in fetch ')
return False
示例10: __processRSSFeeds
def __processRSSFeeds(self):
'''This will process the RSS Feeds of Facebook
'''
log.debug(self.log_msg("Entry Webpage: "+str(self.currenturi)))
parser = feedparser.parse(self.currenturi)
if len(parser.version) == 0 or not parser:
log.info(self.log_msg('parser version not found , returning'))
return False
log.info('number of entries %s'%(len(parser.entries)))
for entity in parser.entries:
try:
if checkSessionInfo('Review',self.session_info_out, entity['link'],
self.task.instance_data.get('update')):
log.info(self.log_msg('Session info returns True for uri %s'%entity['link']))
continue
result = updateSessionInfo('Review', self.session_info_out, entity['link'], '',
'Post', self.task.instance_data.get('update'))
if not result['updated']:
log.info(self.log_msg('Result not updated for uri %s'%entity['link']))
continue
temp_task = self.task.clone()
temp_task.instance_data['uri'] = normalize(entity['link'])
temp_task.pagedata['title'] = entity['title']
temp_task.pagedata['source'] = 'facebook.com'
temp_task.instance_data['connector_name'] = 'HTMLConnector'
temp_task.pagedata['source_type'] = 'rss'
self.linksOut.append(temp_task)
except:
log.exception(self.log_msg("exception in adding temptask to linksout"))
return True
示例11: fetch
def fetch(self):
self.genre="Review"
try:
self.__base_uri = 'http://answers.yahoo.com/'
code = None
parent_uri = self.currenturi
res=self._getHTML()
self.rawpage=res['result']
self._setCurrentPage()
self.POSTS_ITERATIONS = tg.config.get(path='Connector',key='yahooanswers_numposts')
self.__max_date_submission_date = tg.config.get(path='Connector',key='yahooanswers_max_date_submission')
self.curiter = 0
if '/question/index' not in self.currenturi:
self.__createSiteUrl()
next_page = self.soup.find('li',{'class':'next'})
while self.addQuestionUrls(parent_uri) and next_page:
try:
self.currenturi = normalize(self.__base_uri + next_page.a['href'])
log.debug(self.log_msg("Fetching url %s" %(self.currenturi)))
res=self._getHTML()
self.rawpage=res['result']
self._setCurrentPage()
next_page = self.soup.find('li',{'class':'next'})
except Exception, e:
log.exception(self.log_msg('exception in iterating pages in fetch'))
break
else:
示例12: __getParentPage
def __getParentPage(self):
"""
This will get the parent info
"""
page = {}
try:
self.hierarchy = page['et_thread_hierarchy'] = [stripHtml(x.renderContents()) for x in self.soup.find('div','CommonBreadCrumbArea').findAll('a')][1:]
page['title']= page['et_thread_hierarchy'][-1]
except:
log.info(self.log_msg('Thread hierarchy is not found'))
page['title']=''
try:
self.thread_id = page['et_thread_id'] = unicode(self.currenturi.split('/')[-1].replace('.aspx',''))
except:
log.info(self.log_msg('Thread id not found'))
if checkSessionInfo(self.genre, self.session_info_out, self.parent_uri,\
self.task.instance_data.get('update')):
log.info(self.log_msg('Session info return True, Already exists'))
return False
for each in ['et_thread_last_post_author','ei_thread_replies_count','edate_last_post_date']:
try:
page[each] = self.task.pagedata[each]
except:
log.info(self.log_msg('page data cannot be extracted for %s'%each))
try:
post_hash = get_hash( page )
id=None
if self.session_info_out=={}:
id=self.task.id
result=updateSessionInfo( self.genre, self.session_info_out, self.\
parent_uri, post_hash,'Forum',self.task.instance_data.get('update'), Id=id)
if not result['updated']:
return False
page['path']=[self.parent_uri]
page['parent_path']=[]
page['uri'] = normalize( self.currenturi )
page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
page['priority']=self.task.priority
page['level']=self.task.level
page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
page['connector_instance_log_id'] = self.task.connector_instance_log_id
page['connector_instance_id'] = self.task.connector_instance_id
page['workspace_id'] = self.task.workspace_id
page['client_id'] = self.task.client_id
page['client_name'] = self.task.client_name
page['last_updated_time'] = page['pickup_date']
page['versioned'] = False
page['data'] = ''
page['task_log_id']=self.task.id
page['entity'] = 'Post'
page['category']=self.task.instance_data.get('category','')
self.pages.append(page)
log.info(page)
log.info(self.log_msg('Parent Page added'))
return True
except :
log.exception(self.log_msg("parent post couldn't be parsed"))
return False
示例13: api_save_new_subscription
def api_save_new_subscription(request):
user = request.user
body = request.body_json
fields = body.get('actionFields')
url = urlnorm.normalize(fields['url'])
folder = fields['folder']
if folder == "Top Level":
folder = " "
code, message, us = UserSubscription.add_subscription(
user=user,
feed_address=url,
folder=folder,
bookmarklet=True
)
logging.user(request, "~FRAdding URL from ~FC~SBIFTTT~SN~FR: ~SB%s (in %s)" % (url, folder))
if us and us.feed:
url = us.feed.feed_address
return {"data": [{
"id": us and us.feed_id,
"url": url,
}]}
示例14: __getParentPage
def __getParentPage(self):
''
if checkSessionInfo(self.genre, self.session_info_out, self.currenturi,\
self.task.instance_data.get('update')):
log.info(self.log_msg('Session info return True, Already exists'))
return False
page = {}
try:
page['et_thread_hierarchy'] = [each.replace('>','').strip() for each in stripHtml(self.soup.find('span','navbar').findParent('table').renderContents()).split('\n') if not each.strip()=='']
page['title']= page['et_thread_hierarchy'][-1]
except:
log.info(self.log_msg('Thread hierarchy is not found'))
page['title']=''
for each in ['title','et_last_post_author_name','ei_thread_replies_count','ei_thread_views_count','edate_last_post_date','ei_thread_votes_count','ef_thread_rating']:
try:
page[each] = self.task.pagedata[each]
except:
log.info(self.log_msg('page data cannot be extracted'))
try:
page['et_thread_id'] = self.currenturi.split('&')[-1].split('=')[-1]
except:
log.info(self.log_msg('Thread id not found'))
try:
post_hash = get_hash( page )
id=None
if self.session_info_out=={}:
id=self.task.id
result=updateSessionInfo( self.genre, self.session_info_out, self.\
currenturi, post_hash,'Post',self.task.instance_data.get('update'), Id=id)
if not result['updated']:
return False
page['path']=[self.currenturi]
page['parent_path']=[]
page['uri'] = normalize( self.currenturi )
page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
page['priority']=self.task.priority
page['level']=self.task.level
page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
page['connector_instance_log_id'] = self.task.connector_instance_log_id
page['connector_instance_id'] = self.task.connector_instance_id
page['workspace_id'] = self.task.workspace_id
page['client_id'] = self.task.client_id
page['client_name'] = self.task.client_name
page['last_updated_time'] = page['pickup_date']
page['versioned'] = False
#page['first_version_id']=result['first_version_id']
page['data'] = ''
#page['id'] = result['id']
page['task_log_id']=self.task.id
page['entity'] = 'Post'
page['category']=self.task.instance_data.get('category','')
self.pages.append(page)
log.info(page)
log.info(self.log_msg('Parent Page added'))
return True
except :
log.exception(self.log_msg("parent post couldn't be parsed"))
return False
示例15: api_share_new_story
def api_share_new_story(request):
user = request.user
body = request.body_json
fields = body.get('actionFields')
story_url = urlnorm.normalize(fields['story_url'])
content = fields.get('story_content', "")
story_title = fields.get('story_title', "[Untitled]")
story_author = fields.get('story_author', "")
comments = fields.get('comments', None)
feed = Feed.get_feed_from_url(story_url, create=True, fetch=True)
content = lxml.html.fromstring(content)
content.make_links_absolute(story_url)
content = lxml.html.tostring(content)
shared_story = MSharedStory.objects.filter(user_id=user.pk,
story_feed_id=feed and feed.pk or 0,
story_guid=story_url).limit(1).first()
if not shared_story:
story_db = {
"story_guid": story_url,
"story_permalink": story_url,
"story_title": story_title,
"story_feed_id": feed and feed.pk or 0,
"story_content": content,
"story_author": story_author,
"story_date": datetime.datetime.now(),
"user_id": user.pk,
"comments": comments,
"has_comments": bool(comments),
}
shared_story = MSharedStory.objects.create(**story_db)
socialsubs = MSocialSubscription.objects.filter(subscription_user_id=user.pk)
for socialsub in socialsubs:
socialsub.needs_unread_recalc = True
socialsub.save()
logging.user(request, "~BM~FYSharing story from ~SB~FCIFTTT~FY: ~SB%s: %s" % (story_url, comments))
else:
logging.user(request, "~BM~FY~SBAlready~SN shared story from ~SB~FCIFTTT~FY: ~SB%s: %s" % (story_url, comments))
try:
socialsub = MSocialSubscription.objects.get(user_id=user.pk,
subscription_user_id=user.pk)
except MSocialSubscription.DoesNotExist:
socialsub = None
if socialsub:
socialsub.mark_story_ids_as_read([shared_story.story_hash],
shared_story.story_feed_id,
request=request)
else:
RUserStory.mark_read(user.pk, shared_story.story_feed_id, shared_story.story_hash)
shared_story.publish_update_to_subscribers()
return {"data": [{
"id": shared_story and shared_story.story_guid,
"url": shared_story and shared_story.blurblog_permalink()
}]}