本文整理汇总了Python中utils.story_functions.pre_process_story函数的典型用法代码示例。如果您正苦于以下问题:Python pre_process_story函数的具体用法?Python pre_process_story怎么用?Python pre_process_story使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了pre_process_story函数的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: process
def process(self):
""" Downloads and parses a feed.
"""
start = time.time()
self.refresh_feed()
ret_values = dict(new=0, updated=0, same=0, error=0)
# logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title))
if hasattr(self.fpf, 'status'):
if self.options['verbose']:
if self.fpf.bozo and self.fpf.status != 304:
logging.debug(u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (
self.feed.title[:30],
self.fpf.bozo_exception,
len(self.fpf.entries)))
if self.fpf.status == 304:
self.feed = self.feed.save()
self.feed.save_feed_history(304, "Not modified")
return FEED_SAME, ret_values
if self.fpf.status in (302, 301):
if not self.fpf.href.endswith('feedburner.com/atom.xml'):
self.feed.feed_address = self.fpf.href
if not self.feed.known_good:
self.feed.fetched_once = True
logging.debug(" ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status))
self.feed = self.feed.schedule_feed_fetch_immediately()
if not self.fpf.entries:
self.feed = self.feed.save()
self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
return FEED_ERRHTTP, ret_values
if self.fpf.status >= 400:
logging.debug(" ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.title[:30], self.fpf.status))
fixed_feed = None
if not self.feed.known_good:
fixed_feed = self.feed.check_feed_link_for_feed_address()
if not fixed_feed:
self.feed.save_feed_history(self.fpf.status, "HTTP Error")
self.feed = self.feed.save()
return FEED_ERRHTTP, ret_values
if not self.fpf.entries:
if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
logging.debug(" ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)))
fixed_feed = None
if not self.feed.known_good:
fixed_feed = self.feed.check_feed_link_for_feed_address()
if not fixed_feed:
self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception)
self.feed = self.feed.save()
return FEED_ERRPARSE, ret_values
elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
logging.debug(" ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)))
fixed_feed = None
if not self.feed.known_good:
fixed_feed = self.feed.check_feed_link_for_feed_address()
if not fixed_feed:
self.feed.save_feed_history(553, 'SAX Exception', self.fpf.bozo_exception)
self.feed = self.feed.save()
return FEED_ERRPARSE, ret_values
# the feed has changed (or it is the first time we parse it)
# saving the etag and last_modified fields
self.feed.etag = self.fpf.get('etag')
if self.feed.etag:
self.feed.etag = self.feed.etag[:255]
# some times this is None (it never should) *sigh*
if self.feed.etag is None:
self.feed.etag = ''
try:
self.feed.last_modified = mtime(self.fpf.modified)
except:
pass
self.fpf.entries = self.fpf.entries[:50]
if self.fpf.feed.get('title'):
self.feed.feed_title = self.fpf.feed.get('title')
tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline)
if tagline:
self.feed.data.feed_tagline = utf8encode(tagline)
self.feed.data.save()
if not self.feed.feed_link_locked:
self.feed.feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link
self.feed = self.feed.save()
# Compare new stories to existing stories, adding and updating
start_date = datetime.datetime.utcnow()
story_guids = []
stories = []
for entry in self.fpf.entries:
story = pre_process_story(entry)
if story.get('published') < start_date:
start_date = story.get('published')
stories.append(story)
#.........这里部分代码省略.........
示例2: process
#.........这里部分代码省略.........
if self.feed.etag is None:
self.feed.etag = ""
try:
self.feed.last_modified = mtime(self.fpf.modified)
except:
pass
self.fpf.entries = self.fpf.entries[:50]
if self.fpf.feed.get("title"):
self.feed.feed_title = self.fpf.feed.get("title")
tagline = self.fpf.feed.get("tagline", self.feed.data.feed_tagline)
if tagline:
self.feed.data.feed_tagline = utf8encode(tagline)
self.feed.data.save()
if not self.feed.feed_link_locked:
self.feed.feed_link = self.fpf.feed.get("link") or self.fpf.feed.get("id") or self.feed.feed_link
guids = []
for entry in self.fpf.entries:
if entry.get("id", ""):
guids.append(entry.get("id", ""))
elif entry.get("link"):
guids.append(entry.link)
elif entry.get("title"):
guids.append(entry.title)
self.feed = self.feed.save()
# Compare new stories to existing stories, adding and updating
start_date = datetime.datetime.utcnow()
# end_date = datetime.datetime.utcnow()
story_guids = []
stories = []
for entry in self.fpf.entries:
story = pre_process_story(entry)
if story.get("published") < start_date:
start_date = story.get("published")
# if story.get('published') > end_date:
# end_date = story.get('published')
stories.append(story)
story_guids.append(story.get("guid") or story.get("link"))
existing_stories = list(
MStory.objects(
# story_guid__in=story_guids,
story_date__gte=start_date,
story_feed_id=self.feed_id,
).limit(len(story_guids))
)
# MStory.objects(
# (Q(story_date__gte=start_date) & Q(story_date__lte=end_date))
# | (Q(story_guid__in=story_guids)),
# story_feed=self.feed
# ).order_by('-story_date')
ret_values = self.feed.add_update_stories(stories, existing_stories, verbose=self.options["verbose"])
if (
(not self.feed.is_push or self.options.get("force"))
and hasattr(self.fpf, "feed")
and hasattr(self.fpf.feed, "links")
and self.fpf.feed.links
):
hub_url = None
self_url = self.feed.feed_address
for link in self.fpf.feed.links:
if link["rel"] == "hub":
hub_url = link["href"]
elif link["rel"] == "self":
self_url = link["href"]
if hub_url and self_url and not settings.DEBUG:
logging.debug(u" ---> [%-30s] ~BB~FWSubscribing to PuSH hub: %s" % (self.feed.title[:30], hub_url))
PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url)
logging.debug(
u" ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s"
% (
self.feed.title[:30],
"~FG~SB" if ret_values[ENTRY_NEW] else "",
ret_values[ENTRY_NEW],
"~FY~SB" if ret_values[ENTRY_UPDATED] else "",
ret_values[ENTRY_UPDATED],
"~SB" if ret_values[ENTRY_SAME] else "",
ret_values[ENTRY_SAME],
"~FR~SB" if ret_values[ENTRY_ERR] else "",
ret_values[ENTRY_ERR],
len(self.fpf.entries),
)
)
self.feed.update_all_statistics(full=bool(ret_values[ENTRY_NEW]), force=self.options["force"])
self.feed.trim_feed()
self.feed.save_feed_history(200, "OK")
if self.options["verbose"]:
logging.debug(
u" ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss" % (self.feed.title[:30], time.time() - start)
)
return FEED_OK, ret_values
示例3: add_update_stories
def add_update_stories(self, stories, existing_stories, db):
ret_values = {
ENTRY_NEW:0,
ENTRY_UPDATED:0,
ENTRY_SAME:0,
ENTRY_ERR:0
}
for story in stories:
story = pre_process_story(story)
if story.get('title'):
story_contents = story.get('content')
story_tags = self.get_tags(story)
if story_contents is not None:
story_content = story_contents[0]['value']
else:
story_content = story.get('summary')
existing_story, story_has_changed = self._exists_story(story, story_content, existing_stories)
if existing_story is None:
# pub_date = datetime.datetime.timetuple(story.get('published'))
# logging.debug('- New story: %s %s' % (pub_date, story.get('title')))
s = MStory(story_feed_id = self.pk,
story_date = story.get('published'),
story_title = story.get('title'),
story_content = story_content,
story_author_name = story.get('author'),
story_permalink = story.get('link'),
story_guid = story.get('guid') or story.get('id') or story.get('link'),
story_tags = story_tags
)
try:
s.save()
ret_values[ENTRY_NEW] += 1
cache.set('updated_feed:%s' % self.id, 1)
except (IntegrityError, OperationError):
ret_values[ENTRY_ERR] += 1
# print('Saving new story, IntegrityError: %s - %s: %s' % (self.feed_title, story.get('title'), e))
elif existing_story and story_has_changed:
# update story
# logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content)))
original_content = None
if existing_story.get('story_original_content_z'):
original_content = zlib.decompress(existing_story.get('story_original_content_z'))
elif existing_story.get('story_content_z'):
original_content = zlib.decompress(existing_story.get('story_content_z'))
# print 'Type: %s %s' % (type(original_content), type(story_content))
if len(story_content) > 10:
diff = HTMLDiff(unicode(original_content), story_content)
story_content_diff = diff.getDiff()
else:
story_content_diff = original_content
# logging.debug("\t\tDiff: %s %s %s" % diff.getStats())
# logging.debug("\t\tDiff content: %s" % diff.getDiff())
if existing_story.get('story_title') != story.get('title'):
# logging.debug('\tExisting title / New: : \n\t\t- %s\n\t\t- %s' % (existing_story.story_title, story.get('title')))
pass
existing_story['story_feed'] = self.pk
existing_story['story_date'] = story.get('published')
existing_story['story_title'] = story.get('title')
existing_story['story_content'] = story_content_diff
existing_story['story_original_content'] = original_content
existing_story['story_author_name'] = story.get('author')
existing_story['story_permalink'] = story.get('link')
existing_story['story_guid'] = story.get('guid') or story.get('id') or story.get('link')
existing_story['story_tags'] = story_tags
try:
db.stories.update({'_id': existing_story['_id']}, existing_story)
ret_values[ENTRY_UPDATED] += 1
cache.set('updated_feed:%s' % self.id, 1)
except (IntegrityError, OperationError):
ret_values[ENTRY_ERR] += 1
# print('Saving updated story, IntegrityError: %s - %s' % (self.feed_title, story.get('title')))
else:
ret_values[ENTRY_SAME] += 1
# logging.debug("Unchanged story: %s " % story.get('title'))
return ret_values
示例4: add_update_stories
def add_update_stories(self, stories, existing_stories):
ret_values = {ENTRY_NEW: 0, ENTRY_UPDATED: 0, ENTRY_SAME: 0, ENTRY_ERR: 0}
for story in stories:
story = pre_process_story(story)
if story.get("title"):
story_contents = story.get("content")
story_tags = self.get_tags(story)
if story_contents is not None:
story_content = story_contents[0]["value"]
else:
story_content = story.get("summary")
existing_story, story_has_changed = self._exists_story(story, story_content, existing_stories)
if existing_story is None:
s = MStory(
story_feed_id=self.pk,
story_date=story.get("published"),
story_title=story.get("title"),
story_content=story_content,
story_author_name=story.get("author"),
story_permalink=story.get("link"),
story_guid=story.get("guid") or story.get("id") or story.get("link"),
story_tags=story_tags,
)
try:
s.save()
ret_values[ENTRY_NEW] += 1
cache.set("updated_feed:%s" % self.id, 1)
except (IntegrityError, OperationError):
ret_values[ENTRY_ERR] += 1
# logging.info('Saving new story, IntegrityError: %s - %s: %s' % (self.feed_title, story.get('title'), e))
elif existing_story and story_has_changed:
# update story
# logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content)))
original_content = None
if existing_story.story_original_content_z:
original_content = zlib.decompress(existing_story.story_original_content_z)
elif existing_story.story_content_z:
original_content = zlib.decompress(existing_story.story_content_z)
# print 'Type: %s %s' % (type(original_content), type(story_content))
if story_content and len(story_content) > 10:
diff = HTMLDiff(unicode(original_content), story_content)
story_content_diff = diff.getDiff()
else:
story_content_diff = original_content
# logging.debug("\t\tDiff: %s %s %s" % diff.getStats())
# logging.debug("\t\tDiff content: %s" % diff.getDiff())
if existing_story.story_title != story.get("title"):
# logging.debug('\tExisting title / New: : \n\t\t- %s\n\t\t- %s' % (existing_story.story_title, story.get('title')))
pass
existing_story.story_feed = self.pk
existing_story.story_date = story.get("published")
existing_story.story_title = story.get("title")
existing_story.story_content = story_content_diff
existing_story.story_original_content = original_content
existing_story.story_author_name = story.get("author")
existing_story.story_permalink = story.get("link")
existing_story.story_guid = story.get("guid") or story.get("id") or story.get("link")
existing_story.story_tags = story_tags
try:
existing_story.save()
ret_values[ENTRY_UPDATED] += 1
cache.set("updated_feed:%s" % self.id, 1)
except (IntegrityError, OperationError):
ret_values[ENTRY_ERR] += 1
logging.info(
"Saving updated story, IntegrityError: %s - %s" % (self.feed_title, story.get("title"))
)
else:
ret_values[ENTRY_SAME] += 1
# logging.debug("Unchanged story: %s " % story.get('title'))
return ret_values
示例5: process
#.........这里部分代码省略.........
if self.fpf.status >= 400:
logging.debug(" ---> [%-30s] HTTP Status code: %s. Checking address..." % (unicode(self.feed)[:30], self.fpf.status))
fixed_feed = self.feed.check_feed_address_for_feed_link()
if not fixed_feed:
self.feed.save_feed_history(self.fpf.status, "HTTP Error")
else:
self.feed.schedule_feed_fetch_immediately()
self.feed.save()
return FEED_ERRHTTP, ret_values
if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
if not self.fpf.entries:
logging.debug(" ---> [%-30s] Feed is Non-XML. %s entries. Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries)))
fixed_feed = self.feed.check_feed_address_for_feed_link()
if not fixed_feed:
self.feed.save_feed_history(502, 'Non-xml feed', self.fpf.bozo_exception)
else:
self.feed.schedule_feed_fetch_immediately()
self.feed.save()
return FEED_ERRPARSE, ret_values
elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
logging.debug(" ---> [%-30s] Feed is Bad XML (SAX). %s entries. Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries)))
if not self.fpf.entries:
fixed_feed = self.feed.check_feed_address_for_feed_link()
if not fixed_feed:
self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception)
else:
self.feed.schedule_feed_fetch_immediately()
self.feed.save()
return FEED_ERRPARSE, ret_values
# the feed has changed (or it is the first time we parse it)
# saving the etag and last_modified fields
self.feed.etag = self.fpf.get('etag')
if self.feed.etag:
self.feed.etag = self.feed.etag[:255]
# some times this is None (it never should) *sigh*
if self.feed.etag is None:
self.feed.etag = ''
try:
self.feed.last_modified = mtime(self.fpf.modified)
except:
pass
self.fpf.entries = self.fpf.entries[:50]
self.feed.feed_title = self.fpf.feed.get('title', self.feed.feed_title)
tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline)
if tagline:
self.feed.data.feed_tagline = utf8encode(tagline)
self.feed.data.save()
self.feed.feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link
self.feed.last_update = datetime.datetime.utcnow()
guids = []
for entry in self.fpf.entries:
if entry.get('id', ''):
guids.append(entry.get('id', ''))
elif entry.get('link'):
guids.append(entry.link)
elif entry.get('title'):
guids.append(entry.title)
self.feed.save()
# Compare new stories to existing stories, adding and updating
start_date = datetime.datetime.utcnow()
# end_date = datetime.datetime.utcnow()
story_guids = []
for entry in self.fpf.entries:
story = pre_process_story(entry)
if story.get('published') < start_date:
start_date = story.get('published')
# if story.get('published') > end_date:
# end_date = story.get('published')
story_guids.append(story.get('guid') or story.get('link'))
existing_stories = MStory.objects(
# story_guid__in=story_guids,
story_date__gte=start_date,
story_feed_id=self.feed.pk
).limit(len(story_guids))
# MStory.objects(
# (Q(story_date__gte=start_date) & Q(story_date__lte=end_date))
# | (Q(story_guid__in=story_guids)),
# story_feed=self.feed
# ).order_by('-story_date')
ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories)
logging.debug(u' ---> [%-30s] Parsed Feed: %s' % (
unicode(self.feed)[:30],
u' '.join(u'%s=%d' % (self.entry_trans[key],
ret_values[key]) for key in self.entry_keys),))
self.feed.update_all_statistics()
self.feed.trim_feed()
self.feed.save_feed_history(200, "OK")
return FEED_OK, ret_values
示例6: process
#.........这里部分代码省略.........
original_tagline = self.feed.data.feed_tagline
self.feed.data.feed_tagline = smart_unicode(tagline)
if self.feed.data.feed_tagline != original_tagline:
self.feed.data.save(update_fields=['feed_tagline'])
if not self.feed.feed_link_locked:
new_feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link
if new_feed_link != self.feed.feed_link:
logging.debug(" ---> [%-30s] ~SB~FRFeed's page is different: %s to %s" % (self.feed.title[:30], self.feed.feed_link, new_feed_link))
redirects, non_redirects = self.feed.count_redirects_in_history('page')
self.feed.save_page_history(301, "HTTP Redirect (%s to go)" % (20-len(redirects)))
if len(redirects) >= 20 or len(non_redirects) == 0:
self.feed.feed_link = new_feed_link
self.feed.save(update_fields=['feed_link'])
# Determine if stories aren't valid and replace broken guids
guids_seen = set()
permalinks_seen = set()
for entry in self.fpf.entries:
guids_seen.add(entry.get('guid'))
permalinks_seen.add(Feed.get_permalink(entry))
guid_difference = len(guids_seen) != len(self.fpf.entries)
single_guid = len(guids_seen) == 1
replace_guids = single_guid and guid_difference
permalink_difference = len(permalinks_seen) != len(self.fpf.entries)
single_permalink = len(permalinks_seen) == 1
replace_permalinks = single_permalink and permalink_difference
# Compare new stories to existing stories, adding and updating
start_date = datetime.datetime.utcnow()
story_hashes = []
stories = []
for entry in self.fpf.entries:
story = pre_process_story(entry)
if story.get('published') < start_date:
start_date = story.get('published')
if replace_guids:
if replace_permalinks:
new_story_guid = unicode(story.get('published'))
if self.options['verbose']:
logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s' % (
self.feed.title[:30],
story.get('guid'), new_story_guid))
story['guid'] = new_story_guid
else:
new_story_guid = Feed.get_permalink(story)
if self.options['verbose']:
logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s' % (
self.feed.title[:30],
story.get('guid'), new_story_guid))
story['guid'] = new_story_guid
story['story_hash'] = MStory.feed_guid_hash_unsaved(self.feed.pk, story.get('guid'))
stories.append(story)
story_hashes.append(story.get('story_hash'))
existing_stories = dict((s.story_hash, s) for s in MStory.objects(
story_hash__in=story_hashes,
# story_date__gte=start_date,
# story_feed_id=self.feed.pk
))
ret_values = self.feed.add_update_stories(stories, existing_stories,
verbose=self.options['verbose'],
updates_off=self.options['updates_off'])
if (hasattr(self.fpf, 'feed') and
示例7: process
#.........这里部分代码省略.........
self.feed = self.feed.save()
return FEED_ERRPARSE, ret_values
# the feed has changed (or it is the first time we parse it)
# saving the etag and last_modified fields
self.feed.etag = self.fpf.get('etag')
if self.feed.etag:
self.feed.etag = self.feed.etag[:255]
# some times this is None (it never should) *sigh*
if self.feed.etag is None:
self.feed.etag = ''
try:
self.feed.last_modified = mtime(self.fpf.modified)
except:
self.feed.last_modified = None
pass
self.fpf.entries = self.fpf.entries[:100]
if self.fpf.feed.get('title'):
self.feed.feed_title = strip_tags(self.fpf.feed.get('title'))
self.feed.feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link
self.feed = self.feed.save()
# Determine if stories aren't valid and replace broken guids
# if guid is single among many entries:
# if permalink also is single among many entries:
# replace the guid with published
# else if permalink is not:
# replace the guid with permalink
guids_seen = set()
permalinks_seen = set()
for entry in self.fpf.entries:
guids_seen.add(entry.get('guid'))
permalinks_seen.add(Feed.get_permalink(entry))
guid_difference = len(guids_seen) != len(self.fpf.entries) # means guid is duplicated.
single_guid = len(guids_seen) == 1
replace_guids = single_guid and guid_difference # means guid is single but entries not.
permalink_difference = len(permalinks_seen) != len(self.fpf.entries)
single_permalink = len(permalinks_seen) == 1
replace_permalinks = single_permalink and permalink_difference
# Compare new stories to existing stories, adding and updating
start_date = datetime.datetime.utcnow()
story_hashes = []
stories = []
for entry in self.fpf.entries:
story = pre_process_story(entry)
if story.get('published') < start_date:
start_date = story.get('published')
if replace_guids:
if replace_permalinks:
new_story_guid = unicode(story.get('published'))
if self.options['verbose']:
logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s' % (
self.feed.title[:30],
story.get('guid'), new_story_guid))
story['guid'] = new_story_guid
else:
new_story_guid = Feed.get_permalink(story)
if self.options['verbose']:
logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s' % (
self.feed.title[:30],
story.get('guid'), new_story_guid))
story['guid'] = new_story_guid
story['story_hash'] = MStory.feed_guid_hash_unsaved(self.feed.pk, story.get('guid'))
stories.append(story)
story_hashes.append(story.get('story_hash'))
# find the existing_stories with story_hash in story_hashes.
existing_stories = dict((s.story_hash, s) for s in MStory.objects(
story_hash__in=story_hashes,
# story_date__gte=start_date,
# story_feed_id=self.feed.pk
))
ret_values = self.feed.add_update_stories(stories, existing_stories,
verbose=self.options['verbose'],)
logging.debug(u' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (
self.feed.title[:30],
'~FG~SB' if ret_values['new'] else '', ret_values['new'],
'~FY~SB' if ret_values['updated'] else '', ret_values['updated'],
'~SB' if ret_values['same'] else '', ret_values['same'],
'~FR~SB' if ret_values['error'] else '', ret_values['error'],
len(self.fpf.entries)))
# If there is new story, update all statistics
self.feed.update_all_statistics(full=bool(ret_values['new']))
self.feed.save_feed_history(200, "OK")
if self.options['verbose']:
logging.debug(u' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (
self.feed.title[:30], time.time() - start))
return FEED_OK, ret_values
示例8: process
#.........这里部分代码省略.........
# the feed has changed (or it is the first time we parse it)
# saving the etag and last_modified fields
self.feed.etag = self.fpf.get("etag")
if self.feed.etag:
self.feed.etag = self.feed.etag[:255]
# some times this is None (it never should) *sigh*
if self.feed.etag is None:
self.feed.etag = ""
try:
self.feed.last_modified = mtime(self.fpf.modified)
except:
self.feed.last_modified = None
pass
self.fpf.entries = self.fpf.entries[:50]
if self.fpf.feed.get("title"):
self.feed.feed_title = self.fpf.feed.get("title")
tagline = self.fpf.feed.get("tagline", self.feed.data.feed_tagline)
if tagline:
self.feed.data.feed_tagline = utf8encode(tagline)
self.feed.data.save()
if not self.feed.feed_link_locked:
self.feed.feed_link = self.fpf.feed.get("link") or self.fpf.feed.get("id") or self.feed.feed_link
self.feed = self.feed.save()
# Compare new stories to existing stories, adding and updating
start_date = datetime.datetime.utcnow()
story_guids = []
stories = []
for entry in self.fpf.entries:
story = pre_process_story(entry)
if story.get("published") < start_date:
start_date = story.get("published")
stories.append(story)
story_guids.append(story.get("guid"))
existing_stories = dict(
(s.story_guid, s)
for s in MStory.objects(
# story_guid__in=story_guids,
story_date__gte=start_date,
story_feed_id=self.feed.pk,
).limit(max(int(len(story_guids) * 1.5), 10))
)
ret_values = self.feed.add_update_stories(stories, existing_stories, verbose=self.options["verbose"])
if hasattr(self.fpf, "feed") and hasattr(self.fpf.feed, "links") and self.fpf.feed.links:
hub_url = None
self_url = self.feed.feed_address
for link in self.fpf.feed.links:
if link["rel"] == "hub" and not hub_url:
hub_url = link["href"]
elif link["rel"] == "self":
self_url = link["href"]
push_expired = self.feed.is_push and self.feed.push.lease_expires < datetime.datetime.now()
if (
hub_url
and self_url
and not settings.DEBUG
and self.feed.active_subscribers > 0
and (push_expired or not self.feed.is_push or self.options.get("force"))
):
示例9: process
def process(self):
""" Downloads and parses a feed.
"""
ret_values = {
ENTRY_NEW:0,
ENTRY_UPDATED:0,
ENTRY_SAME:0,
ENTRY_ERR:0}
# logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title))
if hasattr(self.fpf, 'status'):
if self.options['verbose']:
logging.debug(u' ---> [%-30s] Fetched feed, HTTP status %d: %s (bozo: %s)' % (unicode(self.feed)[:30],
self.fpf.status,
self.feed.feed_address,
self.fpf.bozo))
if self.fpf.bozo and self.fpf.status != 304:
logging.debug(u' ---> [%-30s] BOZO exception: %s' % (
unicode(self.feed)[:30],
self.fpf.bozo_exception,))
if self.fpf.status == 304:
self.feed.save()
self.feed.save_feed_history(304, "Not modified")
return FEED_SAME, ret_values
if self.fpf.status >= 400:
self.feed.save()
self.feed.save_feed_history(self.fpf.status, "HTTP Error")
return FEED_ERRHTTP, ret_values
if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
if not self.fpf.entries:
logging.debug(" ---> [%-30s] Feed is Non-XML. Checking address..." % unicode(self.feed)[:30])
fixed_feed = self.feed.check_feed_address_for_feed_link()
if not fixed_feed:
self.feed.save_feed_history(502, 'Non-xml feed', self.fpf.bozo_exception)
return FEED_ERRPARSE, ret_values
elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
logging.debug(" ---> [%-30s] Feed is Bad XML (SAX). Checking address..." % unicode(self.feed)[:30])
if not self.fpf.entries:
fixed_feed = self.feed.check_feed_address_for_feed_link()
if not fixed_feed:
self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception)
return FEED_ERRPARSE, ret_values
# the feed has changed (or it is the first time we parse it)
# saving the etag and last_modified fields
self.feed.etag = self.fpf.get('etag')
if self.feed.etag:
self.feed.etag = self.feed.etag[:255]
# some times this is None (it never should) *sigh*
if self.feed.etag is None:
self.feed.etag = ''
try:
self.feed.last_modified = mtime(self.fpf.modified)
except:
pass
self.feed.feed_title = self.fpf.feed.get('title', self.feed.feed_title)
self.feed.feed_tagline = self.fpf.feed.get('tagline', self.feed.feed_tagline)
self.feed.feed_link = self.fpf.feed.get('link', self.feed.feed_link)
self.feed.last_update = datetime.datetime.now()
guids = []
for entry in self.fpf.entries:
if entry.get('id', ''):
guids.append(entry.get('id', ''))
elif entry.title:
guids.append(entry.title)
elif entry.link:
guids.append(entry.link)
self.lock.acquire()
try:
self.feed.save()
finally:
self.lock.release()
# Compare new stories to existing stories, adding and updating
# start_date = datetime.datetime.now()
# end_date = datetime.datetime.now()
story_guids = []
for entry in self.fpf.entries:
story = pre_process_story(entry)
# if story.get('published') < start_date:
# start_date = story.get('published')
# if story.get('published') > end_date:
# end_date = story.get('published')
story_guids.append(story.get('guid') or story.get('link'))
existing_stories = self.db.stories.find({
'story_feed_id': self.feed.pk,
# 'story_date': {'$gte': start_date},
'story_guid': {'$in': story_guids}
}).limit(len(story_guids))
# MStory.objects(
# (Q(story_date__gte=start_date) & Q(story_date__lte=end_date))
# | (Q(story_guid__in=story_guids)),
#.........这里部分代码省略.........
示例10: pre_process_story
guids.append(entry.title)
<<<<<<< HEAD
self.feed.save()
self.refresh_feed()
>>>>>>> Refreshing feed on fetch.
=======
self.feed = self.feed.save()
>>>>>>> Fixing errors in timeouts to show the correct error. Also fixing microformats parsing issue and allow IPv6 URLs in enclosures to be ignored, fixing a bunch of feeds.
# Compare new stories to existing stories, adding and updating
start_date = datetime.datetime.utcnow()
story_guids = []
stories = []
for entry in self.fpf.entries:
story = pre_process_story(entry)
if story.get('published') < start_date:
start_date = story.get('published')
stories.append(story)
story_guids.append(story.get('guid'))
existing_stories = list(MStory.objects(
# story_guid__in=story_guids,
story_date__gte=start_date,
story_feed_id=self.feed.pk
).limit(max(int(len(story_guids)*1.5), 10)))
ret_values = self.feed.add_update_stories(stories, existing_stories,
verbose=self.options['verbose'])
if ((not self.feed.is_push or self.options.get('force'))
示例11: process
#.........这里部分代码省略.........
if self.fpf.status >= 400:
self.feed.save()
self.feed.save_feed_history(self.fpf.status, "HTTP Error")
return FEED_ERRHTTP, ret_values
if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
if not self.fpf.entries:
logging.debug(
" ---> [%-30s] Feed is Non-XML. %s entries. Checking address..."
% (unicode(self.feed)[:30], len(self.fpf.entries))
)
fixed_feed = self.feed.check_feed_address_for_feed_link()
if not fixed_feed:
self.feed.save_feed_history(502, "Non-xml feed", self.fpf.bozo_exception)
else:
self.feed.schedule_feed_fetch_immediately()
self.feed.save()
return FEED_ERRPARSE, ret_values
elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
logging.debug(
" ---> [%-30s] Feed is Bad XML (SAX). %s entries. Checking address..."
% (unicode(self.feed)[:30], len(self.fpf.entries))
)
if not self.fpf.entries:
fixed_feed = self.feed.check_feed_address_for_feed_link()
if not fixed_feed:
self.feed.save_feed_history(503, "SAX Exception", self.fpf.bozo_exception)
else:
self.feed.schedule_feed_fetch_immediately()
self.feed.save()
return FEED_ERRPARSE, ret_values
# the feed has changed (or it is the first time we parse it)
# saving the etag and last_modified fields
self.feed.etag = self.fpf.get("etag")
if self.feed.etag:
self.feed.etag = self.feed.etag[:255]
# some times this is None (it never should) *sigh*
if self.feed.etag is None:
self.feed.etag = ""
try:
self.feed.last_modified = mtime(self.fpf.modified)
except:
pass
self.feed.feed_title = self.fpf.feed.get("title", self.feed.feed_title)
self.feed.feed_tagline = self.fpf.feed.get("tagline", self.feed.feed_tagline)
self.feed.feed_link = self.fpf.feed.get("link", self.feed.feed_link)
self.feed.last_update = datetime.datetime.utcnow()
guids = []
for entry in self.fpf.entries:
if entry.get("id", ""):
guids.append(entry.get("id", ""))
elif entry.title:
guids.append(entry.title)
elif entry.link:
guids.append(entry.link)
self.feed.save()
# Compare new stories to existing stories, adding and updating
# start_date = datetime.datetime.utcnow()
# end_date = datetime.datetime.utcnow()
story_guids = []
for entry in self.fpf.entries:
story = pre_process_story(entry)
# if story.get('published') < start_date:
# start_date = story.get('published')
# if story.get('published') > end_date:
# end_date = story.get('published')
story_guids.append(story.get("guid") or story.get("link"))
existing_stories = settings.MONGODB.stories.find(
{
"story_feed_id": self.feed.pk,
# 'story_date': {'$gte': start_date},
"story_guid": {"$in": story_guids},
}
).limit(len(story_guids))
# MStory.objects(
# (Q(story_date__gte=start_date) & Q(story_date__lte=end_date))
# | (Q(story_guid__in=story_guids)),
# story_feed=self.feed
# ).order_by('-story_date')
ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories)
logging.debug(
u" ---> [%-30s] Parsed Feed: %s"
% (
unicode(self.feed)[:30],
u" ".join(u"%s=%d" % (self.entry_trans[key], ret_values[key]) for key in self.entry_keys),
)
)
self.feed.update_all_statistics(lock=self.lock)
self.feed.trim_feed()
self.feed.save_feed_history(200, "OK")
return FEED_OK, ret_values