本文整理汇总了Python中mediawords.db.DatabaseHandler.require_by_id方法的典型用法代码示例。如果您正苦于以下问题:Python DatabaseHandler.require_by_id方法的具体用法?Python DatabaseHandler.require_by_id怎么用?Python DatabaseHandler.require_by_id使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类mediawords.db.DatabaseHandler
的用法示例。
在下文中一共展示了DatabaseHandler.require_by_id方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _get_deduped_medium
# 需要导入模块: from mediawords.db import DatabaseHandler [as 别名]
# 或者: from mediawords.db.DatabaseHandler import require_by_id [as 别名]
def _get_deduped_medium(db: DatabaseHandler, media_id: int) -> dict:
"""Get either the referenced medium or the deduped version of the medium by recursively following dup_media_id."""
medium = db.require_by_id('media', media_id)
if medium['dup_media_id'] is None:
return medium
else:
return _get_deduped_medium(db, medium['dup_media_id'])
示例2: fetch_topic_tweets
# 需要导入模块: from mediawords.db import DatabaseHandler [as 别名]
# 或者: from mediawords.db.DatabaseHandler import require_by_id [as 别名]
def fetch_topic_tweets(
db: DatabaseHandler,
topics_id: int,
twitter_class: typing.Type[AbstractTwitter] = Twitter,
ch_class: typing.Type[AbstractCrimsonHexagon] = CrimsonHexagon) -> None:
"""
Fetch list of tweets within a Crimson Hexagon monitor based on the ch_monitor_id of the given topic.
Crimson Hexagon returns up to 10k randomly sampled tweets per posts fetch, and each posts fetch can be restricted
down to a single day. This call fetches tweets from CH day by day, up to a total of 1 million tweets for a single
topic for the whole date range combined. The call normalizes the number of tweets returned for each day so that
each day has the same percentage of all tweets found on that day. So if there were 20,000 tweets found on the
busiest day, each day will use at most 50% of the returned tweets for the day.
One call to this function takes care of both fetching the list of all tweets from CH and fetching each of those
tweets from twitter (CH does not provide the tweet content, only the url). Each day's worth of tweets will be
recorded in topic_tweet_days, and subsequent calls to the function will not refetch a given day for a given topic,
but each call will fetch any days newly included in the date range of the topic given a topic dates change.
If there is no ch_monitor_id for the topic, do nothing.
Arguments:
db - db handle
topics_id - topic id
twitter_class - optional implementation of AbstractTwitter class;
default to one that fetches data from twitter with config from mediawords.yml
ch_class - optional implementation of AbstractCrimsonHexagon class;
default to one that fetches data from twitter with config from mediawords.yml
Return:
None
"""
topic = db.require_by_id('topics', topics_id)
ch_monitor_id = topic['ch_monitor_id']
if ch_monitor_id is None:
log.debug("returning after noop because topic topics_id has a null ch_monitor_id")
return
_add_topic_tweet_days(db, topic, twitter_class, ch_class)
示例3: fetch_topic_url
# 需要导入模块: from mediawords.db import DatabaseHandler [as 别名]
# 或者: from mediawords.db.DatabaseHandler import require_by_id [as 别名]
def fetch_topic_url(db: DatabaseHandler, topic_fetch_urls_id: int, domain_timeout: typing.Optional[int] = None) -> None:
"""Fetch a url for a topic and create a media cloud story from it if its content matches the topic pattern.
Update the following fields in the topic_fetch_urls row:
code - the status code of the http response
fetch_date - the current time
state - one of the FETCH_STATE_* constatnts
message - message related to the state (eg. HTTP message for FETCH_STATE_REQUEST_FAILED)
stories_id - the id of the story generated from the fetched content, or null if no story created'
If topic_links_id is present in the topic_fetch_url and if a story was added or matched, assign the resulting
topic_fetch_urls.stories_id to topic_links.ref_stories_id.
If the state is anything but FETCH_STATE_PENDING or FETCH_STATE_REQUEUED, return without doing anything.
If there is content for the corresponding url and topics_id in topic_seed_urls, use that content instead of
fetching the url.
This function catches almost all possible exceptions and stashes them topic_fetch_urls along with a state of
FETCH_STATE_PYTHON_ERROR
Arguments:
db - db handle
topic_fetch_urls_id - id of topic_fetch_urls row
domain_timeout - pass through to fech_link
Returns:
None
"""
topic_fetch_url = db.require_by_id('topic_fetch_urls', topic_fetch_urls_id)
try:
log.info("fetch_link: %s" % topic_fetch_url['url'])
_try_fetch_topic_url(db=db, topic_fetch_url=topic_fetch_url, domain_timeout=domain_timeout)
if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']:
try_update_topic_link_ref_stories_id(db, topic_fetch_url)
if 'stories_id' in topic_fetch_url and topic_fetch_url['stories_id'] is not None:
story = db.require_by_id('stories', topic_fetch_url['stories_id'])
topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
redirect_url = topic_fetch_url['url']
assume_match = topic_fetch_url['assume_match']
if _is_not_topic_story(db, topic_fetch_url):
if _story_matches_topic(db, story, topic, redirect_url=redirect_url, assume_match=assume_match):
mediawords.tm.stories.add_to_topic_stories(db, story, topic)
if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']:
try_update_topic_link_ref_stories_id(db, topic_fetch_url)
except McThrottledDomainException as ex:
raise ex
except Exception as ex:
log.error("Error while fetching URL {}: {}".format(topic_fetch_url, ex))
topic_fetch_url['state'] = FETCH_STATE_PYTHON_ERROR
topic_fetch_url['message'] = traceback.format_exc()
log.warning('topic_fetch_url %s failed: %s' % (topic_fetch_url['url'], topic_fetch_url['message']))
db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], topic_fetch_url)
示例4: _try_fetch_topic_url
# 需要导入模块: from mediawords.db import DatabaseHandler [as 别名]
# 或者: from mediawords.db.DatabaseHandler import require_by_id [as 别名]
def _try_fetch_topic_url(
db: DatabaseHandler,
topic_fetch_url: dict,
domain_timeout: typing.Optional[int] = None) -> None:
"""Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update."""
log.warning("_try_fetch_topic_url: %s" % topic_fetch_url['url'])
# don't reprocess already processed urls
if topic_fetch_url['state'] not in (FETCH_STATE_PENDING, FETCH_STATE_REQUEUED):
return
_update_tfu_message(db, topic_fetch_url, "checking ignore links")
if _ignore_link_pattern(topic_fetch_url['url']):
topic_fetch_url['state'] = FETCH_STATE_IGNORED
topic_fetch_url['code'] = 403
return
_update_tfu_message(db, topic_fetch_url, "checking failed url")
failed_url = _get_failed_url(db, topic_fetch_url['topics_id'], topic_fetch_url['url'])
if failed_url:
topic_fetch_url['state'] = failed_url['state']
topic_fetch_url['code'] = failed_url['code']
topic_fetch_url['message'] = failed_url['message']
return
_update_tfu_message(db, topic_fetch_url, "checking self linked domain")
if mediawords.tm.domains.skip_self_linked_domain(db, topic_fetch_url):
topic_fetch_url['state'] = FETCH_STATE_SKIPPED
topic_fetch_url['code'] = 403
return
topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
topic_fetch_url['fetch_date'] = datetime.datetime.now()
story_match = None
# this match is relatively expensive, so only do it on the first 'pending' request and not the potentially
# spammy 'requeued' requests
_update_tfu_message(db, topic_fetch_url, "checking story match")
if topic_fetch_url['state'] == FETCH_STATE_PENDING:
story_match = mediawords.tm.stories.get_story_match(db=db, url=topic_fetch_url['url'])
# try to match the story before doing the expensive fetch
if story_match is not None:
topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
topic_fetch_url['code'] = 200
topic_fetch_url['stories_id'] = story_match['stories_id']
return
# check whether we want to delay fetching for another job, eg. fetch_twitter_urls
pending_state = _get_pending_state(topic_fetch_url)
if pending_state:
topic_fetch_url['state'] = pending_state
return
# get content from either the seed or by fetching it
_update_tfu_message(db, topic_fetch_url, "checking seeded content")
response = _get_seeded_content(db, topic_fetch_url)
if response is None:
_update_tfu_message(db, topic_fetch_url, "fetching content")
response = _fetch_url(db, topic_fetch_url['url'], domain_timeout=domain_timeout)
log.debug("%d response returned for url: %s" % (response.code, topic_fetch_url['url']))
else:
log.debug("seeded content found for url: %s" % topic_fetch_url['url'])
content = response.content
fetched_url = topic_fetch_url['url']
response_url = response.last_requested_url
if fetched_url != response_url:
if _ignore_link_pattern(response_url):
topic_fetch_url['state'] = FETCH_STATE_IGNORED
topic_fetch_url['code'] = 403
return
_update_tfu_message(db, topic_fetch_url, "checking story match for redirect_url")
story_match = mediawords.tm.stories.get_story_match(db=db, url=fetched_url, redirect_url=response_url)
topic_fetch_url['code'] = response.code
assume_match = topic_fetch_url['assume_match']
_update_tfu_message(db, topic_fetch_url, "checking content match")
if not response.is_success:
topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED
topic_fetch_url['message'] = response.message
elif story_match is not None:
topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
topic_fetch_url['stories_id'] = story_match['stories_id']
elif not content_matches_topic(content=content, topic=topic, assume_match=assume_match):
topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED
else:
try:
_update_tfu_message(db, topic_fetch_url, "generating story")
url = response_url if response_url is not None else fetched_url
story = mediawords.tm.stories.generate_story(db=db, content=content, url=url)
topic_fetch_url['stories_id'] = story['stories_id']
#.........这里部分代码省略.........