当前位置: 首页>>代码示例>>Python>>正文


Python DatabaseHandler.require_by_id方法代码示例

本文整理汇总了Python中mediawords.db.DatabaseHandler.require_by_id方法的典型用法代码示例。如果您正苦于以下问题:Python DatabaseHandler.require_by_id方法的具体用法?Python DatabaseHandler.require_by_id怎么用?Python DatabaseHandler.require_by_id使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在mediawords.db.DatabaseHandler的用法示例。


在下文中一共展示了DatabaseHandler.require_by_id方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _get_deduped_medium

# 需要导入模块: from mediawords.db import DatabaseHandler [as 别名]
# 或者: from mediawords.db.DatabaseHandler import require_by_id [as 别名]
def _get_deduped_medium(db: DatabaseHandler, media_id: int) -> dict:
    """Get either the referenced medium or the deduped version of the medium by recursively following dup_media_id."""
    medium = db.require_by_id('media', media_id)
    if medium['dup_media_id'] is None:
        return medium
    else:
        return _get_deduped_medium(db, medium['dup_media_id'])
开发者ID:berkmancenter,项目名称:mediacloud,代码行数:9,代码来源:stories.py

示例2: fetch_topic_tweets

# 需要导入模块: from mediawords.db import DatabaseHandler [as 别名]
# 或者: from mediawords.db.DatabaseHandler import require_by_id [as 别名]
def fetch_topic_tweets(
        db: DatabaseHandler,
        topics_id: int,
        twitter_class: typing.Type[AbstractTwitter] = Twitter,
        ch_class: typing.Type[AbstractCrimsonHexagon] = CrimsonHexagon) -> None:
    """
    Fetch list of tweets within a Crimson Hexagon monitor based on the ch_monitor_id of the given topic.

    Crimson Hexagon returns up to 10k randomly sampled tweets per posts fetch, and each posts fetch can be restricted
    down to a single day.  This call fetches tweets from CH day by day, up to a total of 1 million tweets for a single
    topic for the whole date range combined.  The call normalizes the number of tweets returned for each day so that
    each day has the same percentage of all tweets found on that day.  So if there were 20,000 tweets found on the
    busiest day, each day will use at most 50% of the returned tweets for the day.

    One call to this function takes care of both fetching the list of all tweets from CH and fetching each of those
    tweets from twitter (CH does not provide the tweet content, only the url).  Each day's worth of tweets will be
    recorded in topic_tweet_days, and subsequent calls to the function will not refetch a given day for a given topic,
    but each call will fetch any days newly included in the date range of the topic given a topic dates change.

    If there is no ch_monitor_id for the topic, do nothing.

    Arguments:
    db - db handle
    topics_id - topic id
    twitter_class - optional implementation of AbstractTwitter class;
        default to one that fetches data from twitter with config from mediawords.yml
    ch_class - optional implementation of AbstractCrimsonHexagon class;
        default to one that fetches data from twitter with config from mediawords.yml

    Return:
    None
    """
    topic = db.require_by_id('topics', topics_id)
    ch_monitor_id = topic['ch_monitor_id']

    if ch_monitor_id is None:
        log.debug("returning after noop because topic topics_id has a null ch_monitor_id")
        return

    _add_topic_tweet_days(db, topic, twitter_class, ch_class)
开发者ID:berkmancenter,项目名称:mediacloud,代码行数:42,代码来源:fetch_topic_tweets.py

示例3: fetch_topic_url

# 需要导入模块: from mediawords.db import DatabaseHandler [as 别名]
# 或者: from mediawords.db.DatabaseHandler import require_by_id [as 别名]
def fetch_topic_url(db: DatabaseHandler, topic_fetch_urls_id: int, domain_timeout: typing.Optional[int] = None) -> None:
    """Fetch a url for a topic and create a media cloud story from it if its content matches the topic pattern.

    Update the following fields in the topic_fetch_urls row:

    code - the status code of the http response
    fetch_date - the current time
    state - one of the FETCH_STATE_* constatnts
    message - message related to the state (eg. HTTP message for FETCH_STATE_REQUEST_FAILED)
    stories_id - the id of the story generated from the fetched content, or null if no story created'

    If topic_links_id is present in the topic_fetch_url and if a story was added or matched, assign the resulting
    topic_fetch_urls.stories_id to topic_links.ref_stories_id.

    If the state is anything but FETCH_STATE_PENDING or FETCH_STATE_REQUEUED, return without doing anything.

    If there is content for the corresponding url and topics_id in topic_seed_urls, use that content instead of
    fetching the url.

    This function catches almost all possible exceptions and stashes them topic_fetch_urls along with a state of
    FETCH_STATE_PYTHON_ERROR

    Arguments:
    db - db handle
    topic_fetch_urls_id - id of topic_fetch_urls row
    domain_timeout - pass through to fech_link

    Returns:
    None

    """
    topic_fetch_url = db.require_by_id('topic_fetch_urls', topic_fetch_urls_id)

    try:
        log.info("fetch_link: %s" % topic_fetch_url['url'])
        _try_fetch_topic_url(db=db, topic_fetch_url=topic_fetch_url, domain_timeout=domain_timeout)

        if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']:
            try_update_topic_link_ref_stories_id(db, topic_fetch_url)

        if 'stories_id' in topic_fetch_url and topic_fetch_url['stories_id'] is not None:
            story = db.require_by_id('stories', topic_fetch_url['stories_id'])
            topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
            redirect_url = topic_fetch_url['url']
            assume_match = topic_fetch_url['assume_match']
            if _is_not_topic_story(db, topic_fetch_url):
                if _story_matches_topic(db, story, topic, redirect_url=redirect_url, assume_match=assume_match):
                    mediawords.tm.stories.add_to_topic_stories(db, story, topic)

        if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']:
            try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    except McThrottledDomainException as ex:
        raise ex

    except Exception as ex:
        log.error("Error while fetching URL {}: {}".format(topic_fetch_url, ex))

        topic_fetch_url['state'] = FETCH_STATE_PYTHON_ERROR
        topic_fetch_url['message'] = traceback.format_exc()
        log.warning('topic_fetch_url %s failed: %s' % (topic_fetch_url['url'], topic_fetch_url['message']))

    db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], topic_fetch_url)
开发者ID:berkmancenter,项目名称:mediacloud,代码行数:65,代码来源:fetch_link.py

示例4: _try_fetch_topic_url

# 需要导入模块: from mediawords.db import DatabaseHandler [as 别名]
# 或者: from mediawords.db.DatabaseHandler import require_by_id [as 别名]
def _try_fetch_topic_url(
        db: DatabaseHandler,
        topic_fetch_url: dict,
        domain_timeout: typing.Optional[int] = None) -> None:
    """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update."""

    log.warning("_try_fetch_topic_url: %s" % topic_fetch_url['url'])

    # don't reprocess already processed urls
    if topic_fetch_url['state'] not in (FETCH_STATE_PENDING, FETCH_STATE_REQUEUED):
        return

    _update_tfu_message(db, topic_fetch_url, "checking ignore links")
    if _ignore_link_pattern(topic_fetch_url['url']):
        topic_fetch_url['state'] = FETCH_STATE_IGNORED
        topic_fetch_url['code'] = 403
        return

    _update_tfu_message(db, topic_fetch_url, "checking failed url")
    failed_url = _get_failed_url(db, topic_fetch_url['topics_id'], topic_fetch_url['url'])
    if failed_url:
        topic_fetch_url['state'] = failed_url['state']
        topic_fetch_url['code'] = failed_url['code']
        topic_fetch_url['message'] = failed_url['message']
        return

    _update_tfu_message(db, topic_fetch_url, "checking self linked domain")
    if mediawords.tm.domains.skip_self_linked_domain(db, topic_fetch_url):
        topic_fetch_url['state'] = FETCH_STATE_SKIPPED
        topic_fetch_url['code'] = 403
        return

    topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
    topic_fetch_url['fetch_date'] = datetime.datetime.now()

    story_match = None

    # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially
    # spammy 'requeued' requests
    _update_tfu_message(db, topic_fetch_url, "checking story match")
    if topic_fetch_url['state'] == FETCH_STATE_PENDING:
        story_match = mediawords.tm.stories.get_story_match(db=db, url=topic_fetch_url['url'])

        # try to match the story before doing the expensive fetch
        if story_match is not None:
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            topic_fetch_url['code'] = 200
            topic_fetch_url['stories_id'] = story_match['stories_id']
            return

    # check whether we want to delay fetching for another job, eg. fetch_twitter_urls
    pending_state = _get_pending_state(topic_fetch_url)
    if pending_state:
        topic_fetch_url['state'] = pending_state
        return

    # get content from either the seed or by fetching it
    _update_tfu_message(db, topic_fetch_url, "checking seeded content")
    response = _get_seeded_content(db, topic_fetch_url)
    if response is None:
        _update_tfu_message(db, topic_fetch_url, "fetching content")
        response = _fetch_url(db, topic_fetch_url['url'], domain_timeout=domain_timeout)
        log.debug("%d response returned for url: %s" % (response.code, topic_fetch_url['url']))
    else:
        log.debug("seeded content found for url: %s" % topic_fetch_url['url'])

    content = response.content

    fetched_url = topic_fetch_url['url']
    response_url = response.last_requested_url

    if fetched_url != response_url:
        if _ignore_link_pattern(response_url):
            topic_fetch_url['state'] = FETCH_STATE_IGNORED
            topic_fetch_url['code'] = 403
            return

        _update_tfu_message(db, topic_fetch_url, "checking story match for redirect_url")
        story_match = mediawords.tm.stories.get_story_match(db=db, url=fetched_url, redirect_url=response_url)

    topic_fetch_url['code'] = response.code

    assume_match = topic_fetch_url['assume_match']

    _update_tfu_message(db, topic_fetch_url, "checking content match")
    if not response.is_success:
        topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED
        topic_fetch_url['message'] = response.message
    elif story_match is not None:
        topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
        topic_fetch_url['stories_id'] = story_match['stories_id']
    elif not content_matches_topic(content=content, topic=topic, assume_match=assume_match):
        topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED
    else:
        try:
            _update_tfu_message(db, topic_fetch_url, "generating story")
            url = response_url if response_url is not None else fetched_url
            story = mediawords.tm.stories.generate_story(db=db, content=content, url=url)

            topic_fetch_url['stories_id'] = story['stories_id']
#.........这里部分代码省略.........
开发者ID:berkmancenter,项目名称:mediacloud,代码行数:103,代码来源:fetch_link.py


注:本文中的mediawords.db.DatabaseHandler.require_by_id方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。