Python distributed.new_group方法代码示例

本文整理汇总了Python中torch.distributed.new_group方法的典型用法代码示例。如果您正苦于以下问题：Python distributed.new_group方法的具体用法？Python distributed.new_group怎么用？Python distributed.new_group使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类torch.distributed的用法示例。

在下文中一共展示了distributed.new_group方法的11个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: init

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import new_group [as 别名]
def __init__(self, rank, learner_ranks, worker_ranks, ip, port):
        world_size = len(learner_ranks) + len(worker_ranks)
        dist.init_process_group(
            "nccl",
            init_method="tcp://{}:{}".format(ip, port),
            rank=rank,
            world_size=world_size,
        )
        groups = {}
        for learner_rank in learner_ranks:
            for worker_rank in worker_ranks:
                g = dist.new_group([learner_rank, worker_rank])
                if worker_rank == rank:
                    groups[learner_rank] = g
        dist.new_group(learner_ranks)

        self.groups = groups
        self.device = torch.device(f"cuda:{ray.get_gpu_ids()[0]}")
        self.rank = rank
        self.network = torch.zeros(3).to(self.device)
        self.exp = None
        self.network_handle = None

开发者ID:heronsystems，项目名称:adeptRL，代码行数:24，代码来源:ray_container.py

示例2: init_distributed_training

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import new_group [as 别名]
def init_distributed_training(cfg):
    """
    Initialize variables needed for distributed training.
    """
    if cfg.NUM_GPUS == 1:
        return
    num_gpus_per_machine = cfg.NUM_GPUS
    num_machines = dist.get_world_size() // num_gpus_per_machine
    for i in range(num_machines):
        ranks_on_i = list(
            range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)
        )
        pg = dist.new_group(ranks_on_i)
        if i == cfg.SHARD_ID:
            global _LOCAL_PROCESS_GROUP
            _LOCAL_PROCESS_GROUP = pg

开发者ID:facebookresearch，项目名称:SlowFast，代码行数:18，代码来源:distributed.py

示例3: _start_reduction_threads

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import new_group [as 别名]
def _start_reduction_threads(self):
        num_buckets = len(self.bucket_sizes)
        self._reduction_queues = [queue.Queue() for _ in range(num_buckets)]
        self._reduction_threads = []
        self._reduction_streams = [[] for _ in range(num_buckets)]
        self._nccl_streams = []
        self._default_streams = []
        for dev_id in self.device_ids:
            with torch.cuda.device(dev_id):
                # TODO: don't assume we're on a default stream
                self._default_streams.append(torch.cuda.current_stream())
                self._nccl_streams.append(torch.cuda.Stream())
        for reduction_queue, reduction_streams in zip(self._reduction_queues, self._reduction_streams):
            for dev_id in self.device_ids:
                with torch.cuda.device(dev_id):
                    reduction_streams.append(torch.cuda.Stream())
            # We only use the first device for distributed reductions
            dist._register_stream(reduction_streams[0])
            group_id = dist.new_group()

            self._reduction_threads.append(threading.Thread(
                target=self._reduction_thread_fn,
                args=(reduction_queue, group_id, self.device_ids, reduction_streams, self._nccl_streams)))
            self._reduction_threads[-1].daemon = True
            self._reduction_threads[-1].start()

开发者ID:hwang595，项目名称:ps_pytorch，代码行数:27，代码来源:data_parallel_dist.py

示例4: run

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import new_group [as 别名]
def run(world_size, rank, steps):
    for step in range(1, steps + 1):
        # get random int
        value = randint(0, 10)

        # group all ranks
        ranks = list(range(world_size))
        group = dist.new_group(ranks=ranks)

        # compute reduced sum
        tensor = torch.tensor(value, dtype=torch.int)
        dist.all_reduce(tensor, op=dist.ReduceOp.SUM, group=group)

        print('rank: {}, step: {}, value: {}, reduced sum: {}.'.format(rank, step, value, tensor.item()))

        sleep(1)

开发者ID:narumiruna，项目名称:pytorch-distributed-example，代码行数:18，代码来源:main.py

示例5: init

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import new_group [as 别名]
def __init__(self, *args, **kwargs):
    super(SynchronousDistributedTraining, self).__init__(*args, **kwargs)
    self.world_size = distributed.get_world_size()
    self.rank = distributed.get_rank()
    self.group = distributed.new_group(ranks=list(range(self.world_size)))

开发者ID:mjendrusch，项目名称:torchsupport，代码行数:7，代码来源:distributed.py

示例6: init

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import new_group [as 别名]
def __init__(self, init_ttp=False):
        # no need to do anything if we already initialized the communicator:
        if not dist.is_initialized():
            # get configuration variables from environmens:
            for key in ["distributed_backend", "rendezvous", "world_size", "rank"]:
                if key.upper() not in os.environ:
                    raise ValueError("Environment variable %s must be set." % key)
                setattr(self, key.lower(), os.environ[key.upper()])

            # make sure world size and rank are integers; comms stats are reset:
            self.world_size = int(self.world_size)
            self.rank = int(self.rank)
            self.reset_communication_stats()
            self._name = f"rank{self.rank}"

            # logging:
            logging.info("==================")
            logging.info("DistributedCommunicator with rank %d" % self.rank)
            logging.info("==================")

            # initialize process group:
            total_ws = self.world_size + 1 if init_ttp else self.world_size
            dist.init_process_group(
                backend=self.distributed_backend,
                init_method=self.rendezvous,
                world_size=total_ws,
                rank=self.rank,
            )
            self.ttp_group = dist.new_group(list(range(total_ws)))
            self.main_group = dist.new_group(list(range(self.world_size)))
            self.ttp_initialized = init_ttp
            logging.info("World size = %d" % self.world_size)

开发者ID:facebookresearch，项目名称:CrypTen，代码行数:34，代码来源:distributed_communicator.py

示例7: active_group

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import new_group [as 别名]
def active_group(active):
    """Initialize a distributed group where each process can independently decide whether to participate or not

    Parameters
    ----------
    active : bool
        Whether this process will be active in the group or not

    Returns
    -------
        A distributed group containing all processes that passed `active=True`, or `None` if all passed `False`
    """
    world_size = distributed.get_world_size()
    rank = distributed.get_rank()

    # Check if cache is initialized, add WORLD and None to it
    if not hasattr(active_group, "__cache__"):
        active_group.__cache__ = {
            frozenset(range(world_size)): distributed.group.WORLD,
            frozenset(): None
        }

    # Gather active status from all workers
    active = torch.tensor(rank if active else -1, dtype=torch.long, device=torch.cuda.current_device())
    active_workers = torch.empty(world_size, dtype=torch.long, device=torch.cuda.current_device())
    distributed.all_gather(list(active_workers.unbind(0)), active)

    # Create and cache group if it doesn't exist yet
    active_workers = frozenset(int(i) for i in active_workers.tolist() if i != -1)
    if active_workers not in active_group.__cache__:
        group = distributed.new_group(list(active_workers))
        active_group.__cache__[active_workers] = group

    return active_group.__cache__[active_workers]

开发者ID:mapillary，项目名称:inplace_abn，代码行数:36，代码来源:group.py

示例8: _get_global_gloo_group

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import new_group [as 别名]
def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD

开发者ID:JDAI-CV，项目名称:fast-reid，代码行数:11，代码来源:comm.py

示例9: _distributed_worker

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import new_group [as 别名]
def _distributed_worker(
    local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args
):
    assert torch.cuda.is_available(), "cuda is not available. Please check your installation."
    global_rank = machine_rank * num_gpus_per_machine + local_rank
    try:
        dist.init_process_group(
            backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank
        )
    except Exception as e:
        logger = logging.getLogger(__name__)
        logger.error("Process group URL: {}".format(dist_url))
        raise e
    # synchronize is needed here to prevent a possible timeout after calling init_process_group
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

    assert num_gpus_per_machine <= torch.cuda.device_count()
    torch.cuda.set_device(local_rank)

    # Setup the local process group (which contains ranks within the same machine)
    assert comm._LOCAL_PROCESS_GROUP is None
    num_machines = world_size // num_gpus_per_machine
    for i in range(num_machines):
        ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine))
        pg = dist.new_group(ranks_on_i)
        if i == machine_rank:
            comm._LOCAL_PROCESS_GROUP = pg

    main_func(*args)

开发者ID:facebookresearch，项目名称:detectron2，代码行数:32，代码来源:launch.py

示例10: tmp_process_group

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import new_group [as 别名]
def tmp_process_group(backend):
    cpu_pg = dist.new_group(backend=backend)
    try:
        yield cpu_pg
    finally:
        dist.destroy_process_group(cpu_pg)

开发者ID:pytorch，项目名称:elastic，代码行数:8，代码来源:main.py

示例11: _get_global_gloo_group

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import new_group [as 别名]
def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    Returns:
        (group): pytorch dist group.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD

开发者ID:facebookresearch，项目名称:SlowFast，代码行数:13，代码来源:distributed.py

注：本文中的torch.distributed.new_group方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。

示例1: __init__