当前位置: 首页>>代码示例>>Python>>正文


Python distributed.init_process_group方法代码示例

本文整理汇总了Python中torch.distributed.init_process_group方法的典型用法代码示例。如果您正苦于以下问题:Python distributed.init_process_group方法的具体用法?Python distributed.init_process_group怎么用?Python distributed.init_process_group使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在torch.distributed的用法示例。


在下文中一共展示了distributed.init_process_group方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: setup

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import init_process_group [as 别名]
def setup(rank, device_ids, args):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group
    dist.init_process_group("gloo", rank=rank, world_size=len(device_ids))

    train_file, test_file, batch_size, epochs, gpu_mode, num_workers, retrain_model, \
    retrain_model_path, gru_layers, hidden_size, learning_rate, weight_decay, model_dir, stats_dir, total_callers, \
    train_mode = args

    # issue with semaphore lock: https://github.com/pytorch/pytorch/issues/2517
    # mp.set_start_method('spawn')

    # Explicitly setting seed to make sure that models created in two processes
    # start from same random weights and biases. https://github.com/pytorch/pytorch/issues/2517
    torch.manual_seed(42)
    train(train_file, test_file, batch_size, epochs, gpu_mode, num_workers, retrain_model, retrain_model_path,
          gru_layers, hidden_size, learning_rate, weight_decay, model_dir, stats_dir, train_mode,
          total_callers, rank, device_ids[rank])
    cleanup() 
开发者ID:kishwarshafin,项目名称:helen,代码行数:23,代码来源:train_distributed.py

示例2: setup

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import init_process_group [as 别名]
def setup(rank, total_callers, args, all_input_files, all_devices):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group
    dist.init_process_group("gloo", rank=rank, world_size=total_callers)

    # expand the arguments
    output_filepath, model_path, batch_size, num_workers = args

    # call prediction function
    predict(all_input_files[rank],
            output_filepath,
            model_path,
            batch_size,
            num_workers,
            rank,
            all_devices[rank])
    cleanup() 
开发者ID:kishwarshafin,项目名称:helen,代码行数:21,代码来源:predict_gpu.py

示例3: setup

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import init_process_group [as 别名]
def setup(rank, total_callers, args, all_input_files):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group
    dist.init_process_group("gloo", rank=rank, world_size=total_callers)

    # expand the arguments
    output_filepath, model_path, batch_size, num_workers, threads = args

    # call prediction function
    predict(all_input_files[rank],
            output_filepath,
            model_path,
            batch_size,
            num_workers,
            rank,
            threads)
    cleanup() 
开发者ID:kishwarshafin,项目名称:helen,代码行数:21,代码来源:predict_cpu.py

示例4: __init__

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import init_process_group [as 别名]
def __init__(self, rank, learner_ranks, worker_ranks, ip, port):
        world_size = len(learner_ranks) + len(worker_ranks)
        dist.init_process_group(
            "nccl",
            init_method="tcp://{}:{}".format(ip, port),
            rank=rank,
            world_size=world_size,
        )
        groups = {}
        for learner_rank in learner_ranks:
            for worker_rank in worker_ranks:
                g = dist.new_group([learner_rank, worker_rank])
                if worker_rank == rank:
                    groups[learner_rank] = g
        dist.new_group(learner_ranks)

        self.groups = groups
        self.device = torch.device(f"cuda:{ray.get_gpu_ids()[0]}")
        self.rank = rank
        self.network = torch.zeros(3).to(self.device)
        self.exp = None
        self.network_handle = None 
开发者ID:heronsystems,项目名称:adeptRL,代码行数:24,代码来源:ray_container.py

示例5: init_process_group

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import init_process_group [as 别名]
def init_process_group(backend):

    comm = MPI.COMM_WORLD
    world_size = comm.Get_size()
    rank = comm.Get_rank()

    info = dict()
    if rank == 0:
        host = socket.gethostname()
        address = socket.gethostbyname(host)
        info.update(dict(MASTER_ADDR=address, MASTER_PORT='1234'))

    info = comm.bcast(info, root=0)
    info.update(dict(WORLD_SIZE=str(world_size), RANK=str(rank)))
    os.environ.update(info)

    distributed.init_process_group(backend=backend) 
开发者ID:skmhrk1209,项目名称:Single-Path-NAS-PyTorch,代码行数:19,代码来源:distributed.py

示例6: setup_distributed

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import init_process_group [as 别名]
def setup_distributed(port=29500):
    if not dist.is_available() or not torch.cuda.is_available() or torch.cuda.device_count() <= 1:
        return 0, 1

    if 'MPIR_CVAR_CH3_INTERFACE_HOSTNAME' in os.environ:
        from mpi4py import MPI
        mpi_rank = MPI.COMM_WORLD.Get_rank()
        mpi_size = MPI.COMM_WORLD.Get_size()

        os.environ["MASTER_ADDR"] = '127.0.0.1'
        os.environ["MASTER_PORT"] = str(port)

        dist.init_process_group(backend="nccl", world_size=mpi_size, rank=mpi_rank)
        return mpi_rank, mpi_size

    dist.init_process_group(backend="nccl", init_method="env://")
    return dist.get_rank(), dist.get_world_size() 
开发者ID:openai,项目名称:gpt-2-output-dataset,代码行数:19,代码来源:train.py

示例7: spmd_main

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import init_process_group [as 别名]
def spmd_main(local_world_size, local_rank):
    # These are the parameters used to initialize the process group
    env_dict = {
        key: os.environ[key]
        for key in ("MASTER_ADDR", "MASTER_PORT", "RANK", "WORLD_SIZE")
    }
    print(f"[{os.getpid()}] Initializing process group with: {env_dict}")
    dist.init_process_group(backend="nccl")
    print(
        f"[{os.getpid()}]: world_size = {dist.get_world_size()}, "
        + f"rank = {dist.get_rank()}, backend={dist.get_backend()}"
    )

    demo_basic(local_world_size, local_rank)

    # Tear down the process group
    dist.destroy_process_group() 
开发者ID:pytorch,项目名称:examples,代码行数:19,代码来源:example.py

示例8: setup

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import init_process_group [as 别名]
def setup():
    os.environ['MASTER_ADDR'] = args.master
    os.environ['MASTER_PORT'] = '29500'

    # initialize the process group
    dist.init_process_group("gloo", rank=args.rank, world_size=args.world_size)

    # Explicitly setting seed makes sure that models created in two processes
    # start from same random weights and biases. Alternatively, sync models
    # on start with the callback below.
    #torch.manual_seed(42) 
开发者ID:pytorchbearer,项目名称:torchbearer,代码行数:13,代码来源:distributed_data_parallel.py

示例9: _init_dist_pytorch

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import init_process_group [as 别名]
def _init_dist_pytorch(backend, **kwargs):
    # TODO: use local_rank instead of rank % num_gpus
    rank = int(os.environ['RANK'])
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(rank % num_gpus)
    dist.init_process_group(backend=backend, **kwargs) 
开发者ID:dingjiansw101,项目名称:AerialDetection,代码行数:8,代码来源:env.py

示例10: _init_dist_slurm

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import init_process_group [as 别名]
def _init_dist_slurm(backend, port=29500, **kwargs):
    proc_id = int(os.environ['SLURM_PROCID'])
    ntasks = int(os.environ['SLURM_NTASKS'])
    node_list = os.environ['SLURM_NODELIST']
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(proc_id % num_gpus)
    addr = subprocess.getoutput(
        'scontrol show hostname {} | head -n1'.format(node_list))
    os.environ['MASTER_PORT'] = str(port)
    os.environ['MASTER_ADDR'] = addr
    os.environ['WORLD_SIZE'] = str(ntasks)
    os.environ['RANK'] = str(proc_id)
    dist.init_process_group(backend=backend) 
开发者ID:dingjiansw101,项目名称:AerialDetection,代码行数:15,代码来源:env.py

示例11: main

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import init_process_group [as 别名]
def main(args):
    # Initialize multi-processing
    distributed.init_process_group(backend='nccl', init_method='env://')
    device_id, device = args.local_rank, torch.device(args.local_rank)
    rank, world_size = distributed.get_rank(), distributed.get_world_size()
    torch.cuda.set_device(device_id)

    # Initialize logging
    if rank == 0:
        logging.init(args.log_dir, "test")

    # Load configuration
    config = make_config(args)

    # Create dataloader
    test_dataloader = make_dataloader(args, config, rank, world_size)
    meta = load_meta(args.meta)

    # Create model
    model = make_model(config, meta["num_thing"], meta["num_stuff"])

    # Load snapshot
    log_debug("Loading snapshot from %s", args.model)
    resume_from_snapshot(model, args.model, ["body", "rpn_head", "roi_head"])

    # Init GPU stuff
    torch.backends.cudnn.benchmark = config["general"].getboolean("cudnn_benchmark")
    model = DistributedDataParallel(model.cuda(device), device_ids=[device_id], output_device=device_id)

    if args.raw:
        save_function = partial(save_prediction_raw, out_dir=args.out_dir)
    else:
        save_function = partial(
            save_prediction_image, out_dir=args.out_dir, colors=meta["palette"],
            num_stuff=meta["num_stuff"], threshold=args.threshold)
    test(model, test_dataloader, device=device, summary=None,
         log_interval=config["general"].getint("log_interval"), save_function=save_function) 
开发者ID:mapillary,项目名称:seamseg,代码行数:39,代码来源:test_instance_seg.py

示例12: setup

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import init_process_group [as 别名]
def setup(hp, rank, world_size):
    os.environ["MASTER_ADDR"] = hp.train.dist.master_addr
    os.environ["MASTER_PORT"] = hp.train.dist.master_port

    # initialize the process group
    dist.init_process_group(hp.train.dist.mode, rank=rank, world_size=world_size) 
开发者ID:ryul99,项目名称:pytorch-project-template,代码行数:8,代码来源:trainer.py

示例13: main

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import init_process_group [as 别名]
def main():
    import torch.nn as nn
    import torch.distributed as dist
    import torch.optim as optim
    import torch.utils.data

    dist.init_process_group(backend='gloo')
    torch.manual_seed(42)

    data = torch.rand((1000, 32), dtype=torch.float32)
    labels = torch.randint(1, (1000, 10), dtype=torch.float32)

    train_dataset = torch.utils.data.TensorDataset(data, labels)
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=32,
                                               shuffle=False,
                                               sampler=train_sampler)

    model = nn.parallel.DistributedDataParallel(get_model())
    optimizer = optim.SGD(model.parameters(),
                          lr=0.01, momentum=0.5)
    criterion = nn.BCELoss()

    for _ in range(2):
        # 2 epochs
        for _, (batch_data, batch_labels) in enumerate(train_loader):
            outputs = model(batch_data)
            loss = criterion(outputs.squeeze(), batch_labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step() 
开发者ID:mars-project,项目名称:mars,代码行数:34,代码来源:pytorch_sample.py

示例14: main

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import init_process_group [as 别名]
def main():
    import torch.nn as nn
    import torch.distributed as dist
    import torch.optim as optim
    import torch.utils.data
    import mars.tensor as mt
    from mars.learn.contrib.pytorch import MarsDataset, MarsDistributedSampler

    dist.init_process_group(backend='gloo')
    torch.manual_seed(42)

    data = mt.named_tensor(name='data')
    labels = mt.named_tensor(name='labels')
    train_dataset = MarsDataset(data, labels)
    train_sampler = MarsDistributedSampler(train_dataset)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=32,
                                               shuffle=False,
                                               sampler=train_sampler)

    model = nn.parallel.DistributedDataParallel(get_model())
    optimizer = optim.SGD(model.parameters(),
                          lr=0.01, momentum=0.5)
    criterion = nn.BCELoss()

    for _ in range(2):
        # 2 epochs
        for _, (batch_data, batch_labels) in enumerate(train_loader):
            outputs = model(batch_data)
            loss = criterion(outputs.squeeze(), batch_labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step() 
开发者ID:mars-project,项目名称:mars,代码行数:35,代码来源:dataset_sample.py

示例15: init_processes

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import init_process_group [as 别名]
def init_processes(backend, master_addr, master_port, rank, world_size,
                   rows, columns, host, num_gpus):
    # Initialize the distributed environment.
    os.environ['WORLD_SIZE'] = str(world_size)
    os.environ['RANK'] = str(rank)
    os.environ['MASTER_ADDR'] = master_addr
    os.environ['MASTER_PORT'] = master_port

    logger.info('Init process rank {} on host \'{}\''.format(rank, host))
    dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
    run(backend, rank, rows, columns, num_gpus) 
开发者ID:aws,项目名称:sagemaker-pytorch-training-toolkit,代码行数:13,代码来源:distributed_operations.py


注:本文中的torch.distributed.init_process_group方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。