From 74781ab5b8d39362d1fdeb60f8661bffcf2d15cd Mon Sep 17 00:00:00 2001
From: Alban Desmaison <albandes@fb.com>
Date: Fri, 21 Aug 2020 12:17:43 -0700
Subject: [PATCH] Revert D23242101: [pytorch][PR] Implement first draft of
 autograd benchmark.

Test Plan: revert-hammer

Differential Revision:
D23242101 (https://github.com/pytorch/pytorch/commit/c2511bdfa4bef89e6c328a161d3a4d3466261ff7)

Original commit changeset: a2b92d5a4341

fbshipit-source-id: bda562d15565f074b448022d180ec8f959c6ecc9
---
 .../functional_autograd_benchmark/README.md   |  48 --
 .../audio_text_models.py                      | 122 ---
 .../functional_autograd_benchmark/compare.py  |  45 -
 .../functional_autograd_benchmark.py          | 153 ----
 .../ppl_models.py                             |  93 --
 .../torchaudio_models.py                      | 556 ------------
 .../torchvision_models.py                     | 803 ------------------
 .../functional_autograd_benchmark/utils.py    | 103 ---
 .../vision_models.py                          |  97 ---
 test/run_test.py                              |   1 -
 test/test_functional_autograd_benchmark.py    |  54 --
 11 files changed, 2075 deletions(-)
 delete mode 100644 benchmarks/functional_autograd_benchmark/README.md
 delete mode 100644 benchmarks/functional_autograd_benchmark/audio_text_models.py
 delete mode 100644 benchmarks/functional_autograd_benchmark/compare.py
 delete mode 100644 benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
 delete mode 100644 benchmarks/functional_autograd_benchmark/ppl_models.py
 delete mode 100644 benchmarks/functional_autograd_benchmark/torchaudio_models.py
 delete mode 100644 benchmarks/functional_autograd_benchmark/torchvision_models.py
 delete mode 100644 benchmarks/functional_autograd_benchmark/utils.py
 delete mode 100644 benchmarks/functional_autograd_benchmark/vision_models.py
 delete mode 100644 test/test_functional_autograd_benchmark.py

diff --git a/benchmarks/functional_autograd_benchmark/README.md b/benchmarks/functional_autograd_benchmark/README.md
deleted file mode 100644
index a5f106fec67..00000000000
--- a/benchmarks/functional_autograd_benchmark/README.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# Benchmarking tool for the autograd API
-
-This folder contain a set of self-contained scripts that allow to benchmark the autograd with different common models.
-It is designed to run the benchmark before and after your change and will generate a table to share on the PR.
-
-To do so, you can use `functional_autograd_benchmark.py` to run the benchmarks before your change (using as output `before.txt`) and after your change (using as output `after.txt`).
-You can then use `compare.py` to get a markdown table comparing the two runs.
-
-The default arguments of `functional_autograd_benchmark.py` should be used in general. You can change them though to force a given device or force running even the (very) slow settings.
-
-### Sample usage
-
-```bash
-# Make sure you compile pytorch in release mode and with the same flags before/after
-export DEBUG=0
-# When running on CPU, it might be required to limit the number of cores to avoid oversubscription
-export OMP_NUM_THREADS=10
-
-# Compile pytorch with the base revision
-git checkout master
-python setup.py develop
-
-# Run the benchmark for the base
-# This will use the GPU if available.
-pushd benchmarks/functional_autograd_benchmark
-python functional_autograd_benchmark.py --output before.txt
-
-# Compile pytorch with your change
-popd
-git checkout your_feature_branch
-python setup.py develop
-
-# Run the benchmark for the new version
-pushd benchmarks/functional_autograd_benchmark
-python functional_autograd_benchmark.py --output after.txt
-
-# Get the markdown table that you can paste in your github PR
-python compare.py
-
-popd
-
-```
-
-### Files in this folder:
-- `functional_autograd_benchmark.py` is the main entry point to run the benchmark.
-- `compare.py` is the entry point to run the comparison script that generates a markdown table.
-- `torchaudio_models.py` and `torchvision_models.py`  contains code extracted from torchaudio and torchvision to be able to run the models without having a specific version of these libraries installed.
-- `ppl_models.py`, `vision_models.py` and `audio_text_models.py` contain all the getter functions used for the benchmark.
diff --git a/benchmarks/functional_autograd_benchmark/audio_text_models.py b/benchmarks/functional_autograd_benchmark/audio_text_models.py
deleted file mode 100644
index 938e677ac38..00000000000
--- a/benchmarks/functional_autograd_benchmark/audio_text_models.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import torch
-from torch import nn, Tensor
-
-import torchaudio_models as models
-
-from utils import extract_weights, load_weights, GetterReturnType
-
-def get_wav2letter(device: torch.device) -> GetterReturnType:
-    N = 10
-    input_frames = 700
-    vocab_size = 28
-    model = models.Wav2Letter(num_classes=vocab_size)
-    criterion = torch.nn.NLLLoss()
-    model.to(device)
-    params, names = extract_weights(model)
-
-    inputs = torch.rand([N, 1, input_frames], device=device)
-    labels = torch.rand(N, 3, device=device).mul(vocab_size).long()
-
-    def forward(*new_params: Tensor) -> Tensor:
-        load_weights(model, names, new_params)
-        out = model(inputs)
-
-        loss = criterion(out, labels)
-        return loss
-
-    return forward, params
-
-def get_deepspeech(device: torch.device) -> GetterReturnType:
-    sample_rate = 16000
-    window_size = 0.02
-    window = "hamming"
-    audio_conf = dict(sample_rate=sample_rate,
-                      window_size=window_size,
-                      window=window,
-                      noise_dir=None)
-
-    N = 10
-    num_classes = 10
-    spectrogram_size = 161
-    # Commented are the original sizes in the code
-    seq_length = 500  # 1343
-    target_length = 10  # 50
-    labels = torch.rand(num_classes, device=device)
-    inputs = torch.rand(N, 1, spectrogram_size, seq_length, device=device)
-    # Sequence length for each input
-    inputs_sizes = torch.rand(N, device=device).mul(seq_length * 0.1).add(seq_length * 0.8)
-    targets = torch.rand(N, target_length, device=device)
-    targets_sizes = torch.full((N,), target_length, dtype=torch.int, device=device)
-
-    model = models.DeepSpeech(rnn_type=nn.LSTM, labels=labels, rnn_hidden_size=1024, nb_layers=5,
-                              audio_conf=audio_conf, bidirectional=True)
-    model = model.to(device)
-    criterion = nn.CTCLoss()
-    params, names = extract_weights(model)
-
-    def forward(*new_params: Tensor) -> Tensor:
-        load_weights(model, names, new_params)
-        out, out_sizes = model(inputs, inputs_sizes)
-        out = out.transpose(0, 1)  # For ctc loss
-
-        loss = criterion(out, targets, out_sizes, targets_sizes)
-        return loss
-
-    return forward, params
-
-def get_transformer(device: torch.device) -> GetterReturnType:
-    # For most SOTA research, you would like to have embed to 720, nhead to 12, bsz to 64, tgt_len/src_len to 128.
-    N = 64
-    seq_length = 128
-    ntoken = 50
-    model = models.TransformerModel(ntoken=ntoken, ninp=720, nhead=12, nhid=2048, nlayers=2)
-    model.to(device)
-    criterion = nn.NLLLoss()
-    params, names = extract_weights(model)
-
-    data = torch.rand(N, seq_length + 1, device=device).mul(ntoken).long()
-    inputs = data.narrow(1, 0, seq_length)
-    targets = data.narrow(1, 1, seq_length)
-
-    def forward(*new_params: Tensor) -> Tensor:
-        load_weights(model, names, new_params)
-        out = model(inputs)
-
-        loss = criterion(out.reshape(N * seq_length, ntoken), targets.reshape(N * seq_length))
-        return loss
-
-    return forward, params
-
-def get_multiheadattn(device: torch.device) -> GetterReturnType:
-    # From https://github.com/pytorch/text/blob/master/test/data/test_modules.py#L10
-    embed_dim, nhead, tgt_len, src_len, bsz = 10, 5, 6, 10, 64
-    # Build torchtext MultiheadAttention module
-    in_proj = models.InProjContainer(torch.nn.Linear(embed_dim, embed_dim, bias=False),
-                                     torch.nn.Linear(embed_dim, embed_dim, bias=False),
-                                     torch.nn.Linear(embed_dim, embed_dim, bias=False))
-
-    model = models.MultiheadAttentionContainer(nhead, in_proj,
-                                               models.ScaledDotProduct(),
-                                               torch.nn.Linear(embed_dim, embed_dim, bias=False))
-    model.to(device)
-    params, names = extract_weights(model)
-
-    query = torch.rand((tgt_len, bsz, embed_dim), device=device)
-    key = value = torch.rand((src_len, bsz, embed_dim), device=device)
-    attn_mask_2D = torch.randint(0, 2, (tgt_len, src_len), device=device).to(torch.bool)
-    bias_k = bias_v = torch.rand((1, 1, embed_dim), device=device)
-
-    attn_mask = torch.stack([attn_mask_2D] * (bsz * nhead))
-    bias_k = bias_k.repeat(1, bsz, 1).reshape(1, bsz * nhead, -1)
-    bias_v = bias_v.repeat(1, bsz, 1).reshape(1, bsz * nhead, -1)
-
-    def forward(*new_params: Tensor) -> Tensor:
-        load_weights(model, names, new_params)
-        mha_output, attn_weights = model(query, key, value, attn_mask=attn_mask, bias_k=bias_k, bias_v=bias_v)
-
-        # Don't test any specific loss, just backprop ones for both outputs
-        loss = mha_output.sum() + attn_weights.sum()
-
-        return loss
-
-    return forward, params
diff --git a/benchmarks/functional_autograd_benchmark/compare.py b/benchmarks/functional_autograd_benchmark/compare.py
deleted file mode 100644
index c2c4ef6c95d..00000000000
--- a/benchmarks/functional_autograd_benchmark/compare.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import argparse
-from collections import defaultdict
-
-from utils import to_markdown_table, from_markdown_table
-
-def main():
-    parser = argparse.ArgumentParser("Main script to compare results from the benchmarks")
-    parser.add_argument("--before", type=str, default="before.txt", help="Text file containing the times to use as base")
-    parser.add_argument("--after", type=str, default="after.txt", help="Text file containing the times to use as new version")
-    parser.add_argument("--output", type=str, default="", help="Text file where to write the output")
-    args = parser.parse_args()
-
-    with open(args.before, "r") as f:
-        content = f.read()
-    res_before = from_markdown_table(content)
-
-    with open(args.after, "r") as f:
-        content = f.read()
-    res_after = from_markdown_table(content)
-
-    diff = defaultdict(defaultdict)
-    for model in res_before:
-        for task in res_before[model]:
-            mean_before, var_before = res_before[model][task]
-            if task not in res_after[model]:
-                diff[model][task] = (None, mean_before, var_before, None, None)
-            else:
-                mean_after, var_after = res_after[model][task]
-                diff[model][task] = (mean_before / mean_after, mean_before, var_before, mean_after, var_after)
-    for model in res_after:
-        for task in res_after[model]:
-            if task not in res_before[model]:
-                mean_after, var_after = res_after[model][task]
-                diff[model][task] = (None, None, None, mean_after, var_after)
-
-    header = ("model", "task", "speedup", "mean (before)", "var (before)", "mean (after)", "var (after)")
-    out = to_markdown_table(diff, header=header)
-
-    print(out)
-    if args.output:
-        with open(args.output, "w") as f:
-            f.write(out)
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py b/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
deleted file mode 100644
index 3eeda15f1af..00000000000
--- a/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
+++ /dev/null
@@ -1,153 +0,0 @@
-import torch
-from torch.autograd import functional
-
-import time
-from argparse import ArgumentParser
-from collections import defaultdict
-from typing import NamedTuple, Callable, List, Any
-
-import ppl_models
-import vision_models
-import audio_text_models
-
-from utils import to_markdown_table, TimingResultType, InputsType, GetterType, VType
-
-# Listing of the different tasks
-FAST_TASKS_NO_DOUBLE_BACK = [
-    "vjp",
-]
-
-FAST_TASKS = FAST_TASKS_NO_DOUBLE_BACK + [
-    "vhp",
-    "jvp",
-]
-
-ALL_TASKS = FAST_TASKS + [
-    "hvp",
-    "jacobian",
-    "hessian"
-]
-
-DOUBLE_BACKWARD_TASKS = ["jvp", "hvp", "vhp", "hessian"]
-
-# Model definition which contains:
-# - name: a string with the model name.
-# - getter: a function to get the model. It takes as input the device on which the model
-#     will run. It should return the forward function and the parameters (Tensors) used as
-#     input for the forward function. Note that the forward must *not* have any side effect.
-# - tasks: the list of recommended tasks that can run in a reasonable amount of time with this model.
-# - unsupported: the list of tasks that this model cannot run.
-class ModelDef(NamedTuple):
-    name: str
-    getter: GetterType
-    tasks: List[str]
-    unsupported: List[str]
-
-MODELS = [
-    ModelDef("resnet18", vision_models.get_resnet18, FAST_TASKS, []),
-    ModelDef("fcn_resnet", vision_models.get_fcn_resnet, FAST_TASKS, []),
-    ModelDef("detr", vision_models.get_detr, FAST_TASKS, []),
-    ModelDef("ppl_simple_reg", ppl_models.get_simple_regression, ALL_TASKS, []),
-    ModelDef("ppl_robust_reg", ppl_models.get_robust_regression, ALL_TASKS, []),
-    ModelDef("wav2letter", audio_text_models.get_wav2letter, FAST_TASKS, []),
-    ModelDef("deepspeech", audio_text_models.get_deepspeech, FAST_TASKS_NO_DOUBLE_BACK, DOUBLE_BACKWARD_TASKS),
-    ModelDef("transformer", audio_text_models.get_transformer, FAST_TASKS, []),
-    ModelDef("multiheadattn", audio_text_models.get_multiheadattn, FAST_TASKS, []),
-]
-
-def get_v_for(model: Callable, inp: InputsType, task: str) -> VType:
-    v: VType
-
-    if task in ["vjp"]:
-        out = model(*inp)
-        v = torch.rand_like(out)
-    elif task in ["jvp", "hvp", "vhp"]:
-        if isinstance(inp, tuple):
-            v = tuple(torch.rand_like(i) for i in inp)
-        else:
-            v = torch.rand_like(inp)
-    else:
-        v = None
-
-    return v
-
-def run_once(model: Callable, inp: InputsType, task: str, v: VType) -> None:
-    func = getattr(functional, task)
-
-    if v is not None:
-        res = func(model, inp, v=v, strict=True)
-    else:
-        res = func(model, inp, strict=True)
-
-def run_model(model_getter: GetterType, args: Any, task: str) -> List[float]:
-    if args.gpu == -1:
-        device = torch.device("cpu")
-
-        def noop():
-            pass
-        do_sync = noop
-    else:
-        device = torch.device("cuda:{}".format(args.gpu))
-        do_sync = torch.cuda.synchronize
-
-    model, inp = model_getter(device)
-
-    v = get_v_for(model, inp, task)
-    # Warmup
-    run_once(model, inp, task, v)
-
-    elapsed = []
-    for it in range(args.num_iters):
-        do_sync()
-        start = time.time()
-        run_once(model, inp, task, v)
-        do_sync()
-        elapsed.append(time.time() - start)
-
-    return elapsed
-
-def main():
-    parser = ArgumentParser("Main script to benchmark functional API of the autograd.")
-    parser.add_argument("--output", type=str, default="", help="Text file where to write the output")
-    parser.add_argument("--num-iters", type=int, default=10)
-    parser.add_argument("--gpu", type=int, default=-2, help="GPU to use, -1 for CPU and -2 for auto-detect")
-    parser.add_argument("--run-slow-tasks", action="store_true", help="Run even the slow tasks")
-    parser.add_argument("--model-filter", type=str, default="", help="Only run the models in this filter")
-    parser.add_argument("--task-filter", type=str, default="", help="Only run the tasks in this filter")
-    parser.add_argument("--num-threads", type=int, default=10,
-                        help="Number of concurrent threads to use when running on cpu")
-    parser.add_argument("--seed", type=int, default=0, help="The random seed to use.")
-    args = parser.parse_args()
-
-    results: TimingResultType = defaultdict(defaultdict)
-    torch.set_num_threads(args.num_threads)
-    torch.set_num_interop_threads(args.num_threads)
-
-    # This automatically seed cuda if it is available
-    torch.manual_seed(args.seed)
-
-    if args.gpu == -2:
-        args.gpu = 0 if torch.cuda.is_available() else -1
-
-    for name, model_getter, recommended_tasks, unsupported_tasks in MODELS:
-        if args.model_filter and name not in args.model_filter:
-            continue
-        tasks = ALL_TASKS if args.run_slow_tasks else recommended_tasks
-        for task in tasks:
-            if task in unsupported_tasks:
-                continue
-            if args.task_filter and task not in args.task_filter:
-                continue
-            runtimes = run_model(model_getter, args, task)
-
-            runtimes = torch.tensor(runtimes)
-            mean, var = runtimes.mean(), runtimes.var()
-            results[name][task] = (mean.item(), var.item())
-            print("Results for model {} on task {}: {}s (var: {})".format(name, task, mean, var))
-
-    if args.output:
-        with open(args.output, "w") as f:
-            f.write(to_markdown_table(results))
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmarks/functional_autograd_benchmark/ppl_models.py b/benchmarks/functional_autograd_benchmark/ppl_models.py
deleted file mode 100644
index 906ebac5d41..00000000000
--- a/benchmarks/functional_autograd_benchmark/ppl_models.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import torch
-from torch import Tensor
-import torch.distributions as dist
-
-from utils import GetterReturnType
-
-def get_simple_regression(device: torch.device) -> GetterReturnType:
-    N = 10
-    K = 10
-
-    loc_beta = 0.
-    scale_beta = 1.
-
-    beta_prior = dist.Normal(loc_beta, scale_beta)
-
-    X = torch.rand(N, K + 1, device=device)
-    Y = torch.rand(N, 1, device=device)
-
-    # X.shape: (N, K + 1), Y.shape: (N, 1), beta_value.shape: (K + 1, 1)
-    beta_value = beta_prior.sample((K + 1, 1))
-    beta_value.requires_grad_(True)
-
-    def forward(beta_value: Tensor) -> Tensor:
-        mu = X.mm(beta_value)
-
-        # We need to compute the first and second gradient of this score with respect
-        # to beta_value.
-        score = dist.Bernoulli(logits=mu).log_prob(Y).sum() + beta_prior.log_prob(beta_value).sum()
-        return score
-
-    return forward, (beta_value.to(device),)
-
-
-def get_robust_regression(device: torch.device) -> GetterReturnType:
-    N = 10
-    K = 10
-
-    # X.shape: (N, K + 1), Y.shape: (N, 1)
-    X = torch.rand(N, K + 1, device=device)
-    Y = torch.rand(N, 1, device=device)
-
-    # Predefined nu_alpha and nu_beta, nu_alpha.shape: (1, 1), nu_beta.shape: (1, 1)
-    nu_alpha = torch.randn(1, 1, device=device)
-    nu_beta = torch.rand(1, 1, device=device)
-    nu = dist.Gamma(nu_alpha, nu_beta)
-
-    # Predefined sigma_rate: sigma_rate.shape: (N, 1)
-    sigma_rate = torch.rand(N, 1, device=device)
-    sigma = dist.Exponential(sigma_rate)
-
-    # Predefined beta_mean and beta_sigma: beta_mean.shape: (K + 1, 1), beta_sigma.shape: (K + 1, 1)
-    beta_mean = torch.rand(K + 1, 1, device=device)
-    beta_sigma = torch.rand(K + 1, 1, device=device)
-    beta = dist.Normal(beta_mean, beta_sigma)
-
-    nu_value = nu.sample()
-    nu_value.requires_grad_(True)
-
-    sigma_value = sigma.sample()
-    sigma_unconstrained_value = sigma_value.log()
-    sigma_unconstrained_value.requires_grad_(True)
-
-    beta_value = beta.sample()
-    beta_value.requires_grad_(True)
-
-    def forward(nu_value: Tensor, sigma_unconstrained_value: Tensor, beta_value: Tensor) -> Tensor:
-        sigma_constrained_value = sigma_unconstrained_value.exp()
-        mu = X.mm(beta_value)
-
-        # For this model, we need to compute the following three scores:
-        # We need to compute the first and second gradient of this score with respect
-        # to nu_value.
-        nu_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \
-            + nu.log_prob(nu_value)
-
-
-
-        # We need to compute the first and second gradient of this score with respect
-        # to sigma_unconstrained_value.
-        sigma_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \
-            + sigma.log_prob(sigma_constrained_value) \
-            + sigma_unconstrained_value
-
-
-
-        # We need to compute the first and second gradient of this score with respect
-        # to beta_value.
-        beta_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \
-            + beta.log_prob(beta_value)
-
-        return nu_score.sum() + sigma_score.sum() + beta_score.sum()
-
-    return forward, (nu_value.to(device), sigma_unconstrained_value.to(device), beta_value.to(device))
diff --git a/benchmarks/functional_autograd_benchmark/torchaudio_models.py b/benchmarks/functional_autograd_benchmark/torchaudio_models.py
deleted file mode 100644
index 1e4cc747b0f..00000000000
--- a/benchmarks/functional_autograd_benchmark/torchaudio_models.py
+++ /dev/null
@@ -1,556 +0,0 @@
-# Taken from https://github.com/pytorch/audio/blob/master/torchaudio/models/wav2letter.py
-# So that we don't need torchaudio to be installed
-
-import torch
-from torch import Tensor
-from torch import nn
-import torch.nn.functional as F
-
-import math
-from collections import OrderedDict
-from typing import Tuple, Optional
-
-__all__ = ["Wav2Letter"]
-
-
-class Wav2Letter(nn.Module):
-    r"""Wav2Letter model architecture from the `"Wav2Letter: an End-to-End ConvNet-based Speech Recognition System"
-     <https://arxiv.org/abs/1609.03193>`_ paper.
-     :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}`
-    Args:
-        num_classes (int, optional): Number of classes to be classified. (Default: ``40``)
-        input_type (str, optional): Wav2Letter can use as input: ``waveform``, ``power_spectrum``
-         or ``mfcc`` (Default: ``waveform``).
-        num_features (int, optional): Number of input features that the network will receive (Default: ``1``).
-    """
-
-    def __init__(self, num_classes: int = 40,
-                 input_type: str = "waveform",
-                 num_features: int = 1) -> None:
-        super(Wav2Letter, self).__init__()
-
-        acoustic_num_features = 250 if input_type == "waveform" else num_features
-        acoustic_model = nn.Sequential(
-            nn.Conv1d(in_channels=acoustic_num_features, out_channels=250, kernel_size=48, stride=2, padding=23),
-            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
-            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
-            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
-            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
-            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
-            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
-            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
-            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16),
-            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0),
-            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=2000, out_channels=num_classes, kernel_size=1, stride=1, padding=0),
-            nn.ReLU(inplace=True)
-        )
-
-        if input_type == "waveform":
-            waveform_model = nn.Sequential(
-                nn.Conv1d(in_channels=num_features, out_channels=250, kernel_size=250, stride=160, padding=45),
-                nn.ReLU(inplace=True)
-            )
-            self.acoustic_model = nn.Sequential(waveform_model, acoustic_model)
-
-        if input_type in ["power_spectrum", "mfcc"]:
-            self.acoustic_model = acoustic_model
-
-    def forward(self, x: Tensor) -> Tensor:
-        r"""
-        Args:
-            x (Tensor): Tensor of dimension (batch_size, num_features, input_length).
-        Returns:
-            Tensor: Predictor tensor of dimension (batch_size, number_of_classes, input_length).
-        """
-
-        x = self.acoustic_model(x)
-        x = nn.functional.log_softmax(x, dim=1)
-        return x
-
-# Taken from  https://github.com/SeanNaren/deepspeech.pytorch with modifications
-class SequenceWise(nn.Module):
-    def __init__(self, module):
-        """
-        Collapses input of dim T*N*H to (T*N)*H, and applies to a module.
-        Allows handling of variable sequence lengths and minibatch sizes.
-        :param module: Module to apply input to.
-        """
-        super(SequenceWise, self).__init__()
-        self.module = module
-
-    def forward(self, x):
-        t, n = x.size(0), x.size(1)
-        x = x.view(t * n, -1)
-        x = self.module(x)
-        x = x.view(t, n, -1)
-        return x
-
-    def __repr__(self):
-        tmpstr = self.__class__.__name__ + ' (\n'
-        tmpstr += self.module.__repr__()
-        tmpstr += ')'
-        return tmpstr
-
-
-class MaskConv(nn.Module):
-    def __init__(self, seq_module):
-        """
-        Adds padding to the output of the module based on the given lengths. This is to ensure that the
-        results of the model do not change when batch sizes change during inference.
-        Input needs to be in the shape of (BxCxDxT)
-        :param seq_module: The sequential module containing the conv stack.
-        """
-        super(MaskConv, self).__init__()
-        self.seq_module = seq_module
-
-    def forward(self, x, lengths):
-        """
-        :param x: The input of size BxCxDxT
-        :param lengths: The actual length of each sequence in the batch
-        :return: Masked output from the module
-        """
-        for module in self.seq_module:
-            x = module(x)
-            mask = torch.BoolTensor(x.size()).fill_(0)
-            if x.is_cuda:
-                mask = mask.cuda()
-            for i, length in enumerate(lengths):
-                length = length.item()
-                if (mask[i].size(2) - length) > 0:
-                    mask[i].narrow(2, length, mask[i].size(2) - length).fill_(1)
-            x = x.masked_fill(mask, 0)
-        return x, lengths
-
-
-class InferenceBatchSoftmax(nn.Module):
-    def forward(self, input_):
-        if not self.training:
-            return F.softmax(input_, dim=-1)
-        else:
-            return input_
-
-
-class BatchRNN(nn.Module):
-    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, bidirectional=False, batch_norm=True):
-        super(BatchRNN, self).__init__()
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.bidirectional = bidirectional
-        self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
-        self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
-                            bidirectional=bidirectional, bias=True)
-        self.num_directions = 2 if bidirectional else 1
-
-    def flatten_parameters(self):
-        self.rnn.flatten_parameters()
-
-    def forward(self, x, output_lengths):
-        if self.batch_norm is not None:
-            x = self.batch_norm(x)
-        x = nn.utils.rnn.pack_padded_sequence(x, output_lengths, enforce_sorted=False)
-        x, h = self.rnn(x)
-        x, _ = nn.utils.rnn.pad_packed_sequence(x)
-        if self.bidirectional:
-            x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1)  # (TxNxH*2) -> (TxNxH) by sum
-        return x
-
-
-class Lookahead(nn.Module):
-    # Wang et al 2016 - Lookahead Convolution Layer for Unidirectional Recurrent Neural Networks
-    # input shape - sequence, batch, feature - TxNxH
-    # output shape - same as input
-    def __init__(self, n_features, context):
-        super(Lookahead, self).__init__()
-        assert context > 0
-        self.context = context
-        self.n_features = n_features
-        self.pad = (0, self.context - 1)
-        self.conv = nn.Conv1d(self.n_features, self.n_features, kernel_size=self.context, stride=1,
-                              groups=self.n_features, padding=0, bias=None)
-
-    def forward(self, x):
-        x = x.transpose(0, 1).transpose(1, 2)
-        x = F.pad(x, pad=self.pad, value=0)
-        x = self.conv(x)
-        x = x.transpose(1, 2).transpose(0, 1).contiguous()
-        return x
-
-    def __repr__(self):
-        return self.__class__.__name__ + '(' \
-            + 'n_features=' + str(self.n_features) \
-            + ', context=' + str(self.context) + ')'
-
-class DeepSpeech(nn.Module):
-    def __init__(self, rnn_type, labels, rnn_hidden_size, nb_layers, audio_conf,
-                 bidirectional, context=20):
-        super(DeepSpeech, self).__init__()
-
-        self.hidden_size = rnn_hidden_size
-        self.hidden_layers = nb_layers
-        self.rnn_type = rnn_type
-        self.audio_conf = audio_conf
-        self.labels = labels
-        self.bidirectional = bidirectional
-
-        sample_rate = self.audio_conf["sample_rate"]
-        window_size = self.audio_conf["window_size"]
-        num_classes = len(self.labels)
-
-        self.conv = MaskConv(nn.Sequential(
-            nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)),
-            nn.BatchNorm2d(32),
-            nn.Hardtanh(0, 20, inplace=True),
-            nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)),
-            nn.BatchNorm2d(32),
-            nn.Hardtanh(0, 20, inplace=True)
-        ))
-        # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1
-        rnn_input_size = int(math.floor((sample_rate * window_size) / 2) + 1)
-        rnn_input_size = int(math.floor(rnn_input_size + 2 * 20 - 41) / 2 + 1)
-        rnn_input_size = int(math.floor(rnn_input_size + 2 * 10 - 21) / 2 + 1)
-        rnn_input_size *= 32
-
-        rnns = []
-        rnn = BatchRNN(input_size=rnn_input_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
-                       bidirectional=bidirectional, batch_norm=False)
-        rnns.append(('0', rnn))
-        for x in range(nb_layers - 1):
-            rnn = BatchRNN(input_size=rnn_hidden_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
-                           bidirectional=bidirectional)
-            rnns.append(('%d' % (x + 1), rnn))
-        self.rnns = nn.Sequential(OrderedDict(rnns))
-        self.lookahead = nn.Sequential(
-            # consider adding batch norm?
-            Lookahead(rnn_hidden_size, context=context),
-            nn.Hardtanh(0, 20, inplace=True)
-        ) if not bidirectional else None
-
-        fully_connected = nn.Sequential(
-            nn.BatchNorm1d(rnn_hidden_size),
-            nn.Linear(rnn_hidden_size, num_classes, bias=False)
-        )
-        self.fc = nn.Sequential(
-            SequenceWise(fully_connected),
-        )
-        self.inference_softmax = InferenceBatchSoftmax()
-
-    def forward(self, x, lengths):
-        lengths = lengths.cpu().int()
-        output_lengths = self.get_seq_lens(lengths)
-        x, _ = self.conv(x, output_lengths)
-
-        sizes = x.size()
-        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # Collapse feature dimension
-        x = x.transpose(1, 2).transpose(0, 1).contiguous()  # TxNxH
-
-        for rnn in self.rnns:
-            x = rnn(x, output_lengths)
-
-        if not self.bidirectional:  # no need for lookahead layer in bidirectional
-            x = self.lookahead(x)
-
-        x = self.fc(x)
-        x = x.transpose(0, 1)
-        # identity in training mode, softmax in eval mode
-        x = self.inference_softmax(x)
-        return x, output_lengths
-
-    def get_seq_lens(self, input_length):
-        """
-        Given a 1D Tensor or Variable containing integer sequence lengths, return a 1D tensor or variable
-        containing the size sequences that will be output by the network.
-        :param input_length: 1D Tensor
-        :return: 1D Tensor scaled by model
-        """
-        seq_len = input_length
-        for m in self.conv.modules():
-            if type(m) == nn.modules.conv.Conv2d:
-                seq_len = seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1
-                seq_len = seq_len.true_divide(m.stride[1]) + 1
-        return seq_len.int()
-
-# Taken from https://github.com/pytorch/examples/blob/master/word_language_model/model.py#L108-L152
-class PositionalEncoding(nn.Module):
-    r"""Inject some information about the relative or absolute position of the tokens
-        in the sequence. The positional encodings have the same dimension as
-        the embeddings, so that the two can be summed. Here, we use sine and cosine
-        functions of different frequencies.
-    .. math::
-        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
-        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
-        \text{where pos is the word position and i is the embed idx)
-    Args:
-        d_model: the embed dim (required).
-        dropout: the dropout value (default=0.1).
-        max_len: the max. length of the incoming sequence (default=5000).
-    Examples:
-        >>> pos_encoder = PositionalEncoding(d_model)
-    """
-
-    def __init__(self, d_model, dropout=0.1, max_len=5000):
-        super(PositionalEncoding, self).__init__()
-        self.dropout = nn.Dropout(p=dropout)
-
-        pe = torch.zeros(max_len, d_model)
-        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
-        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
-        pe[:, 0::2] = torch.sin(position * div_term)
-        pe[:, 1::2] = torch.cos(position * div_term)
-        pe = pe.unsqueeze(0).transpose(0, 1)
-        self.register_buffer('pe', pe)
-
-    def forward(self, x):
-        r"""Inputs of forward function
-        Args:
-            x: the sequence fed to the positional encoder model (required).
-        Shape:
-            x: [sequence length, batch size, embed dim]
-            output: [sequence length, batch size, embed dim]
-        Examples:
-            >>> output = pos_encoder(x)
-        """
-
-        x = x + self.pe[:x.size(0), :]
-        return self.dropout(x)
-
-class TransformerModel(nn.Module):
-    """Container module with an encoder, a recurrent or transformer module, and a decoder."""
-
-    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
-        super(TransformerModel, self).__init__()
-        try:
-            from torch.nn import TransformerEncoder, TransformerEncoderLayer
-        except Exception:
-            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
-        self.model_type = 'Transformer'
-        self.src_mask = None
-        self.pos_encoder = PositionalEncoding(ninp, dropout)
-        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
-        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
-        self.encoder = nn.Embedding(ntoken, ninp)
-        self.ninp = ninp
-        self.decoder = nn.Linear(ninp, ntoken)
-
-        self.init_weights()
-
-    def _generate_square_subsequent_mask(self, sz):
-        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
-        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
-        return mask
-
-    def init_weights(self):
-        initrange = 0.1
-        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
-        # Not sure how this works in the original code
-        # nn.init.zeros_(self.decoder)
-        nn.init.uniform_(self.decoder.weight, -initrange, initrange)
-
-    def forward(self, src, has_mask=True):
-        if has_mask:
-            device = src.device
-            # This will be created once during warmup
-            if self.src_mask is None or self.src_mask.size(0) != len(src):
-                mask = self._generate_square_subsequent_mask(len(src)).to(device)
-                self.src_mask = mask
-        else:
-            self.src_mask = None
-
-        src = self.encoder(src) * math.sqrt(self.ninp)
-        src = self.pos_encoder(src)
-        output = self.transformer_encoder(src, self.src_mask)
-        output = self.decoder(output)
-        return F.log_softmax(output, dim=-1)
-
-# From https://github.com/pytorch/text/blob/master/torchtext/modules
-class MultiheadAttentionContainer(torch.nn.Module):
-    def __init__(self, nhead, in_proj_container, attention_layer, out_proj):
-        r""" A multi-head attention container
-        Args:
-            nhead: the number of heads in the multiheadattention model
-            in_proj_container: A container of multi-head in-projection linear layers (a.k.a nn.Linear).
-            attention_layer: The attention layer.
-            out_proj: The multi-head out-projection layer (a.k.a nn.Linear).
-        Examples::
-            >>> import torch
-            >>> embed_dim, num_heads, bsz = 10, 5, 64
-            >>> in_proj_container = InProjContainer(torch.nn.Linear(embed_dim, embed_dim),
-                                                    torch.nn.Linear(embed_dim, embed_dim),
-                                                    torch.nn.Linear(embed_dim, embed_dim))
-            >>> MHA = MultiheadAttentionContainer(num_heads,
-                                                  in_proj_container,
-                                                  ScaledDotProduct(),
-                                                  torch.nn.Linear(embed_dim, embed_dim))
-            >>> query = torch.rand((21, bsz, embed_dim))
-            >>> key = value = torch.rand((16, bsz, embed_dim))
-            >>> attn_output, attn_weights = MHA(query, key, value)
-            >>> print(attn_output.shape)
-            >>> torch.Size([21, 64, 10])
-        """
-        super(MultiheadAttentionContainer, self).__init__()
-        self.nhead = nhead
-        self.in_proj_container = in_proj_container
-        self.attention_layer = attention_layer
-        self.out_proj = out_proj
-
-    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
-                attn_mask: Optional[torch.Tensor] = None,
-                bias_k: Optional[torch.Tensor] = None,
-                bias_v: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
-        r"""
-        Args:
-            query, key, value (Tensor): map a query and a set of key-value pairs to an output.
-                See "Attention Is All You Need" for more details.
-            attn_mask, bias_k and bias_v (Tensor, optional): keyword arguments passed to the attention layer.
-                See the definitions in the attention.
-        Shape:
-            - Inputs:
-            - query: :math:`(L, N, E)`
-            - key: :math:`(S, N, E)`
-            - value: :math:`(S, N, E)`
-            - attn_mask, bias_k and bias_v: same with the shape of the corresponding args in attention layer.
-            - Outputs:
-            - attn_output: :math:`(L, N, E)`
-            - attn_output_weights: :math:`(N * H, L, S)`
-            where where L is the target length, S is the sequence length, H is the number of attention heads,
-                N is the batch size, and E is the embedding dimension.
-        """
-        tgt_len, src_len, bsz, embed_dim = query.size(-3), key.size(-3), query.size(-2), query.size(-1)
-        q, k, v = self.in_proj_container(query, key, value)
-        assert q.size(-1) % self.nhead == 0, "query's embed_dim must be divisible by the number of heads"
-        head_dim = q.size(-1) // self.nhead
-        q = q.reshape(tgt_len, bsz * self.nhead, head_dim)
-
-        assert k.size(-1) % self.nhead == 0, "key's embed_dim must be divisible by the number of heads"
-        head_dim = k.size(-1) // self.nhead
-        k = k.reshape(src_len, bsz * self.nhead, head_dim)
-
-        assert v.size(-1) % self.nhead == 0, "value's embed_dim must be divisible by the number of heads"
-        head_dim = v.size(-1) // self.nhead
-        v = v.reshape(src_len, bsz * self.nhead, head_dim)
-
-        attn_output, attn_output_weights = self.attention_layer(q, k, v, attn_mask=attn_mask,
-                                                                bias_k=bias_k, bias_v=bias_v)
-        attn_output = attn_output.reshape(tgt_len, bsz, embed_dim)
-        attn_output = self.out_proj(attn_output)
-        return attn_output, attn_output_weights
-
-
-class ScaledDotProduct(torch.nn.Module):
-
-    def __init__(self, dropout=0.0):
-        r"""Processes a projected query and key-value pair to apply
-        scaled dot product attention.
-        Args:
-            dropout (float): probability of dropping an attention weight.
-        Examples::
-            >>> SDP = torchtext.models.ScaledDotProduct(0.1)
-            >>> q = torch.randn(256, 21, 3)
-            >>> k = v = torch.randn(256, 21, 3)
-            >>> attn_output, attn_weights = SDP(q, k, v)
-            >>> print(attn_output.shape, attn_weights.shape)
-            torch.Size([256, 21, 3]) torch.Size([256, 21, 21])
-        """
-        super(ScaledDotProduct, self).__init__()
-        self.dropout = dropout
-
-    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
-                attn_mask: Optional[torch.Tensor] = None,
-                bias_k: Optional[torch.Tensor] = None,
-                bias_v: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
-        r"""Uses a scaled dot product with the projected key-value pair to update
-        the projected query.
-        Args:
-            query (Tensor): Projected query
-            key (Tensor): Projected key
-            value (Tensor): Projected value
-            attn_mask (BoolTensor, optional): 3D mask that prevents attention to certain positions.
-            bias_k and bias_v: (Tensor, optional): one more key and value sequence to be added at
-                sequence dim (dim=-3). Those are used for incremental decoding. Users should provide
-                non-None to both arguments in order to activate them.
-        Shape:
-            - query: :math:`(L, N * H, E / H)`
-            - key: :math:`(S, N * H, E / H)`
-            - value: :math:`(S, N * H, E / H)`
-            - attn_mask: :math:`(N * H, L, S)`, positions with ``True`` are not allowed to attend
-                while ``False`` values will be unchanged.
-            - bias_k and bias_v:bias: :math:`(1, N * H, E / H)`
-            - Output: :math:`(L, N * H, E / H)`, :math:`(N * H, L, S)`
-            where L is the target length, S is the source length, H is the number
-            of attention heads, N is the batch size, and E is the embedding dimension.
-        """
-        if bias_k is not None and bias_v is not None:
-            assert key.size(-1) == bias_k.size(-1) and key.size(-2) == bias_k.size(-2) and bias_k.size(-3) == 1, \
-                "Shape of bias_k is not supported"
-            assert value.size(-1) == bias_v.size(-1) and value.size(-2) == bias_v.size(-2) and bias_v.size(-3) == 1, \
-                "Shape of bias_v is not supported"
-            key = torch.cat([key, bias_k])
-            value = torch.cat([value, bias_v])
-            if attn_mask is not None:
-                _attn_mask = attn_mask
-                attn_mask = torch.nn.functional.pad(_attn_mask, [0, 1])
-
-        tgt_len, head_dim = query.size(-3), query.size(-1)
-        assert query.size(-1) == key.size(-1) == value.size(-1), "The feature dim of query, key, value must be equal."
-        assert key.size() == value.size(), "Shape of key, value must match"
-        src_len = key.size(-3)
-        batch_heads = max(query.size(-2), key.size(-2))
-
-        # Scale query
-        query, key, value = query.transpose(-2, -3), key.transpose(-2, -3), value.transpose(-2, -3)
-        query = query * (float(head_dim) ** -0.5)
-        if attn_mask is not None:
-            if attn_mask.dim() != 3:
-                raise RuntimeError('attn_mask must be a 3D tensor.')
-            if (attn_mask.size(-1) != src_len) or (attn_mask.size(-2) != tgt_len) or \
-               (attn_mask.size(-3) != 1 and attn_mask.size(-3) != batch_heads):
-                raise RuntimeError('The size of the attn_mask is not correct.')
-            if attn_mask.dtype != torch.bool:
-                raise RuntimeError('Only bool tensor is supported for attn_mask')
-
-        # Dot product of q, k
-        attn_output_weights = torch.matmul(query, key.transpose(-2, -1))
-        if attn_mask is not None:
-            attn_output_weights.masked_fill_(attn_mask, -1e8,)
-        attn_output_weights = torch.nn.functional.softmax(attn_output_weights, dim=-1)
-        attn_output_weights = torch.nn.functional.dropout(attn_output_weights, p=self.dropout, training=self.training)
-        attn_output = torch.matmul(attn_output_weights, value)
-        return attn_output.transpose(-2, -3), attn_output_weights
-
-
-class InProjContainer(torch.nn.Module):
-    def __init__(self, query_proj, key_proj, value_proj):
-        r"""A in-proj container to process inputs.
-        Args:
-            query_proj: a proj layer for query.
-            key_proj: a proj layer for key.
-            value_proj: a proj layer for value.
-        """
-
-        super(InProjContainer, self).__init__()
-        self.query_proj = query_proj
-        self.key_proj = key_proj
-        self.value_proj = value_proj
-
-    def forward(self,
-                query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        r"""Projects the input sequences using in-proj layers.
-        Args:
-            query, key, value (Tensors): sequence to be projected
-        Shape:
-            - query, key, value: :math:`(S, N, E)`
-            - Output: :math:`(S, N, E)`
-            where S is the sequence length, N is the batch size, and E is the embedding dimension.
-        """
-        return self.query_proj(query), self.key_proj(key), self.value_proj(value)
diff --git a/benchmarks/functional_autograd_benchmark/torchvision_models.py b/benchmarks/functional_autograd_benchmark/torchvision_models.py
deleted file mode 100644
index 25361af7766..00000000000
--- a/benchmarks/functional_autograd_benchmark/torchvision_models.py
+++ /dev/null
@@ -1,803 +0,0 @@
-# Taken from https://github.com/pytorch/vision
-# So that we don't need torchvision to be installed
-import torch
-from torch import nn
-from torch.nn import functional as F
-
-from torch.jit.annotations import Dict
-from collections import OrderedDict
-
-try:
-    from scipy.optimize import linear_sum_assignment  # type: ignore
-    scipy_available = True
-except Exception:
-    scipy_available = False
-
-def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
-    """3x3 convolution with padding"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
-                     padding=dilation, groups=groups, bias=False, dilation=dilation)
-
-
-def conv1x1(in_planes, out_planes, stride=1):
-    """1x1 convolution"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
-
-
-class BasicBlock(nn.Module):
-    expansion = 1
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
-                 base_width=64, dilation=1, norm_layer=None):
-        super(BasicBlock, self).__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        if groups != 1 or base_width != 64:
-            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
-        if dilation > 1:
-            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
-        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
-        self.conv1 = conv3x3(inplanes, planes, stride)
-        self.bn1 = norm_layer(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = conv3x3(planes, planes)
-        self.bn2 = norm_layer(planes)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        identity = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        if self.downsample is not None:
-            identity = self.downsample(x)
-
-        out += identity
-        out = self.relu(out)
-
-        return out
-
-class Bottleneck(nn.Module):
-    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
-    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
-    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
-    # This variant is also known as ResNet V1.5 and improves accuracy according to
-    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
-
-    expansion = 4
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
-                 base_width=64, dilation=1, norm_layer=None):
-        super(Bottleneck, self).__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        width = int(planes * (base_width / 64.)) * groups
-        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
-        self.conv1 = conv1x1(inplanes, width)
-        self.bn1 = norm_layer(width)
-        self.conv2 = conv3x3(width, width, stride, groups, dilation)
-        self.bn2 = norm_layer(width)
-        self.conv3 = conv1x1(width, planes * self.expansion)
-        self.bn3 = norm_layer(planes * self.expansion)
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        identity = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        if self.downsample is not None:
-            identity = self.downsample(x)
-
-        out += identity
-        out = self.relu(out)
-
-        return out
-
-class ResNet(nn.Module):
-
-    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
-                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
-                 norm_layer=None):
-        super(ResNet, self).__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        self._norm_layer = norm_layer
-
-        self.inplanes = 64
-        self.dilation = 1
-        if replace_stride_with_dilation is None:
-            # each element in the tuple indicates if we should replace
-            # the 2x2 stride with a dilated convolution instead
-            replace_stride_with_dilation = [False, False, False]
-        if len(replace_stride_with_dilation) != 3:
-            raise ValueError("replace_stride_with_dilation should be None "
-                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
-        self.groups = groups
-        self.base_width = width_per_group
-        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
-                               bias=False)
-        self.bn1 = norm_layer(self.inplanes)
-        self.relu = nn.ReLU(inplace=True)
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-        self.layer1 = self._make_layer(block, 64, layers[0])
-        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
-                                       dilate=replace_stride_with_dilation[0])
-        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
-                                       dilate=replace_stride_with_dilation[1])
-        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
-                                       dilate=replace_stride_with_dilation[2])
-        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-        self.fc = nn.Linear(512 * block.expansion, num_classes)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-
-        # Zero-initialize the last BN in each residual branch,
-        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
-        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
-        if zero_init_residual:
-            for m in self.modules():
-                if isinstance(m, Bottleneck):
-                    nn.init.constant_(m.bn3.weight, 0)
-                elif isinstance(m, BasicBlock):
-                    nn.init.constant_(m.bn2.weight, 0)
-
-    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
-        norm_layer = self._norm_layer
-        downsample = None
-        previous_dilation = self.dilation
-        if dilate:
-            self.dilation *= stride
-            stride = 1
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                conv1x1(self.inplanes, planes * block.expansion, stride),
-                norm_layer(planes * block.expansion),
-            )
-
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
-                            self.base_width, previous_dilation, norm_layer))
-        self.inplanes = planes * block.expansion
-        for _ in range(1, blocks):
-            layers.append(block(self.inplanes, planes, groups=self.groups,
-                                base_width=self.base_width, dilation=self.dilation,
-                                norm_layer=norm_layer))
-
-        return nn.Sequential(*layers)
-
-    def _forward_impl(self, x):
-        # See note [TorchScript super()]
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.maxpool(x)
-
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-
-        x = self.avgpool(x)
-        x = torch.flatten(x, 1)
-        x = self.fc(x)
-
-        return x
-
-    def forward(self, x):
-        return self._forward_impl(x)
-
-def _resnet(arch, block, layers, pretrained, progress, **kwargs):
-    model = ResNet(block, layers, **kwargs)
-    # if pretrained:
-    #     state_dict = load_state_dict_from_url(model_urls[arch],
-    #                                           progress=progress)
-    #     model.load_state_dict(state_dict)
-    return model
-
-def resnet18(pretrained=False, progress=True, **kwargs):
-    r"""ResNet-18 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
-                   **kwargs)
-
-def resnet50(pretrained=False, progress=True, **kwargs):
-    r"""ResNet-50 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
-                   **kwargs)
-
-class IntermediateLayerGetter(nn.ModuleDict):
-    """
-    Module wrapper that returns intermediate layers from a model
-    It has a strong assumption that the modules have been registered
-    into the model in the same order as they are used.
-    This means that one should **not** reuse the same nn.Module
-    twice in the forward if you want this to work.
-    Additionally, it is only able to query submodules that are directly
-    assigned to the model. So if `model` is passed, `model.feature1` can
-    be returned, but not `model.feature1.layer2`.
-    Arguments:
-        model (nn.Module): model on which we will extract the features
-        return_layers (Dict[name, new_name]): a dict containing the names
-            of the modules for which the activations will be returned as
-            the key of the dict, and the value of the dict is the name
-            of the returned activation (which the user can specify).
-    Examples::
-        >>> m = torchvision.models.resnet18(pretrained=True)
-        >>> # extract layer1 and layer3, giving as names `feat1` and feat2`
-        >>> new_m = torchvision.models._utils.IntermediateLayerGetter(m,
-        >>>     {'layer1': 'feat1', 'layer3': 'feat2'})
-        >>> out = new_m(torch.rand(1, 3, 224, 224))
-        >>> print([(k, v.shape) for k, v in out.items()])
-        >>>     [('feat1', torch.Size([1, 64, 56, 56])),
-        >>>      ('feat2', torch.Size([1, 256, 14, 14]))]
-    """
-    _version = 2
-    __annotations__ = {
-        "return_layers": Dict[str, str],
-    }
-
-    def __init__(self, model, return_layers):
-        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
-            raise ValueError("return_layers are not present in model")
-        orig_return_layers = return_layers
-        return_layers = {str(k): str(v) for k, v in return_layers.items()}
-        layers = OrderedDict()
-        for name, module in model.named_children():
-            layers[name] = module
-            if name in return_layers:
-                del return_layers[name]
-            if not return_layers:
-                break
-
-        super(IntermediateLayerGetter, self).__init__(layers)
-        self.return_layers = orig_return_layers
-
-    def forward(self, x):
-        out = OrderedDict()
-        for name, module in self.items():
-            x = module(x)
-            if name in self.return_layers:
-                out_name = self.return_layers[name]
-                out[out_name] = x
-        return out
-
-class _SimpleSegmentationModel(nn.Module):
-    __constants__ = ['aux_classifier']
-
-    def __init__(self, backbone, classifier, aux_classifier=None):
-        super(_SimpleSegmentationModel, self).__init__()
-        self.backbone = backbone
-        self.classifier = classifier
-        self.aux_classifier = aux_classifier
-
-    def forward(self, x):
-        input_shape = x.shape[-2:]
-        # contract: features is a dict of tensors
-        features = self.backbone(x)
-
-        result = OrderedDict()
-        x = features["out"]
-        x = self.classifier(x)
-        x = F.interpolate(x, size=input_shape, mode='bilinear', align_corners=False)
-        result["out"] = x
-
-        if self.aux_classifier is not None:
-            x = features["aux"]
-            x = self.aux_classifier(x)
-            x = F.interpolate(x, size=input_shape, mode='bilinear', align_corners=False)
-            result["aux"] = x
-
-        return result
-
-class FCN(_SimpleSegmentationModel):
-    """
-    Implements a Fully-Convolutional Network for semantic segmentation.
-    Arguments:
-        backbone (nn.Module): the network used to compute the features for the model.
-            The backbone should return an OrderedDict[Tensor], with the key being
-            "out" for the last feature map used, and "aux" if an auxiliary classifier
-            is used.
-        classifier (nn.Module): module that takes the "out" element returned from
-            the backbone and returns a dense prediction.
-        aux_classifier (nn.Module, optional): auxiliary classifier used during training
-    """
-    pass
-
-class FCNHead(nn.Sequential):
-    def __init__(self, in_channels, channels):
-        inter_channels = in_channels // 4
-        layers = [
-            nn.Conv2d(in_channels, inter_channels, 3, padding=1, bias=False),
-            nn.BatchNorm2d(inter_channels),
-            nn.ReLU(),
-            nn.Dropout(0.1),
-            nn.Conv2d(inter_channels, channels, 1)
-        ]
-
-        super(FCNHead, self).__init__(*layers)
-
-def _segm_resnet(name, backbone_name, num_classes, aux, pretrained_backbone=True):
-    # backbone = resnet.__dict__[backbone_name](
-    #     pretrained=pretrained_backbone,
-    #     replace_stride_with_dilation=[False, True, True])
-    # Hardcoded resnet 50
-    assert backbone_name == "resnet50"
-    backbone = resnet50(
-        pretrained=pretrained_backbone,
-        replace_stride_with_dilation=[False, True, True])
-
-    return_layers = {'layer4': 'out'}
-    if aux:
-        return_layers['layer3'] = 'aux'
-    backbone = IntermediateLayerGetter(backbone, return_layers=return_layers)
-
-    aux_classifier = None
-    if aux:
-        inplanes = 1024
-        aux_classifier = FCNHead(inplanes, num_classes)
-
-    model_map = {
-        # 'deeplabv3': (DeepLabHead, DeepLabV3), # Not used
-        'fcn': (FCNHead, FCN),
-    }
-    inplanes = 2048
-    classifier = model_map[name][0](inplanes, num_classes)
-    base_model = model_map[name][1]
-
-    model = base_model(backbone, classifier, aux_classifier)
-    return model
-
-def _load_model(arch_type, backbone, pretrained, progress, num_classes, aux_loss, **kwargs):
-    if pretrained:
-        aux_loss = True
-    model = _segm_resnet(arch_type, backbone, num_classes, aux_loss, **kwargs)
-    # if pretrained:
-    #     arch = arch_type + '_' + backbone + '_coco'
-    #     model_url = model_urls[arch]
-    #     if model_url is None:
-    #         raise NotImplementedError('pretrained {} is not supported as of now'.format(arch))
-    #     else:
-    #         state_dict = load_state_dict_from_url(model_url, progress=progress)
-    #         model.load_state_dict(state_dict)
-    return model
-
-def fcn_resnet50(pretrained=False, progress=True,
-                 num_classes=21, aux_loss=None, **kwargs):
-    """Constructs a Fully-Convolutional Network model with a ResNet-50 backbone.
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on COCO train2017 which
-            contains the same classes as Pascal VOC
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _load_model('fcn', 'resnet50', pretrained, progress, num_classes, aux_loss, **kwargs)
-
-
-# Taken from @fmassa example slides and https://github.com/facebookresearch/detr
-class DETR(nn.Module):
-    """
-    Demo DETR implementation.
-
-    Demo implementation of DETR in minimal number of lines, with the
-    following differences wrt DETR in the paper:
-    * learned positional encoding (instead of sine)
-    * positional encoding is passed at input (instead of attention)
-    * fc bbox predictor (instead of MLP)
-    The model achieves ~40 AP on COCO val5k and runs at ~28 FPS on Tesla V100.
-    Only batch size 1 supported.
-    """
-    def __init__(self, num_classes, hidden_dim=256, nheads=8,
-                 num_encoder_layers=6, num_decoder_layers=6):
-        super().__init__()
-
-        # create ResNet-50 backbone
-        self.backbone = resnet50()
-        del self.backbone.fc
-
-        # create conversion layer
-        self.conv = nn.Conv2d(2048, hidden_dim, 1)
-
-        # create a default PyTorch transformer
-        self.transformer = nn.Transformer(
-            hidden_dim, nheads, num_encoder_layers, num_decoder_layers)
-
-        # prediction heads, one extra class for predicting non-empty slots
-        # note that in baseline DETR linear_bbox layer is 3-layer MLP
-        self.linear_class = nn.Linear(hidden_dim, num_classes + 1)
-        self.linear_bbox = nn.Linear(hidden_dim, 4)
-
-        # output positional encodings (object queries)
-        self.query_pos = nn.Parameter(torch.rand(100, hidden_dim))
-
-        # spatial positional encodings
-        # note that in baseline DETR we use sine positional encodings
-        self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
-        self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
-
-    def forward(self, inputs):
-        # propagate inputs through ResNet-50 up to avg-pool layer
-        x = self.backbone.conv1(inputs)
-        x = self.backbone.bn1(x)
-        x = self.backbone.relu(x)
-        x = self.backbone.maxpool(x)
-
-        x = self.backbone.layer1(x)
-        x = self.backbone.layer2(x)
-        x = self.backbone.layer3(x)
-        x = self.backbone.layer4(x)
-
-        # convert from 2048 to 256 feature planes for the transformer
-        h = self.conv(x)
-
-        # construct positional encodings
-        H, W = h.shape[-2:]
-        pos = torch.cat([
-            self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
-            self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
-        ], dim=-1).flatten(0, 1).unsqueeze(1)
-
-        # propagate through the transformer
-        # TODO (alband) Why this is not automatically broadcasted? (had to add the repeat)
-        f = pos + 0.1 * h.flatten(2).permute(2, 0, 1)
-        s = self.query_pos.unsqueeze(1)
-        s = s.expand(s.size(0), inputs.size(0), s.size(2))
-        h = self.transformer(f, s).transpose(0, 1)
-
-        # finally project transformer outputs to class labels and bounding boxes
-        return {'pred_logits': self.linear_class(h),
-                'pred_boxes': self.linear_bbox(h).sigmoid()}
-
-def generalized_box_iou(boxes1, boxes2):
-    """
-    Generalized IoU from https://giou.stanford.edu/
-    The boxes should be in [x0, y0, x1, y1] format
-    Returns a [N, M] pairwise matrix, where N = len(boxes1)
-    and M = len(boxes2)
-    """
-    # degenerate boxes gives inf / nan results
-    # so do an early check
-    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
-    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
-    iou, union = box_iou(boxes1, boxes2)
-
-    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
-    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
-
-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
-    area = wh[:, :, 0] * wh[:, :, 1]
-
-    return iou - (area - union) / area
-
-def box_cxcywh_to_xyxy(x):
-    x_c, y_c, w, h = x.unbind(-1)
-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
-         (x_c + 0.5 * w), (y_c + 0.5 * h)]
-    return torch.stack(b, dim=-1)
-
-def box_area(boxes):
-    """
-    Computes the area of a set of bounding boxes, which are specified by its
-    (x1, y1, x2, y2) coordinates.
-    Arguments:
-        boxes (Tensor[N, 4]): boxes for which the area will be computed. They
-            are expected to be in (x1, y1, x2, y2) format
-    Returns:
-        area (Tensor[N]): area for each box
-    """
-    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
-
-# modified from torchvision to also return the union
-def box_iou(boxes1, boxes2):
-    area1 = box_area(boxes1)
-    area2 = box_area(boxes2)
-
-    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
-    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
-
-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
-    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
-
-    union = area1[:, None] + area2 - inter
-
-    iou = inter / union
-    return iou, union
-
-def is_dist_avail_and_initialized():
-    return False
-
-def get_world_size():
-    if not is_dist_avail_and_initialized():
-        return 1
-
-@torch.no_grad()
-def accuracy(output, target, topk=(1,)):
-    """Computes the precision@k for the specified values of k"""
-    if target.numel() == 0:
-        return [torch.zeros([], device=output.device)]
-    maxk = max(topk)
-    batch_size = target.size(0)
-
-    _, pred = output.topk(maxk, 1, True, True)
-    pred = pred.t()
-    correct = pred.eq(target.view(1, -1).expand_as(pred))
-
-    res = []
-    for k in topk:
-        correct_k = correct[:k].view(-1).float().sum(0)
-        res.append(correct_k.mul_(100.0 / batch_size))
-    return res
-
-class SetCriterion(nn.Module):
-    """ This class computes the loss for DETR.
-    The process happens in two steps:
-        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
-        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
-    """
-    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses):
-        """ Create the criterion.
-        Parameters:
-            num_classes: number of object categories, omitting the special no-object category
-            matcher: module able to compute a matching between targets and proposals
-            weight_dict: dict containing as key the names of the losses and as values their relative weight.
-            eos_coef: relative classification weight applied to the no-object category
-            losses: list of all the losses to be applied. See get_loss for list of available losses.
-        """
-        super().__init__()
-        self.num_classes = num_classes
-        self.matcher = matcher
-        self.weight_dict = weight_dict
-        self.eos_coef = eos_coef
-        self.losses = losses
-        empty_weight = torch.ones(self.num_classes + 1)
-        empty_weight[-1] = self.eos_coef
-        self.register_buffer('empty_weight', empty_weight)
-
-    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
-        """Classification loss (NLL)
-        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
-        """
-        assert 'pred_logits' in outputs
-        src_logits = outputs['pred_logits']
-
-        idx = self._get_src_permutation_idx(indices)
-        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
-        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
-                                    dtype=torch.int64, device=src_logits.device)
-        target_classes[idx] = target_classes_o
-
-        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
-        losses = {'loss_ce': loss_ce}
-
-        if log:
-            # TODO this should probably be a separate loss, not hacked in this one here
-            losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
-        return losses
-
-    @torch.no_grad()
-    def loss_cardinality(self, outputs, targets, indices, num_boxes):
-        """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
-        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
-        """
-        pred_logits = outputs['pred_logits']
-        device = pred_logits.device
-        tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device)
-        # Count the number of predictions that are NOT "no-object" (which is the last class)
-        card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1)
-        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
-        losses = {'cardinality_error': card_err}
-        return losses
-
-    def loss_boxes(self, outputs, targets, indices, num_boxes):
-        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
-           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
-           The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
-        """
-        assert 'pred_boxes' in outputs
-        idx = self._get_src_permutation_idx(indices)
-        src_boxes = outputs['pred_boxes'][idx]
-        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
-
-        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
-
-        losses = {}
-        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
-
-        loss_giou = 1 - torch.diag(generalized_box_iou(
-            box_cxcywh_to_xyxy(src_boxes),
-            box_cxcywh_to_xyxy(target_boxes)))
-        losses['loss_giou'] = loss_giou.sum() / num_boxes
-        return losses
-
-    def loss_masks(self, outputs, targets, indices, num_boxes):
-        """Compute the losses related to the masks: the focal loss and the dice loss.
-           targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
-        """
-        assert "pred_masks" in outputs
-
-        src_idx = self._get_src_permutation_idx(indices)
-        tgt_idx = self._get_tgt_permutation_idx(indices)
-
-        src_masks = outputs["pred_masks"]
-
-        # TODO use valid to mask invalid areas due to padding in loss
-        target_masks, valid = nested_tensor_from_tensor_list([t["masks"] for t in targets]).decompose()
-        target_masks = target_masks.to(src_masks)
-
-        src_masks = src_masks[src_idx]
-        # upsample predictions to the target size
-        src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:],
-                                mode="bilinear", align_corners=False)
-        src_masks = src_masks[:, 0].flatten(1)
-
-        target_masks = target_masks[tgt_idx].flatten(1)
-
-        losses = {
-            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
-            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
-        }
-        return losses
-
-    def _get_src_permutation_idx(self, indices):
-        # permute predictions following indices
-        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
-        src_idx = torch.cat([src for (src, _) in indices])
-        return batch_idx, src_idx
-
-    def _get_tgt_permutation_idx(self, indices):
-        # permute targets following indices
-        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
-        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
-        return batch_idx, tgt_idx
-
-    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
-        loss_map = {
-            'labels': self.loss_labels,
-            'cardinality': self.loss_cardinality,
-            'boxes': self.loss_boxes,
-            'masks': self.loss_masks
-        }
-        assert loss in loss_map, f'do you really want to compute {loss} loss?'
-        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
-
-    def forward(self, outputs, targets):
-        """ This performs the loss computation.
-        Parameters:
-             outputs: dict of tensors, see the output specification of the model for the format
-             targets: list of dicts, such that len(targets) == batch_size.
-                      The expected keys in each dict depends on the losses applied, see each loss' doc
-        """
-        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}
-
-        # Retrieve the matching between the outputs of the last layer and the targets
-        indices = self.matcher(outputs_without_aux, targets)
-
-        # Compute the average number of target boxes accross all nodes, for normalization purposes
-        num_boxes = sum(len(t["labels"]) for t in targets)
-        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
-        if is_dist_avail_and_initialized():
-            torch.distributed.all_reduce(num_boxes)
-        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
-
-        # Compute all the requested losses
-        losses = {}
-        for loss in self.losses:
-            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
-
-        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
-        if 'aux_outputs' in outputs:
-            for i, aux_outputs in enumerate(outputs['aux_outputs']):
-                indices = self.matcher(aux_outputs, targets)
-                for loss in self.losses:
-                    if loss == 'masks':
-                        # Intermediate masks losses are too costly to compute, we ignore them.
-                        continue
-                    kwargs = {}
-                    if loss == 'labels':
-                        # Logging is enabled only for the last layer
-                        kwargs = {'log': False}
-                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
-                    l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
-                    losses.update(l_dict)
-
-        return losses
-
-class HungarianMatcher(nn.Module):
-    """This class computes an assignment between the targets and the predictions of the network
-    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
-    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
-    while the others are un-matched (and thus treated as non-objects).
-    """
-
-    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
-        """Creates the matcher
-        Params:
-            cost_class: This is the relative weight of the classification error in the matching cost
-            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
-            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
-        """
-        super().__init__()
-        self.cost_class = cost_class
-        self.cost_bbox = cost_bbox
-        self.cost_giou = cost_giou
-        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
-
-    @torch.no_grad()
-    def forward(self, outputs, targets):
-        """ Performs the matching
-        Params:
-            outputs: This is a dict that contains at least these entries:
-                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
-                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
-            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
-                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
-                           objects in the target) containing the class labels
-                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
-        Returns:
-            A list of size batch_size, containing tuples of (index_i, index_j) where:
-                - index_i is the indices of the selected predictions (in order)
-                - index_j is the indices of the corresponding selected targets (in order)
-            For each batch element, it holds:
-                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
-        """
-        bs, num_queries = outputs["pred_logits"].shape[:2]
-
-        # We flatten to compute the cost matrices in a batch
-        out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
-        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
-
-        # Also concat the target labels and boxes
-        tgt_ids = torch.cat([v["labels"] for v in targets])
-        tgt_bbox = torch.cat([v["boxes"] for v in targets])
-
-        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
-        # but approximate it in 1 - proba[target class].
-        # The 1 is a constant that doesn't change the matching, it can be ommitted.
-        cost_class = -out_prob[:, tgt_ids]
-
-        # Compute the L1 cost between boxes
-        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
-
-        # Compute the giou cost betwen boxes
-        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
-
-        # Final cost matrix
-        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
-        C = C.view(bs, num_queries, -1).cpu()
-
-        sizes = [len(v["boxes"]) for v in targets]
-        if not scipy_available:
-            raise RuntimeError("The 'detr' model requires scipy to run. Please make sure you have it installed"
-                               " if you enable the 'detr' model.")
-        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
-        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
diff --git a/benchmarks/functional_autograd_benchmark/utils.py b/benchmarks/functional_autograd_benchmark/utils.py
deleted file mode 100644
index c7aeb29d157..00000000000
--- a/benchmarks/functional_autograd_benchmark/utils.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import torch
-
-from collections import defaultdict
-
-from torch import nn, Tensor
-from typing import List, Tuple, Dict, Union, Callable
-
-# Type helpers
-InputsType = Union[Tensor, Tuple[Tensor, ...]]
-# A Getter takes in a device and returns a callable and the inputs to that callable
-GetterReturnType = Tuple[Callable[..., Tensor], InputsType]
-GetterType = Callable[[torch.device], GetterReturnType]
-# V here refers to the v in either vjp, jvp, vhp or hvp
-VType = Union[None, Tensor, Tuple[Tensor, ...]]
-# Type used to store timing results. The first key is the model name, the second key
-# is the task name, the result is a Tuple of: speedup, mean_before, var_before, mean_after, var_after.
-TimingResultType = Dict[str, Dict[str, Tuple[float, ...]]]
-
-# Utilities to make nn.Module "functional"
-# In particular the goal is to be able to provide a function that takes as input
-# the parameters and evaluate the nn.Module using fixed inputs.
-def _del_nested_attr(obj: nn.Module, names: List[str]) -> None:
-    """
-    Deletes the attribute specified by the given list of names.
-    For example, to delete the attribute obj.conv.weight,
-    use _del_nested_attr(obj, ['conv', 'weight'])
-    """
-    if len(names) == 1:
-        delattr(obj, names[0])
-    else:
-        _del_nested_attr(getattr(obj, names[0]), names[1:])
-
-def _set_nested_attr(obj: nn.Module, names: List[str], value: Tensor) -> None:
-    """
-    Set the attribute specified by the given list of names to value.
-    For example, to set the attribute obj.conv.weight,
-    use _del_nested_attr(obj, ['conv', 'weight'], value)
-    """
-    if len(names) == 1:
-        setattr(obj, names[0], value)
-    else:
-        _set_nested_attr(getattr(obj, names[0]), names[1:], value)
-
-def extract_weights(mod: nn.Module) -> Tuple[Tuple[Tensor, ...], List[str]]:
-    """
-    This function removes all the Parameters from the model and
-    return them as a tuple as well as their original attribute names.
-    The weights must be re-loaded with `load_weights` before the model
-    can be used again.
-    Note that this function modifies the model in place and after this
-    call, mod.parameters() will be empty.
-    """
-    orig_params = tuple(mod.parameters())
-    # Remove all the parameters in the model
-    names = []
-    for name, p in list(mod.named_parameters()):
-        _del_nested_attr(mod, name.split("."))
-        names.append(name)
-
-    # Make params regular Tensors instead of nn.Parameter
-    params = tuple(p.detach().requires_grad_() for p in orig_params)
-    return params, names
-
-def load_weights(mod: nn.Module, names: List[str], params: Tuple[Tensor, ...]) -> None:
-    """
-    Reload a set of weights so that `mod` can be used again to perform a forward pass.
-    Note that the `params` are regular Tensors (that can have history) and so are left
-    as Tensors. This means that mod.parameters() will still be empty after this call.
-    """
-    for name, p in zip(names, params):
-        _set_nested_attr(mod, name.split("."), p)
-
-# Utilities to read/write markdown table-like content.
-def to_markdown_table(res: TimingResultType, header: Tuple[str, ...] = None) -> str:
-    if header is None:
-        header = ("model", "task", "mean", "var")
-    out = ""
-
-    def write_line(*args):
-        nonlocal out
-        out += "| {} |\n".format(" | ".join(str(a) for a in args))
-
-    # Make it a markdown table
-    write_line(*header)
-    write_line(*["--"] * len(header))
-    for model, tasks in res.items():
-        for task, line in tasks.items():
-            write_line(*(model, task) + line)
-
-    return out
-
-def from_markdown_table(data: str) -> TimingResultType:
-    out = data.strip().split("\n")
-    out = out[2:]  # Ignore the header lines
-
-    res: TimingResultType
-    res = defaultdict(defaultdict)
-
-    for line in out:
-        model, task, mean, var = [f.strip() for f in line.strip().split("|") if f]
-        res[model][task] = (float(mean), float(var))
-
-    return res
diff --git a/benchmarks/functional_autograd_benchmark/vision_models.py b/benchmarks/functional_autograd_benchmark/vision_models.py
deleted file mode 100644
index cd2f84e638a..00000000000
--- a/benchmarks/functional_autograd_benchmark/vision_models.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import torch
-from torch import Tensor
-import torchvision_models as models
-
-from utils import extract_weights, load_weights, GetterReturnType
-
-from typing import cast
-
-def get_resnet18(device: torch.device) -> GetterReturnType:
-    N = 32
-    model = models.resnet18(pretrained=False)
-    criterion = torch.nn.CrossEntropyLoss()
-    model.to(device)
-    params, names = extract_weights(model)
-
-    inputs = torch.rand([N, 3, 224, 224], device=device)
-    labels = torch.rand(N, device=device).mul(10).long()
-
-    def forward(*new_params: Tensor) -> Tensor:
-        load_weights(model, names, new_params)
-        out = model(inputs)
-
-        loss = criterion(out, labels)
-        return loss
-
-    return forward, params
-
-def get_fcn_resnet(device: torch.device) -> GetterReturnType:
-    N = 8
-    criterion = torch.nn.MSELoss()
-    model = models.fcn_resnet50(pretrained=False, pretrained_backbone=False)
-    model.to(device)
-    params, names = extract_weights(model)
-
-    inputs = torch.rand([N, 3, 480, 480], device=device)
-    # Given model has 21 classes
-    labels = torch.rand([N, 21, 480, 480], device=device)
-
-    def forward(*new_params: Tensor) -> Tensor:
-        load_weights(model, names, new_params)
-        out = model(inputs)['out']
-
-        loss = criterion(out, labels)
-        return loss
-
-    return forward, params
-
-def get_detr(device: torch.device) -> GetterReturnType:
-    # All values below are from CLI defaults in https://github.com/facebookresearch/detr
-    N = 2
-    num_classes = 91
-    hidden_dim = 256
-    nheads = 8
-    num_encoder_layers = 6
-    num_decoder_layers = 6
-
-    model = models.DETR(num_classes=num_classes, hidden_dim=hidden_dim, nheads=nheads,
-                        num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers)
-    losses = ['labels', 'boxes', 'cardinality']
-    eos_coef = 0.1
-    bbox_loss_coef = 5
-    giou_loss_coef = 2
-    weight_dict = {'loss_ce': 1, 'loss_bbox': bbox_loss_coef, 'loss_giou': giou_loss_coef}
-    matcher = models.HungarianMatcher(1, 5, 2)
-    criterion = models.SetCriterion(num_classes=num_classes, matcher=matcher, weight_dict=weight_dict,
-                                    eos_coef=eos_coef, losses=losses)
-
-    model = model.to(device)
-    criterion = criterion.to(device)
-    params, names = extract_weights(model)
-
-    inputs = torch.rand(N, 3, 800, 1200, device=device)
-    labels = []
-    for idx in range(N):
-        targets = {}
-        n_targets: int = int(torch.randint(5, 10, size=tuple()).item())
-        label = torch.randint(5, 10, size=(n_targets,))
-        targets["labels"] = label
-        boxes = torch.randint(100, 800, size=(n_targets, 4))
-        for t in range(n_targets):
-            if boxes[t, 0] > boxes[t, 2]:
-                boxes[t, 0], boxes[t, 2] = boxes[t, 2], boxes[t, 0]
-            if boxes[t, 1] > boxes[t, 3]:
-                boxes[t, 1], boxes[t, 3] = boxes[t, 3], boxes[t, 1]
-        targets["boxes"] = boxes.float()
-        labels.append(targets)
-
-    def forward(*new_params: Tensor) -> Tensor:
-        load_weights(model, names, new_params)
-        out = model(inputs)
-
-        loss = criterion(out, labels)
-        weight_dict = criterion.weight_dict
-        final_loss = cast(Tensor, sum(loss[k] * weight_dict[k] for k in loss.keys() if k in weight_dict))
-        return final_loss
-
-    return forward, params
diff --git a/test/run_test.py b/test/run_test.py
index 13a779abff7..f84a91579e3 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -86,7 +86,6 @@ TESTS = [
     'test_determination',
     'test_futures',
     'test_fx',
-    'test_functional_autograd_benchmark'
 ]
 
 WINDOWS_BLOCKLIST = [
diff --git a/test/test_functional_autograd_benchmark.py b/test/test_functional_autograd_benchmark.py
deleted file mode 100644
index 46ee7653462..00000000000
--- a/test/test_functional_autograd_benchmark.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from torch.testing._internal.common_utils import TestCase, run_tests, slowTest, IS_WINDOWS
-
-import subprocess
-import tempfile
-import os
-import unittest
-
-# This is a very simple smoke test for the functional autograd benchmarking script.
-class TestFunctionalAutogradBenchmark(TestCase):
-    def _test_runner(self, model):
-        # Note about windows:
-        # The temporary file is exclusively open by this process and the child process
-        # is not allowed to open it again. As this is a simple smoke test, we choose for now
-        # not to run this on windows and keep the code here simple.
-        with tempfile.NamedTemporaryFile() as out_file:
-            cmd = ['python', '../benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py']
-            # Only run the warmup
-            cmd += ['--num-iters', '0']
-            # Only run the vjp task (fastest one)
-            cmd += ['--task-filter', 'vjp']
-            # Only run the specified model
-            cmd += ['--model-filter', model]
-            # Output file
-            cmd += ['--output', out_file.name]
-
-            res = subprocess.run(cmd)
-
-            self.assertTrue(res.returncode == 0)
-            # Check that something was written to the file
-            out_file.seek(0, os.SEEK_END)
-            self.assertTrue(out_file.tell() > 0)
-
-
-    @unittest.skipIf(IS_WINDOWS, "NamedTemporaryFile on windows does not have all the features we need.")
-    def test_fast_tasks(self):
-        fast_tasks = ['resnet18', 'ppl_simple_reg', 'ppl_robust_reg', 'wav2letter',
-                      'transformer', 'multiheadattn']
-
-        for task in fast_tasks:
-            self._test_runner(task)
-
-    @slowTest
-    @unittest.skipIf(IS_WINDOWS, "NamedTemporaryFile on windows does not have all the features we need.")
-    def test_slow_tasks(self):
-        slow_tasks = ['fcn_resnet', 'detr']
-        # deepspeech is voluntarily excluded as it takes too long to run without
-        # proper tuning of the number of threads it should use.
-
-        for task in slow_tasks:
-            self._test_runner(task)
-
-
-if __name__ == '__main__':
-    run_tests()