Operator: aten._log_softmax.default
cnt: 1, ((T([32, 1000], f16), 1, False), {})
Operator: aten._log_softmax_backward_data.default
cnt: 1, ((T([32, 1000], f16), T([32, 1000], f16), 1, f16), {})
Operator: aten._softmax.default
cnt: 3, ((T([32, 1, 3136, 49], f16), -1, False), {})
cnt: 4, ((T([32, 2, 784, 49], f16), -1, False), {})
cnt: 18, ((T([32, 5, 196, 49], f16), -1, False), {})
cnt: 3, ((T([32, 8, 49, 49], f16), -1, False), {})
Operator: aten._softmax_backward_data.default
cnt: 3, ((T([32, 8, 49, 49], f16), T([32, 8, 49, 49], f16), -1, f16), {})
cnt: 18, ((T([32, 5, 196, 49], f16), T([32, 5, 196, 49], f16), -1, f16), {})
cnt: 4, ((T([32, 2, 784, 49], f16), T([32, 2, 784, 49], f16), -1, f16), {})
cnt: 3, ((T([32, 1, 3136, 49], f16), T([32, 1, 3136, 49], f16), -1, f16), {})
Operator: aten._unsafe_view.default
cnt: 3, ((T([32, 3136, 49], f16), [32, 1, 3136, 49]), {})
cnt: 3, ((T([32, 3136, 64], f16), [32, 1, 3136, 64]), {})
cnt: 8, ((T([32, 2, 784, 64], f16), [64, 784, 64]), {})
cnt: 4, ((T([32, 2, 64, 49], f16), [64, 64, 49]), {})
cnt: 4, ((T([64, 784, 49], f16), [32, 2, 784, 49]), {})
cnt: 4, ((T([32, 2, 49, 64], f16), [64, 49, 64]), {})
cnt: 4, ((T([64, 784, 64], f16), [32, 2, 784, 64]), {})
cnt: 8, ((T([32, 784, 2, 64], f16), [32, 784, 128]), {})
cnt: 36, ((T([32, 5, 196, 64], f16), [160, 196, 64]), {})
cnt: 18, ((T([32, 5, 64, 49], f16), [160, 64, 49]), {})
cnt: 18, ((T([160, 196, 49], f16), [32, 5, 196, 49]), {})
cnt: 18, ((T([32, 5, 49, 64], f16), [160, 49, 64]), {})
cnt: 18, ((T([160, 196, 64], f16), [32, 5, 196, 64]), {})
cnt: 36, ((T([32, 196, 5, 64], f16), [32, 196, 320]), {})
cnt: 9, ((T([32, 8, 49, 64], f16), [256, 49, 64]), {})
cnt: 3, ((T([32, 8, 64, 49], f16), [256, 64, 49]), {})
cnt: 3, ((T([256, 49, 49], f16), [32, 8, 49, 49]), {})
cnt: 3, ((T([256, 49, 64], f16), [32, 8, 49, 64]), {})
cnt: 6, ((T([32, 49, 8, 64], f16), [32, 49, 512]), {})
cnt: 3, ((T([32, 49, 2, 8, 64], f16), [32, 49, 1024]), {})
cnt: 36, ((T([32, 196, 320], f16), [6272, 320]), {})
cnt: 18, ((T([32, 49, 2, 5, 64], f16), [32, 49, 640]), {})
cnt: 8, ((T([32, 784, 128], f16), [25088, 128]), {})
cnt: 4, ((T([32, 49, 2, 2, 64], f16), [32, 49, 256]), {})
cnt: 6, ((T([32, 3136, 64], f16), [100352, 64]), {})
cnt: 3, ((T([32, 49, 2, 1, 64], f16), [32, 49, 128]), {})
Operator: aten.add.Tensor
cnt: 9, ((T([32, 3136, 64], f16), T([32, 3136, 64], f16)), {})
cnt: 12, ((T([32, 784, 128], f16), T([32, 784, 128], f16)), {})
cnt: 54, ((T([32, 196, 320], f16), T([32, 196, 320], f16)), {})
cnt: 15, ((T([32, 49, 512], f16), T([32, 49, 512], f16)), {})
cnt: 3, ((T([2, 32, 8, 49, 64], f16), T([2, 32, 8, 49, 64], f16)), {})
cnt: 1, ((T([32, 512, 7, 7], f16, stride=(25088, 1, 3584, 512)), T([32, 512, 7, 7], f16, stride=(25088, 1, 3584, 512))), {})
cnt: 36, ((T([32, 196, 320], f16, stride=(62720, 1, 196)), T([32, 196, 320], f16)), {})
cnt: 18, ((T([2, 32, 5, 49, 64], f16), T([2, 32, 5, 49, 64], f16)), {})
cnt: 1, ((T([32, 320, 14, 14], f16), T([32, 320, 14, 14], f16, stride=(62720, 1, 4480, 320))), {})
cnt: 8, ((T([32, 784, 128], f16, stride=(100352, 1, 784)), T([32, 784, 128], f16)), {})
cnt: 4, ((T([2, 32, 2, 49, 64], f16), T([2, 32, 2, 49, 64], f16)), {})
cnt: 1, ((T([32, 128, 28, 28], f16), T([32, 128, 28, 28], f16, stride=(100352, 1, 3584, 128))), {})
cnt: 6, ((T([32, 3136, 64], f16, stride=(200704, 1, 3136)), T([32, 3136, 64], f16)), {})
cnt: 3, ((T([2, 32, 1, 49, 64], f16), T([2, 32, 1, 49, 64], f16)), {})
cnt: 1, ((T([32, 64, 56, 56], f16), T([32, 64, 56, 56], f16, stride=(200704, 1, 3584, 64))), {})
Operator: aten.add_.Tensor
cnt: 1, ((T([32, 64, 56, 56], f16, stride=(200704, 1, 3584, 64)), T([32, 64, 56, 56], f16, stride=(200704, 1, 3584, 64))), {})
cnt: 1, ((T([32, 128, 28, 28], f16, stride=(100352, 1, 3584, 128)), T([32, 128, 28, 28], f16, stride=(100352, 1, 3584, 128))), {})
cnt: 1, ((T([32, 320, 14, 14], f16, stride=(62720, 1, 4480, 320)), T([32, 320, 14, 14], f16, stride=(62720, 1, 4480, 320))), {})
cnt: 1, ((T([32, 512, 7, 7], f16, stride=(25088, 1, 3584, 512)), T([32, 512, 7, 7], f16, stride=(25088, 1, 3584, 512))), {})
Operator: aten.addmm.default
cnt: 6, ((T([64], f16), T([100352, 64], f16), T([64, 64], f16, stride=(1, 64))), {})
cnt: 3, ((T([128], f16), T([1568, 64], f16), T([64, 128], f16, stride=(1, 64))), {})
cnt: 3, ((T([512], f16), T([100352, 64], f16), T([64, 512], f16, stride=(1, 64))), {})
cnt: 3, ((T([64], f16), T([100352, 512], f16), T([512, 64], f16, stride=(1, 512))), {})
cnt: 8, ((T([128], f16), T([25088, 128], f16), T([128, 128], f16, stride=(1, 128))), {})
cnt: 4, ((T([256], f16), T([1568, 128], f16), T([128, 256], f16, stride=(1, 128))), {})
cnt: 4, ((T([1024], f16), T([25088, 128], f16), T([128, 1024], f16, stride=(1, 128))), {})
cnt: 4, ((T([128], f16), T([25088, 1024], f16), T([1024, 128], f16, stride=(1, 1024))), {})
cnt: 36, ((T([320], f16), T([6272, 320], f16), T([320, 320], f16, stride=(1, 320))), {})
cnt: 18, ((T([640], f16), T([1568, 320], f16), T([320, 640], f16, stride=(1, 320))), {})
cnt: 18, ((T([1280], f16), T([6272, 320], f16), T([320, 1280], f16, stride=(1, 320))), {})
cnt: 18, ((T([320], f16), T([6272, 1280], f16), T([1280, 320], f16, stride=(1, 1280))), {})
cnt: 6, ((T([512], f16), T([1568, 512], f16), T([512, 512], f16, stride=(1, 512))), {})
cnt: 3, ((T([1024], f16), T([1568, 512], f16), T([512, 1024], f16, stride=(1, 512))), {})
cnt: 3, ((T([2048], f16), T([1568, 512], f16), T([512, 2048], f16, stride=(1, 512))), {})
cnt: 3, ((T([512], f16), T([1568, 2048], f16), T([2048, 512], f16, stride=(1, 2048))), {})
cnt: 1, ((T([1000], f16), T([32, 512], f16), T([512, 1000], f16, stride=(1, 512))), {})
Operator: aten.bmm.default
cnt: 6, ((T([32, 3136, 64], f16), T([32, 64, 49], f16, stride=(6272, 1, 128))), {})
cnt: 6, ((T([32, 3136, 49], f16), T([32, 49, 64], f16, stride=(6272, 128, 1))), {})
cnt: 4, ((T([64, 784, 64], f16), T([64, 64, 49], f16)), {})
cnt: 4, ((T([64, 784, 49], f16), T([64, 49, 64], f16)), {})
cnt: 18, ((T([160, 196, 64], f16), T([160, 64, 49], f16)), {})
cnt: 18, ((T([160, 196, 49], f16), T([160, 49, 64], f16)), {})
cnt: 3, ((T([256, 49, 64], f16), T([256, 64, 49], f16)), {})
cnt: 3, ((T([256, 49, 49], f16), T([256, 49, 64], f16)), {})
cnt: 3, ((T([256, 49, 49], f16, stride=(2401, 1, 49)), T([256, 49, 64], f16)), {})
cnt: 3, ((T([256, 49, 64], f16), T([256, 64, 49], f16, stride=(3136, 1, 64))), {})
cnt: 3, ((T([256, 64, 49], f16, stride=(3136, 1, 64)), T([256, 49, 49], f16)), {})
cnt: 3, ((T([256, 49, 49], f16), T([256, 49, 64], f16, stride=(3136, 1, 49))), {})
cnt: 18, ((T([160, 49, 196], f16, stride=(9604, 1, 49)), T([160, 196, 64], f16)), {})
cnt: 18, ((T([160, 196, 64], f16), T([160, 64, 49], f16, stride=(3136, 1, 64))), {})
cnt: 18, ((T([160, 64, 196], f16, stride=(12544, 1, 64)), T([160, 196, 49], f16)), {})
cnt: 18, ((T([160, 196, 49], f16), T([160, 49, 64], f16, stride=(3136, 1, 49))), {})
cnt: 4, ((T([64, 49, 784], f16, stride=(38416, 1, 49)), T([64, 784, 64], f16)), {})
cnt: 4, ((T([64, 784, 64], f16), T([64, 64, 49], f16, stride=(3136, 1, 64))), {})
cnt: 4, ((T([64, 64, 784], f16, stride=(50176, 1, 64)), T([64, 784, 49], f16)), {})
cnt: 4, ((T([64, 784, 49], f16), T([64, 49, 64], f16, stride=(3136, 1, 49))), {})
cnt: 3, ((T([32, 49, 3136], f16, stride=(153664, 1, 49)), T([32, 3136, 64], f16)), {})
cnt: 3, ((T([32, 64, 3136], f16, stride=(200704, 1, 64)), T([32, 3136, 49], f16)), {})
Operator: aten.clone.default
cnt: 1, ((T([32, 3, 224, 224], f16),), {})
Operator: aten.convolution.default
cnt: 1, ((T([32, 3, 224, 224], f16), T([64, 3, 4, 4], f16), T([64], f16), [4, 4], [0, 0], [1, 1], False, [0, 0], 1), {})
cnt: 3, ((T([32, 64, 56, 56], f16, stride=(200704, 1, 3584, 64)), T([64, 64, 8, 8], f16), T([64], f16), [8, 8], [0, 0], [1, 1], False, [0, 0], 1), {})
cnt: 1, ((T([32, 64, 56, 56], f16, stride=(200704, 1, 3584, 64)), T([64, 1, 3, 3], f16), T([64], f16), [1, 1], [1, 1], [1, 1], False, [0, 0], 64), {})
cnt: 1, ((T([32, 64, 56, 56], f16), T([128, 64, 2, 2], f16), T([128], f16), [2, 2], [0, 0], [1, 1], False, [0, 0], 1), {})
cnt: 4, ((T([32, 128, 28, 28], f16, stride=(100352, 1, 3584, 128)), T([128, 128, 4, 4], f16), T([128], f16), [4, 4], [0, 0], [1, 1], False, [0, 0], 1), {})
cnt: 1, ((T([32, 128, 28, 28], f16, stride=(100352, 1, 3584, 128)), T([128, 1, 3, 3], f16), T([128], f16), [1, 1], [1, 1], [1, 1], False, [0, 0], 128), {})
cnt: 1, ((T([32, 128, 28, 28], f16), T([320, 128, 2, 2], f16), T([320], f16), [2, 2], [0, 0], [1, 1], False, [0, 0], 1), {})
cnt: 18, ((T([32, 320, 14, 14], f16, stride=(62720, 1, 4480, 320)), T([320, 320, 2, 2], f16), T([320], f16), [2, 2], [0, 0], [1, 1], False, [0, 0], 1), {})
cnt: 1, ((T([32, 320, 14, 14], f16, stride=(62720, 1, 4480, 320)), T([320, 1, 3, 3], f16), T([320], f16), [1, 1], [1, 1], [1, 1], False, [0, 0], 320), {})
cnt: 1, ((T([32, 320, 14, 14], f16), T([512, 320, 2, 2], f16), T([512], f16), [2, 2], [0, 0], [1, 1], False, [0, 0], 1), {})
cnt: 1, ((T([32, 512, 7, 7], f16, stride=(25088, 1, 3584, 512)), T([512, 1, 3, 3], f16), T([512], f16), [1, 1], [1, 1], [1, 1], False, [0, 0], 512), {})
Operator: aten.convolution_backward.default
cnt: 1, ((T([32, 512, 7, 7], f16, stride=(25088, 1, 3584, 512)), T([32, 512, 7, 7], f16, stride=(25088, 1, 3584, 512)), T([512, 1, 3, 3], f16), [512], [1, 1], [1, 1], [1, 1], False, [0, 0], 512, [True, True, True]), {})
cnt: 1, ((T([32, 512, 7, 7], f16, stride=(25088, 1, 3584, 512)), T([32, 320, 14, 14], f16), T([512, 320, 2, 2], f16), [512], [2, 2], [0, 0], [1, 1], False, [0, 0], 1, [True, True, True]), {})
cnt: 18, ((T([32, 320, 7, 7], f16, stride=(15680, 1, 2240, 320)), T([32, 320, 14, 14], f16, stride=(62720, 1, 4480, 320)), T([320, 320, 2, 2], f16), [320], [2, 2], [0, 0], [1, 1], False, [0, 0], 1, [True, True, True]), {})
cnt: 1, ((T([32, 320, 14, 14], f16), T([32, 320, 14, 14], f16, stride=(62720, 1, 4480, 320)), T([320, 1, 3, 3], f16), [320], [1, 1], [1, 1], [1, 1], False, [0, 0], 320, [True, True, True]), {})
cnt: 1, ((T([32, 320, 14, 14], f16), T([32, 128, 28, 28], f16), T([320, 128, 2, 2], f16), [320], [2, 2], [0, 0], [1, 1], False, [0, 0], 1, [True, True, True]), {})
cnt: 4, ((T([32, 128, 7, 7], f16, stride=(6272, 1, 896, 128)), T([32, 128, 28, 28], f16, stride=(100352, 1, 3584, 128)), T([128, 128, 4, 4], f16), [128], [4, 4], [0, 0], [1, 1], False, [0, 0], 1, [True, True, True]), {})
cnt: 1, ((T([32, 128, 28, 28], f16), T([32, 128, 28, 28], f16, stride=(100352, 1, 3584, 128)), T([128, 1, 3, 3], f16), [128], [1, 1], [1, 1], [1, 1], False, [0, 0], 128, [True, True, True]), {})
cnt: 1, ((T([32, 128, 28, 28], f16), T([32, 64, 56, 56], f16), T([128, 64, 2, 2], f16), [128], [2, 2], [0, 0], [1, 1], False, [0, 0], 1, [True, True, True]), {})
cnt: 3, ((T([32, 64, 7, 7], f16, stride=(3136, 1, 448, 64)), T([32, 64, 56, 56], f16, stride=(200704, 1, 3584, 64)), T([64, 64, 8, 8], f16), [64], [8, 8], [0, 0], [1, 1], False, [0, 0], 1, [True, True, True]), {})
cnt: 1, ((T([32, 64, 56, 56], f16), T([32, 64, 56, 56], f16, stride=(200704, 1, 3584, 64)), T([64, 1, 3, 3], f16), [64], [1, 1], [1, 1], [1, 1], False, [0, 0], 64, [True, True, True]), {})
cnt: 1, ((T([32, 64, 56, 56], f16), T([32, 3, 224, 224], f16), T([64, 3, 4, 4], f16), [64], [4, 4], [0, 0], [1, 1], False, [0, 0], 1, [False, True, True]), {})
Operator: aten.copy_.default
cnt: 1, ((T([32, 3, 224, 224], f16), T([32, 3, 224, 224], f16)), {})
cnt: 18, ((T([320, 320, 2, 2], f16), T([320, 320, 2, 2], f16, stride=(1280, 1, 640, 320))), {})
cnt: 4, ((T([128, 128, 4, 4], f16), T([128, 128, 4, 4], f16, stride=(2048, 1, 512, 128))), {})
cnt: 3, ((T([64, 64, 8, 8], f16), T([64, 64, 8, 8], f16, stride=(4096, 1, 512, 64))), {})
Operator: aten.div.Scalar
cnt: 1, ((T([32, 49, 512], f16, stride=(512, 0, 1)), 49), {})
Operator: aten.gelu.default
cnt: 3, ((T([32, 3136, 512], f16),), {})
cnt: 4, ((T([32, 784, 1024], f16),), {})
cnt: 18, ((T([32, 196, 1280], f16),), {})
cnt: 3, ((T([32, 49, 2048], f16),), {})
Operator: aten.gelu_backward.default
cnt: 3, ((T([32, 49, 2048], f16), T([32, 49, 2048], f16)), {})
cnt: 18, ((T([32, 196, 1280], f16), T([32, 196, 1280], f16)), {})
cnt: 4, ((T([32, 784, 1024], f16), T([32, 784, 1024], f16)), {})
cnt: 3, ((T([32, 3136, 512], f16), T([32, 3136, 512], f16)), {})
Operator: aten.lift_fresh_copy.default
cnt: 1, ((T([32], i64),), {})
Operator: aten.mean.dim
cnt: 1, ((T([32, 49, 512], f16), [1]), {})
Operator: aten.mm.default
cnt: 1, ((T([32, 1000], f16), T([1000, 512], f16)), {})
cnt: 1, ((T([1000, 32], f16, stride=(1, 1000)), T([32, 512], f16)), {})
cnt: 3, ((T([1568, 512], f16), T([512, 2048], f16)), {})
cnt: 3, ((T([512, 1568], f16, stride=(1, 512)), T([1568, 2048], f16)), {})
cnt: 3, ((T([1568, 2048], f16), T([2048, 512], f16)), {})
cnt: 3, ((T([2048, 1568], f16, stride=(1, 2048)), T([1568, 512], f16)), {})
cnt: 6, ((T([1568, 512], f16), T([512, 512], f16)), {})
cnt: 6, ((T([512, 1568], f16, stride=(1, 512)), T([1568, 512], f16)), {})
cnt: 3, ((T([1568, 1024], f16), T([1024, 512], f16)), {})
cnt: 3, ((T([1024, 1568], f16, stride=(1, 1024)), T([1568, 512], f16)), {})
cnt: 18, ((T([6272, 320], f16), T([320, 1280], f16)), {})
cnt: 18, ((T([320, 6272], f16, stride=(1, 320)), T([6272, 1280], f16)), {})
cnt: 18, ((T([6272, 1280], f16), T([1280, 320], f16)), {})
cnt: 18, ((T([1280, 6272], f16, stride=(1, 1280)), T([6272, 320], f16)), {})
cnt: 36, ((T([6272, 320], f16), T([320, 320], f16)), {})
cnt: 36, ((T([320, 6272], f16, stride=(1, 320)), T([6272, 320], f16)), {})
cnt: 18, ((T([1568, 640], f16), T([640, 320], f16)), {})
cnt: 18, ((T([640, 1568], f16, stride=(1, 640)), T([1568, 320], f16)), {})
cnt: 4, ((T([25088, 128], f16), T([128, 1024], f16)), {})
cnt: 4, ((T([128, 25088], f16, stride=(1, 128)), T([25088, 1024], f16)), {})
cnt: 4, ((T([25088, 1024], f16), T([1024, 128], f16)), {})
cnt: 4, ((T([1024, 25088], f16, stride=(1, 1024)), T([25088, 128], f16)), {})
cnt: 8, ((T([25088, 128], f16), T([128, 128], f16)), {})
cnt: 8, ((T([128, 25088], f16, stride=(1, 128)), T([25088, 128], f16)), {})
cnt: 4, ((T([1568, 256], f16), T([256, 128], f16)), {})
cnt: 4, ((T([256, 1568], f16, stride=(1, 256)), T([1568, 128], f16)), {})
cnt: 3, ((T([100352, 64], f16), T([64, 512], f16)), {})
cnt: 3, ((T([64, 100352], f16, stride=(1, 64)), T([100352, 512], f16)), {})
cnt: 3, ((T([100352, 512], f16), T([512, 64], f16)), {})
cnt: 3, ((T([512, 100352], f16, stride=(1, 512)), T([100352, 64], f16)), {})
cnt: 6, ((T([100352, 64], f16), T([64, 64], f16)), {})
cnt: 6, ((T([64, 100352], f16, stride=(1, 64)), T([100352, 64], f16)), {})
cnt: 3, ((T([1568, 128], f16), T([128, 64], f16)), {})
cnt: 3, ((T([128, 1568], f16, stride=(1, 128)), T([1568, 64], f16)), {})
Operator: aten.mul.Tensor
cnt: 6, ((T([32, 1, 3136, 49], f16), 0.125), {})
cnt: 8, ((T([32, 2, 784, 49], f16), 0.125), {})
cnt: 36, ((T([32, 5, 196, 49], f16), 0.125), {})
cnt: 6, ((T([32, 8, 49, 49], f16), 0.125), {})
Operator: aten.native_layer_norm.default
cnt: 1, ((T([32, 3136, 64], f16, stride=(200704, 1, 3136)), [64], T([64], f16), T([64], f16), 1e-05), {})
cnt: 6, ((T([32, 3136, 64], f16), [64], T([64], f16), T([64], f16), 1e-06), {})
cnt: 3, ((T([32, 49, 64], f16), [64], T([64], f16), T([64], f16), 1e-05), {})
cnt: 1, ((T([32, 784, 128], f16, stride=(100352, 1, 784)), [128], T([128], f16), T([128], f16), 1e-05), {})
cnt: 8, ((T([32, 784, 128], f16), [128], T([128], f16), T([128], f16), 1e-06), {})
cnt: 4, ((T([32, 49, 128], f16), [128], T([128], f16), T([128], f16), 1e-05), {})
cnt: 1, ((T([32, 196, 320], f16, stride=(62720, 1, 196)), [320], T([320], f16), T([320], f16), 1e-05), {})
cnt: 36, ((T([32, 196, 320], f16), [320], T([320], f16), T([320], f16), 1e-06), {})
cnt: 18, ((T([32, 49, 320], f16), [320], T([320], f16), T([320], f16), 1e-05), {})
cnt: 1, ((T([32, 49, 512], f16, stride=(25088, 1, 49)), [512], T([512], f16), T([512], f16), 1e-05), {})
cnt: 7, ((T([32, 49, 512], f16), [512], T([512], f16), T([512], f16), 1e-06), {})
Operator: aten.native_layer_norm_backward.default
cnt: 7, ((T([32, 49, 512], f16), T([32, 49, 512], f16), [512], T([32, 49, 1], f32), T([32, 49, 1], f32), T([512], f16), T([512], f16), [True, True, True]), {})
cnt: 1, ((T([32, 49, 512], f16), T([32, 49, 512], f16, stride=(25088, 1, 49)), [512], T([32, 49, 1], f32), T([32, 49, 1], f32), T([512], f16), T([512], f16), [True, True, True]), {})
cnt: 36, ((T([32, 196, 320], f16), T([32, 196, 320], f16), [320], T([32, 196, 1], f32), T([32, 196, 1], f32), T([320], f16), T([320], f16), [True, True, True]), {})
cnt: 18, ((T([32, 49, 320], f16), T([32, 49, 320], f16), [320], T([32, 49, 1], f32), T([32, 49, 1], f32), T([320], f16), T([320], f16), [True, True, True]), {})
cnt: 1, ((T([32, 196, 320], f16, stride=(62720, 1, 196)), T([32, 196, 320], f16, stride=(62720, 1, 196)), [320], T([32, 196, 1], f32), T([32, 196, 1], f32), T([320], f16), T([320], f16), [True, True, True]), {})
cnt: 8, ((T([32, 784, 128], f16), T([32, 784, 128], f16), [128], T([32, 784, 1], f32), T([32, 784, 1], f32), T([128], f16), T([128], f16), [True, True, True]), {})
cnt: 4, ((T([32, 49, 128], f16), T([32, 49, 128], f16), [128], T([32, 49, 1], f32), T([32, 49, 1], f32), T([128], f16), T([128], f16), [True, True, True]), {})
cnt: 1, ((T([32, 784, 128], f16, stride=(100352, 1, 784)), T([32, 784, 128], f16, stride=(100352, 1, 784)), [128], T([32, 784, 1], f32), T([32, 784, 1], f32), T([128], f16), T([128], f16), [True, True, True]), {})
cnt: 6, ((T([32, 3136, 64], f16), T([32, 3136, 64], f16), [64], T([32, 3136, 1], f32), T([32, 3136, 1], f32), T([64], f16), T([64], f16), [True, True, True]), {})
cnt: 3, ((T([32, 49, 64], f16), T([32, 49, 64], f16), [64], T([32, 49, 1], f32), T([32, 49, 1], f32), T([64], f16), T([64], f16), [True, True, True]), {})
cnt: 1, ((T([32, 3136, 64], f16, stride=(200704, 1, 3136)), T([32, 3136, 64], f16, stride=(200704, 1, 3136)), [64], T([32, 3136, 1], f32), T([32, 3136, 1], f32), T([64], f16), T([64], f16), [True, True, True]), {})
Operator: aten.new_empty_strided.default
cnt: 18, ((T([320, 320, 2, 2], f16, stride=(1280, 1, 640, 320)), [320, 320, 2, 2], [1280, 4, 2, 1]), {'dtype': f16, 'layout': torch.strided, 'device': 'cuda'})
cnt: 4, ((T([128, 128, 4, 4], f16, stride=(2048, 1, 512, 128)), [128, 128, 4, 4], [2048, 16, 4, 1]), {'dtype': f16, 'layout': torch.strided, 'device': 'cuda'})
cnt: 3, ((T([64, 64, 8, 8], f16, stride=(4096, 1, 512, 64)), [64, 64, 8, 8], [4096, 64, 8, 1]), {'dtype': f16, 'layout': torch.strided, 'device': 'cuda'})
Operator: aten.nll_loss_backward.default
cnt: 1, ((T([], f16), T([32, 1000], f16), T([32], i64), None, 1, -100, T([], f16)), {})
Operator: aten.nll_loss_forward.default
cnt: 1, ((T([32, 1000], f16), T([32], i64), None, 1, -100), {})
Operator: aten.select_backward.default
cnt: 3, ((T([32, 8, 49, 64], f16), [2, 32, 8, 49, 64], 0, 1), {})
cnt: 3, ((T([32, 8, 49, 64], f16, stride=(25088, 3136, 1, 49)), [2, 32, 8, 49, 64], 0, 0), {})
cnt: 18, ((T([32, 5, 49, 64], f16), [2, 32, 5, 49, 64], 0, 1), {})
cnt: 18, ((T([32, 5, 49, 64], f16, stride=(15680, 3136, 1, 49)), [2, 32, 5, 49, 64], 0, 0), {})
cnt: 4, ((T([32, 2, 49, 64], f16), [2, 32, 2, 49, 64], 0, 1), {})
cnt: 4, ((T([32, 2, 49, 64], f16, stride=(6272, 3136, 1, 49)), [2, 32, 2, 49, 64], 0, 0), {})
cnt: 3, ((T([32, 1, 49, 64], f16), [2, 32, 1, 49, 64], 0, 1), {})
cnt: 3, ((T([32, 1, 49, 64], f16, stride=(3136, 3136, 1, 49)), [2, 32, 1, 49, 64], 0, 0), {})
Operator: aten.sum.SymInt
cnt: 1, ((T([32, 1000], f16), [0], True), {})
cnt: 9, ((T([1568, 512], f16), [0], True), {})
cnt: 3, ((T([1568, 2048], f16), [0], True), {})
cnt: 3, ((T([1568, 1024], f16), [0], True), {})
cnt: 54, ((T([6272, 320], f16), [0], True), {})
cnt: 18, ((T([6272, 1280], f16), [0], True), {})
cnt: 18, ((T([1568, 640], f16), [0], True), {})
cnt: 12, ((T([25088, 128], f16), [0], True), {})
cnt: 4, ((T([25088, 1024], f16), [0], True), {})
cnt: 4, ((T([1568, 256], f16), [0], True), {})
cnt: 9, ((T([100352, 64], f16), [0], True), {})
cnt: 3, ((T([100352, 512], f16), [0], True), {})
cnt: 3, ((T([1568, 128], f16), [0], True), {})
