0%

9-GirdSample算子预分析

整理了从环境安装、ONNX 导出、OM 模型转换、推理验证到算子支持查询的完整流程,并针对 GridSampler2D/3D 的关键问题和解决思路进行了说明。


1. 参考资料

2. 安装环境
1
pip install torch>=1.9 onnx==1.12.0 onnxruntime==1.14.0
  • Opset v16

3. 导出 2D GridSample 的 ONNX 模型
  • 编写导出脚本 export_grid_sample_onnx.py
1
vim export_grid_sample_onnx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import torch
import torch.nn as nn
import torch.nn.functional as F

import onnx
import onnxruntime as ort
import numpy as np

class GridSampleModel(nn.Module):
def __init__(self):
super(GridSampleModel, self).__init__()
self.conv = nn.Conv2d(in_channels=1, out_channels=1,
kernel_size=3, padding=1)

def forward(self, x, grid):
x = self.conv(x)

out = F.grid_sample(
x,
grid,
mode='bilinear',
padding_mode='zeros',
align_corners=True
)
return out

def main():
torch.manual_seed(0)

model = GridSampleModel().eval()

N, C, H, W = 1, 1, 64, 64
x = torch.randn(N, C, H, W)

theta = torch.tensor([
[[1.0, 0.0, 0.0],
[0.0, 1.0, 0.0]]
], dtype=torch.float32)

grid = F.affine_grid(theta, size=(N, C, H, W), align_corners=True)

with torch.no_grad():
out = model(x, grid)

print("PyTorch output shape:", out.shape)

onnx_model_path = "grid_sample_model.onnx"
torch.onnx.export(
model,
(x, grid),
onnx_model_path,
export_params=True,
opset_version=16,
do_constant_folding=True,
input_names=["input_x", "input_grid"],
output_names=["output"],
dynamic_axes={
"input_x": {0: "batch_size", 2: "height", 3: "width"},
"input_grid": {0: "batch_size", 1: "grid_height", 2: "grid_width"},
"output": {0: "batch_size", 2: "out_height", 3: "out_width"}
}
)
print(f"ONNX model saved to: {onnx_model_path}")

onnx_model = onnx.load(onnx_model_path)
onnx.checker.check_model(onnx_model)

ort_sess = ort.InferenceSession(onnx_model_path)

x_np = x.cpu().numpy()
grid_np = grid.cpu().numpy()

onnx_out = ort_sess.run(
None,
{
"input_x": x_np,
"input_grid": grid_np
}
)[0]

print("ONNX Runtime output shape:", onnx_out.shape)

diff = np.mean(np.abs(out.cpu().numpy() - onnx_out))
print(f"Mean absolute difference between PyTorch and ONNX outputs: {diff:.6f}")

if __name__ == "__main__":
main()
  • 运行导出脚本
1
python3 export_grid_sample.py


4. 转换成om模型

  • 使用 ATC 工具将 ONNX 模型转换为 OM 模型:
1
atc --model=grid_sample_model.onnx --framework=5 --output=grid_sample_model --input_shape="input_x:1,1,64,64;input_grid:1,64,64,2" --soc_version=Ascend910B3


5. 开始推理验证
1
2
3
4
5
6
7
cd tools-master/ais-bench_workload/tool/ais_bench
pip3 wheel ./backend/ -v
pip3 wheel ./ -v
pip3 install ./aclruntime-0.0.2-cp311-cp311-linux_aarch64.whl --force-reinstall
pip3 install --force-reinstall "numpy<2.0"
pip3 install --no-deps ./ais_bench-0.0.2-py3-none-any.whl --force-reinstall
pip3 install ./ais_bench-0.0.2-py3-none-any.whl --force-reinstall
  • 生成输入数据generate_bin.py
1
2
3
mkdir prep_dataset
cd prep_dataset
vim generate_bin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import torch
import torch.nn.functional as F
import numpy as np

def main():
torch.manual_seed(0)

N, C, H, W = 1, 1, 64, 64
x = torch.randn(N, C, H, W, dtype=torch.float32)

theta = torch.tensor([
[[1.0, 0.0, 0.0],
[0.0, 1.0, 0.0]]
], dtype=torch.float32)

grid = F.affine_grid(theta, size=(N, C, H, W), align_corners=True)

x_np = x.cpu().numpy()
grid_np = grid.cpu().numpy()

x_np.tofile("input_x.bin")
grid_np.tofile("input_grid.bin")

print("Generated two .bin files:")
print(f" - input_x.bin : shape {x_np.shape}, dtype {x_np.dtype}")
print(f" - input_grid.bin : shape {grid_np.shape}, dtype {grid_np.dtype}")

if __name__ == "__main__":
main()

运行脚本生成二进制输入文件,并创建相应目录:

1
2
3
4
5
python3 generate_bin.py
mkdir input_x_dir
mkdir input_grid_dir
mv input_x.bin input_x_dir/
mv input_grid.bin input_grid_dir/

  • 使用 ais_bench 进行推理
1
2
3
4
5
cd ..
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python -m ais_bench --model grid_sample_model.om
mkdir output
python3 -m ais_bench --model ./grid_sample_model.om --input ./prep_dataset/input_x_dir,./prep_dataset/input_grid_dir --output ./output/ --output_dirname result --outfmt TXT


6. 更多性能数据

创建或编辑 acl.json 文件:

1
vim acl.json
1
2
3
4
5
6
{
"profiler": {
"switch": "on",
"output": "./result/profiler"
}
}

执行推理并采集性能数据:

1
python3 -m ais_bench --model ./grid_sample_model.om --acl_json_path ./acl.json


7. 算子支持与注意事项
  • 查看CANN算子规格

如果使用GridSampler3D:

1
vim export_grid_sample_3D.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import torch
import torch.nn as nn
import torch.nn.functional as F
import onnx
import numpy as np

class GridSample3DModel(nn.Module):
def __init__(self):
super(GridSample3DModel, self).__init__()
self.conv3d = nn.Conv3d(in_channels=1, out_channels=1,
kernel_size=3, padding=1)

def forward(self, x, grid):
x = self.conv3d(x)
out = F.grid_sample(
x,
grid,
mode='bilinear',
padding_mode='zeros',
align_corners=True
)
return out

def main():
torch.manual_seed(0)

model = GridSample3DModel().eval()

N, C, D, H, W = 1, 1, 8, 640, 640
x = torch.randn(N, C, D, H, W, dtype=torch.float32)

theta = torch.tensor([[
[1.0, 0.0, 0.0, 0.0],
[0.0, 1.0, 0.0, 0.0],
[0.0, 0.0, 1.0, 0.0],
]], dtype=torch.float32)

grid = F.affine_grid(theta, size=(N, C, D, H, W), align_corners=True)

with torch.no_grad():
out = model(x, grid)
print("PyTorch output shape:", out.shape)

onnx_model_path = "grid_sample_3d_model.onnx"
torch.onnx.export(
model,
(x, grid),
onnx_model_path,
export_params=True,
opset_version=16,
do_constant_folding=True,
input_names=["input_x", "input_grid"],
output_names=["output"],
# dynamic_axes={
# "input_x": {0: "batch_size", 2: "depth", 3: "height", 4: "width"},
# "input_grid": {0: "batch_size", 1: "grid_depth", 2: "grid_height", 3: "grid_width"},
# "output": {0: "batch_size", 2: "out_depth", 3: "out_height", 4: "out_width"}
# }
)
print(f"ONNX model saved to: {onnx_model_path}")

if __name__ == "__main__":
main()
1
python3 export_grid_sample_3D.py

  • GridSampler2D 与 GridSampler3D 的限制

    • PyTorch ONNX 导出限制:目前 torch.onnx.export 仅支持 2D GridSample(4D 输入),不支持 3D(5D)输入,会报错。

    • Ascend AI Core 支持

      • GridSampler3D 有 AI Core Kernel 实现,需 float32 前向;
      • GridSampler2D 目前仅支持在 AICPU 上执行,无法使用 AI Core。
  • 3D grid_sample 的计算量通常远大于 2D,如果本来只是个 2D 任务,用 3D 人工加一维度可能并不能带来真正的功能收益,只是为了能够在 AI Core上执行。

  • 这是一个前向支持 / 后向不支持的状况。如果要在 Ascend AI Core 上做推理,又通过 ONNX/OM 流程,目前没有官方开箱可行的方案


8. 解决思路:

面对上述限制,以下是可能的应对思路:

  • 使用 2D GridSample:如果不强制使用 AI Core 加速,可使用 2D GridSample,会在 AICPU 上运行,性能较低但流程简单。

  • 直接在 PyTorch + Ascend NPU 上推理:跳过 ONNX/OM 转换,使用 torch_npu 在 Ascend NPU 上直接运行模型。

  • 使用 GridSampler3D

    • 构造 5D 输入 x(形状 [N, C, D, H, W],dtype float32)和 5D 网格 grid(形状 [N, D, H, W, 3],dtype float32)。

    • 在计算图中添加 GridSampler3D 节点,并设置相关属性。

    • 使用 ATC 或其他编译器,将计算图编译为 .om 文件,在 AI Core 上执行。注意这种方式需要深入了解 Ascend 图编译流程。

  • 自定义实现:自行实现 3D GridSample 的 ONNX symbolic 函数或 Ascend 自定义算子,但工作量较大。


9. 查询算子支持
  • 使用 ms_fast_query 工具查询算子支持情况:
1
2
cd /usr/local/Ascend/ascend-toolkit/latest/tools/ms_fast_query
python3 ms_fast_query.py -t op --opp_path /usr/local/Ascend/ascend-toolkit/latest/opp -o /home/ljw/op.json

查询结果会生成在指定的 op.json 文件中,可用于查看算子支持详情。


10. 查看算子原型
1
2
cd /usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_proto/
cat image_ops.h