2021-12-2 更新
通过设置标志
torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False
尝试修复rtx30系显卡的默认低精度计算问题
–2022/1/7
经评论区老哥qq_17755303反馈,该方法能有效解决该bug 。
2021-11-18 更新
发现在30系显卡上由于未知原因出现低精度计算情况。
如果本测试启动立刻报测试失败,多次都是这样,则说明本测试代码暂时不适合你的显卡进行测试,麻烦在评论里说明下显卡名,驱动版本,操作系统类型,便于我分析原因。
如果本测试启动,过了一段时间后,才报测试失败,则说明代码没有问题,你的显卡或机器可能有问题。
相关事件:https://blog.csdn.net/ONE_SIX_MIX/article/details/109251125
Github仓库:https://github.com/One-sixth/check_cuda_numerical_stability
显卡部分损坏,或者插槽不稳,导致运算时出现错误计算的数值。
编写了一个python代码来检测cuda运算的问题
原理:使用可逆运算原理,进行计算后然后恢复数值,比较恢复的数值和原始数值的差距,从而判断CUDA是否运算出错
本机为1070ti,运算误差均在 1e-5 以下
相关事件里写的机器,平时运算误差也均在1e-5以下,但偶尔误差会跳到个位数,这足够说明显卡的CUDA运算存在问题了。
以下代码占用显存约2.9g,默认参数时需要运行半小时。如果程序结束时输出 Test passed,你的显卡应该没有问题。
如果程序结束时输出 Test failure. 说明你的显卡需要修了,或者插槽接触不良。
测试pytorch版本为 1.6,CUDA 10.2
使用方法
复制以下代码到一个文本文件内,并改名为 _check_cuda_numerical_stability.py 使用 python _check_cuda_numerical_stability.py 即可马上启动检测 支持3个可选参数 -i 指定测试的显卡编号,默认为0。若指定为-1代表使用CPU做测试。 -t 指定测试持续时间,单位为分钟,默认为30。若指定为小于等于0的数值,代表无限运行。 -bs 指定测试时的批量大小,默认为20。数值越大占用显存越多,一般不用改
CUDA数值稳定性检测工具
''' 用于检测cuda运算错误 ''' import torch import torch.nn as nn from torch.backends import cudnn import argparse import time import math def ConvBnAct(in_ch, out_ch, ker_sz, stride, pad, act=nn.Identity(), group=1, dilation=1): return nn.Sequential(nn.Conv2d(in_ch, out_ch, ker_sz, stride, pad, groups=group, bias=False, dilation=dilation), nn.BatchNorm2d(out_ch, eps=1e-8, momentum=0.9), act) def DeConvBnAct(in_ch, out_ch, ker_sz, stride, pad, act=nn.Identity(), group=1, dilation=1): return nn.Sequential(nn.ConvTranspose2d(in_ch, out_ch, ker_sz, stride, pad, groups=group, bias=False, dilation=dilation), nn.BatchNorm2d(out_ch, eps=1e-8, momentum=0.9), act) class RevSequential(nn.ModuleList): ''' 功能大部分与ModuleList重叠 ''' def __init__(self, modules=None): super().__init__(modules) def append(self, module): assert hasattr(module, 'invert') and callable(module.invert) super().append(module) def extend(self, modules): for m in modules: self.append(m) def forward(self, x1, x2): y1, y2 = x1, x2 for m in self: y1, y2 = m(y1, y2) return y1, y2 def invert(self, y1, y2): x1, x2 = y1, y2 for m in list(self)[::-1]: x1, x2 = m.invert(x1, x2) return x1, x2 class RevGroupBlock(RevSequential): ''' 当前只支持输入通道等于输出通道,并且不允许下采样 ''' def __init__(self, in_ch, out_ch, stride, act, block_type, blocks, **kwargs): assert in_ch == out_ch assert stride == 1 mods = [] for _ in range(blocks): mods.append(block_type(in_ch=in_ch, out_ch=out_ch, stride=1, act=act, **kwargs)) # self.extend(mods) super().__init__(mods) class RevBlockC(nn.Module): def __init__(self, in_ch, out_ch, stride, act, **kwargs): super().__init__() inter_ch = in_ch // 2 self.conv1 = ConvBnAct(in_ch, inter_ch, ker_sz=3, stride=1, pad=1, act=act) self.conv2 = ConvBnAct(inter_ch, inter_ch, ker_sz=5, stride=1, pad=2, act=act, group=inter_ch) self.conv3 = ConvBnAct(in_ch, in_ch, ker_sz=1, stride=1, pad=0, act=nn.Identity()) def func(self, x): y1 = self.conv1(x) y2 = self.conv2(y1) y = torch.cat([y1, y2], dim=1) y = self.conv3(y) return y def forward(self, x1, x2): y = x1 + self.func(x2) return x2, y def invert(self, y1, y2): x2, y = y1, y2 x1 = y - self.func(x2) return x1, x2 if __name__ == '__main__': cudnn.benchmark = False cudnn.deterministic = True torch.set_grad_enabled(False) # Close tf32 features. Fix low numerical accuracy on rtx30xx gpu. try: torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False except AttributeError as e: print('Info. This pytorch version is not support with tf32.') parse = argparse.ArgumentParser(description='Used to detect CUDA numerical stability problems.') parse.add_argument('-i', type=int, help='card id. Which cuda card do you want to test. default: 0', default=0) parse.add_argument('-t', type=int, help='minute. Test duration. When the setting is less than or equal to 0, it will not stop automatically. defaule: 30', default=30) parse.add_argument('-bs', type=int, help='Test batch size when testing. defaule: 20', default=20) parse = parse.parse_args() duration = parse.t * 60 if duration <= 0: duration = math.inf card_id = parse.i if card_id == -1: # 使用cpu测试理论上是永远不会报错的 device = torch.device('cpu') else: device = torch.device(f'cuda:{card_id}') batch_size = parse.bs assert batch_size > 0 start_time = time.time() test_count = 0 act = nn.ELU() rvb = RevGroupBlock(128, 128, 1, act, RevBlockC, 32).to(device) rvb.eval() is_no_error = True print('CUDA numerical stability test begin.') while is_no_error: cur_time = time.time() if cur_time - start_time > duration: break test_count += 1 if test_count % 50 == 0: # 每50次迭代后,刷新一次网络权重 rvb = RevGroupBlock(128, 128, 1, act, RevBlockC, 32).to(device) rvb.eval() a1 = torch.randn(batch_size, 128, 128, 128, device=device) b1, b2 = rvb(a1, a1) o_a1, o_a2 = rvb.invert(b1, b2) max_diff_1 = torch.abs(o_a1 - o_a2).max() max_diff_2 = torch.abs(a1 - o_a1).max() line = f'elapsed/total: {int(cur_time-start_time)}/{duration} card_id: {card_id} count: {test_count} max_diff_1: {max_diff_1:.8f} max_diff_2: {max_diff_2:.8f}' print(line) if max_diff_1 > 1e-3 or max_diff_2 > 1e-3: print(f'A large numerical error was found!') is_no_error = False if is_no_error: print(f'Test passed. Card ID: {card_id}') else: print(f'Test failed. Card ID: {card_id}')