行业识别——基于Bert
项目介绍
数据集:
本项目使用的是THUCNews的一个子集,每条数据都是从新闻中抽取的标题,属于标题(短文本)分类。
文本长度在20到30之间。一共10个类别,每类2万条。数据以字为单位输入模型。
类别:财经、房产、股票、教育、科技、社会、时政、体育、游戏、娱乐。
数据集 | 数据量 |
训练集 | 18万 |
验证集 | 1万 |
测试集 | 1万 |
数据迭代器:
在进行训练的时候,读取数据有两种方式。
一种是提前把数据预处理好,保存为文件,训练的时候读取文件进行训练。
另一种是构建数据迭代器,预处理和训练同时进行。
优点:当数据量大的时候,一次只会加载1个batch的数据到显存中,有效防止了显存溢出。
项目结构:
│ predict.py 预测代码
│ run.py 总入口
│ train_eval.py 训练、验证、测试代码
│ utils.py 数据预处理
│
├─bert_pretrain
│ bert_config.json 超参数配置文件
│ pytorch_model.bin 预训练参数文件
│ vocab.txt 词表文件
│
├─models
│ bert.py 模型定义及超参数定义
│
└─THUCNews
├─data
│ class.txt 类别
│ dev.txt 验证集
│ test.txt 测试集
│ train.txt 验证集
│
└─saved_dict
bert.ckpt 训练模型保存
总入口:
parser = argparse.ArgumentParser(description="chinese text classification") parser.add_argument('--model',type=str,required=True,help="choose model") args = parser.parse_args() if __name__ == "__main__": dataset = 'THUCNews' #数据集 model_name = args.model #模型名字 x = import_module('models.' + model_name) #根据模型名字,获取models包下的文件 config = x.Config(dataset) #模型配置类 #设置随机种子,np,cpu,gpu,固定卷积层算法 np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) torch.backends.cudnn.deterministic = True stat_time = time.time() print("Loading data...") #数据预处理 train_data,dev_data,test_data = build_dataset(config) #构建训练集、验证集、测试集迭代器 train_iter = build_iterator(train_data,config) dev_iter = build_iterator(dev_data,config) test_iter = build_iterator(test_data,config) time_dif = get_time_dif(stat_time) print("Time usage:",time_dif) #构建模型对象,to_device model = x.Model(config).to(config.device) train(config,model,train_iter,dev_iter,test_iter)
模型搭建和配置
配置类: config
class Config(object): def __init__(self,dataset): self.model_name = 'bert' #训练集、验证集、测试集 self.train_path = dataset + 'data/train.txt' self.dev_path = dataset + 'data/dev.txt' self.test_path = dataset + 'data/test.txt' #类别 self.class_list = [x.strip() for x in open(dataset + 'data/class.txt').readlines()] #模型保存位置 self.save_path = dataset + 'saved_dict' + self.model_name + '.ckpt' #设置训练使用cpu、gpu self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #设置xx个batch没有改变则提前结束 self.require_improvement = 1000 #类别数目 self.num_classes = len(self.class_list) #迭代次数 self.epoches = 3 #设置batch_size self.batch_size = 128 #设置句子长度 self.pad_size = 32 #设置学习率 self.learning_rate = 5e-5 #预训练模型相关文件:1.模型文件.bin 2.配置文件.json 3.词表文件vocab.txt self.bert_path = './bert_pretrain' #序列划分工具 self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) #隐藏层数量 self.hidden_size = 768
模型搭建:model
class model(nn.Module): def __init__(self,config): super(Module,self).__init__ #加载模型 self.bert = BertModel.from_pretrained(config.bert_path) #微调 for param in self.bert.parameters(): param.requires_grad = True #训练时进行梯度更新 #输出,自定义全连接层 self.fc = nn.Linear(config.hidden_size,config.num_classes) def forward(self,x): context = x[0] mask = x[2] _,pooled = self.bert(context,mask,output_all_encoded_layers=False)#是否将bert中每层(12)都输出,false只输出最后一层(128,768) out = self.fc(pooled) return out
数据预处理:
数据预处理
PAD, CLS = '[PAD]', '[CLS]' #pad:占位符,input长度相同 #cls:放在句子首位,用于分类任务 def build_dataset(config): def load_dataset(path,pad_size=32): #根据pad_size进行补全或者截断 contents = [] with open(path,'r',encoding="utf-8") as f: for line in tqdm(f): lin = line.strip()#去除首尾空格和换行 if not lin: continue content, label = lin.split("\t")#根据tab键进行分割 token = config.tokenizer.tokenize(content) #分字,bert内置的 token = [CLS] + token #头部加入 seq_len = len(token) mask = [] #区分填充部分和非填充部分 token_ids = config.tokenize.convert_tokens_to_ids(token) #基于词表文件,将token转换为索引 #长截短补 if pad_size: if len(token) < pad_size: mask = [1] * len(token) + [0] * (pad_size - len(token)) token_ids += [0]*(pad_size - len(token)) else: mask = [1] * pad_size token_ids = token_ids[:pad_size] seq_len = pad_size contents.append(token_ids,int(label),seq_len,mask) return contents train = load_dataset(config.train_path,config.pad_size) dev = load_dataset(config.dev_path,config.pad_size) test = load_dataset(config.test_path,config.pad_size) return train, dev,test
数据迭代器构建
class DatasetIterater(object): def __init__(self, batches, batch_size, device): self.batch_size = batch_size self.batches = batches self.n_batches = len(batches)//batch_size self.residue = False if len(batches) % self.n_batches != 0: #不是整数batch self.residue = True self.device = device self.index = 0 def _to_tensor(self, datas):#将索引,标签,长度,mask转换为tensor类型 x = torch.LongTensor(_[0] for _ in datas).to(self.device) y = torch.LongTensor(_[1] for _ in datas).to(self.device) seq_len = torch.LongTensor(_[2] for _ in datas).to(self.device) mask = torch.LongTensor(_[3] for _ in datas).to(self.device) return (x,seq_len,mask),y def __next__(self): if self.residue and self.index == self.n_batches: batches = self.batches[self.index*self.index:len(self.batches)] self.index += 1 batches = self._to_tensor(batches) return batches elif self.index>self.n_batches: self.index = 0 raise StopIteration else: batches = self.batch_size[self.index*self.batch_size:(self.index+1)*self.batch_size] self.index += 1 batches = self._to_tensor(batches) return batches def __iter__(self): return self def __len__(self): if self.residue: return self.n_batches + 1 else: return self.n_batches def build_iterator(dataset, config): iter = DatasetIterater(dataset,config.batch_size,config.device) return iter
构建训练流程
训练
def train(config,model,train_iter,dev_iter,test_iter): start_time =time.time() #开启训练模式 model.train() #参数 param_optimizer = list(model.named_parameters()) #无需更新的参数 no_decay = ['bias','LayerNorm.bias','LayerNorm.weight'] #设置哪些参数需要更新,哪些不需要 optimizer_grouped_parameters = [ {'params':[p for n,p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay':0.01}, {'params':[p for n,p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay':0.0} ] #优化器搭建 optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=len(train_iter)*config.num_epochs) #记录总batch total_batch = 0 #记录验证集的损失值 dev_best_loss = float('inf') #记录上次改变的batch数 last_improve = 0 #训练结束标识 flag = False model.train() for epoch in range(config.num_epochs): print('Epoch [{}/{}]'.format(epoch+1,config.num_epochs)) for i,(trains,labels) in enumerate(train_iter): #前向传播,获取输出 outputs = model(trains) #梯度置零,否则会梯度累加 model.zero_grad() #交叉熵损失 loss = F.cross_entropy(outputs,labels) #计算梯度 loss.backward() #反向传播更新参数 optimizer.step() #每一百个batch进行输出结果 if total_batch % 100 == 0: #数据迁移到cpu上进行预测 true = labels.data.cpu() predict = torch.max(outputs.data,1)[1].cpu() #分类指标的文本报告:1.精确率 2.召回率 3.F1 score train_acc = metrics.accuracy_score(true,predict) #验证集准确率和损失 dev_acc,dev_loss = evaluate(config,model,dev_iter) #损失值降低,保存模型 if dev_loss < dev_best_loss: dev_best_loss =dev_loss torch.save(model.state_dict(),config.save_path) improve = '*' last_improve = total_batch else: improve = '' time_dif = get_time_dif(start_time) msg = "Iter: {0:>6}, Train loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}" print(msg.format(total_batch,loss.item(),train_acc,dev_loss,dev_acc,time_dif,improve)) model.train() total_batch += 1 #早停策略 if total_batch - last_improve > config.require_improvement: print("Early stopping") flag = True break if flag: break test(config,model,test_iter)
验证
def evaluate(config,model,data_iter,test=False): model.eval() loss_tatal = 0 predict_tatal = np.array([],dtype=int) label_tatal = np.array([],dtype=int) with torch.no_grad: for texts,labels in data_iter: output = model(texts) loss = F.cross_entropy(output,labels) loss_tatal += loss labels = labels.data.cpu().numpy() predict = torch.max(output.data,1)[1].cpu().numpy() label_tatal = np.append(label_tatal,labels) predict_tatal = np.append(predict_tatal,predict) acc = metrics.accuracy_score(label_tatal,predict_tatal) if test: report = metrics.classification_report(predict_tatal,label_tatal,target_names=config.class_list,digits=4) confution = metrics.confusion_matrix(predict_tatal,label_tatal) return acc,loss/len(data_iter),report,confution return acc,loss/len(data_iter)
测试
def test(model,config,test_iter): model.load_state_dict(torch.load(config.save_path)) model.eval() start_time = time.time() test_acc,test_loss,test_report,test_confution = evaluate(config,model,test_iter,test=True) msg = "Test loss:{0:>5.2},Test acc:{1:>6.2%}" print(msg.format(test_acc,test_loss)) print("test_report") print(test_report) print("confution") print(test_confution) time_dif =get_time_dif(start_time) print("use time",time_dif)
预测
import torch from importlib import import_module import os key = { 0: '金融', 1: '房产', 2: '股票', 3: '教育', 4: '科技', 5: '社会', 6: '政治', 7: '体育', 8: '游戏', 9: '娱乐' } cru = os.path.dirname(__file__) path = os.path.join(cru,'THUCNews') model_name = 'bert' x = import_module('bert_demo.models.' + model_name) config = x.Config(path) model = x.Model(config).to("cpu") model.load_state_dict(torch.load(config.save_path, map_location='cpu')) def build_predict_text(text): token = config.tokenizer.tokenize(text) token = ['[CLS]'] + token seq_len = len(token) mask = [] token_ids = config.tokenizer.convert_tokens_to_ids(token) pad_size = config.pad_size if pad_size: if len(token) < pad_size: mask = [1] * len(token_ids) + ([0] * (pad_size - len(token))) token_ids += ([0] * (pad_size - len(token))) else: mask = [1] * pad_size token_ids = token_ids[:pad_size] seq_len = pad_size ids = torch.LongTensor([token_ids]) seq_len = torch.LongTensor([seq_len]) mask = torch.LongTensor([mask]) return ids, seq_len, mask def predict(text): data = build_predict_text(text) with torch.no_grad(): outputs = model(data) num = torch.argmax(outputs) return key[int(num)] if __name__ == '__main__': while True: print(predict("福建省政务云平台基础设施运维服务项25555年招标公告"))