相关链接
【2021 高校大数据挑战赛-智能运维中的异常检测与趋势预测】1 赛后总结与分析
【2021 高校大数据挑战赛-智能运维中的异常检测与趋势预测】2 方案设计与实现-Python
【完整代码下载 823316627Bandeng 的github】
1 问题一
1.1 算法过程描述
(1)异常点个数检测思路:对每个小区的四个指标单独检测。即需要检测58*4条序列。并采用两种算法,取两种算法的并集为最终的检测结果。
(2)异常周期个数统计思路:根据对时间序列周期的分析,选择6个小时为一个时间周期,我们统计6个小时以内的出现两次或两次以上的异常点,就视为这个周期为异常周期。
1.2 算法实现
(1)读取数据,并只要四个KPI指标、小区编号、基站编号
import pandas as pd
from adtk.data import validate_series
from adtk.visualization import plot
from adtk.detector import QuantileAD
from adtk.detector import AutoregressionAD
from adtk.detector import SeasonalAD
import matplotlib.pyplot as plt
all_data = pd.read_excel('data/附件1:赛题A数据.xlsx', sheet_name='比赛数据-脱敏')
all_data = all_data.rename(columns={'时间': 'time', '小区内的平均用户数': 'avg_user', '基站编号':'base_id','小区编号': 'community_id', '小区PDCP层所接收到的上行数据的总吞吐量比特': 'upward_throughput',
'小区PDCP层所发送的下行数据的总吞吐量比特': 'downward_throughput', '平均激活用户数': 'avg_activate_users'})
f_train = all_data[['time', 'avg_user', 'base_id','community_id',
'upward_throughput', 'downward_throughput', 'avg_activate_users']]
train = f_train.copy()
# 增加id一列,为了打乱顺序后,还能恢复最原始的排列顺序
train['id'] = all_data.index
(2)根据小区编号,小区分组
# 按小区分组,分成58个Dataframe
all_df = []
for id in train['community_id'].value_counts().keys():
com_train = train.loc[train['community_id']==int(id)]
# com_train = com_train.drop(columns =['time'])
all_df.append(com_train)
(3)第一个小区四个指标检测可视化
# 第一个小区,绘制平均激活用户指标 异常检测图
data_one = all_df[0]
data_one.index = pd.DatetimeIndex(data_one.time)
train_avg_user = data_one[['avg_user', 'upward_throughput', 'downward_throughput', 'avg_activate_users']]
df = validate_series(train_avg_user)
seasonal_ad = SeasonalAD(c=1, side="both")
anomalies1 = seasonal_ad.fit_detect(df)
autoregression_ad = AutoregressionAD(n_steps=7*1, step_size=12, c=1)
anomalies2 = autoregression_ad.fit_detect(df)
quantile_ad = QuantileAD(high=0.9, low=0.1)
anomalies3 = quantile_ad.fit_detect(df)
# plot(df, anomaly=anomalies1, ts_markersize=1, anomaly_color='red',
# anomaly_tag="marker", anomaly_markersize=2)
plot(df, anomaly=anomalies2, ts_markersize=1, anomaly_color='red',
anomaly_tag="marker", anomaly_markersize=2)
# plot(df, anomaly=anomalies3, ts_markersize=1, anomaly_color='red',
# anomaly_tag="marker", anomaly_markersize=2)
# 保存图片
# plt.savefig('com_1_detection.png'
(4)统计异常点个数
kpi_df = []
# 取每个指标
dataset = []
for kpi in ['avg_user', 'upward_throughput', 'downward_throughput', 'avg_activate_users']:
# 取每个小区
error_count = 0
all_community = []
for j in range(len(all_df)):
# print(j)
com_1_train = all_df[j]
com_1_train = com_1_train.copy()
# 以时间戳作为索引
com_1_train.index = pd.DatetimeIndex(com_1_train.time)
train_avg_user = com_1_train[[kpi]]
df = validate_series(train_avg_user)
autoregression_ad = AutoregressionAD(n_steps=7*1, step_size=12, c=1)
anomalies1 = autoregression_ad.fit_detect(df)
quantile_ad = QuantileAD(high=0.9, low=0.1)
anomalies2 = quantile_ad.fit_detect(df)
index_list = anomalies1.index
# 按异常值索引给所有数据打标签
label_name = 'label_tmp'
for idx in index_list:
tag1 = anomalies1.at[str(idx), kpi]
tag2 = anomalies2.at[str(idx), kpi]
# 取两种检测算法的并集
if tag1 == True and tag2 == 1:
com_1_train.loc[str(idx), label_name] = 1
error_count += 1
else:
com_1_train.loc[str(idx), label_name] = 0
com_1_train.index = list(range(len(com_1_train)))
all_community.append(com_1_train)
one_kpi_data = pd.concat(all_community)
one_kpi_data.set_index(["id"], inplace=True)
one_kpi_data.sort_index(inplace=True)
kpi_df.append(one_kpi_data)
print('{} 异常点有{}'.format(kpi, error_count))
(5)给每个指标标记异常与否标签
# 给每个指标打label
label_1 = kpi_df[0]['label_tmp']
label_2 = kpi_df[1]['label_tmp']
label_3 = kpi_df[2]['label_tmp']
label_4 = kpi_df[3]['label_tmp']
dataset = f_train.copy()
for i,kpi in enumerate(['avg_user', 'upward_throughput', 'downward_throughput', 'avg_activate_users']):
l_name = '{}_label'.format(kpi)
dataset[l_name] = kpi_df[i]['label_tmp']
(6)统计异常周期个数
com_df = []
for id in dataset['community_id'].value_counts().keys():
com_train = dataset.loc[dataset['community_id'] == int(id)]
com_df.append(com_train)
for kpi in ['avg_user', 'upward_throughput','downward_throughput', 'avg_activate_users']:
# 取每个小区
n = 0
all_community = []
for j in range(len(com_df)):
# print(j)
com_1_train = com_df[j]
com_1_train = com_1_train.copy()
label_name = '{}_label'.format(kpi)
s = com_1_train[label_name].values.tolist()
j = 0
# 6的决定是根据傅里叶周期性判断分析得到的,具体看FFT_judge_Cycle.ipynb文件
cycle_time = 6
while j < len(s):
if s[j:j+cycle_time].count(1) > 1:
n += 1
j += cycle_time
print('{} 异常周期点有{}'.format(kpi, n))
2 问题二
2.1 算法过程描述
(1)异常预测思路:我针对整个数据集进行建模,视为一个多变量分类问题。模型输入的变量有平均用户数、上行PDCP、上行PDCP、平均激活用户、小区编号、基站编号。其中小区编号、基站编号变量属于类别特征,需要进行特征编码,我采用的one-hot编码。输出是未来数据的异常与否label。如下封装训练集的图所示。
2.2 算法实现
(1)导入包
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import LSTM, TimeDistributed
from tensorflow.keras.models import Sequential
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
(2)封装训练集
def get_batch(train_x, train_y, TIME_STEPS):
data_len = len(train_x) - TIME_STEPS
seq = []
res = []
for i in range(data_len):
seq.append(train_x[i:i + TIME_STEPS])
res.append(train_y[i:i + TIME_STEPS]) # 取后5组数据
seq, res = np.array(seq), np.array(res)
return seq, res
封住过程数据之前,需要构造有label的数据集,label是选择四个指标异常检测结果的或。比如说这条数据的只要有一个指标是异常,那整条数据就是异常,只有四个指标都不异常,这条数据才不异常,具体实现train_model()函数
(3)搭建模型-TensorFlow2实现
def build_model(TIME_STEPS, INPUT_SIZE, OUTPUT_SIZE):
model = Sequential()
model.add(LSTM(units=128, activation='relu',
return_sequences=True, input_shape=(TIME_STEPS, INPUT_SIZE)))
model.add(Dropout(0.2))
model.add(LSTM(units=64, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=32, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
#全连接,输出, add output layer
model.add(TimeDistributed(Dense(OUTPUT_SIZE,activation='sigmoid')))
model.summary()
model.compile(metrics=['accuracy'], loss=['binary_crossentropy'], optimizer='adam')
return model
(4)模型训练
时间步长和预测长度,这两个是超参数,会影响模型的收敛。我在做的过程,有可能第一问的label没有做好,在设置不同的时间步长、预测时长。
def train_model():
TIME_STEPS = 58*4 # 时间步长
PRED_SIZE = 1*24*58 # 预测长度
OUTPUT_SIZE = 1
BATCH_START = 24*10
INPUT_SIZE = 67
raw_data = pd.read_csv(
'/data/Or_detection_data.csv')
data_one = raw_data.drop(columns=['time'])
a = data_one['avg_user_label'].tolist()
b = data_one['upward_throughput_label'].tolist()
c = data_one['downward_throughput_label'].tolist()
d = data_one['avg_activate_users_label'].tolist()
label_data = list(map(lambda x: (x[0] or x[1] or x[2] or x[3]), zip(a, b, c, d)))
label_data = np.array(label_data)
label_data = label_data[:,np.newaxis]
# # 给类别特征,one-hot编码
data_one = pd.get_dummies(data_one, prefix=['baseId', 'communityId'], columns=['base_id', 'community_id'])
# 对数据进行归一化处理, valeus.shape=(, 8),inversed_transform时也需要8列
scaler_X = MinMaxScaler(feature_range=(0, 1))
train = scaler_X.fit_transform(data_one.astype('float32'))
scaler_Y = MinMaxScaler(feature_range=(0, 1))
train_label = scaler_Y.fit_transform(label_data)
data_X, data_Y = get_batch(train[:-PRED_SIZE], train_label[PRED_SIZE:], TIME_STEPS)
testX, testY = data_X[-1000:, :, :], data_Y[-1000:,:,:]
# testX, testY = data_X, data_Y
# 为了在LSTM中应用该数据,需要将其格式转化为3D format,即[Samples, timesteps, features]
model = build_model(TIME_STEPS, INPUT_SIZE, OUTPUT_SIZE)
save_path = './weight/Q2_weight_all.tf'
checkpoint = tf.keras.callbacks.ModelCheckpoint(
save_path, save_format='tf', monitor='val_accuracy',verbose=0, save_best_only=True, save_weights_only=True)
k = data_X.shape[0] % BATCH_START
trainX, trainY = data_X[k:], data_Y[k:]
# model.load_weights(save_path)
history = model.fit(trainX, trainY, batch_size=BATCH_START, epochs=7,
validation_split=0.1,
callbacks=[checkpoint],
verbose=1)
plt.figure(1)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='valid')
plt.legend()
plt.show()
y_p = model.predict(testX)
y_t = testY
y_pre = np.array(y_p).reshape(-1, OUTPUT_SIZE)
y_true = np.array(y_t).reshape(-1, OUTPUT_SIZE)
score = f1_score(y_true, np.argmax(y_pre, axis=1), average='macro')
print('模型F1得分 ={} '.format(str(score)))
if __name__ == '__main__':
train_model()
3 问题三
3.1 算法过程描述
(1)KPI指标预测:将问题视为一个多变量KPI预测问题。模型输入和第二问一样,输出是未来数据的4个KPI指标(平均用户数、上行PDCP、上行PDCP、平均激活用户)。
(2)对于模型的拟合,一般情况下需要对异常值进行剔除,我尝试对比了剔除异常值,发现线下验证 拟合效果并不如不剔除异常值,分析原因是异常值也包含了一些时序信息。在论文里分析了,但最终的预测模型是没有剔除异常值的。剔除异常值的方式是将异常值视为缺失值处理,用线性插值的方式填充。
3.2 算法实现
(1)导入包
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import LSTM, TimeDistributed
from tensorflow.keras.models import Sequential
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy import interpolate
import os
import openpyxl
from sklearn.metrics import mean_absolute_error, mean_squared_error
(2)封住训练集
def get_batch(train_x, train_y, TIME_STEPS):
data_len = len(train_x) - TIME_STEPS
seq = []
res = []
for i in range(data_len):
seq.append(train_x[i:i + TIME_STEPS])
res.append(train_y[i:i + TIME_STEPS]) # 取后5组数据
seq, res = np.array(seq), np.array(res)
return seq, res
(3)评价指标MAPE计算函数
def MAPE_score(true, pred):
diff = np.abs(np.array(true) - np.array(pred))
return np.mean(diff / true)
(4)构建LSTM模型
def build_model(TIME_STEPS, INPUT_SIZE, OUTPUT_SIZE):
model = Sequential()
model.add(LSTM(units=128, activation='relu',return_sequences=True, input_shape=(TIME_STEPS, INPUT_SIZE)))
model.add(Dropout(0.2))
model.add(LSTM(units=64, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=32, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
#全连接,输出, add output layer
model.add(TimeDistributed(Dense(OUTPUT_SIZE)))
model.summary()
model.compile(metrics=['mape'], loss=['mae'], optimizer='adam')
return model
(5)模型训练
def train_model(load_first=True,test_tag=False):
# 时间步长
TIME_STEPS = 58
# 预测大小,前3天数据预测预测后3天的数据,每次预测1个小时,,就是一个batch的数据是两个小时的。那需要从9.23 00:00开始预测
PRED_SIZE = 3*24*58
# 输出大小
OUTPUT_SIZE = 4
BATCH_START = 58*4
# 输入大小
INPUT_SIZE = 67
# 异常值处理
# clear_path = 'data/Q3_clear_data.csv'
# if os.path.exists(clear_path):
# data_all = pd.read_csv(
# 'data/Q3_clear_data.csv')
# else:
# raw_data = pd.read_csv('data/Or_detection_data.csv')
# # 为了后面打乱顺序后还能根据id重新排序回来
# raw_data['id'] = raw_data.index
# data_one = raw_data.drop(columns=['time'])
# data_one = data_one.copy()
# # 把异常值换成缺失值
# error_index = data_one[data_one.label == 1].index
# data_one.loc[error_index,'avg_user'] = np.nan
# data_one.loc[error_index,'upward_throughput'] = np.nan
# data_one.loc[error_index,'downward_throughput'] = np.nan
# data_one.loc[error_index,'avg_activate_users'] = np.nan
# # 以小区分组,数据以时间顺序排列
# all_df = []
# for id in raw_data['community_id'].value_counts().keys():
# com_train = raw_data.loc[raw_data['community_id'] == int(id)]
# com_train = com_train.drop(columns=['time','label'])
# all_df.append(com_train)
# inter_all_df = []
# # 线性插值
# for j in range(len(all_df)):
# com_one = all_df[j]
# for indexs in ['avg_user', 'upward_throughput', 'downward_throughput', 'avg_activate_users']:
# com_one[indexs] = com_one[indexs].interpolate()
# inter_all_df.append(com_one)
# inter_all_df = pd.concat(inter_all_df)
# inter_all_df.set_index(["id"], inplace=True)
# inter_all_df.sort_index(inplace=True)
# # 给列别特征,one-hot编码
# data_all = pd.get_dummies(inter_all_df, prefix=['baseId', 'communityId'], columns=['base_id', 'community_id'])
# data_all.to_csv('data/Q3_clear_data.csv', index=False)
# # 不异常值处理
raw_data = pd.read_csv('/data/detection_data.csv')
data_all = raw_data.drop(columns=['time', 'avg_user_label', 'upward_throughput_label',
'downward_throughput_label', 'avg_activate_users_label'])
data_all = pd.get_dummies(data_all, prefix=['baseId', 'communityId'], columns=[
'base_id', 'community_id'])
# 对数据进行归一化处理, valeus.shape=(, 8),inversed_transform时也需要8列
scaler_X = MinMaxScaler(feature_range=(0, 1))
train = scaler_X.fit_transform(data_all.values.astype('float32'))
scaler_Y = MinMaxScaler(feature_range=(0, 1))
label_data = data_all[['avg_user', 'upward_throughput','downward_throughput', 'avg_activate_users']]
train_label = scaler_Y.fit_transform(label_data.values.astype('float32'))
data_X, data_Y = get_batch(train[:-PRED_SIZE], train_label[PRED_SIZE:], TIME_STEPS)
testX, testY = data_X[-1:, :, :], data_Y[-1:, :, :]
# trainX, trainY = data_X[:-1,:, :], data_Y[:-1 :, :]
trainX, trainY = data_X, data_Y
# 为了在LSTM中应用该数据,需要将其格式转化为3D format,即[Samples, timesteps, features]
model = build_model(TIME_STEPS, INPUT_SIZE, OUTPUT_SIZE)
save_path = './weight/Q3_weight.tf'
checkpoint = tf.keras.callbacks.ModelCheckpoint(save_path, save_format='tf', monitor='val_mape',verbose=0, save_best_only=True, save_weights_only=True)
k = trainX.shape[0] % BATCH_START
trainX, trainY = trainX[k:], trainY[k:]
# model.load_weights(save_path)
history = model.fit(trainX, trainY, batch_size=BATCH_START, epochs=8,
validation_split=0.1,
callbacks=[checkpoint],
verbose=1)
plt.figure(1)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='valid')
plt.legend()
plt.show()
print()
y_out = model.predict(testX)
#预测数据逆缩放 invert scaling for forecast
y_p = []
y_t = []
for i in range(y_out.shape[0]):
y_pre = scaler_Y.inverse_transform(y_out[i, :, :])
y_p.append(y_pre)
#真实数据逆缩放 invert scaling for actual
y_true = scaler_Y.inverse_transform(testY[i, :, :])
y_t.append(y_true)
#画出真实数据和预测数据
y_pre = np.array(y_p).reshape(-1, OUTPUT_SIZE)
y_true = np.array(y_t).reshape(-1, OUTPUT_SIZE)
plt.figure(2)
for i, index in enumerate(label_data.columns):
ax1 = plt.subplot(4, 1, i+1)
mape_score = MAPE_score(y_true[:, i], y_pre[:, i])
print('{} 指标MAPE ={} '.format(str(index), mape_score))
mae_score = MAPE_score(y_true[:, i], y_pre[:, i])
print('{} 指标MAE ={} '.format(str(index), mae_score))
rmse_socre = np.sqrt(mean_squared_error(y_true[:, i], y_pre[:, i]))
print('{} 指标RMSE ={} '.format(str(index), rmse_socre))
plt.sca(ax1)
plt.plot(y_pre[:, i], label="Pre-"+str(index))
plt.plot(y_true[:, i], label="True-"+str(index))
plt.legend()
plt.show()
# 预测未来三天的数据
if test_tag ==True:
model.load_weights(save_path)
dataX,dataY = get_batch(train, train_label, TIME_STEPS)
test_data = dataX[-int(4176/TIME_STEPS):,:,:]
prediction_3day = model.predict(test_data)
pre_data = np.array([])
for i in range(prediction_3day.shape[0]):
tmp = scaler_Y.inverse_transform(prediction_3day[i, :, :])
if i ==0:
pre_data = tmp
else:
pre_data = np.vstack((pre_data,tmp))
print()
result_arr = pre_data
result_df = pd.read_excel('data/附件2:赛题A预测表格.xlsx', sheet_name='预测数据')
result_df['小区内的平均用户数'] = list(result_arr[:, 0])
result_df['小区PDCP层所接收到的上行数据的总吞吐量比特'] = list(result_arr[:,1])
result_df['小区PDCP层所发送的下行数据的总吞吐量比特'] = list(result_arr[:, 2])
result_df['平均激活用户数'] = list(result_arr[:,3])
result_df.to_excel('/data/prediction_table.xlsx',sheet_name='预测数据',index =False)
if __name__ == '__main__':
train_model(True,True)
选取部分数据,可视化模型拟合效果。