针对阿里云PAI designer中心脏病数据分类预测的二分类评估的计算:
CSV的数据如下:
ifhealth prediction_result prediction_score 1.0 1.0 0.9966667942879242 0.0 1.0 0.9298914789557708 1.0 1.0 0.9846901616254438 0.0 1.0 0.9462712719594703 0.0 1.0 0.9444635187099122 0.0 1.0 0.9516372265103414 1.0 1.0 0.9966406721371299 0.0 1.0 0.6678175380727635 0.0 1.0 0.6415438396153912 0.0 1.0 0.9059790146200846 1.0 1.0 0.992921727376061 1.0 1.0 0.831870956859964 0.0 1.0 0.9612779235423258 1.0 1.0 0.9919396368097083 1.0 1.0 0.9916273674960602 0.0 1.0 0.8404149037218922 1.0 1.0 0.9875787797664172 1.0 1.0 0.9586744481583241 0.0 1.0 0.9623458867798506 1.0 1.0 0.9471595008676558 1.0 1.0 0.9945957178394087 0.0 1.0 0.9502650259905758 0.0 1.0 0.829452404239284 0.0 1.0 0.8863192201235089 0.0 1.0 0.885830873115397 0.0 1.0 0.6679729510448718 1.0 1.0 0.9878052271863805 0.0 1.0 0.6517749542042871 1.0 1.0 0.9942310516033694 0.0 1.0 0.9297614945042411 1.0 1.0 0.9639740171846837 1.0 1.0 0.9642707178878092 1.0 1.0 0.9899490946008963 1.0 1.0 0.9861960222087047 0.0 1.0 0.9548375606731054 1.0 1.0 0.9929732583582984 0.0 1.0 0.9306337430080971 0.0 1.0 0.6459098346647069 1.0 1.0 0.9848265619605089 1.0 1.0 0.9868557456671756 0.0 1.0 0.9365817882786825 0.0 1.0 0.9583570621152451 0.0 1.0 0.9055784384931642 1.0 1.0 0.8341981653896565 1.0 1.0 0.9943537054234038 0.0 1.0 0.929993106221854 1.0 1.0 0.990061774909401 0.0 1.0 0.9406087643634486 1.0 1.0 0.9863010724247677 1.0 1.0 0.9866151024497868 0.0 1.0 0.8871321713486561 1.0 1.0 0.891899893079265 1.0 1.0 0.9869135461782707 1.0 1.0 0.9925060675725533 0.0 1.0 0.9712077659325221 1.0 1.0 0.9944705734365958 1.0 1.0 0.9897647862138454 1.0 1.0 0.9971447803076375 1.0 1.0 0.9390138515935567 0.0 1.0 0.9190301493867434 1.0 1.0 0.8187682928512124 0.0 1.0 0.7681060440020924 0.0 1.0 0.8519305777364843 0.0 1.0 0.9463671203558736 1.0 1.0 0.9974830335831082 1.0 1.0 0.9967930548593038 0.0 1.0 0.8952222127949186 1.0 1.0 0.9008860294438562 0.0 1.0 0.9278817825648754 0.0 1.0 0.8406965507796301 1.0 1.0 0.9583989162777902 0.0 1.0 0.6953171588927556 0.0 1.0 0.6840915088425724 1.0 1.0 0.9638156155320655 0.0 1.0 0.9810855135230669 0.0 1.0 0.9304614492636984 1.0 1.0 0.9452592385480718 1.0 1.0 0.8839773155466862 0.0 1.0 0.6442015155675309 1.0 1.0 0.9737438185574755 0.0 1.0 0.9749484485852054 0.0 1.0 0.6998457106808499 1.0 1.0 0.9879689216245541 0.0 1.0 0.8959977386829386 1.0 1.0 0.9911917394813492 0.0 1.0 0.9452406795933016 1.0 1.0 0.9579363819236905 1.0 1.0 0.9736869661543863 0.0 1.0 0.838161043063212 0.0 1.0 0.7714120341361188 1.0 1.0 0.9926407905668786 0.0 1.0 0.8959422911760214 0.0 1.0 0.6794290160203149 0.0 1.0 0.976823289773342 0.0 0.0 0.5634924243677479 0.0 1.0 0.6668211003107292 0.0 1.0 0.9785027127865454
使用python计算F1 score
对于一个连续值的预测值来说,最关键的是要找到一个最佳阈值来划分0-1 label。根据划分的阈值,得到预测的label之后,再计算F1 Score。
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 二分类评估脚本:将 prediction_score 分成 1000 个桶计算 F1 Score 使用 sklearn 进行评估 """ import pandas as pd import numpy as np from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, accuracy_score import warnings warnings.filterwarnings('ignore') def load_data(filepath='heart.txt'): """读取数据文件""" df = pd.read_csv(filepath, sep='\s+') return df def evaluate_f1_buckets(y_true, scores, num_buckets=1000): """ 将 scores 分成 num_buckets 个桶,计算每个阈值下的 F1 Score Args: y_true: 真实标签 scores: 预测分数 (0-1 连续值) num_buckets: 桶的数量,默认 1000 Returns: results_df: 包含各阈值下评估指标的 DataFrame best_result: 最佳 F1 Score 对应的结果 """ # 生成阈值(0到1分成 num_buckets 个区间) thresholds = np.linspace(0, 1, num_buckets + 1) results = [] best_f1 = 0 best_threshold = 0 for threshold in thresholds: y_pred = (scores >= threshold).astype(int) # 计算指标 f1 = f1_score(y_true, y_pred, zero_division=0) precision = precision_score(y_true, y_pred, zero_division=0) recall = recall_score(y_true, y_pred, zero_division=0) accuracy = accuracy_score(y_true, y_pred) # 混淆矩阵 if len(np.unique(y_pred)) > 1: tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() else: tn, fp, fn, tp = 0, 0, 0, 0 results.append({ 'threshold': threshold, 'f1': f1, 'precision': precision, 'recall': recall, 'accuracy': accuracy, 'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn }) if f1 > best_f1: best_f1 = f1 best_threshold = threshold results_df = pd.DataFrame(results) best_idx = results_df['f1'].idxmax() best_result = results_df.loc[best_idx] return results_df, best_result, best_threshold def print_results(results_df, best_result, best_threshold, y_true): """打印评估结果""" print('=' * 60) print('二分类评估: 将 prediction_score 分成 1000 个桶计算 F1 Score') print('=' * 60) print(f'样本总数: {len(y_true)}') print(f'正样本 (ifhealth=1): {sum(y_true == 1)}') print(f'负样本 (ifhealth=0): {sum(y_true == 0)}') print() # 显示部分结果(每100个桶显示一个) print('部分阈值结果展示(每100个桶):') print('-' * 80) header = f"{'阈值':>10} | {'F1 Score':>10} | {'精确率':>10} | {'召回率':>10} | {'准确率':>10}" print(header) print('-' * 80) for i in range(0, len(results_df), 100): r = results_df.iloc[i] print(f"{r['threshold']:>10.4f} | {r['f1']:>10.4f} | {r['precision']:>10.4f} | {r['recall']:>10.4f} | {r['accuracy']:>10.4f}") print() print('=' * 60) print('最佳 F1 Score 结果:') print('=' * 60) print(f"最佳阈值: {best_threshold:.4f}") print(f"最佳 F1 Score: {best_result['f1']:.4f}") print(f"对应精确率 (Precision): {best_result['precision']:.4f}") print(f"对应召回率 (Recall): {best_result['recall']:.4f}") print(f"对应准确率 (Accuracy): {best_result['accuracy']:.4f}") print() print('混淆矩阵:') print(' 预测') print(' 0 1') print(f"实际 0 {int(best_result['tn']):>4} {int(best_result['fp']):>4} (负样本)") print(f" 1 {int(best_result['fn']):>4} {int(best_result['tp']):>4} (正样本)") print() # F1 Score 统计 print('=' * 60) print('F1 Score 统计信息:') print('=' * 60) print(f"F1 Score 最小值: {results_df['f1'].min():.4f}") print(f"F1 Score 最大值: {results_df['f1'].max():.4f}") print(f"F1 Score 平均值: {results_df['f1'].mean():.4f}") print(f"F1 Score 中位数: {results_df['f1'].median():.4f}") print(f"F1 Score > 0.8 的阈值数量: {sum(results_df['f1'] > 0.8)}") def main(): """主函数""" # 读取数据 df = load_data('heart.txt') y_true = df['ifhealth'].values scores = df['prediction_score'].values # 评估 results_df, best_result, best_threshold = evaluate_f1_buckets(y_true, scores, num_buckets=1000) # 打印结果 print_results(results_df, best_result, best_threshold, y_true) # 可选:保存结果到 CSV # results_df.to_csv('f1_evaluation_results.csv', index=False) # print("\n结果已保存到 f1_evaluation_results.csv") if __name__ == '__main__': main()
预测的结果输出:
数据概况 样本总数: 97 正样本 (ifhealth=1): 46 负样本 (ifhealth=0): 51 评估结果(1000 个桶) 最佳 F1 Score 指标 数值 最佳阈值 0.9550 最佳 F1 Score 0.8132 精确率 (Precision) 0.8222 召回率 (Recall) 0.8043 准确率 (Accuracy) 0.8247 混淆矩阵 plaintext 预测 0 1 实际 0 43 8 (负样本) 1 9 37 (正样本) F1 Score 统计 最小值: 0.0000 最大值: 0.8132 平均值: 0.6639 中位数: 0.6434 F1 > 0.8 的阈值数量: 6 个 说明 将 prediction_score (0-1 连续值) 分成 1000 个等宽桶(阈值从 0 到 1,步长 0.001) 对每个阈值计算预测标签(score ≥ 阈值 则为正类) 使用 sklearn 的 f1_score 计算每个阈值下的 F1 Score 最佳 F1 Score 为 0.8132,对应阈值为 0.9550