在做时序类数据的特征分析时,日历热力图(Calendar Heatmap)是一种较为直观的数据可视化方法。本文对一些可用来绘制日历热力图的开源工具进行分析比较。
-
Matplotlib
-
Seaborn
-
Pyecharts
-
Calmap
-
Calplot
-
July
-
Plotly
Matplotlib方法
Matplotlib是python绘图最基础的包,安装方法:
pip install matplotlib
直接使用Matplotlib生成热力图较为繁琐,主要是使用 subplot 生成多个子图再拼接在一起,可以参考的代码如下:
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
# 数据生成函数
def generate_data():
num = 100
data = np.random.randint(0, 20, num)
start = dt.datetime(2015, 3, 13)
dates = [start + dt.timedelta(days=i) for i in range(num)]
return dates, data
# 数据封装函数
def calendar_array(dates, data):
i, j = zip(*[d.isocalendar()[1:] for d in dates])
i = np.array(i) - min(i)
j = np.array(j) - 1
ni = max(i) + 1
calendar = np.nan * np.zeros((ni, 7))
calendar[i, j] = data
return i, j, calendar
# 横轴标签(星期)函数
def label_days(ax, dates, i, j, calendar):
ni, nj = calendar.shape
day_of_month = np.nan * np.zeros((ni, 7))
day_of_month[i, j] = [d.day for d in dates]
for (i, j), day in np.ndenumerate(day_of_month):
if np.isfinite(day):
ax.text(j, i, int(day), ha='center', va='center')
ax.set(xticks=np.arange(7),
xticklabels=['M', 'T', 'W', 'R', 'F', 'S', 'S'])
ax.xaxis.tick_top()
# 纵轴标签(月份)函数
def label_months(ax, dates, i, j, calendar):
month_labels = np.array(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul',
'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
months = np.array([d.month for d in dates])
uniq_months = sorted(set(months))
yticks = [i[months == m].mean() for m in uniq_months]
labels = [month_labels[m - 1] for m in uniq_months]
ax.set(yticks=yticks)
ax.set_yticklabels(labels, rotation=90)
# 传入日历数据和日期,输出日历图像
def calendar_heatmap(ax, dates, data):
i, j, calendar = calendar_array(dates, data)
im = ax.imshow(calendar, interpolation='none', cmap='summer')
label_days(ax, dates, i, j, calendar)
label_months(ax, dates, i, j, calendar)
ax.figure.colorbar(im)
if __name__ == '__main__':
dates, data = generate_data()
fig, ax = plt.subplots(figsize=(6, 10))
calendar_heatmap(ax, dates, data)
plt.show()
生成图形如下:
另外一种生成月份独立开来的图形的方法:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
# Settings
years = [2018] # [2018, 2019, 2020]
weeks = [1, 2, 3, 4, 5, 6]
days = ['M', 'T', 'W', 'T', 'F', 'S', 'S']
month_names = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August',
'September', 'October', 'November', 'December']
# 数据生成函数
def generate_data():
idx = pd.date_range('2018-01-01', periods=365, freq='D')
return pd.Series(range(len(idx)), index=idx)
# 数据切分函数
def split_months(df, year):
"""
Take a df, slice by year, and produce a list of months,
where each month is a 2D array in the shape of the calendar
:param df: dataframe or series
:return: matrix for daily values and numerals
"""
df = df[df.index.year == year]
# Empty matrices
a = np.empty((6, 7))
a[:] = np.nan
day_nums = {m:np.copy(a) for m in range(1,13)} # matrix for day numbers
day_vals = {m:np.copy(a) for m in range(1,13)} # matrix for day values
# Logic to shape datetimes to matrices in calendar layout
for d in df.iteritems(): # use iterrows if you have a DataFrame
day = d[0].day
month = d[0].month
col = d[0].dayofweek
if d[0].is_month_start:
row = 0
day_nums[month][row, col] = day # day number (0-31)
day_vals[month][row, col] = d[1] # day value (the heatmap data)
if col == 6:
row += 1
return day_nums, day_vals
# 绘图函数
def create_year_calendar(day_nums, day_vals):
fig, ax = plt.subplots(3, 4, figsize=(14.85, 10.5))
for i, axs in enumerate(ax.flat):
axs.imshow(day_vals[i+1], cmap='viridis', vmin=1, vmax=365) # heatmap
axs.set_title(month_names[i])
# Labels
axs.set_xticks(np.arange(len(days)))
axs.set_xticklabels(days, fontsize=10, fontweight='bold', color='#555555')
axs.set_yticklabels([])
# Tick marks
axs.tick_params(axis=u'both', which=u'both', length=0) # remove tick marks
axs.xaxis.tick_top()
# Modify tick locations for proper grid placement
axs.set_xticks(np.arange(-.5, 6, 1), minor=True)
axs.set_yticks(np.arange(-.5, 5, 1), minor=True)
axs.grid(which='minor', color='w', linestyle='-', linewidth=2.1)
# Despine
for edge in ['left', 'right', 'bottom', 'top']:
axs.spines[edge].set_color('#FFFFFF')
# Annotate
for w in range(len(weeks)):
for d in range(len(days)):
day_val = day_vals[i+1][w, d]
day_num = day_nums[i+1][w, d]
# Value label
axs.text(d, w+0.3, f"{day_val:0.0f}",
ha="center", va="center",
fontsize=7, color="w", alpha=0.8)
# If value is 0, draw a grey patch
if day_val == 0:
patch_coords = ((d - 0.5, w - 0.5),
(d - 0.5, w + 0.5),
(d + 0.5, w + 0.5),
(d + 0.5, w - 0.5))
square = Polygon(patch_coords, fc='#DDDDDD')
axs.add_artist(square)
# If day number is a valid calendar day, add an annotation
if not np.isnan(day_num):
axs.text(d+0.45, w-0.31, f"{day_num:0.0f}",
ha="right", va="center",
fontsize=6, color="#003333", alpha=0.8) # day
# Aesthetic background for calendar day number
patch_coords = ((d-0.1, w-0.5),
(d+0.5, w-0.5),
(d+0.5, w+0.1))
triangle = Polygon(patch_coords, fc='w', alpha=0.7)
axs.add_artist(triangle)
# Final adjustments
fig.suptitle('Calendar', fontsize=16)
plt.subplots_adjust(left=0.04, right=0.96, top=0.88, bottom=0.04)
# Save to file
plt.savefig('calendar_example.pdf')
for year in years:
df = generate_data()
day_nums, day_vals = split_months(df, year)
create_year_calendar(day_nums, day_vals)
plt.show()
生成的图形如下:
使用Matplotlib的优势是比较灵活,可自定义的地方较多,但是代码相对繁琐。
Seaborn方法
Seaborn是较为常用的Python可视化包,安装如下:
pip install seaborn
绘制日历热力图主要依赖的是seaborn.heatmap方法,相关的说明文档见:https://seaborn.pydata.org/generated/seaborn.heatmap.html
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
X = [1,2,3,7,4,1,5,9,0,8]
Y = [1,2,3,4,5,6,7,8,9,10]
Z = [0.2, 0.33, 0.1, 0.25, 0.0, 0.9, 0.75, 0.88, 0.44, 0.95]
data = pd.DataFrame({'X': X, 'Y': Y, 'Z': Z})
data_pivoted = data.pivot("X", "Y", "Z")
ax = sns.heatmap(data_pivoted)
plt.show()
对比可见Seaborn比Matplotlib更加友好易用,但是上图还不是真正的日历热力图,需要进一步优化。
以下为随机生成两年的步数数据,然后绘制日历热力图:
import random
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# 随机生成三年期间的每日步数表
step_data = pd.DataFrame(columns=["Time", "Year", "Month", "Steps"])
step_data["Time"] = pd.date_range('2020-01-01', '2022-12-31')
step_data["Year"] = step_data["Time"].dt.year.astype(int)
step_data["Month"] = step_data["Time"].dt.month.astype(int)
step_data["Steps"] = pd.DataFrame([random.randint(100, 20000)] for i in range(len(step_data))).astype(int)
# 将数据修改为易展示的格式(制作透视表)
year_step_data = step_data.copy()
year_step_data = year_step_data.groupby(["Year", "Month"]).sum().reset_index().pivot(index='Year', columns='Month', values='Steps').fillna(0).astype(int)
# 使用seaborn绘图
f, ax = plt.subplots(figsize=(20, 5))
cmap = sns.color_palette("Blues")
monthly_sessions = sns.heatmap(year_step_data,
annot=True,
fmt="d",
linewidths=5,
ax=ax,
cmap=cmap,
square=True)
ax.axes.set_title("Steps by month and year",fontsize=20)
ax.set_xlabel("Month",fontsize=15)
ax.set_ylabel("Year",fontsize=15)
plt.show()
结果如图:
Pyecharts方法
Pyecharts 是一个由百度开源的数据可视化工具包,安装方式:
pip install pyecharts
绘制直接使用 calendar 工具:
import random
import datetime
import pyecharts.options as opts
from pyecharts.charts import Calendar
begin = datetime.date(2022, 1, 1)
end = datetime.date(2022, 12, 31)
data = [
[str(begin + datetime.timedelta(days=i)), random.randint(1000, 25000)]
for i in range((end - begin).days + 1)
]
(
Calendar()
.add(
series_name="",
yaxis_data=data,
calendar_opts=opts.CalendarOpts(
pos_top="120",
pos_left="30",
pos_right="30",
range_="2022",
yearlabel_opts=opts.CalendarYearLabelOpts(is_show=False),
),
)
.set_global_opts(
title_opts=opts.TitleOpts(pos_top="30", pos_left="center", title="2022年步数情况"),
visualmap_opts=opts.VisualMapOpts(
max_=20000, min_=500, orient="horizontal", is_piecewise=False
),
)
.render("日历热力图.html")
)
便可以生成可交互式的网页:
Calmap方法
Calmap的安装方式如下:
pip install calmap
对数据进行预处理,这里使用随机生成的数据:
import numpy as np
import pandas as pd
import calmap
import matplotlib.pyplot as plt
all_days = pd.date_range('1/15/2014', periods=700, freq='D')
days = np.random.choice(all_days, 500)
events = pd.Series(np.random.randn(len(days)), index=days)
calmap 的绘图主要有两种方法:
-
yearplot :绘制一年的日历热力图;
-
calendarplot :绘制所有年份的日历热力图。
使用 yearplot 方法:
import numpy as np
import pandas as pd
import calmap
import matplotlib.pyplot as plt
all_days = pd.date_range('1/15/2014', periods=700, freq='D')
days = np.random.choice(all_days, 500)
events = pd.Series(np.random.randn(len(days)), index=days)
calmap.yearplot(events, year=2015)
plt.show()
生成图形如下:
使用 calendarplot 方法:
import numpy as np
import pandas as pd
import calmap
import matplotlib.pyplot as plt
all_days = pd.date_range('1/15/2014', periods=700, freq='D')
days = np.random.choice(all_days, 500)
events = pd.Series(np.random.randn(len(days)), index=days)
calmap.calendarplot(events, monthticks=3, daylabels='MTWTFSS',
dayticks=[0, 2, 4, 6], cmap='YlGn',
fillcolor='grey', linewidth=0,
fig_kws=dict(figsize=(8, 4)))
plt.show()
生成图形如下:
注意:calmap已经放弃维护了,在使用过程中可能会存在问题。
更多文档内容参见:https://pythonhosted.org/calmap/
Calplot方法
Calplot的安装方式如下:
pip install calplot
对要绘制的数据进行预处理,这里使用Kaggle的一个竞赛数据,该数据集为包含订单编号、订单日期、商品类型、价格、销售数量等字段的销售数据,下载地址为:https://www.kaggle.com/datasets/kyanyoga/sample-sales-data?select=sales_data_sample.csv
需要将订单日期转换为 datetime 格式再进行处理:
import pandas as pd
# 读取数据
df = pd.read_csv('sales_data_sample.csv')
# 数据格式转换
df['ORDERDATE'] = pd.to_datetime(df['ORDERDATE'])
# 将订单时间设置为索引
df.set_index('ORDERDATE', inplace = True)
Calplot的绘图方法简单易用:
import calplot
import matplotlib
import matplotlib.pylab as plt
pl1 = calplot.calplot(data = df['SALES'],how = 'sum', cmap = 'Reds', figsize = (16, 8), suptitle = "Total Sales by Month and Year")
plt.show()
如以上代码中使用calplot包中的 calplot方法生成图形,相应的参数说明:
-
data :想要绘制的数据,本例使用的是销售金额。
-
how :数据聚合的方式,该参数可以采用标准的python算数方法,这里使用的是 sum 来计算每日的总销售额。
-
cmap :指的是绘制所采用的配色方案,matplotlib内置了多种配色方案,可以参考: https://matplotlib.org/stable/tutorials/colors/colormaps.html
-
figsize :图像尺寸。
-
suptitle :图像标题。
绘制图形如下:
上图按照年份进行分割,纵轴为星期,横轴为月份。每个小方块的颜色越深表明销售额度越高,越浅表明销售额度越低。从上图可以很直观地看出11月的销售额最高。
如果按照订单数量进行统计,需要按天分组统计每日的订单数量,相关完整代码如下:
import pandas as pd
import calplot
import matplotlib.pylab as plt
# 读取数据
df = pd.read_csv('sales_data_sample.csv', encoding='utf-8')
# 统计每天的订单数量
counts = df.groupby('ORDERDATE')['ORDERNUMBER'].agg('count').reset_index()
# 数据格式转换
counts['ORDERDATE'] = pd.to_datetime(counts['ORDERDATE'])
# 将订单时间设置为索引
counts.set_index('ORDERDATE', inplace = True)
# 绘制图形
pl2 = calplot.calplot(counts['ORDERNUMBER'], cmap = 'GnBu', textformat ='{:.0f}', figsize = (16, 8), suptitle = "Total Orders by Month and Year")
plt.show()
绘制图形如下:
这里通过 textformat 参数展示结果数值,还有更多的其他参数可以参考:
-
Calplot文档: https://calplot.readthedocs.io/en/latest/
-
Calplot的Pypi主页: https://pypi.org/project/calplot/
July方法
使用July绘制日历热力图的方式类似于calplot,最大的差异是结果按照月而不是年的方式展现。
July的安装方式如下:
pip install july
July中主要有两种绘制图形的方法,可以基于我们的需求进行选择:
-
heatmap :绘制一整年的紧凑图形;
-
calendar_plot :将各个月份的情况分开展示。
使用 heatmap 方法:
import pandas as pd
import july
from july.utils import date_range
import matplotlib.pyplot as plt
# 读取数据
df = pd.read_csv('sales_data_sample.csv', encoding='utf-8')
# 生成日期dateframe
dates = date_range("2004-01-01", "2004-12-31")
# 绘图
july.heatmap(dates, data = df['SALES'], title='Total Sales', cmap="golden", month_grid=True, horizontal = True)
plt.show()
生成图形如下:
使用 calendar_plot 方法:
import pandas as pd
import july
from july.utils import date_range
import matplotlib.pyplot as plt
# 读取数据
df = pd.read_csv('sales_data_sample.csv', encoding='utf-8')
# 生成日期dateframe
dates = date_range("2004-01-01", "2004-12-31")
# 绘图
july.calendar_plot(dates, df['SALES'], cmap = 'copper')
plt.show()
生成图形如下:
如果想要查看具体某个月的详细情况,还可以使用 month_plot 方法:
import pandas as pd
import july
from july.utils import date_range
import matplotlib.pyplot as plt
# 读取数据
df = pd.read_csv('sales_data_sample.csv', encoding='utf-8')
# 生成日期dateframe
dates = date_range("2004-01-01", "2004-12-31")
# 绘图
july.month_plot(dates, df['SALES'], month=11, value_label=True)
plt.show()
如下展示11月的详细情况:
更多内容请查看项目地址:https://github.com/e-hulten/july
Plotly方法
Plotly 是一个基于 plotly.js 的python可视化库,其绘制的图形有较好的交互式效果。
Plotly的安装方法:
pip install plotly
绘制日历热力图还需要用到另一个包 plotly-calplot ,安装方式:
pip install plotly-calplot
相关绘制代码如下:
import pandas as pd
import numpy as np
from plotly_calplot import calplot
# 创建样例数据
dummy_start_date = "2021-01-01"
dummy_end_date = "2023-02-03"
dummy_df = pd.DataFrame(
{"ds": pd.date_range(dummy_start_date, dummy_end_date),
"value": np.random.randint(
low=0,
high=30,
size=(pd.to_datetime(dummy_end_date) - pd.to_datetime(dummy_start_date)).days + 1
)
}
)
# 绘制图形
fig = calplot(
dummy_df,
x="ds",
y="value"
)
fig.show()
结果如下:
通过设置相关参数还可以开启暗夜模式:
fig = calplot(
dummy_df,
x="ds",
y="value",
dark_theme=True
)
fig.show()
总结
以上介绍了七种常见的使用Python绘制日历热力图的方法,其中Matplotlib和Seaborn是相对基础的可视化包,可以定制和个性化的部分较强,但是代码相对冗长。Pyecharts和Plotly都是基于JavaScript的包进行的Python封装,可交互性较强。Calmap/Calplot/July都是做了更高级的封装,调用起来比较方便,但个性化修改空间有限。