1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
| import pandas as pd import numpy as np import matplotlib.pyplot as plt import statsmodels.api as sm from statsmodels.tsa.stattools import adfuller, acf, pacf from statsmodels.tsa.arima.model import ARIMA from sklearn.metrics import mean_squared_error from statsmodels.graphics.tsaplots import plot_acf, plot_pacf from statsmodels.stats.diagnostic import acorr_ljungbox
df = pd.read_csv('weatherAUS.csv')
plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False
df['Date'] = pd.to_datetime(df['Date']) df.set_index('Date', inplace=True)
df = df.dropna(subset=['MaxTemp'])
locations = df['Location'].unique()
forecasts = {}
for location in locations: print(f"正在处理地点: {location}") location_data = df[df['Location'] == location]['MaxTemp'] location_data = location_data.asfreq('D') location_data = location_data.interpolate(method='linear') location_data = location_data.dropna() plt.figure(figsize=(14, 6)) plt.plot(location_data) plt.title(f'{location} 的最高温度时间序列') plt.xlabel('日期') plt.ylabel('最高温度') plt.grid(True) plt.show() adf_result = adfuller(location_data) print('ADF 检验结果:', adf_result) print(f'ADF统计量: {adf_result[0]}') print(f'p值: {adf_result[1]}') print(f'使用的滞后数: {adf_result[2]}') print(f'样本量: {adf_result[3]}') print('临界值:') for key, value in adf_result[4].items(): print(f' {key}: {value}') if adf_result[1] < 0.05: print("时间序列是平稳的") else: print("时间序列不是平稳的") decomposition = sm.tsa.seasonal_decompose(location_data, model='additive', period=365) fig = decomposition.plot() plt.show() plt.figure(figsize=(14, 6)) plot_acf(location_data, lags=50) plt.title(f'{location} 的自相关函数(ACF)') plt.xlabel('滞后数') plt.ylabel('自相关') plt.grid(True) plt.show() seasonal_diff = location_data.diff(365).dropna() plt.figure(figsize=(14, 6)) plt.plot(seasonal_diff) plt.title(f'{location} 的最高温度季节性差分时间序列') plt.xlabel('日期') plt.ylabel('差分后的最高温度') plt.grid(True) plt.show() plt.figure(figsize=(14, 6)) plot_acf(seasonal_diff, lags=50) plt.title(f'{location} 的最高温度季节性差分自相关函数(ACF)') plt.xlabel('滞后数') plt.ylabel('自相关') plt.grid(True) plt.show() plt.figure(figsize=(14, 6)) plot_pacf(seasonal_diff, lags=50) plt.title(f'{location} 的最高温度季节性差分偏自相关函数(PACF)') plt.xlabel('滞后数') plt.ylabel('偏自相关') plt.grid(True) plt.show() result = adfuller(seasonal_diff.dropna()) print('ADF Statistic: %f' % result[0]) print('p-value: %f' % result[1]) print('Critical Values:') for key, value in result[4].items(): print('\t%s: %.3f' % (key, value)) if result[1] > 0.05: print("季节性差分后的数据仍然是非平稳的") else: print("季节性差分后的数据是平稳的") print("差分序列的白噪声检验结果:\n", acorr_ljungbox(seasonal_diff.dropna(), lags=1)) order_select = sm.tsa.arma_order_select_ic(seasonal_diff.dropna(), ic=['bic'], trend='n', max_ar=5, max_ma=5) print('BIC选择的最佳参数:', order_select.bic_min_order) train_data = seasonal_diff[:-365] test_data = seasonal_diff[-365:] model = ARIMA(train_data, order=(order_select.bic_min_order[0], 0, order_select.bic_min_order[1])) model_fit = model.fit() forecast_result = model_fit.get_forecast(steps=len(test_data)) forecast = forecast_result.predicted_mean conf_int = forecast_result.conf_int() seasonal_base = location_data.shift(365).dropna() restored_forecast = forecast + seasonal_base[-len(forecast):] forecasts[location] = { 'forecast': restored_forecast, 'conf_int': conf_int, 'train_data': train_data, 'test_data': test_data, 'original_data': location_data } plt.figure(figsize=(14, 10)) plt.plot(location_data.index, location_data, label='原始数据', color='blue', marker='o', markersize=3) plt.plot(restored_forecast.index, restored_forecast, label='还原后的预测值', color='red', linestyle='--', marker='x', markersize=3) plt.fill_between(test_data.index, conf_int.iloc[:, 0] + seasonal_base[-len(forecast):], conf_int.iloc[:, 1] + seasonal_base[-len(forecast):], color='pink', alpha=0.3) plt.title(f'{location} 的最高温度原始数据与还原后的预测值对比(所有数据)') plt.xlabel('日期') plt.ylabel('最高温度') plt.grid(True) plt.legend() plt.show() plt.figure(figsize=(14, 10)) plt.plot(location_data.index[-700:], location_data[-700:], label='原始数据', color='blue', marker='o', markersize=3) plt.plot(restored_forecast.index[-365:], restored_forecast, label='还原后的预测值', color='red', linestyle='--', marker='x', markersize=3) plt.fill_between(test_data.index[-365:], conf_int.iloc[:, 0] + seasonal_base[-365:], conf_int.iloc[:, 1] + seasonal_base[-365:], color='pink', alpha=0.3) plt.title(f'{location} 的最高温度原始数据与还原后的预测值对比(最后700天)') plt.xlabel('日期') plt.ylabel('最高温度') plt.grid(True) plt.legend() plt.show() plt.figure(figsize=(14, 10)) plt.plot(location_data.index[-30:], location_data[-30:], label='原始数据', color='blue', marker='o', markersize=3) plt.plot(restored_forecast.index[-30:], restored_forecast[-30:], label='还原后的预测值', color='red', linestyle='--', marker='x', markersize=3) plt.fill_between(test_data.index[-30:], conf_int.iloc[:, 0].iloc[-30:] + seasonal_base.iloc[-30:], conf_int.iloc[:, 1].iloc[-30:] + seasonal_base.iloc[-30:], color='pink', alpha=0.3) plt.title(f'{location} 的最高温度原始数据与还原后的预测值对比(最后30天)') plt.xlabel('日期') plt.ylabel('最高温度') plt.grid(True) plt.legend() plt.show() print(f'还原后的预测值: {restored_forecast}') print(f'预测的置信区间: {conf_int}')
for location, result in forecasts.items(): print(f"地点: {location}") print(f"还原后的预测值: {result['forecast']}") print(f"预测的置信区间: {result['conf_int']}") print("\n")
|