# -*- coding: utf-8 -*-
"""
===========================================================================
模組三｜非線性的統計失靈（The Nonlinear Blind Spot）
台灣登革熱月病例數 × 月均氣溫
===========================================================================

【教學目標｜ILO】
  1. 學習者能從疾管署開放資料 API 下載台灣登革熱逐案資料，
     並依「發病日 × 居住縣市 × 是否境外移入」聚合為月病例序列。
  2. 學習者能以 NASA POWER 開放 API 取得在地（台南）月均氣溫，
     並將兩個異質資料源依「年-月」鍵值合併。
  3. 學習者能對同一份資料同時擬合 OLS（線性）與 GAM（廣義加成模型，
     非線性樣條），並比較兩者的解釋力與「看見的形狀」。
  4. 學習者能理解一個關鍵的統計素養命題：
     **近乎為零的線性相關（r ≈ 0），不代表「沒有關係」，
       而可能代表「關係不是線性的」。** 倒 U 形關係會讓 OLS 的
       正負斜率相互抵消，使線性模型完全失明。

【資料來源】
  (1) 登革熱逐案資料：衛生福利部疾病管制署「登革熱每日確定病例」
      URL: https://od.cdc.gov.tw/eic/Dengue_Daily.csv
      授權：政府資料開放授權條款第 1 版
      欄位（節錄）：發病日、是否境外移入、居住縣市、確定病例數
      範圍：1998 迄今逐案；本模組取 2008–2023、本土、居住台南市

  (2) 在地月均氣溫：NASA POWER（Prediction Of Worldwide Energy Resources）
      參數 T2M（地表 2 公尺月均溫），台南市座標 (22.99N, 120.21E)
      URL: https://power.larc.nasa.gov/api/temporal/monthly/point
      無需金鑰；MERRA-2 再分析資料。

【為何用台南？】
  台南市是 2015 年台灣登革熱大爆發的震央——當年單一縣市超過 2.2 萬例。
  本土、單一縣市的序列讓「氣溫—傳播」訊號不被跨縣市異質性稀釋，
  也讓學生對「自己島上的疫情」產生切身感。

【方法學定位】
  與模組一（漸進趨勢）、模組二（結構斷層）對照，本模組呈現第三種
  統計結構：**非線性閾值反應（threshold / inverted-U）**。其教學重點
  不在「氣候害死多少人」，而在「當你用錯的模型，資料就會對你說謊」。

【運算環境】
  Python 3.10+（Colab / 本地皆可）
  必要套件：pandas, numpy, matplotlib, statsmodels, scipy
  選用套件：pygam（若已安裝則用之；否則自動回退至 statsmodels 的
            GLMGam + B-spline，結果等價）

【Colab 首次執行】
  !apt-get install -y fonts-noto-cjk -q      # 中文字型（裝完需 Runtime→Restart）
  !pip install pygam -q                       # 選用；不裝也能跑

===========================================================================
"""

# ============================================================================
# 第 0 節｜環境設定
# ============================================================================
import io
import sys
import json
import warnings
import urllib.request

# Windows 終端機常為 cp950，無法輸出 ✓／中文；強制 UTF-8 以免訊息出錯。
try:
    sys.stdout.reconfigure(encoding="utf-8")
except Exception:
    pass

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from scipy import stats
import statsmodels.api as sm

warnings.filterwarnings("ignore")

rcParams["font.sans-serif"] = [
    "Noto Sans CJK TC", "Microsoft JhengHei",
    "PingFang TC", "Arial Unicode MS", "DejaVu Sans",
]
rcParams["axes.unicode_minus"] = False
rcParams["figure.dpi"] = 110

_HEADERS = {"User-Agent": "Mozilla/5.0 (educational research)"}

# 分析範圍（可改）
YEAR_START, YEAR_END = 2008, 2023
COUNTY = ["臺南市", "台南市"]      # 兼容新舊用字
TAINAN_LATLON = (22.99, 120.21)


# ============================================================================
# 第 1 節｜讀取疾管署登革熱逐案資料，聚合為台南本土月病例序列
# ============================================================================
def load_dengue_monthly() -> pd.Series:
    """
    下載疾管署「登革熱每日確定病例」逐案資料，篩選：
      - 是否境外移入 == 否（本土病例）
      - 居住縣市 ∈ {臺南市, 台南市}
    依「發病日」之年-月聚合確定病例數，回傳完整月序列（缺月補 0）。
    """
    url = "https://od.cdc.gov.tw/eic/Dengue_Daily.csv"
    print("  下載疾管署登革熱資料中（約 18 MB）...")
    raw = urllib.request.urlopen(
        urllib.request.Request(url, headers=_HEADERS), timeout=90).read()
    # 疾管署 CSV 帶 UTF-8 BOM；用 utf-8-sig 去除
    txt = raw.decode("utf-8-sig", errors="replace")
    df = pd.read_csv(io.StringIO(txt), low_memory=False)
    df = df.rename(columns={
        "發病日": "onset", "是否境外移入": "imported",
        "居住縣市": "county", "確定病例數": "n",
    })
    df["onset"] = pd.to_datetime(df["onset"], errors="coerce")
    df = df.dropna(subset=["onset"])
    df["n"] = pd.to_numeric(df["n"], errors="coerce").fillna(1)

    sub = df[(df["imported"] == "否") & (df["county"].isin(COUNTY))].copy()
    sub["ym"] = sub["onset"].dt.to_period("M")
    monthly = sub.groupby("ym")["n"].sum()

    idx = pd.period_range(f"{YEAR_START}-01", f"{YEAR_END}-12", freq="M")
    monthly = monthly.reindex(idx, fill_value=0).astype(float)
    monthly.name = "cases"
    print(f"  ✓ 台南本土月病例 {len(monthly)} 個月："
          f"總計 {int(monthly.sum()):,} 例，單月最高 {int(monthly.max()):,} 例"
          f"（{monthly.idxmax()}）")
    return monthly


# ============================================================================
# 第 2 節｜讀取 NASA POWER 台南月均氣溫
# ============================================================================
def load_tainan_temperature() -> pd.Series:
    """
    向 NASA POWER 取得台南月均溫 T2M（°C），回傳與第 1 節對齊的月序列。
    """
    lat, lon = TAINAN_LATLON
    url = ("https://power.larc.nasa.gov/api/temporal/monthly/point"
           f"?parameters=T2M&community=AG&longitude={lon}&latitude={lat}"
           f"&start={YEAR_START}&end={YEAR_END}&format=JSON")
    print("  下載 NASA POWER 台南月均氣溫中...")
    raw = urllib.request.urlopen(
        urllib.request.Request(url, headers=_HEADERS), timeout=90).read()
    t2m = json.loads(raw)["properties"]["parameter"]["T2M"]
    # 鍵形如 '200801'..'200812'；'YYYY13' 是年均值，需排除
    rec = {pd.Period(f"{k[:4]}-{k[4:]}", freq="M"): float(v)
           for k, v in t2m.items() if k[-2:] != "13"}
    temp = pd.Series(rec).sort_index()
    idx = pd.period_range(f"{YEAR_START}-01", f"{YEAR_END}-12", freq="M")
    temp = temp.reindex(idx)
    temp.name = "temp"
    print(f"  ✓ 台南月均溫 {temp.notna().sum()} 個月，"
          f"範圍 {temp.min():.1f}–{temp.max():.1f}°C")
    return temp


# ============================================================================
# 第 1+2 節｜回退用模擬資料（離線課堂或 API 中斷時）
# ============================================================================
def _simulate() -> pd.DataFrame:
    """
    產生與真實資料同形狀（倒 U + 閾值）的模擬序列，保證離線可教學。
    """
    print("  !! 改用模擬資料（形狀與真實一致，但非真值）")
    rng = np.random.default_rng(2015)
    idx = pd.period_range(f"{YEAR_START}-01", f"{YEAR_END}-12", freq="M")
    month = np.array([p.month for p in idx])
    # 台南月均溫年週期：1 月 ~18°C、7 月 ~30°C
    temp = 24 + 6 * np.cos(2 * np.pi * (month - 7) / 12) + rng.normal(0, 0.6, len(idx))
    # 傳播力倒 U：峰值約 27.5°C；再乘上爆發年放大
    risk = np.clip(np.exp(-((temp - 27.5) ** 2) / 6) - 0.1, 0, None)
    boom = np.where(np.array([p.year for p in idx]) == 2015, 40, 3)
    cases = np.round(risk * boom * rng.lognormal(2.0, 0.8, len(idx))).clip(0)
    return pd.DataFrame({"temp": temp, "cases": cases}, index=idx)


def build_dataset() -> pd.DataFrame:
    """整合第 1、2 節，回傳 columns=[temp, cases] 的月資料表。"""
    try:
        cases = load_dengue_monthly()
        temp = load_tainan_temperature()
        df = pd.concat([temp, cases], axis=1).dropna()
        if len(df) < 24:
            raise ValueError("有效月份過少")
        return df
    except Exception as e:
        print(f"  網路下載失敗：{e}")
        return _simulate()


# ============================================================================
# 第 3 節｜OLS 線性擬合（「錯的模型」）
# ============================================================================
def fit_ols(df: pd.DataFrame):
    """cases ~ temp 的最小平方線性迴歸。"""
    model = sm.OLS(df["cases"], sm.add_constant(df["temp"])).fit()
    r, p = stats.pearsonr(df["temp"], df["cases"])
    rho, _ = stats.spearmanr(df["temp"], df["cases"])
    return model, dict(r2=model.rsquared, slope=model.params["temp"],
                       pearson_r=r, pearson_p=p, spearman=rho)


# ============================================================================
# 第 4 節｜GAM 非線性擬合（「對的模型」）
# ============================================================================
def fit_gam(df: pd.DataFrame, grid: np.ndarray):
    """
    對 cases ~ s(temp) 擬合廣義加成模型，回傳 (grid 上的預測, pseudo-R²)。

    優先使用 pygam（若已安裝）；否則回退至 statsmodels 的 GLMGam +
    三次 B-spline。兩者皆為平滑樣條，倒 U 形擬合結果在教學上等價。
    """
    x = df["temp"].values
    y = df["cases"].values
    try:
        from pygam import LinearGAM, s
        gam = LinearGAM(s(0, n_splines=8)).fit(x[:, None], y)
        pred = gam.predict(grid[:, None])
        fitted = gam.predict(x[:, None])
        backend = "pygam"
    except Exception:
        from statsmodels.gam.api import GLMGam, BSplines
        bs = BSplines(x[:, None], df=[6], degree=[3])
        gam = GLMGam(y, exog=np.ones((len(y), 1)), smoother=bs).fit()
        params = np.asarray(gam.params)
        pred = params[0] + bs.transform(grid[:, None]) @ params[1:]
        fitted = np.asarray(gam.fittedvalues)
        backend = "statsmodels GLMGam"
    ss_res = float(np.sum((y - fitted) ** 2))
    ss_tot = float(np.sum((y - y.mean()) ** 2))
    pseudo_r2 = 1 - ss_res / ss_tot
    return pred, pseudo_r2, backend


# ============================================================================
# 第 5 節｜分箱均值（讓「閾值/開關」現形的最直白工具）
# ============================================================================
def binned_means(df: pd.DataFrame, edges=(16, 20, 24, 26, 28, 31)) -> pd.DataFrame:
    """依氣溫帶切分，計算各帶的月均病例數。"""
    cut = pd.cut(df["temp"], list(edges))
    g = df.groupby(cut, observed=True)["cases"].agg(["mean", "max", "count"])
    return g


# ============================================================================
# 第 6 節｜視覺化：GAM 看見的，OLS 看不見的
# ============================================================================
def plot_module3(df, ols_model, gam_pred, grid, gam_r2, stats_d, bins,
                 savepath="fig3_dengue_gam.png"):
    fig, axes = plt.subplots(1, 2, figsize=(13, 5))

    # ── 左：散點 + OLS 直線 + GAM 曲線 ──
    # 2015 大爆發單月達上萬例，會把 y 軸撐到看不見 GAM 的倒 U。
    # 故將 y 軸裁切至可顯示曲線結構的範圍，並誠實標註超出範圍的爆發峰值。
    ax = axes[0]
    peak_t = float(grid[int(np.argmax(gam_pred))])
    YTOP = 1500
    ax.scatter(df["temp"], df["cases"].clip(upper=YTOP), s=26, color="#5B9BD5",
               alpha=0.55, edgecolor="none", zorder=2, label="月觀測（台南本土）")
    # OLS 直線
    xs = np.linspace(df["temp"].min(), df["temp"].max(), 50)
    ax.plot(xs, ols_model.predict(sm.add_constant(xs)),
            color="#2980B9", lw=2, ls="--",
            label=f"OLS 線性（R²={stats_d['r2']:.3f}）", zorder=3)
    # GAM 曲線
    ax.plot(grid, np.clip(gam_pred, 0, None),
            color="#E74C3C", lw=2.6,
            label=f"GAM 非線性（R²={gam_r2:.3f}）", zorder=3)
    ax.axvspan(25, 29, alpha=0.08, color="#E74C3C")
    ax.axvline(peak_t, color="gray", ls=":", lw=1)
    ax.text(peak_t + 0.2, YTOP * 0.55, f"GAM 峰值\n≈{peak_t:.1f}°C",
            fontsize=9, color="gray")
    # 誠實標註：超出 y 軸範圍的 2015 大爆發
    boom_max = int(df["cases"].max())
    bx = float(df.loc[df["cases"].idxmax(), "temp"])
    ax.annotate(f"2015 台南大爆發\n單月 {boom_max:,} 例 ↑（超出範圍）",
                (bx, YTOP), xytext=(20.3, YTOP * 0.82), fontsize=8.5,
                color="#C0392B",
                arrowprops=dict(arrowstyle="->", color="#C0392B", lw=1))
    ax.set_ylim(-40, YTOP)
    ax.set_xlabel("月均溫（°C）", fontsize=11)
    ax.set_ylabel("月確定病例數（人，y 軸裁切至 1,500）", fontsize=10.5)
    ax.set_title("月均溫 × 登革熱：OLS vs GAM", fontsize=12, fontweight="bold")
    ax.legend(fontsize=9, loc="upper right")

    # ── 右：分箱均值長條（閾值/開關現形）──
    ax2 = axes[1]
    labels = [f"{int(iv.left)}–{int(iv.right)}" for iv in bins.index]
    means = bins["mean"].values
    colors = plt.cm.YlOrRd(np.linspace(0.25, 0.95, len(means)))
    ax2.bar(labels, means, color=colors, edgecolor="#7a3b1d", linewidth=0.6)
    for i, v in enumerate(means):
        ax2.text(i, v, f"{v:.0f}", ha="center", va="bottom", fontsize=9)
    ax2.set_xlabel("月均溫帶（°C）", fontsize=11)
    ax2.set_ylabel("該溫度帶的月均病例數", fontsize=11)
    ax2.set_title("分箱均值：傳播力像「開關」", fontsize=12, fontweight="bold")

    fig.suptitle("圖三｜非線性的統計失靈：r≈0.14 的線性相關，藏著一條清楚的倒 U",
                 fontsize=13, fontweight="bold", y=1.02)
    plt.tight_layout()
    fig.savefig(savepath, dpi=200, bbox_inches="tight")
    plt.close()
    print(f"  ✓ 圖已存：{savepath}")
    return peak_t


# ============================================================================
# 第 7 節｜主流程
# ============================================================================
def main():
    print("\n[模組三] 台灣登革熱 × 氣溫｜非線性失靈")
    print("=" * 60)

    df = build_dataset()
    grid = np.linspace(df["temp"].min(), df["temp"].max(), 300)

    ols_model, sd = fit_ols(df)
    gam_pred, gam_r2, backend = fit_gam(df, grid)
    bins = binned_means(df)
    peak_t = plot_module3(df, ols_model, gam_pred, grid, gam_r2, sd, bins)

    print("\n── 統計摘要 ──────────────────────────────")
    print(f"  樣本：{len(df)} 個月（{YEAR_START}–{YEAR_END}），台南本土")
    print(f"  總病例 {int(df['cases'].sum()):,}，單月最高 {int(df['cases'].max()):,}")
    print(f"  Pearson r = {sd['pearson_r']:.3f}（p={sd['pearson_p']:.3g}）"
          f"，Spearman ρ = {sd['spearman']:.3f}")
    print(f"  OLS 線性  R² = {sd['r2']:.3f}（斜率 {sd['slope']:.1f} 例/°C）")
    print(f"  GAM 非線性 R² = {gam_r2:.3f}（{backend}）")
    print(f"  GAM 峰值溫度 ≈ {peak_t:.1f}°C（之後回落 → 倒 U）")
    if sd["r2"] > 0:
        print(f"  GAM/OLS 解釋力比 ≈ {gam_r2 / sd['r2']:.1f}×")
    print("\n  分箱均值（月均病例 / 溫度帶）：")
    for iv, row in bins.iterrows():
        print(f"    {int(iv.left):>2}–{int(iv.right):<2}°C : "
              f"{row['mean']:>7.1f} 例/月（n={int(row['count'])}）")

    print("\n★ 統計素養命題：Pearson r ≈ {:.2f} 幾乎為零，"
          "若只看線性相關會誤判『溫度與疫情無關』；".format(sd["pearson_r"]))
    print("  但分箱與 GAM 都顯示一條清楚的倒 U——關係不是不存在，"
          "而是『不是直線』。\n  OLS 看不見倒 U，因為正負斜率相互抵消。")
    print("[完成] 主流程結束。")


# ============================================================================
# 第 8 節｜課堂討論題
# ============================================================================
"""
【討論題一｜模型即觀點】
  Q1. 同一份資料，OLS 說 R²≈0.02、「幾乎無關」；GAM 說有一條清楚的
      倒 U。兩個模型都「沒算錯」，差別只在假設。請討論：當研究者選擇
      模型時，他其實在選擇「願意看見什麼形狀」。這對我們閱讀新聞中的
      「研究顯示 X 與 Y 無關」應有什麼警惕？

【討論題二｜相關係數的盲點】
  Q2. Pearson r 衡量的是「線性」關聯。請畫出（或想像）一個完美的倒 U
      散點，計算它的 r——你會得到接近 0。這說明 r=0 有哪兩種完全不同
      的可能？（提示：真的無關 vs. 非線性相關。）

【討論題三｜落後效應】
  Q3. 登革熱疫情在台灣通常於「秋季」（9–11 月）達到高峰，但氣溫高峰
      在 7 月。試將氣溫「落後 1–2 個月」再與病例比對（程式可改 shift）。
      解釋為何「病媒蚊孳生 → 叮咬 → 潛伏 → 發病」的時間鏈，會讓同月
      對比低估真實的氣溫敏感度。

【討論題四｜不是溫度決定論】
  Q4. 本模組的 R² 在絕對值上並不高（即使 GAM 也僅約 0.03）。除了氣溫，
      還有哪些因素主導登革熱爆發？（提示：降雨與積水容器、人口移動、
      前一年群體免疫、孳生源清除、輸入病例引信。）為什麼「氣溫是
      重要但非唯一」的誠實表述，比「氣候害死多少人」更有教育價值？

【討論題五｜跨模組整合】
  Q5. 模組一是「漸進剪刀」、模組二是「結構斷層」、模組三是「非線性閾值」。
      三種統計形狀對應三種誤判風險。請各舉一個你生活中可能遇到的
      「用錯模型而誤判」的例子（例如：考試成績、體重、社群追蹤數）。
"""


if __name__ == "__main__":
    main()