hklii_samples/hk_case_extractor.py

"""
hk_case_extractor.py
==========================================================
香港判決書結構化字段抽取管線
基於 本地 Ollama 小模型 + 分階段抽取 + JSON Schema 強制 + 校驗重試

設計理念
--------
基於對實際香港判決書結構的分析優化：
  1. 預處理：規則去噪 + 切段，純規則抽司法區域/案號等高確定性字段
  2. 智能定位：
     - 基礎信息（當事人、案號）：直接從開頭2000字符提取
     - 判決結果：優先從尾部4000字符提取
     - 其他字段：使用關鍵詞召回相關段落
  3. 分組抽取：拆成 5 次獨立 Ollama 調用，每次只負責 1-3 個字段
  4. Schema 強制：用 Ollama 0.5+ 的 format=<JSON Schema> 約束輸出
  5. 校驗+重試：對字數、黑名單、結構標註逐項校驗
  6. judgment_summary 不從原文重生，而從前 4 步結果 + 一段分析段生成

判決書結構特點（基於實際案例分析）
--------------------------------
- 開頭部分（前2000字符）：
  * 案號（如 CACV000175/2000）
  * 法院名稱和級別
  * 當事人信息（BETWEEN...AND 格式）
  * 案件標題
  * 審理日期和法官信息

- 中間部分：
  * 案情背景（BACKGROUND, INTRODUCTION, 背景, 案情）
  * 法律分析和推理
  * 證據評估
  * 法律原則引用

- 尾部部分（後4000字符）：
  * 判決結果（JUDGMENT, ORDER, CONCLUSION, 判決, 命令）
  * 具體命令和裁定
  * 訟費安排
  * 法官簽名

依賴
----
    pip install requests pyyaml
    本地需運行：ollama serve
    模型：ollama pull qwen2.5:7b-instruct   （推薦，中文抽取甜點）
         或 ollama pull glm4:9b

使用
----
    python hk_case_extractor.py <input.txt>
    python hk_case_extractor.py case.txt --model qwen2.5:7b-instruct --out result.yaml
"""

from __future__ import annotations

import argparse
import json
import re
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any

import requests
import yaml


# =============================================================================
# 配置
# =============================================================================

OLLAMA_URL = "https://openai.iconsz.com/ollama3090/api/chat"
DEFAULT_MODEL = "qwen2.5:7b-instruct"
DEFAULT_TIMEOUT = 240
MAX_RETRIES = 2


# =============================================================================
# 0. 語言檢測
# =============================================================================

def detect_language(text: str) -> str:
    """檢測文本主要語言：'zh' 或 'en'

    策略：統計前3000字符中的中文字符比例
    - 中文字符 > 30%：判定為中文
    - 否則：判定為英文
    """
    sample = text[:3000]
    chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', sample))
    total_chars = len(sample.strip())

    if total_chars == 0:
        return 'en'

    chinese_ratio = chinese_chars / total_chars
    return 'zh' if chinese_ratio > 0.3 else 'en'


# =============================================================================
# 1. 預處理：去噪 + 切段 + 規則抽元數據
# =============================================================================

JURISDICTION_MAP_ZH: dict[str, str] = {
    "HKCFA":  "香港特別行政區終審法院",
    "HKCA":   "香港特別行政區高等法院上訴法庭",
    "HKCFI":  "香港特別行政區高等法院原訟法庭",
    "HKDC":   "香港特別行政區區域法院",
    "HKMC":   "香港特別行政區裁判法院",
    "HKMagC": "香港特別行政區裁判法院",
    "HKSCT":  "香港特別行政區小額錢債審裁處",
    "HKLT":   "香港特別行政區土地審裁處",
    "HKLDT":  "香港特別行政區土地審裁處",
    "HKLD":   "香港特別行政區勞資審裁處",
    "HKLAT":  "香港特別行政區勞資審裁處",
    "HKCT":   "香港特別行政區競爭事務審裁處",
    "HKCorC": "香港特別行政區死因裁判法庭",
    "HKCrC":  "香港特別行政區死因裁判法庭",
}

JURISDICTION_MAP_EN: dict[str, str] = {
    "HKCFA":  "Court of Final Appeal of the Hong Kong Special Administrative Region",
    "HKCA":   "Court of Appeal of the High Court of the Hong Kong Special Administrative Region",
    "HKCFI":  "Court of First Instance of the High Court of the Hong Kong Special Administrative Region",
    "HKDC":   "District Court of the Hong Kong Special Administrative Region",
    "HKMC":   "Magistrates' Courts of the Hong Kong Special Administrative Region",
    "HKMagC": "Magistrates' Courts of the Hong Kong Special Administrative Region",
    "HKSCT":  "Small Claims Tribunal of the Hong Kong Special Administrative Region",
    "HKLT":   "Lands Tribunal of the Hong Kong Special Administrative Region",
    "HKLDT":  "Lands Tribunal of the Hong Kong Special Administrative Region",
    "HKLD":   "Labour Tribunal of the Hong Kong Special Administrative Region",
    "HKLAT":  "Labour Tribunal of the Hong Kong Special Administrative Region",
    "HKCT":   "Competition Tribunal of the Hong Kong Special Administrative Region",
    "HKCorC": "Coroner's Court of the Hong Kong Special Administrative Region",
    "HKCrC":  "Coroner's Court of the Hong Kong Special Administrative Region",
}

NEUTRAL_CITATION_RE = re.compile(
    r"\[(\d{4})\]\s*(HKCFA|HKCA|HKCFI|HKDC|HKMagC|HKMC|HKSCT|HKLT|HKLD|HKCT|HKCorC)\s*(\d+)",
    re.I,
)
CASE_NO_RE = re.compile(
    r"(FACV|FACC|FAMV|FAMC|CACV|CACC|CAAG|HCA|HCAL|HCMP|HCCW|HCB|DCCJ|DCMP|SCTC|LBTC|LDPD|LDBM|CCDI|WKCC)"
    r"\s*(?:NO\.?)?\s*\d+\s*(?:OF|/|\sof\s)\s*\d{4}",
    re.I,
)

# 案號前綴到法院代碼的映射（優先級最高）
CASE_NO_PREFIX_MAP: dict[str, str] = {
    "FACV": "HKCFA",  # Final Appeal Civil
    "FACC": "HKCFA",  # Final Appeal Criminal
    "FAMV": "HKCFA",  # Final Appeal Miscellaneous
    "FAMC": "HKCFA",  # Final Appeal Miscellaneous Criminal
    "CACV": "HKCA",   # Court of Appeal Civil
    "CACC": "HKCA",   # Court of Appeal Criminal
    "CAAG": "HKCA",   # Court of Appeal (Administrative)
    "HCA":  "HKCFI",  # High Court Action
    "HCAL": "HKCFI",  # High Court Administrative Law
    "HCMP": "HKCFI",  # High Court Miscellaneous Proceedings
    "HCCW": "HKCFI",  # High Court Companies Winding Up
    "HCB":  "HKCFI",  # High Court Bankruptcy
    "DCCJ": "HKDC",   # District Court
    "DCMP": "HKDC",   # District Court Miscellaneous Proceedings
    "SCTC": "HKSCT",  # Small Claims Tribunal
    "LBTC": "HKLAT",  # Labour Tribunal (勞資審裁處)
    "LDPD": "HKLAT",  # Labour Tribunal
    "LDBM": "HKLDT",  # Lands Tribunal (土地審裁處)
    "CCDI": "HKCrC",  # Coroner's Court (死因裁判法庭)
    "WKCC": "HKMagC", # Magistrates' Court (裁判法院)
}


def clean_text(raw: str) -> str:
    """去頁眉頁腳、頁碼、多餘空行/空格"""
    t = raw
    t = re.sub(r"Page\s+\d+\s+of\s+\d+", "", t, flags=re.I)
    t = re.sub(r"^\s*-\s*\d+\s*-\s*$", "", t, flags=re.M)
    t = re.sub(r"　+", " ", t)        # 全角空格
    t = re.sub(r"[ \t]+", " ", t)
    t = re.sub(r"\n{3,}", "\n\n", t)
    return t.strip()


def extract_metadata_by_rule(text: str, lang: str = 'zh') -> dict[str, Any]:
    """純規則：司法區域、案號、案件地點（默認香港特區）

    優先級：
    1. 案號前綴（最可靠）
    2. Neutral Citation
    3. 法院全稱匹配

    Args:
        text: 判決書文本
        lang: 語言代碼 ('zh' 或 'en')
    """
    # 根據語言選擇對應的映射表和默認地點
    jurisdiction_map = JURISDICTION_MAP_ZH if lang == 'zh' else JURISDICTION_MAP_EN
    default_location = ["香港特別行政區"] if lang == 'zh' else ["Hong Kong Special Administrative Region"]

    meta: dict[str, Any] = {
        "jurisdiction_code": None,
        "jurisdiction_name": None,
        "case_location": default_location,
        "case_number": None,
    }

    # 優先：從案號前綴判斷法院
    if m := CASE_NO_RE.search(text):
        case_no = re.sub(r"\s+", " ", m.group(0).strip())
        meta["case_number"] = case_no

        # 提取前綴並映射到法院代碼
        prefix = m.group(1).upper()
        if prefix in CASE_NO_PREFIX_MAP:
            code = CASE_NO_PREFIX_MAP[prefix]
            meta["jurisdiction_code"] = code
            meta["jurisdiction_name"] = jurisdiction_map.get(code)

    # 次優先：Neutral Citation（如果案號未能確定法院）
    if not meta["jurisdiction_code"]:
        if m := NEUTRAL_CITATION_RE.search(text):
            code = m.group(2).upper()
            # 規範化大小寫
            for k in jurisdiction_map:
                if k.upper() == code:
                    meta["jurisdiction_code"] = k
                    meta["jurisdiction_name"] = jurisdiction_map[k]
                    break

    # 最後：靠法院全稱反查（僅在前兩者都失敗時使用，且只搜索前2000字符）
    if not meta["jurisdiction_code"]:
        header = text[:2000]  # 只在開頭搜索，避免被引用案例干擾
        # 同時搜索中英文法院名稱
        for code in jurisdiction_map:
            full_zh = JURISDICTION_MAP_ZH.get(code, "")
            full_en = JURISDICTION_MAP_EN.get(code, "")
            short_zh = full_zh.replace("香港特別行政區", "")
            short_en = full_en.replace("Hong Kong Special Administrative Region", "").replace(" of the ", " ")

            if any(name in header for name in [full_zh, short_zh, full_en, short_en] if name):
                meta["jurisdiction_code"] = code
                meta["jurisdiction_name"] = jurisdiction_map[code]
                break

    return meta


# -----------------------------------------------------------------------------
# 關鍵詞 + 窗口召回（取代脆弱的正則切段）
# -----------------------------------------------------------------------------
# 思路：每個抽取目標定義一組「高信號關鍵詞」，掃全文取所有命中位置周圍
#       ±half_window 字符的窗口，合併重疊後拼接喂給 LLM。
#       不依賴判決書的固定章節標題，對結構各異的香港判決書都能工作。
#
# 優化策略（基於實際案例分析）：
# 1. 當事人信息：直接從開頭2000字符提取（通常在 BETWEEN...AND 結構中）
# 2. 判決結果：優先從尾部4000字符提取（通常在 JUDGMENT/ORDER/命令 部分）
# 3. 其他字段：使用關鍵詞召回策略

KEYWORD_GROUPS: dict[str, list[str]] = {
    # Call 1：當事人 - 不再使用，改為直接截取開頭
    # 保留此處僅為向後兼容，實際不會被 gather_all 使用
    "parties": [],

    # Call 2：事由與標的
    "reason_object": [
        # 段落標題類
        "案情", "背景", "引言", "事實", "案件背景", "案由",
        "INTRODUCTION", "BACKGROUND", "FACTS", "THE FACTS", "General course",
        # 主張類
        "申索", "索償", "訴因", "聲稱", "請求", "本案涉及", "本訴訟",
        "原告聲稱", "申索人聲稱", "申索人指稱", "上訴人指", "答辯人指",
        "Plaintiff", "Claimant", "Appellant", "claim", "allege",
        # 標的物關鍵詞
        "賠償", "損失", "損害", "精神困擾", "經濟損失", "醫療費",
        "履行", "所有權", "占有", "撤銷", "宣告", "damages", "compensation",
    ],

    # Call 3：判決結果 - 不再使用關鍵詞，改為直接截取尾部
    # 保留此處僅為向後兼容
    "judgment_result": [],

    # Call 4：涉及實體（法官、律師、引用案例中的法官）
    "entities": [
        # 法官稱謂
        "法官", "大法官", "審裁官", "裁判官", "首席法官", "常任法官", "非常任法官",
        "Hon.", " J.", " JA ", " CJ ", " PJ ", " NPJ ", "Coroner", "Judge",
        # 代表類
        "代表", "大律師", "律師", "資深大律師", "代表律師",
        "Counsel", "Solicitor", "instructed by", "represented by",
        # 案例引用（會在周邊帶出法官名）
        " v ", " v. ", " 訴 ", "[19", "[20", "HKCFA", "HKCA", "HKCFI",
    ],

    # Call 5：法庭分析（用於 summary 的核心輸入）
    "analysis": [
        # 法庭觀點標記
        "本席認為", "本席接納", "本席不接納", "本席同意", "本席不同意",
        "本席裁定", "本席拒絕", "本席認同", "本席考慮",
        "本庭認為", "本庭接納", "本庭裁定", "本庭認同", "本庭考慮",
        "I find", "I accept", "I do not accept", "I conclude", "I consider",
        "The court finds", "In my view", "In my judgment", "The Court held",
        # 法律原則
        "舉證責任", "審慎責任", "鄰人原則", "替代責任", "合理疑點",
        "違反", "侵權", "過失", "negligence", "breach", "duty of care",
        # 證據評估
        "證據顯示", "根據證據", "證人證供", "可信", "不可信",
        "evidence shows", "testimony", "credible", "reliable",
    ],
}


def gather_chunks(text: str,
                   keywords: list[str],
                   half_window: int = 500,
                   max_total: int = 6500,
                   case_sensitive: bool = False) -> tuple[str, int]:
    """
    召回所有 keywords 命中位置周圍 ±half_window 字符的窗口，
    合併重疊區間，按位置順序拼接，總長不超過 max_total。

    返回：(拼接後文本, 命中關鍵詞數)
    若無命中，fallback 返回文檔前 max_total 字。
    """
    if not text:
        return "", 0

    flags = 0 if case_sensitive else re.IGNORECASE
    hits: list[tuple[int, int]] = []
    for kw in keywords:
        for m in re.finditer(re.escape(kw), text, flags=flags):
            s = max(0, m.start() - half_window)
            e = min(len(text), m.end() + half_window)
            hits.append((s, e))

    if not hits:
        return text[:max_total], 0

    # 合併重疊區間
    hits.sort()
    merged: list[list[int]] = []
    for s, e in hits:
        if merged and s <= merged[-1][1]:
            merged[-1][1] = max(merged[-1][1], e)
        else:
            merged.append([s, e])

    # 按位置順序拼接，控制總長
    pieces: list[str] = []
    total = 0
    for s, e in merged:
        seg_len = e - s
        if total + seg_len > max_total:
            remain = max_total - total
            if remain > 200:
                pieces.append(text[s:s + remain])
            break
        pieces.append(text[s:e])
        total += seg_len

    return "\n\n[…]\n\n".join(pieces), len(hits)


def gather_all(text: str) -> dict[str, str]:
    """為每個 group 召回對應的上下文片段

    優化策略：
    1. 基礎信息（當事人）：直接取開頭5000字符，不使用關鍵詞召回
    2. 事由與標的（reason_object）：直接取開頭5000字符，不使用關鍵詞召回
    3. 判決結果：取開頭5000字符 + 尾部5000字符
    4. 其他字段：保持關鍵詞召回策略
    """
    out: dict[str, str] = {}

    # 1. 當事人信息：直接從開頭5000字符提取
    out["parties"] = text[:5000]
    out["_parties_hits"] = "0"  # 不使用關鍵詞，標記為0

    # 2. 事由與標的：直接從開頭5000字符提取
    out["reason_object"] = text[:5000]
    out["_reason_object_hits"] = "0"  # 不使用關鍵詞，標記為0

    # 3. 判決結果：取開頭5000字符 + 尾部5000字符
    head_text = text[:5000] if len(text) > 5000 else text
    tail_text = text[-5000:] if len(text) > 5000 else ""
    # 如果文本足夠長，拼接頭尾；否則只用全文
    if tail_text and head_text != tail_text:
        out["judgment_result"] = head_text + "\n\n[…]\n\n" + tail_text
    else:
        out["judgment_result"] = head_text
    out["_judgment_result_hits"] = "0"  # 直接截取，不計算關鍵詞命中

    # 4. 其他字段：使用關鍵詞召回
    params: dict[str, tuple[int, int]] = {
        "entities":        (400, 6500),
        "analysis":        (500, 6500),
    }
    for group in ["entities", "analysis"]:
        kws = KEYWORD_GROUPS[group]
        hw, mt = params[group]
        ctx, hits = gather_chunks(text, kws, half_window=hw, max_total=mt)
        out[group] = ctx
        out[f"_{group}_hits"] = str(hits)

    return out


# =============================================================================
# 2. Ollama 客戶端：JSON Schema 強制 + 重試
# =============================================================================

@dataclass
class OllamaClient:
    model: str = DEFAULT_MODEL
    url: str = OLLAMA_URL
    timeout: int = DEFAULT_TIMEOUT

    def chat_json(self, system: str, user: str, schema: dict,
                  temperature: float = 0.0,
                  num_ctx: int = 8192) -> dict:
        """調用 Ollama，使用 format=<JSON Schema> 強制結構化輸出"""
        payload = {
            "model": self.model,
            "messages": [
                {"role": "system", "content": system},
                {"role": "user", "content": user},
            ],
            "format": schema,
            "stream": False,
            "options": {"temperature": temperature, "num_ctx": num_ctx},
        }
        r = requests.post(self.url, json=payload, timeout=self.timeout)
        r.raise_for_status()
        content = r.json()["message"]["content"]
        try:
            return json.loads(content)
        except json.JSONDecodeError as e:
            # 嘗試剝離可能的 ```json fence
            stripped = re.sub(r"^```(?:json)?\s*|\s*```$", "",
                              content.strip(), flags=re.S)
            return json.loads(stripped)

    def chat_json_with_retry(self, system: str, user: str, schema: dict,
                              validator=None, **kw) -> dict:
        """validator(result) -> (ok: bool, hint: str)；失敗則回灌 hint 重試"""
        last_err = None
        for attempt in range(MAX_RETRIES + 1):
            try:
                out = self.chat_json(system, user, schema, **kw)
                if validator is None:
                    return out
                ok, hint = validator(out)
                if ok:
                    return out
                # 回灌錯誤信息
                user = (f"{user}\n\n上次輸出存在問題：{hint}\n"
                        f"請修正後重新輸出。")
            except Exception as e:
                last_err = e
        if last_err:
            raise last_err
        return out  # type: ignore


# =============================================================================
# 3. 五次抽取調用：每次只負責一組字段
# =============================================================================

# --- Call 1: 當事人 ----------------------------------------------------------

PARTIES_SCHEMA = {
    "type": "object",
    "properties": {
        "plaintiff": {"type": "array", "items": {"type": "string"}},
        "defendant": {"type": "array", "items": {"type": "string"}},
    },
    "required": ["plaintiff", "defendant"],
}

# 中文提示詞
PARTIES_SYSTEM_ZH = """你是香港法律文書信息抽取助手。
從給定的判決書開頭部分抽取所有當事人完整姓名/機構名。

格式識別：
1. 英文格式：BETWEEN ... AND ...
2. 中文格式：申請人 ... 對/訴 答辯人 ...
3. 混合格式：Plaintiff ... Defendant ...

分類規則：
- 原告/申索人/上訴人/覆核申請人/Plaintiff/Appellant/Claimant/Applicant → plaintiff
- 被告/答辯人/被上訴人/Defendant/Respondent → defendant
- 保留中英文對照（如有）
- 某類無則輸出空數組

只輸出符合 schema 的 JSON，不要解釋。"""

PARTIES_FEWSHOT_ZH = """範例1（原告/被告格式）：
BETWEEN
    陳大文 (CHAN TAI MAN)              上訴人
    AND
    香港房屋委員會 (Hong Kong Housing Authority)   答辯人

輸出：
{"plaintiff":["陳大文 (CHAN TAI MAN)"],"defendant":["香港房屋委員會 (Hong Kong Housing Authority)"]}

範例2（申請人/答辯人格式）：
申請人：
    李小明
答辯人：
    入境事務處處長

輸出：
{"plaintiff":["李小明"],"defendant":["入境事務處處長"]}"""

# 英文提示詞
PARTIES_SYSTEM_EN = """You are a Hong Kong legal document information extraction assistant.
Extract all complete names/organization names of parties from the beginning of the judgment.

Format Recognition:
1. English format: BETWEEN ... AND ...
2. Chinese format: 申請人 ... 對/訴 答辯人 ...
3. Mixed format: Plaintiff ... Defendant ...

Classification Rules:
- Plaintiff/Claimant/Appellant/Applicant/原告/申索人/上訴人/覆核申請人 → plaintiff
- Defendant/Respondent/被告/答辯人/被上訴人 → defendant
- Preserve bilingual names (if any)
- Output empty array if none

Output only JSON conforming to schema, no explanation."""

PARTIES_FEWSHOT_EN = """Example 1 (Plaintiff/Defendant format):
BETWEEN
    Dr Paul KI Ping-ki              1st Plaintiff
    Hong Kong Washington Company    2nd Plaintiff
    AND
    Next Magazine Publishing Ltd    1st Defendant

Output:
{"plaintiff":["Dr Paul KI Ping-ki","Hong Kong Washington Company"],"defendant":["Next Magazine Publishing Ltd"]}

Example 2 (Applicant/Respondent format):
Between:
MO YUK PING
Applicant
and
HONG KONG SPECIAL ADMINISTRATIVE REGION
Respondent

Output:
{"plaintiff":["MO YUK PING"],"defendant":["HONG KONG SPECIAL ADMINISTRATIVE REGION"]}"""


def extract_parties(client: OllamaClient, context: str, lang: str = 'zh') -> dict:
    system = PARTIES_SYSTEM_ZH if lang == 'zh' else PARTIES_SYSTEM_EN
    fewshot = PARTIES_FEWSHOT_ZH if lang == 'zh' else PARTIES_FEWSHOT_EN

    if lang == 'zh':
        user = f"{fewshot}\n\n請從以下判決書開頭部分抽取：\n```\n{context[:5000]}\n```"
    else:
        user = f"{fewshot}\n\nPlease extract from the following judgment header:\n```\n{context[:5000]}\n```"

    return client.chat_json_with_retry(system, user, PARTIES_SCHEMA)


# --- Call 2: 事由 + 標的 ----------------------------------------------------

def get_reason_object_schema(lang: str = 'zh') -> dict:
    """根據語言返回對應的 schema（英文字數限制更寬鬆）"""
    max_length = 100 if lang == 'zh' else 200  # 英文允許 2 倍字符數
    return {
        "type": "object",
        "properties": {
            "case_reason": {"type": "string", "maxLength": max_length},
            "case_object": {"type": "array", "items": {"type": "string"}},
        },
        "required": ["case_reason", "case_object"],
    }

# 中文提示詞
REASON_OBJECT_SYSTEM_ZH = """從香港判決書中抽取：

1. case_reason（事由）：
   - 嚴格 ≤100 字，單句
   - 結構：[原告身份] + [針對什麼事件/行為] + [向誰] + [提出什麼請求]
   - 覆核/上訴案件須註明對哪個裁決提出覆核（含日期/案號）
   - 嚴禁包含：判決結果、法庭分析、案發細節、證據評估
   - 只描述訴訟的起因和請求，不涉及法庭的判斷

2. case_object（標的物）：
   - 訴訟請求指向的實體權利或利益
   - 例：汽車、黃金、珠寶、不動產產權、撫養權、人身傷害賠償、合同履行、房產所有權、精神困擾賠償、居留權
   - 合併本質相同的標的
   - 嚴禁：證據材料、程序性訴求（如"要求法庭裁決"）、法律條文名稱

只輸出 JSON。"""

REASON_OBJECT_FEWSHOT_ZH = """範例輸出：
{"case_reason":"申索人為商場保安員，就被告於2023年7月在商場毆打申索人造成的人身傷害向被告提出損害賠償申索。","case_object":["人身傷害賠償","醫療費用","精神困擾賠償"]}"""

# 英文提示詞
REASON_OBJECT_SYSTEM_EN = """Extract from Hong Kong judgment:

1. case_reason (Cause of Action):
   - Strictly ≤100 words, single sentence
   - Structure: [Plaintiff's identity] + [regarding what event/conduct] + [against whom] + [what relief sought]
   - For judicial review/appeal cases, specify which decision is being challenged (with date/case number)
   - MUST NOT include: judgment results, court analysis, incident details, evidence assessment
   - Only describe the cause and relief sought, not the court's determination

2. case_object (Subject Matter):
   - Tangible rights or interests targeted by the claim
   - Examples: vehicle, gold, jewelry, property title, custody, personal injury damages, contract performance, property ownership, distress damages, right of abode
   - Merge essentially identical subjects
   - MUST NOT include: evidentiary materials, procedural requests (e.g., "seeking court ruling"), names of statutes

Output only JSON."""

REASON_OBJECT_FEWSHOT_EN = """Example Output:
{"case_reason":"Plaintiff, a shopping mall security guard, claims damages from defendant for personal injuries caused by defendant's assault at the mall in July 2023.","case_object":["personal injury damages","medical expenses","distress damages"]}"""


def _reason_object_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]:
    r = out.get("case_reason", "")
    max_length = 100 if lang == 'zh' else 200  # 英文允許 2 倍字符數
    target_length = 80 if lang == 'zh' else 160  # 建議壓縮目標

    if len(r) > max_length:
        if lang == 'zh':
            return False, f"case_reason 共 {len(r)} 字，超過 {max_length} 字上限，請壓縮到 {target_length} 字以內。"
        else:
            return False, f"case_reason has {len(r)} characters, exceeds {max_length} limit, please compress to within {target_length}."
    if not out.get("case_object"):
        if lang == 'zh':
            return False, "case_object 不能為空。"
        else:
            return False, "case_object cannot be empty."

    # 檢查是否包含判決結果性詞彙（嚴禁）
    RESULT_KEYWORDS = [
        "駁回", "拒絕", "勝訴", "敗訴", "維持", "撤銷", "發還",
        "判給", "獲判", "判處", "部分勝訴",
        "dismissed", "allowed", "granted", "refused", "upheld", "quashed",
    ]
    for keyword in RESULT_KEYWORDS:
        if keyword in r:
            if lang == 'zh':
                return False, f"case_reason 不應包含判決結果詞彙「{keyword}」，請只描述訴訟起因和請求。"
            else:
                return False, f"case_reason should not contain judgment result term '{keyword}', describe only cause and relief sought."

    return True, ""


def extract_reason_object(client: OllamaClient, context: str, lang: str = 'zh') -> dict:
    system = REASON_OBJECT_SYSTEM_ZH if lang == 'zh' else REASON_OBJECT_SYSTEM_EN
    fewshot = REASON_OBJECT_FEWSHOT_ZH if lang == 'zh' else REASON_OBJECT_FEWSHOT_EN
    schema = get_reason_object_schema(lang)
    max_length = 100 if lang == 'zh' else 200

    if lang == 'zh':
        user = (f"{fewshot}\n\n"
                f"請從以下判決書開頭部分抽取：\n```\n{context[:5000]}\n```")
    else:
        user = (f"{fewshot}\n\n"
                f"Please extract from the following judgment header:\n```\n{context[:5000]}\n```")

    out = client.chat_json_with_retry(system, user,
                                       schema,
                                       validator=lambda x: _reason_object_validator(x, lang))
    if len(out["case_reason"]) > max_length:
        out["case_reason"] = out["case_reason"][:max_length]
    return out


# --- Call 3: 判決結果 -------------------------------------------------------

JUDGMENT_RESULT_SCHEMA = {
    "type": "object",
    "properties": {
        "judgment_result": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "charge": {"type": "string"},
                    "result": {"type": "string"},
                },
                "required": ["charge", "result"],
            },
        }
    },
    "required": ["judgment_result"],
}

# 中文提示詞
JUDGMENT_RESULT_SYSTEM_ZH = """從香港判決書尾部的命令/裁定部分抽取所有判決結果。

重要提示：
- 判決結果通常在判決書的最後部分
- 常見標記：JUDGMENT, ORDER, CONCLUSION, DISPOSITION, 判決, 命令, 裁定, 頒令
- 可能包含：勝訴/敗訴、具體金額、訟費安排、上訴結果

拆分原則：
- 多項請求 → 分條
- "責任判定" 與 "損失/金額計算" 兩個層面 → 必須分條
- 每條 charge 必須以 "(責任問題)" 或 "(損失範圍)" 結尾標註層次
- result 必須包含：
    a) 明確結果（勝訴/敗訴/部分勝訴/維持/撤銷/駁回/發還等）
    b) 2-3 個關鍵法庭理由（如有）
    c) 具體金額、利率或命令內容（如有）

只輸出 JSON。"""

JUDGMENT_RESULT_FEWSHOT_ZH = """範例輸出：
{"judgment_result":[
  {"charge":"申索人就毆打事件的人身傷害索償 (責任問題)","result":"勝訴。法庭接納申索人證供可信，閉路電視顯示被告先動手，被告亦承認部分情節。"},
  {"charge":"醫療費及精神困擾賠償金額 (損失範圍)","result":"部分勝訴。判給醫療費HK$8,500及一般損害賠償HK$20,000，合共HK$28,500，連同利息及訟費。"}
]}"""

# 英文提示詞
JUDGMENT_RESULT_SYSTEM_EN = """Extract all judgment results from the order/disposition section at the end of Hong Kong judgment.

Important Notes:
- Judgment results are usually at the end of the judgment
- Common markers: JUDGMENT, ORDER, CONCLUSION, DISPOSITION, 判決, 命令, 裁定, 頒令
- May include: success/dismissal, specific amounts, costs arrangements, appeal results

Splitting Principles:
- Multiple claims → separate items
- "Liability determination" vs "Quantum/damages assessment" → must be separate items
- Each charge must end with "(liability issue)" or "(quantum issue)" to mark the level
- result must include:
    a) Clear outcome (allowed/dismissed/partially allowed/upheld/quashed/remitted, etc.)
    b) 2-3 key court reasons (if any)
    c) Specific amounts, interest rates or order details (if any)

Output only JSON."""

JUDGMENT_RESULT_FEWSHOT_EN = """Example Output:
{"judgment_result":[
  {"charge":"Plaintiff's claim for personal injury from assault (liability issue)","result":"Allowed. Court accepted plaintiff's testimony as credible, CCTV showed defendant struck first, defendant also admitted parts of the incident."},
  {"charge":"Medical expenses and distress damages quantum (quantum issue)","result":"Partially allowed. Awarded medical expenses HK$8,500 and general damages HK$20,000, totaling HK$28,500, with interest and costs."}
]}"""


def _judgment_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]:
    items = out.get("judgment_result", [])
    if not items:
        if lang == 'zh':
            return False, "judgment_result 不能為空。"
        else:
            return False, "judgment_result cannot be empty."

    if lang == 'zh':
        bad = [i for i in items
               if "責任問題" not in i.get("charge", "")
               and "損失範圍" not in i.get("charge", "")]
        if bad:
            return False, (f"有 {len(bad)} 條 charge 未標註層次。"
                           f"每條 charge 必須以 '(責任問題)' 或 '(損失範圍)' 結尾。")
    else:
        bad = [i for i in items
               if "liability issue" not in i.get("charge", "").lower()
               and "quantum issue" not in i.get("charge", "").lower()]
        if bad:
            return False, (f"{len(bad)} charge items lack level annotation. "
                           f"Each charge must end with '(liability issue)' or '(quantum issue)'.")

    return True, ""


def extract_judgment_result(client: OllamaClient, context: str, lang: str = 'zh') -> dict:
    system = JUDGMENT_RESULT_SYSTEM_ZH if lang == 'zh' else JUDGMENT_RESULT_SYSTEM_EN
    fewshot = JUDGMENT_RESULT_FEWSHOT_ZH if lang == 'zh' else JUDGMENT_RESULT_FEWSHOT_EN

    if lang == 'zh':
        user = (f"{fewshot}\n\n"
                f"請從以下判決書片段（開頭5000字符 + 尾部5000字符）抽取：\n```\n{context}\n```")
    else:
        user = (f"{fewshot}\n\n"
                f"Please extract from the following judgment segments (first 5000 + last 5000 characters):\n```\n{context}\n```")

    return client.chat_json_with_retry(system, user,
                                        JUDGMENT_RESULT_SCHEMA,
                                        validator=lambda x: _judgment_validator(x, lang))


# --- Call 4: 涉及實體 -------------------------------------------------------

ENTITIES_SCHEMA = {
    "type": "object",
    "properties": {
        "involved_entities": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "entity_name": {"type": "string"},
                    "reason": {"type": "string"},
                },
                "required": ["entity_name", "reason"],
            },
        }
    },
    "required": ["involved_entities"],
}

# 中文提示詞
ENTITIES_SYSTEM_ZH = """從香港判決書中抽取所有相關實體（自然人/法人/組織/機構）。

必須包含：
- 主審法官 / 審裁官 / 裁判官（通常在判決書開頭或結尾署名）
- 雙方代表律師、大律師（通常在判決書結尾的 Representation 部分）
- 判決中引用的先例所提及的法官
  reason 須寫明：在XX案[案號]中擔任XX職位，闡述XX法律原則
- 涉案的政府部門、公司、機構（如：入境事務處處長、律政司司長）

嚴禁包含：
- 法案/條例名（如《侵權條例》、Cap.xxx、《基本法》）
- 純案例名稱（如 Donoghue v Stevenson）
- 文獻、期刊名

只輸出 JSON。"""

ENTITIES_FEWSHOT_ZH = """範例輸出：
{"involved_entities":[
  {"entity_name":"林希維審裁官","reason":"本案主審審裁官，負責認定事實及裁決。"},
  {"entity_name":"終審法院常任法官李義","reason":"在Tang Kwok Wah v HKSAR [2019] HKCFA 23 中擔任主筆法官，闡述舉證責任原則，本案第34段引用其判詞。"},
  {"entity_name":"康樂文化事務署","reason":"涉案場所通州街公園的管理機構。"}
]}"""

# 英文提示詞
ENTITIES_SYSTEM_EN = """Extract all relevant entities (natural persons/legal persons/organizations/institutions) from Hong Kong judgment.

Must include:
- Presiding judge/adjudicator/magistrate (usually signed at beginning or end of judgment)
- Counsel/barristers representing both parties (usually in Representation section at end)
- Judges mentioned in cited precedents
  reason must specify: served as XX position in XX case [case number], articulated XX legal principle
- Government departments, companies, institutions involved (e.g., Director of Immigration, Secretary for Justice)

MUST NOT include:
- Statute/ordinance names (e.g., Tort Ordinance, Cap.xxx, Basic Law)
- Pure case names (e.g., Donoghue v Stevenson)
- Literature, journal names

Output only JSON."""

ENTITIES_FEWSHOT_EN = """Example Output:
{"involved_entities":[
  {"entity_name":"Hon Leong JA","reason":"Presiding judge in this case, responsible for fact-finding and adjudication."},
  {"entity_name":"Chief Justice Li","reason":"Served as lead judge in Tang Kwok Wah v HKSAR [2019] HKCFA 23, articulated burden of proof principles, cited in paragraph 34 of this judgment."},
  {"entity_name":"Leisure and Cultural Services Department","reason":"Management authority of Tung Chau Street Park, the incident location."}
]}"""


def _entities_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]:
    ents = out.get("involved_entities", [])
    if not ents:
        if lang == 'zh':
            return False, "involved_entities 不能為空，至少要有主審法官。"
        else:
            return False, "involved_entities cannot be empty, must include at least the presiding judge."

    # 檢查黑名單（條例、法案、案例名稱）
    bad = []
    for e in ents:
        name = e.get("entity_name", "")
        # 檢查是否包含黑名單關鍵詞
        if any(k in name for k in ENTITY_NAME_BLACKLIST):
            bad.append(name)
        # 檢查是否為案例名稱格式（包含 v 或 訴）
        if (" v " in name or " v. " in name or " 訴 " in name or
            " vs " in name or " vs. " in name):
            bad.append(name)

    if bad:
        if lang == 'zh':
            return False, f"以下實體疑為條例/法案/案例名稱，應移除：{bad[:3]}"
        else:
            return False, f"Following entities appear to be statutes/acts/case names, should be removed: {bad[:3]}"

    return True, ""


def extract_entities(client: OllamaClient, context: str, lang: str = 'zh') -> dict:
    system = ENTITIES_SYSTEM_ZH if lang == 'zh' else ENTITIES_SYSTEM_EN
    fewshot = ENTITIES_FEWSHOT_ZH if lang == 'zh' else ENTITIES_FEWSHOT_EN

    if lang == 'zh':
        user = (f"{fewshot}\n\n"
                f"請從以下片段（多處關鍵詞召回拼接）抽取所有涉及實體：\n"
                f"```\n{context[:6500]}\n```")
    else:
        user = (f"{fewshot}\n\n"
                f"Please extract all involved entities from the following segments (keyword-based retrieval):\n"
                f"```\n{context[:6500]}\n```")

    return client.chat_json_with_retry(system, user,
                                        ENTITIES_SCHEMA,
                                        validator=lambda x: _entities_validator(x, lang))


# --- Call 5: 判決總結（基於已抽取結果 + 分析段，不從原文重生） -----------

def get_summary_schema(lang: str = 'zh') -> dict:
    """根據語言返回對應的 schema（英文字數限制更寬鬆）"""
    max_length = 300 if lang == 'zh' else 500  # 英文允許約 1.67 倍字符數
    return {
        "type": "object",
        "properties": {
            "judgment_summary": {"type": "string", "maxLength": max_length},
        },
        "required": ["judgment_summary"],
    }

# 中文提示詞
SUMMARY_SYSTEM_ZH = """根據已抽取的結構化字段 + 法庭分析段，撰寫判決總結。

四要素結構（必須全部涵蓋，連貫成單段）：
(1) 案件背景：1-2 句交代起因與當事人關係
(2) 核心爭議焦點
(3) 法庭法律分析與推理（核心重點）：
    - 如何評估證據？
    - 接受 / 拒絕主張的邏輯？
    - 引用了哪些關鍵法律或判例？
(4) 最終裁決結果及命令

嚴格 ≤300 字。只輸出 JSON。"""

# 英文提示詞
SUMMARY_SYSTEM_EN = """Based on extracted structured fields + court analysis section, write judgment summary.

Four-element structure (must cover all, in coherent single paragraph):
(1) Case background: 1-2 sentences on cause and parties' relationship
(2) Core issues in dispute
(3) Court's legal analysis and reasoning (core focus):
    - How was evidence assessed?
    - Logic for accepting/rejecting claims?
    - What key laws or precedents were cited?
(4) Final judgment and orders

Strictly ≤500 characters. Output only JSON."""


def _summary_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]:
    s = out.get("judgment_summary", "")
    max_length = 300 if lang == 'zh' else 500  # 英文允許約 1.67 倍字符數
    min_length = 80 if lang == 'zh' else 120  # 英文最小長度也相應增加

    if len(s) > max_length:
        if lang == 'zh':
            return False, f"summary 共 {len(s)} 字，超過 {max_length} 字上限，請壓縮。"
        else:
            return False, f"summary has {len(s)} characters, exceeds {max_length} limit, please compress."
    if len(s) < min_length:
        if lang == 'zh':
            return False, "summary 過短，請完整覆蓋四要素。"
        else:
            return False, "summary too short, please cover all four elements."
    return True, ""


def extract_summary(client: OllamaClient,
                     prior: dict, analysis: str, lang: str = 'zh') -> dict:
    system = SUMMARY_SYSTEM_ZH if lang == 'zh' else SUMMARY_SYSTEM_EN
    schema = get_summary_schema(lang)
    max_length = 300 if lang == 'zh' else 500

    if lang == 'zh':
        user = f"""已抽取的字段：
```json
{json.dumps(prior, ensure_ascii=False, indent=2)}
```

法庭分析節選：
```
{analysis[:3500]}
```

請按四要素撰寫 ≤300 字的 judgment_summary。"""
    else:
        user = f"""Extracted fields:
```json
{json.dumps(prior, ensure_ascii=False, indent=2)}
```

Court analysis excerpt:
```
{analysis[:3500]}
```

Please write judgment_summary ≤500 characters covering four elements."""

    out = client.chat_json_with_retry(system, user, schema,
                                       validator=lambda x: _summary_validator(x, lang))
    if len(out["judgment_summary"]) > max_length:
        out["judgment_summary"] = out["judgment_summary"][:max_length]
    return out


# =============================================================================
# 4. 全局校驗與後處理
# =============================================================================

LOCATION_BLACKLIST = [
    "法院", "法庭", "審裁處", "公園", "大廈", "大樓", "商場",
    "街", "道路", "村", "中心", "醫院", "酒店", "車站",
]
ENTITY_NAME_BLACKLIST = [
    "條例", "Cap.", "法案", "案例彙編", "Reports",
    "期刊", "Journal",
    # 案例名稱標記
    " v ", " v. ", " 訴 ", " vs ", " vs. ",
    "HKCFAR", "HKCFA", "HKCA", "HKCFI",  # 避免將案例引用誤認為實體
]


def validate_and_fix(result: dict, lang: str = 'zh') -> tuple[dict, list[str]]:
    warnings: list[str] = []

    # case_location：剔除法院/場所/建築
    locs = result.get("case_location") or []
    cleaned = [l for l in locs
               if l and not any(b in l for b in LOCATION_BLACKLIST)]
    if "香港特別行政區" not in cleaned:
        cleaned.insert(0, "香港特別行政區")
    if set(cleaned) != set(locs):
        warnings.append(
            f"case_location 已清理：移除 {set(locs) - set(cleaned)}")
    result["case_location"] = cleaned

    # 字數硬截斷（根據語言調整限制）
    reason_max = 100 if lang == 'zh' else 200
    summary_max = 300 if lang == 'zh' else 500

    if len(result.get("case_reason", "")) > reason_max:
        warnings.append(f"case_reason > {reason_max} 字，已截斷")
        result["case_reason"] = result["case_reason"][:reason_max]
    if len(result.get("judgment_summary", "")) > summary_max:
        warnings.append(f"judgment_summary > {summary_max} 字，已截斷")
        result["judgment_summary"] = result["judgment_summary"][:summary_max]

    # involved_entities：剔除條例/文獻
    ents = result.get("involved_entities") or []
    cleaned_ents = [e for e in ents
                    if not any(k in e.get("entity_name", "")
                               for k in ENTITY_NAME_BLACKLIST)]
    if len(cleaned_ents) != len(ents):
        warnings.append(
            f"involved_entities 移除 {len(ents) - len(cleaned_ents)} 條疑似條例/文獻")
    result["involved_entities"] = cleaned_ents

    # judgment_result：補層次標註提示
    for jr in result.get("judgment_result", []) or []:
        if ("責任問題" not in jr.get("charge", "")
                and "損失範圍" not in jr.get("charge", "")):
            warnings.append(
                f"judgment_result 條目缺層次標註：{jr.get('charge', '')[:40]}")

    # 空字段告警
    for k in ("plaintiff", "defendant", "case_object",
              "judgment_result", "involved_entities"):
        if not result.get(k):
            warnings.append(f"{k} 為空，請人工複核")

    return result, warnings


# =============================================================================
# 5. 主管線
# =============================================================================

def run_pipeline(text: str, model: str) -> dict:
    log = lambda m: print(m, file=sys.stderr)

    log("[0/7] 檢測語言...")
    lang = detect_language(text)
    log(f"      檢測到語言：{'中文' if lang == 'zh' else '英文'} (lang={lang})")

    log("[1/7] 預處理 + 關鍵詞召回...")
    text = clean_text(text)
    meta = extract_metadata_by_rule(text, lang)
    ctx = gather_all(text)

    log(f"      規則元數據：{meta}")
    log(f"      召回片段：")
    for g in ("parties", "reason_object", "judgment_result",
              "entities", "analysis"):
        hits_info = f"hits={ctx[f'_{g}_hits']}" if ctx[f'_{g}_hits'] != "0" else "直接截取"
        log(f"         {g:16s} len={len(ctx[g]):5d}  {hits_info}")

    client = OllamaClient(model=model)

    log("[2/7] 抽取當事人...")
    parties = extract_parties(client, ctx["parties"], lang)

    log("[3/7] 抽取事由與標的...")
    reason_obj = extract_reason_object(client, ctx["reason_object"], lang)

    log("[4/7] 抽取判決結果...")
    judgment = extract_judgment_result(client, ctx["judgment_result"], lang)

    log("[5/7] 抽取涉及實體...")
    # 實體抽取上下文：當事人片段（含律師名）+ 引用片段
    entities_ctx = (ctx["parties"][:2500] + "\n\n[…]\n\n"
                    + ctx["entities"])[:6500]
    entities = extract_entities(client, entities_ctx, lang)

    interim_for_summary = {
        **parties, **reason_obj, **judgment, **entities,
        "jurisdiction_name": meta["jurisdiction_name"],
    }

    log("[6/7] 撰寫判決總結...")
    summary = extract_summary(client, interim_for_summary, ctx["analysis"], lang)

    final = {
        "plaintiff":          parties["plaintiff"],
        "defendant":          parties["defendant"],
        "jurisdiction_code":  meta["jurisdiction_code"],
        "jurisdiction_name":  meta["jurisdiction_name"],
        "case_location":      meta["case_location"],
        "case_reason":        reason_obj["case_reason"],
        "case_object":        reason_obj["case_object"],
        "judgment_result":    judgment["judgment_result"],
        "judgment_summary":   summary["judgment_summary"],
        "involved_entities":  entities["involved_entities"],
    }

    log("[7/7] 校驗與後處理...")
    final, warnings = validate_and_fix(final, lang)
    for w in warnings:
        log(f"  ⚠️  {w}")

    return final


# =============================================================================
# 6. YAML 輸出（長字串用 > 折疊；含特殊字符的自動雙引號）
# =============================================================================

class FoldedStr(str):
    """標記為 YAML > 折疊樣式"""


def _folded_str_representer(dumper, data):
    return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=">")


def _safe_str_representer(dumper, data):
    """含 :, #, - 開頭的字符串強制雙引號"""
    if data and (":" in data or data.startswith("#") or data.startswith("- ")):
        return dumper.represent_scalar("tag:yaml.org,2002:str", data,
                                        style='"')
    return dumper.represent_scalar("tag:yaml.org,2002:str", data)


yaml.add_representer(FoldedStr, _folded_str_representer)
yaml.add_representer(str, _safe_str_representer)


def to_yaml(result: dict) -> str:
    if result.get("case_reason"):
        result["case_reason"] = FoldedStr(result["case_reason"])
    if result.get("judgment_summary"):
        result["judgment_summary"] = FoldedStr(result["judgment_summary"])
    return yaml.dump(result, allow_unicode=True, sort_keys=False,
                     default_flow_style=False, width=100)


# =============================================================================
# CLI
# =============================================================================

def main() -> None:
    ap = argparse.ArgumentParser(
        description="香港判決書結構化抽取（本地 Ollama 版）")
    ap.add_argument("input", help="判決書文本路徑（.txt 或 .json）")
    ap.add_argument("--model", default=DEFAULT_MODEL, help="Ollama 模型名")
    ap.add_argument("--out", default=None, help="輸出 YAML 路徑（默認 stdout）")
    ap.add_argument("--debug-dump", default=None,
                    help="額外輸出原始 JSON 結果到該路徑（便於 diff）")
    args = ap.parse_args()

    # 支持从 .json 文件的 content 字段读取
    input_path = Path(args.input)
    if input_path.suffix.lower() == '.json':
        import json
        data = json.loads(input_path.read_text(encoding="utf-8"))
        text = data.get("content", "")
        if not text:
            print("錯誤：JSON 文件中沒有 'content' 字段", file=sys.stderr)
            sys.exit(1)
    else:
        text = input_path.read_text(encoding="utf-8")

    result = run_pipeline(text, args.model)

    if args.debug_dump:
        Path(args.debug_dump).write_text(
            json.dumps(result, ensure_ascii=False, indent=2),
            encoding="utf-8")

    yaml_str = to_yaml(result)
    if args.out:
        Path(args.out).write_text(yaml_str, encoding="utf-8")
        print(f"\n✅ 已寫入 {args.out}", file=sys.stderr)
    else:
        print(yaml_str)


if __name__ == "__main__":
    main()