""" hk_case_extractor.py ========================================================== 香港判決書結構化字段抽取管線 基於 本地 Ollama 小模型 + 分階段抽取 + JSON Schema 強制 + 校驗重試 設計理念 -------- 不直接把幾萬字餵給模型,而是: 1. 預處理:規則去噪 + 切段,純規則抽司法區域/案號等高確定性字段 2. 定位:每個字段只截取對應的高信號區段(通常 < 4k 字) 3. 分組抽取:拆成 5 次獨立 Ollama 調用,每次只負責 1-3 個字段 4. Schema 強制:用 Ollama 0.5+ 的 format= 約束輸出 5. 校驗+重試:對字數、黑名單、結構標註逐項校驗 6. judgment_summary 不從原文重生,而從前 4 步結果 + 一段分析段生成 依賴 ---- pip install requests pyyaml 本地需運行:ollama serve 模型:ollama pull qwen2.5:7b-instruct (推薦,中文抽取甜點) 或 ollama pull glm4:9b 使用 ---- python hk_case_extractor.py python hk_case_extractor.py case.txt --model qwen2.5:7b-instruct --out result.yaml """ from __future__ import annotations import argparse import json import re import sys from dataclasses import dataclass from pathlib import Path from typing import Any import requests import yaml # ============================================================================= # 配置 # ============================================================================= OLLAMA_URL = "https://openai.iconsz.com/ollama3090/api/chat" DEFAULT_MODEL = "qwen2.5:7b-instruct" DEFAULT_TIMEOUT = 240 MAX_RETRIES = 2 # ============================================================================= # 1. 預處理:去噪 + 切段 + 規則抽元數據 # ============================================================================= JURISDICTION_MAP: dict[str, str] = { "HKCFA": "香港特別行政區終審法院", "HKCA": "香港特別行政區高等法院上訴法庭", "HKCFI": "香港特別行政區高等法院原訟法庭", "HKDC": "香港特別行政區區域法院", "HKMC": "香港特別行政區裁判法院", "HKMagC": "香港特別行政區裁判法院", "HKSCT": "香港特別行政區小額錢債審裁處", "HKLT": "香港特別行政區土地審裁處", "HKLD": "香港特別行政區勞資審裁處", "HKCT": "香港特別行政區競爭事務審裁處", "HKCorC": "香港特別行政區死因裁判法庭", } NEUTRAL_CITATION_RE = re.compile( r"\[(\d{4})\]\s*(HKCFA|HKCA|HKCFI|HKDC|HKMagC|HKMC|HKSCT|HKLT|HKLD|HKCT|HKCorC)\s*(\d+)", re.I, ) CASE_NO_RE = re.compile( r"(FACV|FACC|FAMV|FAMC|CACV|CACC|HCA|HCAL|HCMP|HCCW|HCB|DCCJ|DCMP|SCTC|LBTC|LDPD)" r"\s*(?:NO\.?)?\s*\d+\s*(?:OF|/|\sof\s)\s*\d{4}", re.I, ) def clean_text(raw: str) -> str: """去頁眉頁腳、頁碼、多餘空行/空格""" t = raw t = re.sub(r"Page\s+\d+\s+of\s+\d+", "", t, flags=re.I) t = re.sub(r"^\s*-\s*\d+\s*-\s*$", "", t, flags=re.M) t = re.sub(r" +", " ", t) # 全角空格 t = re.sub(r"[ \t]+", " ", t) t = re.sub(r"\n{3,}", "\n\n", t) return t.strip() def extract_metadata_by_rule(text: str) -> dict[str, Any]: """純規則:司法區域、案號、案件地點(默認香港特區)""" meta: dict[str, Any] = { "jurisdiction_code": None, "jurisdiction_name": None, "case_location": ["香港特別行政區"], "case_number": None, } if m := NEUTRAL_CITATION_RE.search(text): code = m.group(2).upper() # 規範化大小寫 for k in JURISDICTION_MAP: if k.upper() == code: meta["jurisdiction_code"] = k meta["jurisdiction_name"] = JURISDICTION_MAP[k] break # fallback:靠法院全稱反查 if not meta["jurisdiction_code"]: for code, full in JURISDICTION_MAP.items(): short = full.replace("香港特別行政區", "") if full in text or short in text: meta["jurisdiction_code"] = code meta["jurisdiction_name"] = full break if m := CASE_NO_RE.search(text): meta["case_number"] = re.sub(r"\s+", " ", m.group(0).strip()) return meta # ----------------------------------------------------------------------------- # 關鍵詞 + 窗口召回(取代脆弱的正則切段) # ----------------------------------------------------------------------------- # 思路:每個抽取目標定義一組「高信號關鍵詞」,掃全文取所有命中位置周圍 # ±half_window 字符的窗口,合併重疊後拼接喂給 LLM。 # 不依賴判決書的固定章節標題,對結構各異的香港判決書都能工作。 KEYWORD_GROUPS: dict[str, list[str]] = { # Call 1:當事人。集中在抬頭,但也可能在 "本案中,原告..." 等行文中出現 "parties": [ "BETWEEN", "介乎", " AND ", "申索人", "原告", "原訴人", "上訴人", "覆核申請人", "覆核人", "申請人", "被告", "答辯人", "被上訴人", "答辯方", "Plaintiff", "Defendant", "Appellant", "Respondent", "Applicant", ], # Call 2:事由與標的 "reason_object": [ # 段落標題類 "案情", "背景", "引言", "事實", "案件背景", "INTRODUCTION", "BACKGROUND", "FACTS", "THE FACTS", # 主張類 "申索", "索償", "訴因", "聲稱", "請求", "本案涉及", "本訴訟", "原告聲稱", "申索人聲稱", "申索人指稱", # 標的物關鍵詞 "賠償", "損失", "損害", "精神困擾", "經濟損失", "醫療費", "履行", "所有權", "占有", "撤銷", "宣告", ], # Call 3:判決結果 "judgment_result": [ # 命令類 "本席命令", "本庭命令", "本席裁定", "本庭裁定", "命令如下", "DISPOSITION", "ORDER", "CONCLUSION", "I therefore order", # 結果類 "判決", "判給", "獲判", "判處", "支付", "勝訴", "敗訴", "部分勝訴", "駁回", "撤銷", "維持", "發還", "ALLOWED", "DISMISSED", "GRANTED", "REFUSED", # 收尾連接詞 "因此", "故此", "綜上", ], # Call 4:涉及實體(法官、律師、引用案例中的法官) "entities": [ # 法官稱謂 "法官", "大法官", "審裁官", "裁判官", "首席法官", "常任法官", "非常任法官", "Hon.", " J.", " JA ", " CJ ", " PJ ", " NPJ ", # 代表類 "代表", "大律師", "律師", "資深大律師", "代表律師", "Counsel", "Solicitor", # 案例引用(會在周邊帶出法官名) " v ", " v. ", "[19", "[20", ], # Call 5:法庭分析(用於 summary 的核心輸入) # 注意:避免使用"分析""理由""引用"等單/雙字泛詞——這些在程序性段落、 # 目錄、引文索引中也會大量出現,會把召回擴散成全文。 # 改用判決書分析段獨有的「本席/本庭 + 動詞」短語錨點。 "analysis": [ "本席認為", "本席接納", "本席不接納", "本席同意", "本席不同意", "本席裁定", "本席拒絕", "本席認同", "本庭認為", "本庭接納", "本庭裁定", "I find", "I accept", "I do not accept", "I conclude", "The court finds", "In my view", "In my judgment", "舉證責任", "審慎責任", "鄰人原則", "替代責任", "違反", "侵權", "過失", ], } def gather_chunks(text: str, keywords: list[str], half_window: int = 500, max_total: int = 6500, case_sensitive: bool = False) -> tuple[str, int]: """ 召回所有 keywords 命中位置周圍 ±half_window 字符的窗口, 合併重疊區間,按位置順序拼接,總長不超過 max_total。 返回:(拼接後文本, 命中關鍵詞數) 若無命中,fallback 返回文檔前 max_total 字。 """ if not text: return "", 0 flags = 0 if case_sensitive else re.IGNORECASE hits: list[tuple[int, int]] = [] for kw in keywords: for m in re.finditer(re.escape(kw), text, flags=flags): s = max(0, m.start() - half_window) e = min(len(text), m.end() + half_window) hits.append((s, e)) if not hits: return text[:max_total], 0 # 合併重疊區間 hits.sort() merged: list[list[int]] = [] for s, e in hits: if merged and s <= merged[-1][1]: merged[-1][1] = max(merged[-1][1], e) else: merged.append([s, e]) # 按位置順序拼接,控制總長 pieces: list[str] = [] total = 0 for s, e in merged: seg_len = e - s if total + seg_len > max_total: remain = max_total - total if remain > 200: pieces.append(text[s:s + remain]) break pieces.append(text[s:e]) total += seg_len return "\n\n[…]\n\n".join(pieces), len(hits) def gather_all(text: str) -> dict[str, str]: """為每個 group 召回對應的上下文片段""" # 各 group 的窗口大小可微調(parties 偏短,judgment_result 偏密) params: dict[str, tuple[int, int]] = { "parties": (400, 3000), "reason_object": (500, 6000), "judgment_result": (500, 6500), "entities": (400, 6500), "analysis": (500, 6500), } out: dict[str, str] = {} for group, kws in KEYWORD_GROUPS.items(): hw, mt = params[group] ctx, hits = gather_chunks(text, kws, half_window=hw, max_total=mt) out[group] = ctx # 同時保留命中數,便於日誌 out[f"_{group}_hits"] = str(hits) return out # ============================================================================= # 2. Ollama 客戶端:JSON Schema 強制 + 重試 # ============================================================================= @dataclass class OllamaClient: model: str = DEFAULT_MODEL url: str = OLLAMA_URL timeout: int = DEFAULT_TIMEOUT def chat_json(self, system: str, user: str, schema: dict, temperature: float = 0.0, num_ctx: int = 8192) -> dict: """調用 Ollama,使用 format= 強制結構化輸出""" payload = { "model": self.model, "messages": [ {"role": "system", "content": system}, {"role": "user", "content": user}, ], "format": schema, "stream": False, "options": {"temperature": temperature, "num_ctx": num_ctx}, } r = requests.post(self.url, json=payload, timeout=self.timeout) r.raise_for_status() content = r.json()["message"]["content"] try: return json.loads(content) except json.JSONDecodeError as e: # 嘗試剝離可能的 ```json fence stripped = re.sub(r"^```(?:json)?\s*|\s*```$", "", content.strip(), flags=re.S) return json.loads(stripped) def chat_json_with_retry(self, system: str, user: str, schema: dict, validator=None, **kw) -> dict: """validator(result) -> (ok: bool, hint: str);失敗則回灌 hint 重試""" last_err = None for attempt in range(MAX_RETRIES + 1): try: out = self.chat_json(system, user, schema, **kw) if validator is None: return out ok, hint = validator(out) if ok: return out # 回灌錯誤信息 user = (f"{user}\n\n上次輸出存在問題:{hint}\n" f"請修正後重新輸出。") except Exception as e: last_err = e if last_err: raise last_err return out # type: ignore # ============================================================================= # 3. 五次抽取調用:每次只負責一組字段 # ============================================================================= # --- Call 1: 當事人 ---------------------------------------------------------- PARTIES_SCHEMA = { "type": "object", "properties": { "plaintiff": {"type": "array", "items": {"type": "string"}}, "defendant": {"type": "array", "items": {"type": "string"}}, }, "required": ["plaintiff", "defendant"], } PARTIES_SYSTEM = """你是香港法律文書信息抽取助手。 從給定的判決書抬頭中抽取所有當事人完整姓名/機構名。 - 原告/申索人/上訴人/覆核申請人 → plaintiff - 被告/答辯人/被上訴人 → defendant - 保留中英文對照(如有) - 某類無則輸出空數組 只輸出符合 schema 的 JSON,不要解釋。""" PARTIES_FEWSHOT = """範例輸入: BETWEEN 陳大文 (CHAN TAI MAN) 上訴人 AND 香港房屋委員會 (Hong Kong Housing Authority) 答辯人 範例輸出: {"plaintiff":["陳大文 (CHAN TAI MAN)"],"defendant":["香港房屋委員會 (Hong Kong Housing Authority)"]}""" def extract_parties(client: OllamaClient, context: str) -> dict: user = f"{PARTIES_FEWSHOT}\n\n請從以下片段(多處關鍵詞召回拼接)抽取:\n```\n{context[:3000]}\n```" return client.chat_json_with_retry(PARTIES_SYSTEM, user, PARTIES_SCHEMA) # --- Call 2: 事由 + 標的 ---------------------------------------------------- REASON_OBJECT_SCHEMA = { "type": "object", "properties": { "case_reason": {"type": "string", "maxLength": 100}, "case_object": {"type": "array", "items": {"type": "string"}}, }, "required": ["case_reason", "case_object"], } REASON_OBJECT_SYSTEM = """從香港判決書中抽取: 1. case_reason(事由): - 嚴格 ≤100 字,單句 - 結構:[原告身份] + [針對什麼事件/行為] + [向誰] + [提出什麼請求] - 覆核/上訴案件須註明對哪個裁決提出覆核(含日期/案號) - 嚴禁包含:判決結果、法庭分析、案發細節 2. case_object(標的物): - 訴訟請求指向的實體權利或利益 - 例:人身傷害賠償、合同履行、房產所有權、精神困擾賠償 - 合併本質相同的標的 - 嚴禁:證據材料、程序性訴求(如"要求法庭裁決") 只輸出 JSON。""" REASON_OBJECT_FEWSHOT = """範例輸出: {"case_reason":"申索人為商場保安員,就被告於2023年7月在商場毆打申索人造成的人身傷害向被告提出損害賠償申索。","case_object":["人身傷害賠償","醫療費用","精神困擾賠償"]}""" def _reason_object_validator(out: dict) -> tuple[bool, str]: r = out.get("case_reason", "") if len(r) > 100: return False, f"case_reason 共 {len(r)} 字,超過 100 字上限,請壓縮到 80 字以內。" if not out.get("case_object"): return False, "case_object 不能為空。" return True, "" def extract_reason_object(client: OllamaClient, context: str) -> dict: user = (f"{REASON_OBJECT_FEWSHOT}\n\n" f"請從以下片段(多處關鍵詞召回拼接)抽取:\n```\n{context[:6000]}\n```") out = client.chat_json_with_retry(REASON_OBJECT_SYSTEM, user, REASON_OBJECT_SCHEMA, validator=_reason_object_validator) if len(out["case_reason"]) > 100: out["case_reason"] = out["case_reason"][:100] return out # --- Call 3: 判決結果 ------------------------------------------------------- JUDGMENT_RESULT_SCHEMA = { "type": "object", "properties": { "judgment_result": { "type": "array", "items": { "type": "object", "properties": { "charge": {"type": "string"}, "result": {"type": "string"}, }, "required": ["charge", "result"], }, } }, "required": ["judgment_result"], } JUDGMENT_RESULT_SYSTEM = """從香港判決書命令/裁定部分抽取所有判決結果。 拆分原則: - 多項請求 → 分條 - "責任判定" 與 "損失/金額計算" 兩個層面 → 必須分條 - 每條 charge 必須以 "(責任問題)" 或 "(損失範圍)" 結尾標註層次 - result 必須包含: a) 明確結果(勝訴/敗訴/部分勝訴/維持/撤銷/駁回等) b) 2-3 個關鍵法庭理由 c) 具體金額、利率或命令內容(如有) 只輸出 JSON。""" JUDGMENT_RESULT_FEWSHOT = """範例輸出: {"judgment_result":[ {"charge":"申索人就毆打事件的人身傷害索償 (責任問題)","result":"勝訴。法庭接納申索人證供可信,閉路電視顯示被告先動手,被告亦承認部分情節。"}, {"charge":"醫療費及精神困擾賠償金額 (損失範圍)","result":"部分勝訴。判給醫療費HK$8,500及一般損害賠償HK$20,000,合共HK$28,500,連同利息及訟費。"} ]}""" def _judgment_validator(out: dict) -> tuple[bool, str]: items = out.get("judgment_result", []) if not items: return False, "judgment_result 不能為空。" bad = [i for i in items if "責任問題" not in i.get("charge", "") and "損失範圍" not in i.get("charge", "")] if bad: return False, (f"有 {len(bad)} 條 charge 未標註層次。" f"每條 charge 必須以 '(責任問題)' 或 '(損失範圍)' 結尾。") return True, "" def extract_judgment_result(client: OllamaClient, context: str) -> dict: user = (f"{JUDGMENT_RESULT_FEWSHOT}\n\n" f"請從以下片段(多處關鍵詞召回拼接)抽取:\n```\n{context[:6500]}\n```") return client.chat_json_with_retry(JUDGMENT_RESULT_SYSTEM, user, JUDGMENT_RESULT_SCHEMA, validator=_judgment_validator) # --- Call 4: 涉及實體 ------------------------------------------------------- ENTITIES_SCHEMA = { "type": "object", "properties": { "involved_entities": { "type": "array", "items": { "type": "object", "properties": { "entity_name": {"type": "string"}, "reason": {"type": "string"}, }, "required": ["entity_name", "reason"], }, } }, "required": ["involved_entities"], } ENTITIES_SYSTEM = """從香港判決書中抽取所有相關實體(自然人/法人/組織/機構)。 必須包含: - 主審法官 / 審裁官 - 雙方代表律師、大律師 - 判決中引用的先例所提及的法官 reason 須寫明:在XX案[案號]中擔任XX職位,闡述XX法律原則 - 涉案的政府部門、公司、機構 嚴禁包含: - 法案/條例名(如《侵權條例》、Cap.xxx) - 純案例名稱(如 Donoghue v Stevenson) - 文獻、期刊名 只輸出 JSON。""" ENTITIES_FEWSHOT = """範例輸出: {"involved_entities":[ {"entity_name":"林希維審裁官","reason":"本案主審審裁官,負責認定事實及裁決。"}, {"entity_name":"終審法院常任法官李義","reason":"在Tang Kwok Wah v HKSAR [2019] HKCFA 23 中擔任主筆法官,闡述舉證責任原則,本案第34段引用其判詞。"}, {"entity_name":"康樂文化事務署","reason":"涉案場所通州街公園的管理機構。"} ]}""" def _entities_validator(out: dict) -> tuple[bool, str]: ents = out.get("involved_entities", []) if not ents: return False, "involved_entities 不能為空,至少要有主審法官。" blacklist = ["條例", "Cap.", "法案"] bad = [e["entity_name"] for e in ents if any(k in e.get("entity_name", "") for k in blacklist)] if bad: return False, f"以下實體疑為條例/法案,應移除:{bad}" return True, "" def extract_entities(client: OllamaClient, context: str) -> dict: user = (f"{ENTITIES_FEWSHOT}\n\n" f"請從以下片段(多處關鍵詞召回拼接)抽取所有涉及實體:\n" f"```\n{context[:6500]}\n```") return client.chat_json_with_retry(ENTITIES_SYSTEM, user, ENTITIES_SCHEMA, validator=_entities_validator) # --- Call 5: 判決總結(基於已抽取結果 + 分析段,不從原文重生) ----------- SUMMARY_SCHEMA = { "type": "object", "properties": { "judgment_summary": {"type": "string", "maxLength": 300}, }, "required": ["judgment_summary"], } SUMMARY_SYSTEM = """根據已抽取的結構化字段 + 法庭分析段,撰寫判決總結。 四要素結構(必須全部涵蓋,連貫成單段): (1) 案件背景:1-2 句交代起因與當事人關係 (2) 核心爭議焦點 (3) 法庭法律分析與推理(核心重點): - 如何評估證據? - 接受 / 拒絕主張的邏輯? - 引用了哪些關鍵法律或判例? (4) 最終裁決結果及命令 嚴格 ≤300 字。只輸出 JSON。""" def _summary_validator(out: dict) -> tuple[bool, str]: s = out.get("judgment_summary", "") if len(s) > 300: return False, f"summary 共 {len(s)} 字,超過 300 字上限,請壓縮。" if len(s) < 80: return False, "summary 過短,請完整覆蓋四要素。" return True, "" def extract_summary(client: OllamaClient, prior: dict, analysis: str) -> dict: user = f"""已抽取的字段: ```json {json.dumps(prior, ensure_ascii=False, indent=2)} ``` 法庭分析節選: ``` {analysis[:3500]} ``` 請按四要素撰寫 ≤300 字的 judgment_summary。""" out = client.chat_json_with_retry(SUMMARY_SYSTEM, user, SUMMARY_SCHEMA, validator=_summary_validator) if len(out["judgment_summary"]) > 300: out["judgment_summary"] = out["judgment_summary"][:300] return out # ============================================================================= # 4. 全局校驗與後處理 # ============================================================================= LOCATION_BLACKLIST = [ "法院", "法庭", "審裁處", "公園", "大廈", "大樓", "商場", "街", "道路", "村", "中心", "醫院", "酒店", "車站", ] ENTITY_NAME_BLACKLIST = ["條例", "Cap.", "法案", "案例彙編", "Reports", "期刊", "Journal"] def validate_and_fix(result: dict) -> tuple[dict, list[str]]: warnings: list[str] = [] # case_location:剔除法院/場所/建築 locs = result.get("case_location") or [] cleaned = [l for l in locs if l and not any(b in l for b in LOCATION_BLACKLIST)] if "香港特別行政區" not in cleaned: cleaned.insert(0, "香港特別行政區") if set(cleaned) != set(locs): warnings.append( f"case_location 已清理:移除 {set(locs) - set(cleaned)}") result["case_location"] = cleaned # 字數硬截斷 if len(result.get("case_reason", "")) > 100: warnings.append("case_reason > 100 字,已截斷") result["case_reason"] = result["case_reason"][:100] if len(result.get("judgment_summary", "")) > 300: warnings.append("judgment_summary > 300 字,已截斷") result["judgment_summary"] = result["judgment_summary"][:300] # involved_entities:剔除條例/文獻 ents = result.get("involved_entities") or [] cleaned_ents = [e for e in ents if not any(k in e.get("entity_name", "") for k in ENTITY_NAME_BLACKLIST)] if len(cleaned_ents) != len(ents): warnings.append( f"involved_entities 移除 {len(ents) - len(cleaned_ents)} 條疑似條例/文獻") result["involved_entities"] = cleaned_ents # judgment_result:補層次標註提示 for jr in result.get("judgment_result", []) or []: if ("責任問題" not in jr.get("charge", "") and "損失範圍" not in jr.get("charge", "")): warnings.append( f"judgment_result 條目缺層次標註:{jr.get('charge', '')[:40]}") # 空字段告警 for k in ("plaintiff", "defendant", "case_object", "judgment_result", "involved_entities"): if not result.get(k): warnings.append(f"{k} 為空,請人工複核") return result, warnings # ============================================================================= # 5. 主管線 # ============================================================================= def run_pipeline(text: str, model: str) -> dict: log = lambda m: print(m, file=sys.stderr) log("[1/6] 預處理 + 關鍵詞召回...") text = clean_text(text) meta = extract_metadata_by_rule(text) ctx = gather_all(text) log(f" 規則元數據:{meta}") log(f" 召回片段:") for g in ("parties", "reason_object", "judgment_result", "entities", "analysis"): log(f" {g:16s} len={len(ctx[g]):5d} hits={ctx[f'_{g}_hits']}") client = OllamaClient(model=model) log("[2/6] 抽取當事人...") parties = extract_parties(client, ctx["parties"]) log("[3/6] 抽取事由與標的...") reason_obj = extract_reason_object(client, ctx["reason_object"]) log("[4/6] 抽取判決結果...") judgment = extract_judgment_result(client, ctx["judgment_result"]) log("[5/6] 抽取涉及實體...") # 實體抽取上下文:當事人片段(含律師名)+ 引用片段 entities_ctx = (ctx["parties"][:2500] + "\n\n[…]\n\n" + ctx["entities"])[:6500] entities = extract_entities(client, entities_ctx) interim_for_summary = { **parties, **reason_obj, **judgment, **entities, "jurisdiction_name": meta["jurisdiction_name"], } log("[6/6] 撰寫判決總結...") summary = extract_summary(client, interim_for_summary, ctx["analysis"]) final = { "plaintiff": parties["plaintiff"], "defendant": parties["defendant"], "jurisdiction_code": meta["jurisdiction_code"], "jurisdiction_name": meta["jurisdiction_name"], "case_location": meta["case_location"], "case_reason": reason_obj["case_reason"], "case_object": reason_obj["case_object"], "judgment_result": judgment["judgment_result"], "judgment_summary": summary["judgment_summary"], "involved_entities": entities["involved_entities"], } final, warnings = validate_and_fix(final) for w in warnings: log(f" ⚠️ {w}") return final # ============================================================================= # 6. YAML 輸出(長字串用 > 折疊;含特殊字符的自動雙引號) # ============================================================================= class FoldedStr(str): """標記為 YAML > 折疊樣式""" def _folded_str_representer(dumper, data): return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=">") def _safe_str_representer(dumper, data): """含 :, #, - 開頭的字符串強制雙引號""" if data and (":" in data or data.startswith("#") or data.startswith("- ")): return dumper.represent_scalar("tag:yaml.org,2002:str", data, style='"') return dumper.represent_scalar("tag:yaml.org,2002:str", data) yaml.add_representer(FoldedStr, _folded_str_representer) yaml.add_representer(str, _safe_str_representer) def to_yaml(result: dict) -> str: if result.get("case_reason"): result["case_reason"] = FoldedStr(result["case_reason"]) if result.get("judgment_summary"): result["judgment_summary"] = FoldedStr(result["judgment_summary"]) return yaml.dump(result, allow_unicode=True, sort_keys=False, default_flow_style=False, width=100) # ============================================================================= # CLI # ============================================================================= def main() -> None: ap = argparse.ArgumentParser( description="香港判決書結構化抽取(本地 Ollama 版)") ap.add_argument("input", help="判決書文本路徑(.txt)") ap.add_argument("--model", default=DEFAULT_MODEL, help="Ollama 模型名") ap.add_argument("--out", default=None, help="輸出 YAML 路徑(默認 stdout)") ap.add_argument("--debug-dump", default=None, help="額外輸出原始 JSON 結果到該路徑(便於 diff)") args = ap.parse_args() text = Path(args.input).read_text(encoding="utf-8") result = run_pipeline(text, args.model) if args.debug_dump: Path(args.debug_dump).write_text( json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8") yaml_str = to_yaml(result) if args.out: Path(args.out).write_text(yaml_str, encoding="utf-8") print(f"\n✅ 已寫入 {args.out}", file=sys.stderr) else: print(yaml_str) if __name__ == "__main__": main()