""" hk_case_extractor.py ========================================================== 香港判決書結構化字段抽取管線 基於 本地 Ollama 小模型 + 分階段抽取 + JSON Schema 強制 + 校驗重試 設計理念 -------- 基於對實際香港判決書結構的分析優化: 1. 預處理:規則去噪 + 切段,純規則抽司法區域/案號等高確定性字段 2. 智能定位: - 基礎信息(當事人、案號):直接從開頭2000字符提取 - 判決結果:優先從尾部4000字符提取 - 其他字段:使用關鍵詞召回相關段落 3. 分組抽取:拆成 5 次獨立 Ollama 調用,每次只負責 1-3 個字段 4. Schema 強制:用 Ollama 0.5+ 的 format= 約束輸出 5. 校驗+重試:對字數、黑名單、結構標註逐項校驗 6. judgment_summary 不從原文重生,而從前 4 步結果 + 一段分析段生成 判決書結構特點(基於實際案例分析) -------------------------------- - 開頭部分(前2000字符): * 案號(如 CACV000175/2000) * 法院名稱和級別 * 當事人信息(BETWEEN...AND 格式) * 案件標題 * 審理日期和法官信息 - 中間部分: * 案情背景(BACKGROUND, INTRODUCTION, 背景, 案情) * 法律分析和推理 * 證據評估 * 法律原則引用 - 尾部部分(後4000字符): * 判決結果(JUDGMENT, ORDER, CONCLUSION, 判決, 命令) * 具體命令和裁定 * 訟費安排 * 法官簽名 依賴 ---- pip install requests pyyaml 使用 ---- # 使用本地 Ollama(默認) python hk_case_extractor.py case.txt python hk_case_extractor.py case.txt --model qwen2.5:7b-instruct --out result.yaml # 使用 OpenRouter python hk_case_extractor.py case.txt \\ --base-url https://openrouter.ai/api/v1 \\ --model anthropic/claude-3.5-sonnet \\ --api-key your-api-key # 使用 OpenAI python hk_case_extractor.py case.txt \\ --base-url https://api.openai.com/v1 \\ --model gpt-4 \\ --api-key your-api-key """ from __future__ import annotations import argparse import json import re import sys import time from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass, field from pathlib import Path from typing import Any import requests import yaml # ============================================================================= # 配置 # ============================================================================= DEFAULT_BASE_URL = "http://localhost:11434/v1" # Ollama 默認 OpenAI 兼容端點 DEFAULT_MODEL = "qwen2.5:7b-instruct" DEFAULT_API_KEY = "ollama" # Ollama 不需要真實 key,但 API 需要提供 DEFAULT_TIMEOUT = 600 # 增加到 10 分鐘,適應遠程服務器 MAX_RETRIES = 2 DEFAULT_MODELS_FILE = "models.json" # ============================================================================= # 模型配置(models.json):按配置名加載 base_url / api_key / model / 計費價格 # ============================================================================= def load_model_profile(name: str, models_file: str = DEFAULT_MODELS_FILE) -> dict: """從 models.json 按 Name 加載一個模型配置。 配置格式(數組): [{"Name":..., "source":..., "BaseApiUrl":..., "ApiKey":..., "model":..., "input_price":..., "output_price":..., "price_unit":...}] 找不到文件或配置名時直接退出並提示可用配置。 """ path = Path(models_file) if not path.exists(): print(f"❌ 找不到模型配置文件:{models_file}", file=sys.stderr) sys.exit(1) try: profiles = json.loads(path.read_text(encoding="utf-8")) except json.JSONDecodeError as e: print(f"❌ 模型配置文件不是有效的 JSON:{e}", file=sys.stderr) sys.exit(1) if not isinstance(profiles, list): print("❌ 模型配置文件應為配置對象組成的數組", file=sys.stderr) sys.exit(1) for p in profiles: if p.get("Name") == name: return p available = [p.get("Name") for p in profiles] print(f"❌ 配置 '{name}' 不存在。可用配置:{available}", file=sys.stderr) sys.exit(1) # ============================================================================= # 0. 語言檢測 # ============================================================================= def detect_language(text: str) -> str: """檢測文本主要語言:'zh' 或 'en' 策略:統計前3000字符中的中文字符比例 - 中文字符 > 30%:判定為中文 - 否則:判定為英文 """ sample = text[:3000] chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', sample)) total_chars = len(sample.strip()) if total_chars == 0: return 'en' chinese_ratio = chinese_chars / total_chars return 'zh' if chinese_ratio > 0.3 else 'en' # ============================================================================= # 1. 預處理:去噪 + 切段 + 規則抽元數據 # ============================================================================= JURISDICTION_MAP_ZH: dict[str, str] = { "HKCFA": "香港特別行政區終審法院", "HKCA": "香港特別行政區高等法院上訴法庭", "HKCFI": "香港特別行政區高等法院原訟法庭", "HKDC": "香港特別行政區區域法院", "HKMC": "香港特別行政區裁判法院", "HKMagC": "香港特別行政區裁判法院", "HKSCT": "香港特別行政區小額錢債審裁處", "HKLT": "香港特別行政區土地審裁處", "HKLDT": "香港特別行政區土地審裁處", "HKLD": "香港特別行政區勞資審裁處", "HKLAT": "香港特別行政區勞資審裁處", "HKCT": "香港特別行政區競爭事務審裁處", "HKCorC": "香港特別行政區死因裁判法庭", "HKCrC": "香港特別行政區死因裁判法庭", } JURISDICTION_MAP_EN: dict[str, str] = { "HKCFA": "Court of Final Appeal of the Hong Kong Special Administrative Region", "HKCA": "Court of Appeal of the High Court of the Hong Kong Special Administrative Region", "HKCFI": "Court of First Instance of the High Court of the Hong Kong Special Administrative Region", "HKDC": "District Court of the Hong Kong Special Administrative Region", "HKMC": "Magistrates' Courts of the Hong Kong Special Administrative Region", "HKMagC": "Magistrates' Courts of the Hong Kong Special Administrative Region", "HKSCT": "Small Claims Tribunal of the Hong Kong Special Administrative Region", "HKLT": "Lands Tribunal of the Hong Kong Special Administrative Region", "HKLDT": "Lands Tribunal of the Hong Kong Special Administrative Region", "HKLD": "Labour Tribunal of the Hong Kong Special Administrative Region", "HKLAT": "Labour Tribunal of the Hong Kong Special Administrative Region", "HKCT": "Competition Tribunal of the Hong Kong Special Administrative Region", "HKCorC": "Coroner's Court of the Hong Kong Special Administrative Region", "HKCrC": "Coroner's Court of the Hong Kong Special Administrative Region", } NEUTRAL_CITATION_RE = re.compile( r"\[(\d{4})\]\s*(HKCFA|HKCA|HKCFI|HKDC|HKMagC|HKMC|HKSCT|HKLT|HKLD|HKCT|HKCorC)\s*(\d+)", re.I, ) CASE_NO_RE = re.compile( r"(FACV|FACC|FAMV|FAMC|CACV|CACC|CAAG|HCA|HCAL|HCMP|HCCW|HCB|DCCJ|DCMP|SCTC|LBTC|LDPD|LDBM|CCDI|WKCC)" r"\s*(?:NO\.?)?\s*\d+\s*(?:OF|/|\sof\s)\s*\d{4}", re.I, ) # 案號前綴到法院代碼的映射(優先級最高) CASE_NO_PREFIX_MAP: dict[str, str] = { "FACV": "HKCFA", # Final Appeal Civil "FACC": "HKCFA", # Final Appeal Criminal "FAMV": "HKCFA", # Final Appeal Miscellaneous "FAMC": "HKCFA", # Final Appeal Miscellaneous Criminal "CACV": "HKCA", # Court of Appeal Civil "CACC": "HKCA", # Court of Appeal Criminal "CAAG": "HKCA", # Court of Appeal (Administrative) "HCA": "HKCFI", # High Court Action "HCAL": "HKCFI", # High Court Administrative Law "HCMP": "HKCFI", # High Court Miscellaneous Proceedings "HCCW": "HKCFI", # High Court Companies Winding Up "HCB": "HKCFI", # High Court Bankruptcy "DCCJ": "HKDC", # District Court "DCMP": "HKDC", # District Court Miscellaneous Proceedings "SCTC": "HKSCT", # Small Claims Tribunal "LBTC": "HKLAT", # Labour Tribunal (勞資審裁處) "LDPD": "HKLAT", # Labour Tribunal "LDBM": "HKLDT", # Lands Tribunal (土地審裁處) "CCDI": "HKCrC", # Coroner's Court (死因裁判法庭) "WKCC": "HKMagC", # Magistrates' Court (裁判法院) } def clean_text(raw: str) -> str: """去頁眉頁腳、頁碼、多餘空行/空格""" t = raw t = re.sub(r"Page\s+\d+\s+of\s+\d+", "", t, flags=re.I) t = re.sub(r"^\s*-\s*\d+\s*-\s*$", "", t, flags=re.M) t = re.sub(r" +", " ", t) # 全角空格 t = re.sub(r"[ \t]+", " ", t) t = re.sub(r"\n{3,}", "\n\n", t) return t.strip() def extract_metadata_by_rule(text: str, lang: str = 'zh') -> dict[str, Any]: """純規則:司法區域、案號、案件地點(默認香港特區) 優先級: 1. 案號前綴(最可靠) 2. Neutral Citation 3. 法院全稱匹配 Args: text: 判決書文本 lang: 語言代碼 ('zh' 或 'en') """ # 根據語言選擇對應的映射表和默認地點 jurisdiction_map = JURISDICTION_MAP_ZH if lang == 'zh' else JURISDICTION_MAP_EN default_location = ["香港特別行政區"] if lang == 'zh' else ["Hong Kong Special Administrative Region"] meta: dict[str, Any] = { "jurisdiction_code": None, "jurisdiction_name": None, "case_location": default_location, "case_number": None, } # 優先:從案號前綴判斷法院 if m := CASE_NO_RE.search(text): case_no = re.sub(r"\s+", " ", m.group(0).strip()) meta["case_number"] = case_no # 提取前綴並映射到法院代碼 prefix = m.group(1).upper() if prefix in CASE_NO_PREFIX_MAP: code = CASE_NO_PREFIX_MAP[prefix] meta["jurisdiction_code"] = code meta["jurisdiction_name"] = jurisdiction_map.get(code) # 次優先:Neutral Citation(如果案號未能確定法院) if not meta["jurisdiction_code"]: if m := NEUTRAL_CITATION_RE.search(text): code = m.group(2).upper() # 規範化大小寫 for k in jurisdiction_map: if k.upper() == code: meta["jurisdiction_code"] = k meta["jurisdiction_name"] = jurisdiction_map[k] break # 最後:靠法院全稱反查(僅在前兩者都失敗時使用,且只搜索前2000字符) if not meta["jurisdiction_code"]: header = text[:2000] # 只在開頭搜索,避免被引用案例干擾 # 同時搜索中英文法院名稱 for code in jurisdiction_map: full_zh = JURISDICTION_MAP_ZH.get(code, "") full_en = JURISDICTION_MAP_EN.get(code, "") short_zh = full_zh.replace("香港特別行政區", "") short_en = full_en.replace("Hong Kong Special Administrative Region", "").replace(" of the ", " ") if any(name in header for name in [full_zh, short_zh, full_en, short_en] if name): meta["jurisdiction_code"] = code meta["jurisdiction_name"] = jurisdiction_map[code] break return meta # ----------------------------------------------------------------------------- # 關鍵詞 + 窗口召回(取代脆弱的正則切段) # ----------------------------------------------------------------------------- # 思路:每個抽取目標定義一組「高信號關鍵詞」,掃全文取所有命中位置周圍 # ±half_window 字符的窗口,合併重疊後拼接喂給 LLM。 # 不依賴判決書的固定章節標題,對結構各異的香港判決書都能工作。 # # 優化策略(基於實際案例分析): # 1. 當事人信息:直接從開頭2000字符提取(通常在 BETWEEN...AND 結構中) # 2. 判決結果:優先從尾部4000字符提取(通常在 JUDGMENT/ORDER/命令 部分) # 3. 其他字段:使用關鍵詞召回策略 KEYWORD_GROUPS: dict[str, list[str]] = { # Call 1:當事人 - 不再使用,改為直接截取開頭 # 保留此處僅為向後兼容,實際不會被 gather_all 使用 "parties": [], # Call 2:事由與標的 "reason_object": [ # 段落標題類 "案情", "背景", "引言", "事實", "案件背景", "案由", "INTRODUCTION", "BACKGROUND", "FACTS", "THE FACTS", "General course", # 主張類 "申索", "索償", "訴因", "聲稱", "請求", "本案涉及", "本訴訟", "原告聲稱", "申索人聲稱", "申索人指稱", "上訴人指", "答辯人指", "Plaintiff", "Claimant", "Appellant", "claim", "allege", # 標的物關鍵詞 "賠償", "損失", "損害", "精神困擾", "經濟損失", "醫療費", "履行", "所有權", "占有", "撤銷", "宣告", "damages", "compensation", ], # Call 3:判決結果 - 不再使用關鍵詞,改為直接截取尾部 # 保留此處僅為向後兼容 "judgment_result": [], # Call 4:涉及實體(法官、律師、引用案例中的法官) "entities": [ # 法官稱謂 "法官", "大法官", "審裁官", "裁判官", "首席法官", "常任法官", "非常任法官", "Hon.", " J.", " JA ", " CJ ", " PJ ", " NPJ ", "Coroner", "Judge", # 代表類 "代表", "大律師", "律師", "資深大律師", "代表律師", "Counsel", "Solicitor", "instructed by", "represented by", # 案例引用(會在周邊帶出法官名) " v ", " v. ", " 訴 ", "[19", "[20", "HKCFA", "HKCA", "HKCFI", ], # Call 5:法庭分析(用於 summary 的核心輸入) "analysis": [ # 法庭觀點標記 "本席認為", "本席接納", "本席不接納", "本席同意", "本席不同意", "本席裁定", "本席拒絕", "本席認同", "本席考慮", "本庭認為", "本庭接納", "本庭裁定", "本庭認同", "本庭考慮", "I find", "I accept", "I do not accept", "I conclude", "I consider", "The court finds", "In my view", "In my judgment", "The Court held", # 法律原則 "舉證責任", "審慎責任", "鄰人原則", "替代責任", "合理疑點", "違反", "侵權", "過失", "negligence", "breach", "duty of care", # 證據評估 "證據顯示", "根據證據", "證人證供", "可信", "不可信", "evidence shows", "testimony", "credible", "reliable", ], } def gather_chunks(text: str, keywords: list[str], half_window: int = 500, max_total: int = 6500, case_sensitive: bool = False) -> tuple[str, int]: """ 召回所有 keywords 命中位置周圍 ±half_window 字符的窗口, 合併重疊區間,按位置順序拼接,總長不超過 max_total。 返回:(拼接後文本, 命中關鍵詞數) 若無命中,fallback 返回文檔前 max_total 字。 """ if not text: return "", 0 flags = 0 if case_sensitive else re.IGNORECASE hits: list[tuple[int, int]] = [] for kw in keywords: for m in re.finditer(re.escape(kw), text, flags=flags): s = max(0, m.start() - half_window) e = min(len(text), m.end() + half_window) hits.append((s, e)) if not hits: return text[:max_total], 0 # 合併重疊區間 hits.sort() merged: list[list[int]] = [] for s, e in hits: if merged and s <= merged[-1][1]: merged[-1][1] = max(merged[-1][1], e) else: merged.append([s, e]) # 按位置順序拼接,控制總長 pieces: list[str] = [] total = 0 for s, e in merged: seg_len = e - s if total + seg_len > max_total: remain = max_total - total if remain > 200: pieces.append(text[s:s + remain]) break pieces.append(text[s:e]) total += seg_len return "\n\n[…]\n\n".join(pieces), len(hits) def gather_all(text: str, head_length: int = 5000, tail_length: int = 5000, entities_window: int = 400, entities_max: int = 6500, analysis_window: int = 500, analysis_max: int = 6500) -> dict[str, str]: """為每個 group 召回對應的上下文片段 優化策略: 1. 基礎信息(當事人):直接取開頭 head_length 字符,不使用關鍵詞召回 2. 事由與標的(reason_object):直接取開頭 head_length 字符,不使用關鍵詞召回 3. 判決結果:取開頭 head_length 字符 + 尾部 tail_length 字符 4. 其他字段:保持關鍵詞召回策略 Args: text: 判決書全文 head_length: 開頭截取長度(默認 5000) tail_length: 尾部截取長度(默認 5000) entities_window: 實體關鍵詞窗口半徑(默認 400) entities_max: 實體片段最大總長度(默認 6500) analysis_window: 分析關鍵詞窗口半徑(默認 500) analysis_max: 分析片段最大總長度(默認 6500) """ out: dict[str, str] = {} # 1. 當事人信息:直接從開頭截取 out["parties"] = text[:head_length] out["_parties_hits"] = "0" # 不使用關鍵詞,標記為0 # 2. 事由與標的:直接從開頭截取 out["reason_object"] = text[:head_length] out["_reason_object_hits"] = "0" # 不使用關鍵詞,標記為0 # 3. 判決結果:取開頭 + 尾部 head_text = text[:head_length] if len(text) > head_length else text tail_text = text[-tail_length:] if len(text) > tail_length else "" # 如果文本足夠長,拼接頭尾;否則只用全文 if tail_text and head_text != tail_text: out["judgment_result"] = head_text + "\n\n[…]\n\n" + tail_text else: out["judgment_result"] = head_text out["_judgment_result_hits"] = "0" # 直接截取,不計算關鍵詞命中 # 4. 其他字段:使用關鍵詞召回 params: dict[str, tuple[int, int]] = { "entities": (entities_window, entities_max), "analysis": (analysis_window, analysis_max), } for group in ["entities", "analysis"]: kws = KEYWORD_GROUPS[group] hw, mt = params[group] ctx, hits = gather_chunks(text, kws, half_window=hw, max_total=mt) out[group] = ctx out[f"_{group}_hits"] = str(hits) return out # ============================================================================= # 2. OpenAI 兼容客戶端:支持 Ollama / OpenRouter / OpenAI 等 # ============================================================================= @dataclass class OpenAICompatibleClient: """OpenAI 兼容的 API 客戶端 支持: - Ollama (http://localhost:11434/v1) - OpenRouter (https://openrouter.ai/api/v1) - OpenAI (https://api.openai.com/v1) - 其他 OpenAI 兼容的服務 """ model: str = DEFAULT_MODEL base_url: str = DEFAULT_BASE_URL api_key: str = DEFAULT_API_KEY timeout: int = DEFAULT_TIMEOUT # token 用量累計(跨所有調用,含重試) total_input_tokens: int = field(default=0, init=False) total_output_tokens: int = field(default=0, init=False) num_calls: int = field(default=0, init=False) @property def total_tokens(self) -> int: return self.total_input_tokens + self.total_output_tokens def chat_json(self, system: str, user: str, schema: dict, temperature: float = 0.0, max_tokens: int = 4096) -> dict: """調用 OpenAI 兼容 API,使用 response_format 強制 JSON 輸出""" # 構建請求 URL url = f"{self.base_url.rstrip('/')}/chat/completions" # 構建請求頭 headers = { "Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}", } # 構建請求體 payload = { "model": self.model, "messages": [ {"role": "system", "content": system}, {"role": "user", "content": user}, ], "temperature": temperature, "max_tokens": max_tokens, "response_format": {"type": "json_object"}, # OpenAI 兼容的 JSON 模式 } # 發送請求 try: r = requests.post(url, json=payload, headers=headers, timeout=self.timeout) r.raise_for_status() except requests.exceptions.RequestException as e: print(f"❌ API 請求失敗:{e}", file=sys.stderr) print(f" URL: {url}", file=sys.stderr) print(f" Model: {self.model}", file=sys.stderr) raise # 解析響應 try: response_data = r.json() except json.JSONDecodeError as e: print(f"❌ API 響應不是有效的 JSON", file=sys.stderr) print(f" 響應狀態碼: {r.status_code}", file=sys.stderr) print(f" 響應內容: {r.text[:500]}", file=sys.stderr) raise # 累計 token 用量(OpenAI 兼容端點通常在 usage 字段返回) usage = response_data.get("usage") or {} self.total_input_tokens += int(usage.get("prompt_tokens", 0) or 0) self.total_output_tokens += int(usage.get("completion_tokens", 0) or 0) self.num_calls += 1 # 提取內容 if "choices" not in response_data or not response_data["choices"]: print(f"❌ API 響應缺少 choices 字段", file=sys.stderr) print(f" 響應數據: {json.dumps(response_data, ensure_ascii=False, indent=2)[:500]}", file=sys.stderr) raise ValueError("API 響應格式錯誤:缺少 choices 字段") content = response_data["choices"][0]["message"]["content"] if not content or not content.strip(): print(f"❌ 模型返回空內容", file=sys.stderr) print(f" 完整響應: {json.dumps(response_data, ensure_ascii=False, indent=2)[:1000]}", file=sys.stderr) raise ValueError("模型返回空內容") # 清理可能的 markdown 代碼塊包裹 content = self._clean_json_response(content) try: return json.loads(content) except json.JSONDecodeError as e: # 如果仍然失敗,打印錯誤信息以便調試 print(f"❌ JSON 解析失敗", file=sys.stderr) print(f" 錯誤: {e}", file=sys.stderr) print(f" 原始內容(前500字符):\n{content[:500]}", file=sys.stderr) print(f" 原始內容(後500字符):\n{content[-500:]}", file=sys.stderr) raise def _clean_json_response(self, content: str) -> str: """清理模型輸出中可能包含的 markdown 代碼塊標記和開頭的 標籤 處理以下格式: - {{思考的內容}} (僅開頭) - ```json\n{...}\n``` - ```\n{...}\n``` - {... 前後有空白字符 """ content = content.strip() # 移除開頭的 ... 標籤及其內容 # 使用非貪婪匹配,支持多行,只匹配開頭 if content.startswith("") or content.startswith(""): match = re.match(r'.*?\s*', content, flags=re.DOTALL | re.IGNORECASE) if match: content = content[match.end():] content = content.strip() # 移除開頭的 ```json 或 ``` if content.startswith("```"): # 找到第一個換行符 first_newline = content.find("\n") if first_newline != -1: content = content[first_newline + 1:] # 移除結尾的 ``` if content.endswith("```"): # 找到最後一個 ``` 之前的換行符 last_fence = content.rfind("```") if last_fence != -1: content = content[:last_fence] return content.strip() def chat_json_with_retry(self, system: str, user: str, schema: dict, validator=None, **kw) -> dict: """validator(result) -> (ok: bool, hint: str);失敗則回灌 hint 重試""" last_err = None for attempt in range(MAX_RETRIES + 1): try: out = self.chat_json(system, user, schema, **kw) if validator is None: return out ok, hint = validator(out) if ok: return out # 回灌錯誤信息 user = (f"{user}\n\n上次輸出存在問題:{hint}\n" f"請修正後重新輸出。") except Exception as e: last_err = e if last_err: raise last_err return out # type: ignore # ============================================================================= # 3. 五次抽取調用:每次只負責一組字段 # ============================================================================= # --- Call 1: 當事人 ---------------------------------------------------------- PARTIES_SCHEMA = { "type": "object", "properties": { "plaintiff": {"type": "array", "items": {"type": "string"}}, "defendant": {"type": "array", "items": {"type": "string"}}, }, "required": ["plaintiff", "defendant"], } # 中文提示詞 PARTIES_SYSTEM_ZH = """你是香港法律文書信息抽取助手。 從給定的判決書開頭部分抽取所有當事人完整姓名/機構名。 格式識別: 1. 英文格式:BETWEEN ... AND ... 2. 中文格式:申請人 ... 對/訴 答辯人 ... 3. 混合格式:Plaintiff ... Defendant ... 分類規則: - 原告/申索人/上訴人/覆核申請人/Plaintiff/Appellant/Claimant/Applicant → plaintiff - 被告/答辯人/被上訴人/Defendant/Respondent → defendant - 保留中英文對照(如有) - 某類無則輸出空數組 只輸出符合 schema 的 JSON,不要解釋。""" PARTIES_FEWSHOT_ZH = """範例1(原告/被告格式): BETWEEN 陳大文 (CHAN TAI MAN) 上訴人 AND 香港房屋委員會 (Hong Kong Housing Authority) 答辯人 輸出: {"plaintiff":["陳大文 (CHAN TAI MAN)"],"defendant":["香港房屋委員會 (Hong Kong Housing Authority)"]} 範例2(申請人/答辯人格式): 申請人: 李小明 答辯人: 入境事務處處長 輸出: {"plaintiff":["李小明"],"defendant":["入境事務處處長"]}""" # 英文提示詞 PARTIES_SYSTEM_EN = """You are a Hong Kong legal document information extraction assistant. Extract all complete names/organization names of parties from the beginning of the judgment. Format Recognition: 1. English format: BETWEEN ... AND ... 2. Chinese format: 申請人 ... 對/訴 答辯人 ... 3. Mixed format: Plaintiff ... Defendant ... Classification Rules: - Plaintiff/Claimant/Appellant/Applicant/原告/申索人/上訴人/覆核申請人 → plaintiff - Defendant/Respondent/被告/答辯人/被上訴人 → defendant - Preserve bilingual names (if any) - Output empty array if none Output only JSON conforming to schema, no explanation.""" PARTIES_FEWSHOT_EN = """Example 1 (Plaintiff/Defendant format): BETWEEN Dr Paul KI Ping-ki 1st Plaintiff Hong Kong Washington Company 2nd Plaintiff AND Next Magazine Publishing Ltd 1st Defendant Output: {"plaintiff":["Dr Paul KI Ping-ki","Hong Kong Washington Company"],"defendant":["Next Magazine Publishing Ltd"]} Example 2 (Applicant/Respondent format): Between: MO YUK PING Applicant and HONG KONG SPECIAL ADMINISTRATIVE REGION Respondent Output: {"plaintiff":["MO YUK PING"],"defendant":["HONG KONG SPECIAL ADMINISTRATIVE REGION"]}""" def extract_parties(client: OpenAICompatibleClient, context: str, lang: str = 'zh') -> dict: system = PARTIES_SYSTEM_ZH if lang == 'zh' else PARTIES_SYSTEM_EN fewshot = PARTIES_FEWSHOT_ZH if lang == 'zh' else PARTIES_FEWSHOT_EN if lang == 'zh': user = f"{fewshot}\n\n請從以下判決書開頭部分抽取:\n```\n{context[:5000]}\n```" else: user = f"{fewshot}\n\nPlease extract from the following judgment header:\n```\n{context[:5000]}\n```" return client.chat_json_with_retry(system, user, PARTIES_SCHEMA) # --- Call 2: 事由 + 標的 ---------------------------------------------------- def get_reason_object_schema(lang: str = 'zh') -> dict: """根據語言返回對應的 schema(英文字數限制更寬鬆)""" max_length = 100 if lang == 'zh' else 200 # 英文允許 2 倍字符數 return { "type": "object", "properties": { "case_reason": {"type": "string", "maxLength": max_length}, "case_object": {"type": "array", "items": {"type": "string"}}, }, "required": ["case_reason", "case_object"], } # 中文提示詞 REASON_OBJECT_SYSTEM_ZH = f"""從香港判決書中抽取: 1. case_reason(事由): - 嚴格 ≤100 字,單句,清晰完整 - 結構:[原告身份] + [針對什麼事件/行為] + [向誰] + [提出什麼請求] - 覆核/上訴案件須註明對哪個裁決提出覆核(含日期/案號) - 嚴禁包含:判決結果、法庭分析、案發細節、證據評估 - 只描述訴訟的起因和請求,不涉及法庭的判斷 2. case_object(標的物): - 訴訟請求指向的實體權利或利益 - 例:汽車、黃金、珠寶、不動產產權、撫養權、人身傷害賠償、合同履行、房產所有權、精神困擾賠償、居留權 - 涉及金錢標的時必須提取,並標明幣種與具體金額(如「拖欠貨款 HK$850,000」「索償 HK$1,000,000」「拖欠租金 HK$120,000」);金額未定或待評定者註明「金額待評定」 - 合併本質相同的標的 - 嚴禁:證據材料、程序性訴求(如"要求法庭裁決")、法律條文名稱 用語要求:一律使用規範的法律專業用語(如「申索」「損害賠償」「違約」「侵權」「衡平法濟助」),避免口語化或不準確的表述。 只輸出 JSON。""" REASON_OBJECT_FEWSHOT_ZH = """範例輸出1(人身傷害): {"case_reason":"申索人為商場保安員,就被告於2023年7月在商場毆打申索人造成的人身傷害向被告提出損害賠償申索。","case_object":["人身傷害賠償","醫療費用","精神困擾賠償"]} 範例輸出2(金錢申索,標的物須含具體金額): {"case_reason":"原告就被告未支付2022年買賣合約項下的貨款,向被告提出追討欠款的申索。","case_object":["拖欠貨款 HK$850,000","合約利息","訟費"]}""" # 英文提示詞 REASON_OBJECT_SYSTEM_EN = f"""Extract from Hong Kong judgment: 1. case_reason (Cause of Action): - Strictly ≤200 words, single sentence,clear and complete - Structure: [Plaintiff's identity] + [regarding what event/conduct] + [against whom] + [what relief sought] - For judicial review/appeal cases, specify which decision is being challenged (with date/case number) - MUST NOT include: judgment results, court analysis, incident details, evidence assessment - Only describe the cause and relief sought, not the court's determination 2. case_object (Subject Matter): - Tangible rights or interests targeted by the claim - Examples: vehicle, gold, jewelry, property title, custody, personal injury damages, contract performance, property ownership, distress damages, right of abode - When a monetary subject matter is involved, it MUST be extracted with currency and the specific amount (e.g., "outstanding goods price HK$850,000", "claim of HK$1,000,000", "arrears of rent HK$120,000"); if the amount is unascertained, note "amount to be assessed" - Merge essentially identical subjects - MUST NOT include: evidentiary materials, procedural requests (e.g., "seeking court ruling"), names of statutes Terminology requirement: consistently use precise, standard legal terminology (e.g., "claim", "damages", "breach of contract", "negligence", "equitable relief"); avoid colloquial or imprecise wording. Output only JSON.""" REASON_OBJECT_FEWSHOT_EN = """Example Output 1 (personal injury): {"case_reason":"Plaintiff, a shopping mall security guard, claims damages from defendant for personal injuries caused by defendant's assault at the mall in July 2023.","case_object":["personal injury damages","medical expenses","distress damages"]} Example Output 2 (monetary claim, subject matter must carry the specific amount): {"case_reason":"Plaintiff claims against defendant for non-payment of the price of goods supplied under a 2022 sale and purchase contract.","case_object":["outstanding goods price HK$850,000","contractual interest","costs"]}""" # 金錢數額識別:幣種前綴 + 數字,或數字 + 中文金額單位 MONEY_RE = re.compile( r"(?:HK\$|US\$|RMB|MOP|港幣|港元|人民幣|美元|美金)\s*[\d,]+(?:\.\d+)?" r"|[\$$]\s*[\d,]+(?:\.\d+)?" r"|[\d,]+(?:\.\d+)?\s*(?:萬元|億元|元|萬|億)", re.I, ) def _object_has_amount(objs: list[str]) -> bool: """case_object 中是否已含具體金額(任一項出現數字即視為已提取金額)""" return any(re.search(r"\d", o or "") for o in (objs or [])) def _reason_object_validator(out: dict, lang: str = 'zh', context: str = "") -> tuple[bool, str]: r = out.get("case_reason", "") max_length = 100 if lang == 'zh' else 200 # 英文允許 2 倍字符數 target_length = 80 if lang == 'zh' else 160 # 建議壓縮目標 if len(r) > max_length: if lang == 'zh': return False, f"case_reason 共 {len(r)} 字,超過 {max_length} 字上限,請壓縮到 {target_length} 字以內。" else: return False, f"case_reason has {len(r)} characters, exceeds {max_length} limit, please compress to within {target_length}." if not out.get("case_object"): if lang == 'zh': return False, "case_object 不能為空。" else: return False, "case_object cannot be empty." # 檢查是否包含判決結果性詞彙(嚴禁) RESULT_KEYWORDS = [ "駁回", "拒絕", "勝訴", "敗訴", "維持", "撤銷", "發還", "判給", "獲判", "判處", "部分勝訴", "dismissed", "allowed", "granted", "refused", "upheld", "quashed", ] for keyword in RESULT_KEYWORDS: if keyword in r: if lang == 'zh': return False, f"case_reason 不應包含判決結果詞彙「{keyword}」,請只描述訴訟起因和請求。" else: return False, f"case_reason should not contain judgment result term '{keyword}', describe only cause and relief sought." # 金錢標的物強制提取:原文出現金錢數額但 case_object 未含金額時要求補充 if context and MONEY_RE.search(context) and not _object_has_amount(out.get("case_object", [])): if lang == 'zh': return False, ("原文出現金錢數額。若該數額屬於訴訟標的(如欠款、索償、賠償金額)," "必須在 case_object 中提取並標明幣種與具體金額(如「拖欠貨款 HK$850,000」);" "若僅為無關引用則可忽略。") else: return False, ("Monetary amounts appear in the source. If an amount forms part of the " "subject matter (e.g., debt, claim, damages), it MUST be extracted in " "case_object with currency and the specific figure (e.g., \"outstanding " "goods price HK$850,000\"); ignore only if it is an unrelated citation.") return True, "" def extract_reason_object(client: OpenAICompatibleClient, context: str, lang: str = 'zh') -> dict: system = REASON_OBJECT_SYSTEM_ZH if lang == 'zh' else REASON_OBJECT_SYSTEM_EN fewshot = REASON_OBJECT_FEWSHOT_ZH if lang == 'zh' else REASON_OBJECT_FEWSHOT_EN schema = get_reason_object_schema(lang) max_length = 100 if lang == 'zh' else 200 if lang == 'zh': user = (f"{fewshot}\n\n" f"請從以下判決書開頭部分抽取:\n```\n{context[:5000]}\n```") else: user = (f"{fewshot}\n\n" f"Please extract from the following judgment header:\n```\n{context[:5000]}\n```") return client.chat_json_with_retry(system, user, schema, validator=lambda x: _reason_object_validator(x, lang, context[:5000])) # --- Call 3: 判決結果 ------------------------------------------------------- JUDGMENT_RESULT_SCHEMA = { "type": "object", "properties": { "judgment_result": { "type": "array", "items": { "type": "object", "properties": { "charge": {"type": "string"}, "result": {"type": "string"}, }, "required": ["charge", "result"], }, } }, "required": ["judgment_result"], } # 中文提示詞 JUDGMENT_RESULT_SYSTEM_ZH = """從香港判決書尾部的命令/裁定部分抽取所有判決結果。 重要提示: - 判決結果通常在判決書的最後部分 - 常見標記:JUDGMENT, ORDER, CONCLUSION, DISPOSITION, 判決, 命令, 裁定, 頒令 - 可能包含:勝訴/敗訴、具體金額、訟費安排、上訴結果 拆分原則: - 多項請求 → 分條 - "責任判定" 與 "損失/金額計算" 兩個層面 → 必須分條 - 每條 charge 必須以 "(責任問題)" 或 "(損失範圍)" 結尾標註層次 - result 必須包含: a) 明確結果(勝訴/敗訴/部分勝訴/維持/撤銷/駁回/發還等) b) 2-3 個關鍵法庭理由(如有) c) 具體金額、利率或命令內容;凡有判給/命令支付的金錢數額,必須原文照錄幣種與金額(如 HK$28,500),不得省略或約化 用語要求:一律使用規範的法律專業用語(如「判給」「訟費」「利息」「駁回」「發還重審」),避免口語化或不準確的表述。 只輸出 JSON。""" JUDGMENT_RESULT_FEWSHOT_ZH = """範例輸出: {"judgment_result":[ {"charge":"申索人就毆打事件的人身傷害索償 (責任問題)","result":"勝訴。法庭接納申索人證供可信,閉路電視顯示被告先動手,被告亦承認部分情節。"}, {"charge":"醫療費及精神困擾賠償金額 (損失範圍)","result":"部分勝訴。判給醫療費HK$8,500及一般損害賠償HK$20,000,合共HK$28,500,連同利息及訟費。"} ]}""" # 英文提示詞 JUDGMENT_RESULT_SYSTEM_EN = """Extract all judgment results from the order/disposition section at the end of Hong Kong judgment. Important Notes: - Judgment results are usually at the end of the judgment - Common markers: JUDGMENT, ORDER, CONCLUSION, DISPOSITION, 判決, 命令, 裁定, 頒令 - May include: success/dismissal, specific amounts, costs arrangements, appeal results Splitting Principles: - Multiple claims → separate items - "Liability determination" vs "Quantum/damages assessment" → must be separate items - Each charge must end with "(liability issue)" or "(quantum issue)" to mark the level - result must include: a) Clear outcome (allowed/dismissed/partially allowed/upheld/quashed/remitted, etc.) b) 2-3 key court reasons (if any) c) Specific amounts, interest rates or order details; whenever a sum is awarded/ordered to be paid, the currency and figure MUST be reproduced verbatim (e.g., HK$28,500), never omitted or rounded Terminology requirement: consistently use precise, standard legal terminology (e.g., "awarded", "costs", "interest", "dismissed", "remitted for retrial"); avoid colloquial or imprecise wording. Output only JSON.""" JUDGMENT_RESULT_FEWSHOT_EN = """Example Output: {"judgment_result":[ {"charge":"Plaintiff's claim for personal injury from assault (liability issue)","result":"Allowed. Court accepted plaintiff's testimony as credible, CCTV showed defendant struck first, defendant also admitted parts of the incident."}, {"charge":"Medical expenses and distress damages quantum (quantum issue)","result":"Partially allowed. Awarded medical expenses HK$8,500 and general damages HK$20,000, totaling HK$28,500, with interest and costs."} ]}""" def _judgment_validator(out: dict | list, lang: str = 'zh') -> tuple[bool, str]: # 处理模型直接返回列表的情况 if isinstance(out, list): items = out else: items = out.get("judgment_result", []) if not items: if lang == 'zh': return False, "judgment_result 不能為空。" else: return False, "judgment_result cannot be empty." if lang == 'zh': bad = [i for i in items if "責任問題" not in i.get("charge", "") and "損失範圍" not in i.get("charge", "")] if bad: return False, (f"有 {len(bad)} 條 charge 未標註層次。" f"每條 charge 必須以 '(責任問題)' 或 '(損失範圍)' 結尾。") else: bad = [i for i in items if "liability issue" not in i.get("charge", "").lower() and "quantum issue" not in i.get("charge", "").lower()] if bad: return False, (f"{len(bad)} charge items lack level annotation. " f"Each charge must end with '(liability issue)' or '(quantum issue)'.") return True, "" def extract_judgment_result(client: OpenAICompatibleClient, context: str, lang: str = 'zh') -> dict: system = JUDGMENT_RESULT_SYSTEM_ZH if lang == 'zh' else JUDGMENT_RESULT_SYSTEM_EN fewshot = JUDGMENT_RESULT_FEWSHOT_ZH if lang == 'zh' else JUDGMENT_RESULT_FEWSHOT_EN if lang == 'zh': user = (f"{fewshot}\n\n" f"請從以下判決書片段(開頭5000字符 + 尾部5000字符)抽取:\n```\n{context}\n```") else: user = (f"{fewshot}\n\n" f"Please extract from the following judgment segments (first 5000 + last 5000 characters):\n```\n{context}\n```") result = client.chat_json_with_retry(system, user, JUDGMENT_RESULT_SCHEMA, validator=lambda x: _judgment_validator(x, lang)) # 如果模型返回的是列表,包装成标准格式 if isinstance(result, list): return {"judgment_result": result} return result # --- Call 4: 涉及實體 ------------------------------------------------------- ENTITIES_SCHEMA = { "type": "object", "properties": { "involved_entities": { "type": "array", "items": { "type": "object", "properties": { "entity_name": {"type": "string"}, "reason": {"type": "string"}, }, "required": ["entity_name", "reason"], }, } }, "required": ["involved_entities"], } # 中文提示詞 ENTITIES_SYSTEM_ZH = """從香港判決書中抽取所有相關實體(自然人/法人/組織/機構)。 必須包含: - 主審法官 / 審裁官 / 裁判官(通常在判決書開頭或結尾署名) - 雙方代表律師、大律師(通常在判決書結尾的 Representation 部分) - 判決中引用的先例所提及的法官 reason 須寫明:在XX案[案號]中擔任XX職位,闡述XX法律原則 - 涉案的政府部門、公司、機構(如:入境事務處處長、律政司司長) 嚴禁包含: - 法案/條例名(如《侵權條例》、Cap.xxx、《基本法》) - 純案例名稱(如 Donoghue v Stevenson) - 文獻、期刊名 用語要求:reason 一律使用規範的法律專業用語(如「主審」「闡述」「先例」「判詞」),避免口語化或不準確的表述。 只輸出 JSON。""" ENTITIES_FEWSHOT_ZH = """範例輸出: {"involved_entities":[ {"entity_name":"林希維審裁官","reason":"本案主審審裁官,負責認定事實及裁決。"}, {"entity_name":"終審法院常任法官李義","reason":"在Tang Kwok Wah v HKSAR [2019] HKCFA 23 中擔任主筆法官,闡述舉證責任原則,本案第34段引用其判詞。"}, {"entity_name":"康樂文化事務署","reason":"涉案場所通州街公園的管理機構。"} ]}""" # 英文提示詞 ENTITIES_SYSTEM_EN = """Extract all relevant entities (natural persons/legal persons/organizations/institutions) from Hong Kong judgment. Must include: - Presiding judge/adjudicator/magistrate (usually signed at beginning or end of judgment) - Counsel/barristers representing both parties (usually in Representation section at end) - Judges mentioned in cited precedents reason must specify: served as XX position in XX case [case number], articulated XX legal principle - Government departments, companies, institutions involved (e.g., Director of Immigration, Secretary for Justice) MUST NOT include: - Statute/ordinance names (e.g., Tort Ordinance, Cap.xxx, Basic Law) - Pure case names (e.g., Donoghue v Stevenson) - Literature, journal names Terminology requirement: write each reason in precise, standard legal terminology (e.g., "presiding", "articulated", "precedent", "judgment"); avoid colloquial or imprecise wording. Output only JSON.""" ENTITIES_FEWSHOT_EN = """Example Output: {"involved_entities":[ {"entity_name":"Hon Leong JA","reason":"Presiding judge in this case, responsible for fact-finding and adjudication."}, {"entity_name":"Chief Justice Li","reason":"Served as lead judge in Tang Kwok Wah v HKSAR [2019] HKCFA 23, articulated burden of proof principles, cited in paragraph 34 of this judgment."}, {"entity_name":"Leisure and Cultural Services Department","reason":"Management authority of Tung Chau Street Park, the incident location."} ]}""" def _entities_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]: ents = out.get("involved_entities", []) if not ents: if lang == 'zh': return False, "involved_entities 不能為空,至少要有主審法官。" else: return False, "involved_entities cannot be empty, must include at least the presiding judge." # 檢查黑名單(條例、法案、案例名稱) bad = [] for e in ents: name = e.get("entity_name", "") # 檢查是否包含黑名單關鍵詞 if any(k in name for k in ENTITY_NAME_BLACKLIST): bad.append(name) # 檢查是否為案例名稱格式(包含 v 或 訴) if (" v " in name or " v. " in name or " 訴 " in name or " vs " in name or " vs. " in name): bad.append(name) if bad: if lang == 'zh': return False, f"以下實體疑為條例/法案/案例名稱,應移除:{bad[:3]}" else: return False, f"Following entities appear to be statutes/acts/case names, should be removed: {bad[:3]}" return True, "" def extract_entities(client: OpenAICompatibleClient, context: str, lang: str = 'zh') -> dict: system = ENTITIES_SYSTEM_ZH if lang == 'zh' else ENTITIES_SYSTEM_EN fewshot = ENTITIES_FEWSHOT_ZH if lang == 'zh' else ENTITIES_FEWSHOT_EN if lang == 'zh': user = (f"{fewshot}\n\n" f"請從以下片段(多處關鍵詞召回拼接)抽取所有涉及實體:\n" f"```\n{context[:6500]}\n```") else: user = (f"{fewshot}\n\n" f"Please extract all involved entities from the following segments (keyword-based retrieval):\n" f"```\n{context[:6500]}\n```") return client.chat_json_with_retry(system, user, ENTITIES_SCHEMA, validator=lambda x: _entities_validator(x, lang)) # --- Call 5: 判決總結(基於已抽取結果 + 分析段,不從原文重生) ----------- def get_summary_schema(lang: str = 'zh') -> dict: """根據語言返回對應的 schema(英文字數限制更寬鬆)""" max_length = 300 if lang == 'zh' else 500 # 英文允許約 1.67 倍字符數 return { "type": "object", "properties": { "judgment_summary": {"type": "string", "maxLength": max_length}, }, "required": ["judgment_summary"], } # 中文提示詞 SUMMARY_SYSTEM_ZH = """根據已抽取的結構化字段 + 法庭分析段,撰寫判決總結。 四要素結構(必須全部涵蓋,連貫成單段): (1) 案件背景:1-2 句交代起因與當事人關係 (2) 核心爭議焦點 (3) 法庭法律分析與推理(核心重點): - 如何評估證據? - 接受 / 拒絕主張的邏輯? - 引用了哪些關鍵法律或判例? (4) 最終裁決結果及命令 **重要:judgment_summary 必須使用中文撰寫。** 用語要求:一律使用規範的法律專業用語;涉及金錢的判給或標的,須保留具體金額(含幣種)。 嚴格 ≤300 字。只輸出 JSON。""" # 英文提示詞 SUMMARY_SYSTEM_EN = """Based on extracted structured fields + court analysis section, write judgment summary. Four-element structure (must cover all, in coherent single paragraph): (1) Case background: 1-2 sentences on cause and parties' relationship (2) Core issues in dispute (3) Court's legal analysis and reasoning (core focus): - How was evidence assessed? - Logic for accepting/rejecting claims? - What key laws or precedents were cited? (4) Final judgment and orders **IMPORTANT: judgment_summary MUST be written in English.** Terminology requirement: consistently use precise, standard legal terminology; preserve specific monetary figures (with currency) for any award or subject matter involving money. Strictly ≤500 characters. Output only JSON.""" def _summary_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]: s = out.get("judgment_summary", "") max_length = 300 if lang == 'zh' else 500 # 英文允許約 1.67 倍字符數 min_length = 80 if lang == 'zh' else 120 # 英文最小長度也相應增加 if len(s) > max_length: if lang == 'zh': return False, f"summary 共 {len(s)} 字,超過 {max_length} 字上限,請壓縮。" else: return False, f"summary has {len(s)} characters, exceeds {max_length} limit, please compress." if len(s) < min_length: if lang == 'zh': return False, "summary 過短,請完整覆蓋四要素。" else: return False, "summary too short, please cover all four elements." # 檢查語言是否正確 chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', s)) total_chars = len(s.strip()) if total_chars > 0: chinese_ratio = chinese_chars / total_chars if lang == 'zh' and chinese_ratio < 0.3: return False, "judgment_summary 必須使用中文撰寫,但檢測到主要為英文內容,請用中文重寫。" elif lang == 'en' and chinese_ratio > 0.3: return False, "judgment_summary MUST be written in English, but detected primarily Chinese content. Please rewrite in English." return True, "" def extract_summary(client: OpenAICompatibleClient, prior: dict, analysis: str, lang: str = 'zh') -> dict: system = SUMMARY_SYSTEM_ZH if lang == 'zh' else SUMMARY_SYSTEM_EN schema = get_summary_schema(lang) max_length = 300 if lang == 'zh' else 500 if lang == 'zh': user = f"""已抽取的字段: ```json {json.dumps(prior, ensure_ascii=False, indent=2)} ``` 法庭分析節選: ``` {analysis[:3500]} ``` 請按四要素撰寫 ≤300 字的 judgment_summary。""" else: user = f"""Extracted fields: ```json {json.dumps(prior, ensure_ascii=False, indent=2)} ``` Court analysis excerpt: ``` {analysis[:3500]} ``` Please write judgment_summary ≤500 characters covering four elements.""" return client.chat_json_with_retry(system, user, schema, validator=lambda x: _summary_validator(x, lang)) # ============================================================================= # 4. 全局校驗與後處理 # ============================================================================= LOCATION_BLACKLIST = [ "法院", "法庭", "審裁處", "公園", "大廈", "大樓", "商場", "街", "道路", "村", "中心", "醫院", "酒店", "車站", ] ENTITY_NAME_BLACKLIST = [ "條例", "Cap.", "法案", "案例彙編", "Reports", "期刊", "Journal", # 案例名稱標記 " v ", " v. ", " 訴 ", " vs ", " vs. ", "HKCFAR", "HKCFA", "HKCA", "HKCFI", # 避免將案例引用誤認為實體 ] def validate_and_fix(result: dict, lang: str = 'zh') -> tuple[dict, list[str]]: warnings: list[str] = [] # case_location:剔除法院/場所/建築 locs = result.get("case_location") or [] cleaned = [l for l in locs if l and not any(b in l for b in LOCATION_BLACKLIST)] if "香港特別行政區" not in cleaned: cleaned.insert(0, "香港特別行政區") if set(cleaned) != set(locs): warnings.append( f"case_location 已清理:移除 {set(locs) - set(cleaned)}") result["case_location"] = cleaned # 字數檢查(僅警告,不截斷) reason_max = 100 if lang == 'zh' else 200 summary_max = 300 if lang == 'zh' else 500 reason_len = len(result.get("case_reason", "")) if reason_len > reason_max: warnings.append(f"⚠️ case_reason 共 {reason_len} 字,超過建議上限 {reason_max} 字") summary_len = len(result.get("judgment_summary", "")) if summary_len > summary_max: warnings.append(f"⚠️ judgment_summary 共 {summary_len} 字,超過建議上限 {summary_max} 字") # involved_entities:剔除條例/文獻 ents = result.get("involved_entities") or [] cleaned_ents = [e for e in ents if not any(k in e.get("entity_name", "") for k in ENTITY_NAME_BLACKLIST)] if len(cleaned_ents) != len(ents): warnings.append( f"involved_entities 移除 {len(ents) - len(cleaned_ents)} 條疑似條例/文獻") result["involved_entities"] = cleaned_ents # judgment_result:補層次標註提示 for jr in result.get("judgment_result", []) or []: if ("責任問題" not in jr.get("charge", "") and "損失範圍" not in jr.get("charge", "")): warnings.append( f"judgment_result 條目缺層次標註:{jr.get('charge', '')[:40]}") # 空字段告警 for k in ("plaintiff", "defendant", "case_object", "judgment_result", "involved_entities"): if not result.get(k): warnings.append(f"{k} 為空,請人工複核") return result, warnings # ============================================================================= # 5. 主管線 # ============================================================================= def run_pipeline(text: str, model: str, base_url: str, api_key: str, head_length: int = 5000, tail_length: int = 5000, entities_window: int = 400, entities_max: int = 6500, analysis_window: int = 500, analysis_max: int = 6500, log_prefix: str = "") -> tuple[dict, OpenAICompatibleClient]: log = lambda m: print(f"{log_prefix}{m}", file=sys.stderr) log("[0/7] 檢測語言...") lang = detect_language(text) log(f" 檢測到語言:{'中文' if lang == 'zh' else '英文'} (lang={lang})") log("[1/7] 預處理 + 關鍵詞召回...") text = clean_text(text) meta = extract_metadata_by_rule(text, lang) ctx = gather_all(text, head_length, tail_length, entities_window, entities_max, analysis_window, analysis_max) log(f" 規則元數據:{meta}") log(f" 召回片段:") for g in ("parties", "reason_object", "judgment_result", "entities", "analysis"): hits_info = f"hits={ctx[f'_{g}_hits']}" if ctx[f'_{g}_hits'] != "0" else "直接截取" log(f" {g:16s} len={len(ctx[g]):5d} {hits_info}") client = OpenAICompatibleClient(model=model, base_url=base_url, api_key=api_key) log("[2/7] 抽取當事人...") parties = extract_parties(client, ctx["parties"], lang) log("[3/7] 抽取事由與標的...") reason_obj = extract_reason_object(client, ctx["reason_object"], lang) log("[4/7] 抽取判決結果...") judgment = extract_judgment_result(client, ctx["judgment_result"], lang) log("[5/7] 抽取涉及實體...") # 實體抽取上下文:當事人片段(含律師名)+ 引用片段 entities_ctx = (ctx["parties"][:2500] + "\n\n[…]\n\n" + ctx["entities"])[:6500] entities = extract_entities(client, entities_ctx, lang) interim_for_summary = { **parties, **reason_obj, **judgment, **entities, "jurisdiction_name": meta["jurisdiction_name"], } log("[6/7] 撰寫判決總結...") summary = extract_summary(client, interim_for_summary, ctx["analysis"], lang) final = { "plaintiff": parties["plaintiff"], "defendant": parties["defendant"], "jurisdiction_code": meta["jurisdiction_code"], "jurisdiction_name": meta["jurisdiction_name"], "case_location": meta["case_location"], "case_reason": reason_obj["case_reason"], "case_object": reason_obj["case_object"], "judgment_result": judgment["judgment_result"], "judgment_summary": summary["judgment_summary"], "involved_entities": entities["involved_entities"], } log("[7/7] 校驗與後處理...") final, warnings = validate_and_fix(final, lang) for w in warnings: log(f" ⚠️ {w}") return final, client # ============================================================================= # 5.5 成本統計:根據 models.json 計費價格計算本次抽取消耗 # ============================================================================= def compute_cost(client: OpenAICompatibleClient, elapsed_seconds: float, profile: dict | None) -> dict: """根據 token 用量、耗時和模型計費價格計算本次抽取成本。 價格單位為「每百萬 token 價格」: input_cost = input_tokens / 1_000_000 * input_price output_cost = output_tokens / 1_000_000 * output_price profile 為 None(未使用 --config)時,價格相關字段為 null。 """ input_tokens = client.total_input_tokens output_tokens = client.total_output_tokens input_price = output_price = None price_unit = None if profile is not None: input_price = float(profile.get("input_price") or 0) output_price = float(profile.get("output_price") or 0) price_unit = profile.get("price_unit") input_cost = output_cost = total_cost = None if input_price is not None and output_price is not None: input_cost = round(input_tokens / 1_000_000 * input_price, 6) output_cost = round(output_tokens / 1_000_000 * output_price, 6) total_cost = round(input_cost + output_cost, 6) return { "config_name": profile.get("Name") if profile else None, "source": profile.get("source") if profile else None, "model": client.model, "elapsed_seconds": round(elapsed_seconds, 3), "num_api_calls": client.num_calls, "input_tokens": input_tokens, "output_tokens": output_tokens, "total_tokens": client.total_tokens, "input_price_per_million": input_price, "output_price_per_million": output_price, "price_unit": price_unit, "input_cost": input_cost, "output_cost": output_cost, "total_cost": total_cost, } # ============================================================================= # 6. YAML 輸出(長字串用 > 折疊;含特殊字符的自動雙引號) # ============================================================================= class FoldedStr(str): """標記為 YAML > 折疊樣式""" def _folded_str_representer(dumper, data): return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=">") def _safe_str_representer(dumper, data): """含 :, #, - 開頭的字符串強制雙引號""" if data and (":" in data or data.startswith("#") or data.startswith("- ")): return dumper.represent_scalar("tag:yaml.org,2002:str", data, style='"') return dumper.represent_scalar("tag:yaml.org,2002:str", data) yaml.add_representer(FoldedStr, _folded_str_representer) yaml.add_representer(str, _safe_str_representer) def to_yaml(result: dict) -> str: if result.get("case_reason"): result["case_reason"] = FoldedStr(result["case_reason"]) if result.get("judgment_summary"): result["judgment_summary"] = FoldedStr(result["judgment_summary"]) return yaml.dump(result, allow_unicode=True, sort_keys=False, default_flow_style=False, width=100) # ============================================================================= # 7. 多模型運行:解析運行規格 + 輸出路徑推導 # ============================================================================= @dataclass class RunSpec: """單個模型的運行規格(標籤 + 連接參數 + 計費配置)""" label: str # 用於日誌前綴與輸出文件名(配置名或模型名) model: str base_url: str api_key: str profile: dict | None = None # models.json 配置(用於成本計算),無則 None def _safe_label(name: str) -> str: """把配置/模型名轉成可安全用於文件名的標籤(如 anthropic/claude → anthropic_claude)""" safe = re.sub(r"[^0-9A-Za-z._-]+", "_", name).strip("_") return safe or "model" def parse_run_specs(args) -> list[RunSpec]: """解析 --config / --model(均支持逗號分隔)為一組運行規格。 優先級: - 有 --config:逐個從 models.json 加載配置(自帶 model/base_url/api_key/價格) - 否則:用 --model(可逗號分隔多個),共用 --base-url / --api-key """ specs: list[RunSpec] = [] if args.config: names = [n.strip() for n in args.config.split(",") if n.strip()] for name in names: profile = load_model_profile(name, args.models_file) specs.append(RunSpec( label=name, model=profile.get("model") or args.model, base_url=profile.get("BaseApiUrl") or args.base_url, api_key=profile.get("ApiKey") or args.api_key, profile=profile, )) else: names = [n.strip() for n in args.model.split(",") if n.strip()] or [args.model] for name in names: specs.append(RunSpec( label=name, model=name, base_url=args.base_url, api_key=args.api_key, profile=None, )) return specs def build_out_path(args, input_path: Path, label: str, multi: bool) -> Path | None: """推導某個模型的 YAML 輸出路徑。 - 單模型:沿用原行為;若使用單個 --config 且指定 --out,則在文件名中插入配置名 - 多模型:在文件名中插入標籤;未給 --out 時用「輸入名_標籤.yaml」 """ include_label = multi or bool(args.config and args.out) if not include_label: return Path(args.out) if args.out else None safe = _safe_label(label) if args.out: base = Path(args.out) return base.with_name(f"{base.stem}_{safe}{base.suffix or '.yaml'}") return input_path.with_name(f"{input_path.stem}_{safe}.yaml") def build_cost_path(out_path: Path | None, input_path: Path, label: str, multi: bool) -> Path: """成本文件路徑:有輸出文件時用「輸出名_cost.json」,否則回退到輸入名。""" if out_path is not None: return out_path.with_name(out_path.stem + "_cost.json") if multi: return input_path.with_name(f"{input_path.stem}_{_safe_label(label)}_cost.json") return input_path.with_name(input_path.stem + "_cost.json") def build_debug_path(args, label: str, multi: bool) -> Path | None: """debug-dump 路徑:多模型時在文件名中插入標籤。""" if not args.debug_dump: return None base = Path(args.debug_dump) if multi: return base.with_name(f"{base.stem}_{_safe_label(label)}{base.suffix or '.json'}") return base # ============================================================================= # CLI # ============================================================================= def main() -> None: ap = argparse.ArgumentParser( description="香港判決書結構化抽取(OpenAI 兼容 API)", epilog=""" 示例用法: # 使用 models.json 中的配置名稱(推薦,省去多個參數) python hk_case_extractor.py case.txt --config openrouter-claude-sonnet --out result.yaml # 成本統計會寫入 result_cost.json # 使用本地 Ollama python hk_case_extractor.py case.txt --model qwen2.5:7b-instruct # 使用 OpenRouter python hk_case_extractor.py case.txt \\ --base-url https://openrouter.ai/api/v1 \\ --model anthropic/claude-3.5-sonnet \\ --api-key your-api-key # 使用 OpenAI python hk_case_extractor.py case.txt \\ --base-url https://api.openai.com/v1 \\ --model gpt-4 \\ --api-key your-api-key # 同時跑多個模型(逗號分隔),並發執行,分別輸出到不同文件 python hk_case_extractor.py case.txt \\ --config openrouter-claude-sonnet,openrouter-gpt4o,ollama-qwen \\ --out result.yaml --cost # 生成 result_openrouter-claude-sonnet.yaml / result_openrouter-gpt4o.yaml / ... # 及各自的 *_cost.json # 多個本地 Ollama 模型共用同一端點 python hk_case_extractor.py case.txt \\ --model qwen2.5:7b-instruct,llama3.1:8b --out result.yaml # 調整截取長度 python hk_case_extractor.py case.txt \\ --head-length 8000 \\ --tail-length 8000 \\ --entities-max 10000 \\ --analysis-max 10000 """, formatter_class=argparse.RawDescriptionHelpFormatter ) ap.add_argument("input", help="判決書文本路徑(.txt 或 .json)") ap.add_argument("--config", default=None, help="models.json 中的配置名稱(Name),可逗號分隔多個以同時運行多個模型," "使用後可省略 --model/--base-url/--api-key") ap.add_argument("--models-file", default=DEFAULT_MODELS_FILE, help=f"模型配置文件路徑(默認:{DEFAULT_MODELS_FILE})") ap.add_argument("--model", default=DEFAULT_MODEL, help=f"模型名稱(默認:{DEFAULT_MODEL}),可逗號分隔多個(共用 --base-url/--api-key)") ap.add_argument("--base-url", default=DEFAULT_BASE_URL, help=f"API base URL(默認:{DEFAULT_BASE_URL})") ap.add_argument("--api-key", default=DEFAULT_API_KEY, help="API key(Ollama 可忽略)") ap.add_argument("--out", default=None, help="輸出 YAML 路徑(單模型默認 stdout);多模型時在文件名中插入標籤") ap.add_argument("--max-workers", type=int, default=0, help="多模型時的並發數(默認 0 = 模型數量;設為 1 則順序執行)") ap.add_argument("--cost", action="store_true", help="輸出成本統計到 {輸出文件名}_cost.json(默認不輸出)") ap.add_argument("--debug-dump", default=None, help="額外輸出原始 JSON 結果到該路徑(多模型時在文件名中插入標籤)") # 截取長度控制參數 ap.add_argument("--head-length", type=int, default=5000, help="開頭截取長度(默認:5000)") ap.add_argument("--tail-length", type=int, default=5000, help="尾部截取長度(默認:5000)") ap.add_argument("--entities-window", type=int, default=400, help="實體關鍵詞窗口半徑(默認:400)") ap.add_argument("--entities-max", type=int, default=6500, help="實體片段最大總長度(默認:6500)") ap.add_argument("--analysis-window", type=int, default=500, help="分析關鍵詞窗口半徑(默認:500)") ap.add_argument("--analysis-max", type=int, default=6500, help="分析片段最大總長度(默認:6500)") args = ap.parse_args() # 解析運行規格(--config / --model 均支持逗號分隔多個) specs = parse_run_specs(args) multi = len(specs) > 1 for spec in specs: if spec.profile is not None: print(f"使用配置 '{spec.label}':model={spec.model}, base_url={spec.base_url}", file=sys.stderr) # 支持从 .json 文件的 content 字段读取 input_path = Path(args.input) if input_path.suffix.lower() == '.json': data = json.loads(input_path.read_text(encoding="utf-8")) text = data.get("content", "") if not text: print("錯誤:JSON 文件中沒有 'content' 字段", file=sys.stderr) sys.exit(1) else: text = input_path.read_text(encoding="utf-8") def run_and_emit(spec: RunSpec) -> str: """跑單個模型並寫出其 YAML / cost / debug 文件,返回輸出路徑描述。""" log_prefix = f"[{spec.label}] " if multi else "" start = time.perf_counter() result, client = run_pipeline( text, spec.model, spec.base_url, spec.api_key, args.head_length, args.tail_length, args.entities_window, args.entities_max, args.analysis_window, args.analysis_max, log_prefix=log_prefix) elapsed = time.perf_counter() - start out_path = build_out_path(args, input_path, spec.label, multi) # 成本統計:僅在 --cost 時輸出到 {輸出文件名}_cost.json if args.cost: cost = compute_cost(client, elapsed, spec.profile) cost_path = build_cost_path(out_path, input_path, spec.label, multi) cost_path.parent.mkdir(parents=True, exist_ok=True) cost_path.write_text(json.dumps(cost, ensure_ascii=False, indent=2), encoding="utf-8") print(f"{log_prefix}💰 成本統計已寫入 {cost_path}:耗時 {cost['elapsed_seconds']}s," f"input={cost['input_tokens']} output={cost['output_tokens']} " f"total_cost={cost['total_cost']} {cost['price_unit'] or ''}", file=sys.stderr) # debug dump(原始 JSON) debug_path = build_debug_path(args, spec.label, multi) if debug_path is not None: debug_path.parent.mkdir(parents=True, exist_ok=True) debug_path.write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8") # YAML 輸出 yaml_str = to_yaml(result) if out_path is not None: out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text(yaml_str, encoding="utf-8") print(f"{log_prefix}✅ 已寫入 {out_path}", file=sys.stderr) return str(out_path) print(yaml_str) return "(stdout)" if multi and args.max_workers != 1: # 並發執行:每個模型一個線程,日誌以 [標籤] 前綴區分 workers = args.max_workers if args.max_workers > 0 else len(specs) print(f"⏳ 同時運行 {len(specs)} 個模型(並發 {workers}):" f"{[s.label for s in specs]}", file=sys.stderr) summary: list[tuple[str, str]] = [] with ThreadPoolExecutor(max_workers=workers) as ex: futures = {ex.submit(run_and_emit, spec): spec for spec in specs} for fut in as_completed(futures): spec = futures[fut] try: summary.append((spec.label, fut.result())) except Exception as e: print(f"[{spec.label}] ❌ 運行失敗:{e}", file=sys.stderr) summary.append((spec.label, f"FAILED: {e}")) print("\n=== 多模型運行結果 ===", file=sys.stderr) for label, out in sorted(summary): print(f" {label:30s} → {out}", file=sys.stderr) else: # 單模型,或多模型但顯式 --max-workers 1 順序執行 for spec in specs: try: run_and_emit(spec) except Exception as e: if not multi: raise print(f"[{spec.label}] ❌ 運行失敗:{e}", file=sys.stderr) if __name__ == "__main__": main()