hklii_samples/hk_case_extractor.py

1608 lines
67 KiB
Python
Raw Blame History

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""
hk_case_extractor.py
==========================================================
香港判決書結構化字段抽取管線
基於 本地 Ollama 小模型 + 分階段抽取 + JSON Schema 強制 + 校驗重試
設計理念
--------
基於對實際香港判決書結構的分析優化:
1. 預處理:規則去噪 + 切段,純規則抽司法區域/案號等高確定性字段
2. 智能定位:
- 基礎信息當事人、案號直接從開頭2000字符提取
- 判決結果優先從尾部4000字符提取
- 其他字段:使用關鍵詞召回相關段落
3. 分組抽取:拆成 5 次獨立 Ollama 調用,每次只負責 1-3 個字段
4. Schema 強制:用 Ollama 0.5+ 的 format=<JSON Schema> 約束輸出
5. 校驗+重試:對字數、黑名單、結構標註逐項校驗
6. judgment_summary 不從原文重生,而從前 4 步結果 + 一段分析段生成
判決書結構特點(基於實際案例分析)
--------------------------------
- 開頭部分前2000字符
* 案號(如 CACV000175/2000
* 法院名稱和級別
* 當事人信息BETWEEN...AND 格式)
* 案件標題
* 審理日期和法官信息
- 中間部分:
* 案情背景BACKGROUND, INTRODUCTION, 背景, 案情)
* 法律分析和推理
* 證據評估
* 法律原則引用
- 尾部部分後4000字符
* 判決結果JUDGMENT, ORDER, CONCLUSION, 判決, 命令)
* 具體命令和裁定
* 訟費安排
* 法官簽名
依賴
----
pip install requests pyyaml
使用
----
# 使用本地 Ollama默認
python hk_case_extractor.py case.txt
python hk_case_extractor.py case.txt --model qwen2.5:7b-instruct --out result.yaml
# 使用 OpenRouter
python hk_case_extractor.py case.txt \\
--base-url https://openrouter.ai/api/v1 \\
--model anthropic/claude-3.5-sonnet \\
--api-key your-api-key
# 使用 OpenAI
python hk_case_extractor.py case.txt \\
--base-url https://api.openai.com/v1 \\
--model gpt-4 \\
--api-key your-api-key
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
import requests
import yaml
# =============================================================================
# 配置
# =============================================================================
DEFAULT_BASE_URL = "http://localhost:11434/v1" # Ollama 默認 OpenAI 兼容端點
DEFAULT_MODEL = "qwen2.5:7b-instruct"
DEFAULT_API_KEY = "ollama" # Ollama 不需要真實 key但 API 需要提供
DEFAULT_TIMEOUT = 600 # 增加到 10 分鐘,適應遠程服務器
MAX_RETRIES = 2
DEFAULT_MODELS_FILE = "models.json"
# =============================================================================
# 模型配置models.json按配置名加載 base_url / api_key / model / 計費價格
# =============================================================================
def load_model_profile(name: str, models_file: str = DEFAULT_MODELS_FILE) -> dict:
"""從 models.json 按 Name 加載一個模型配置。
配置格式(數組):
[{"Name":..., "source":..., "BaseApiUrl":..., "ApiKey":...,
"model":..., "input_price":..., "output_price":..., "price_unit":...}]
找不到文件或配置名時直接退出並提示可用配置。
"""
path = Path(models_file)
if not path.exists():
print(f"❌ 找不到模型配置文件:{models_file}", file=sys.stderr)
sys.exit(1)
try:
profiles = json.loads(path.read_text(encoding="utf-8"))
except json.JSONDecodeError as e:
print(f"❌ 模型配置文件不是有效的 JSON{e}", file=sys.stderr)
sys.exit(1)
if not isinstance(profiles, list):
print("❌ 模型配置文件應為配置對象組成的數組", file=sys.stderr)
sys.exit(1)
for p in profiles:
if p.get("Name") == name:
return p
available = [p.get("Name") for p in profiles]
print(f"❌ 配置 '{name}' 不存在。可用配置:{available}", file=sys.stderr)
sys.exit(1)
# =============================================================================
# 0. 語言檢測
# =============================================================================
def detect_language(text: str) -> str:
"""檢測文本主要語言:'zh''en'
策略統計前3000字符中的中文字符比例
- 中文字符 > 30%:判定為中文
- 否則:判定為英文
"""
sample = text[:3000]
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', sample))
total_chars = len(sample.strip())
if total_chars == 0:
return 'en'
chinese_ratio = chinese_chars / total_chars
return 'zh' if chinese_ratio > 0.3 else 'en'
# =============================================================================
# 1. 預處理:去噪 + 切段 + 規則抽元數據
# =============================================================================
JURISDICTION_MAP_ZH: dict[str, str] = {
"HKCFA": "香港特別行政區終審法院",
"HKCA": "香港特別行政區高等法院上訴法庭",
"HKCFI": "香港特別行政區高等法院原訟法庭",
"HKDC": "香港特別行政區區域法院",
"HKMC": "香港特別行政區裁判法院",
"HKMagC": "香港特別行政區裁判法院",
"HKSCT": "香港特別行政區小額錢債審裁處",
"HKLT": "香港特別行政區土地審裁處",
"HKLDT": "香港特別行政區土地審裁處",
"HKLD": "香港特別行政區勞資審裁處",
"HKLAT": "香港特別行政區勞資審裁處",
"HKCT": "香港特別行政區競爭事務審裁處",
"HKCorC": "香港特別行政區死因裁判法庭",
"HKCrC": "香港特別行政區死因裁判法庭",
}
JURISDICTION_MAP_EN: dict[str, str] = {
"HKCFA": "Court of Final Appeal of the Hong Kong Special Administrative Region",
"HKCA": "Court of Appeal of the High Court of the Hong Kong Special Administrative Region",
"HKCFI": "Court of First Instance of the High Court of the Hong Kong Special Administrative Region",
"HKDC": "District Court of the Hong Kong Special Administrative Region",
"HKMC": "Magistrates' Courts of the Hong Kong Special Administrative Region",
"HKMagC": "Magistrates' Courts of the Hong Kong Special Administrative Region",
"HKSCT": "Small Claims Tribunal of the Hong Kong Special Administrative Region",
"HKLT": "Lands Tribunal of the Hong Kong Special Administrative Region",
"HKLDT": "Lands Tribunal of the Hong Kong Special Administrative Region",
"HKLD": "Labour Tribunal of the Hong Kong Special Administrative Region",
"HKLAT": "Labour Tribunal of the Hong Kong Special Administrative Region",
"HKCT": "Competition Tribunal of the Hong Kong Special Administrative Region",
"HKCorC": "Coroner's Court of the Hong Kong Special Administrative Region",
"HKCrC": "Coroner's Court of the Hong Kong Special Administrative Region",
}
NEUTRAL_CITATION_RE = re.compile(
r"\[(\d{4})\]\s*(HKCFA|HKCA|HKCFI|HKDC|HKMagC|HKMC|HKSCT|HKLT|HKLD|HKCT|HKCorC)\s*(\d+)",
re.I,
)
CASE_NO_RE = re.compile(
r"(FACV|FACC|FAMV|FAMC|CACV|CACC|CAAG|HCA|HCAL|HCMP|HCCW|HCB|DCCJ|DCMP|SCTC|LBTC|LDPD|LDBM|CCDI|WKCC)"
r"\s*(?:NO\.?)?\s*\d+\s*(?:OF|/|\sof\s)\s*\d{4}",
re.I,
)
# 案號前綴到法院代碼的映射(優先級最高)
CASE_NO_PREFIX_MAP: dict[str, str] = {
"FACV": "HKCFA", # Final Appeal Civil
"FACC": "HKCFA", # Final Appeal Criminal
"FAMV": "HKCFA", # Final Appeal Miscellaneous
"FAMC": "HKCFA", # Final Appeal Miscellaneous Criminal
"CACV": "HKCA", # Court of Appeal Civil
"CACC": "HKCA", # Court of Appeal Criminal
"CAAG": "HKCA", # Court of Appeal (Administrative)
"HCA": "HKCFI", # High Court Action
"HCAL": "HKCFI", # High Court Administrative Law
"HCMP": "HKCFI", # High Court Miscellaneous Proceedings
"HCCW": "HKCFI", # High Court Companies Winding Up
"HCB": "HKCFI", # High Court Bankruptcy
"DCCJ": "HKDC", # District Court
"DCMP": "HKDC", # District Court Miscellaneous Proceedings
"SCTC": "HKSCT", # Small Claims Tribunal
"LBTC": "HKLAT", # Labour Tribunal (勞資審裁處)
"LDPD": "HKLAT", # Labour Tribunal
"LDBM": "HKLDT", # Lands Tribunal (土地審裁處)
"CCDI": "HKCrC", # Coroner's Court (死因裁判法庭)
"WKCC": "HKMagC", # Magistrates' Court (裁判法院)
}
def clean_text(raw: str) -> str:
"""去頁眉頁腳、頁碼、多餘空行/空格"""
t = raw
t = re.sub(r"Page\s+\d+\s+of\s+\d+", "", t, flags=re.I)
t = re.sub(r"^\s*-\s*\d+\s*-\s*$", "", t, flags=re.M)
t = re.sub(r" +", " ", t) # 全角空格
t = re.sub(r"[ \t]+", " ", t)
t = re.sub(r"\n{3,}", "\n\n", t)
return t.strip()
def extract_metadata_by_rule(text: str, lang: str = 'zh') -> dict[str, Any]:
"""純規則:司法區域、案號、案件地點(默認香港特區)
優先級:
1. 案號前綴(最可靠)
2. Neutral Citation
3. 法院全稱匹配
Args:
text: 判決書文本
lang: 語言代碼 ('zh''en')
"""
# 根據語言選擇對應的映射表和默認地點
jurisdiction_map = JURISDICTION_MAP_ZH if lang == 'zh' else JURISDICTION_MAP_EN
default_location = ["香港特別行政區"] if lang == 'zh' else ["Hong Kong Special Administrative Region"]
meta: dict[str, Any] = {
"jurisdiction_code": None,
"jurisdiction_name": None,
"case_location": default_location,
"case_number": None,
}
# 優先:從案號前綴判斷法院
if m := CASE_NO_RE.search(text):
case_no = re.sub(r"\s+", " ", m.group(0).strip())
meta["case_number"] = case_no
# 提取前綴並映射到法院代碼
prefix = m.group(1).upper()
if prefix in CASE_NO_PREFIX_MAP:
code = CASE_NO_PREFIX_MAP[prefix]
meta["jurisdiction_code"] = code
meta["jurisdiction_name"] = jurisdiction_map.get(code)
# 次優先Neutral Citation如果案號未能確定法院
if not meta["jurisdiction_code"]:
if m := NEUTRAL_CITATION_RE.search(text):
code = m.group(2).upper()
# 規範化大小寫
for k in jurisdiction_map:
if k.upper() == code:
meta["jurisdiction_code"] = k
meta["jurisdiction_name"] = jurisdiction_map[k]
break
# 最後靠法院全稱反查僅在前兩者都失敗時使用且只搜索前2000字符
if not meta["jurisdiction_code"]:
header = text[:2000] # 只在開頭搜索,避免被引用案例干擾
# 同時搜索中英文法院名稱
for code in jurisdiction_map:
full_zh = JURISDICTION_MAP_ZH.get(code, "")
full_en = JURISDICTION_MAP_EN.get(code, "")
short_zh = full_zh.replace("香港特別行政區", "")
short_en = full_en.replace("Hong Kong Special Administrative Region", "").replace(" of the ", " ")
if any(name in header for name in [full_zh, short_zh, full_en, short_en] if name):
meta["jurisdiction_code"] = code
meta["jurisdiction_name"] = jurisdiction_map[code]
break
return meta
# -----------------------------------------------------------------------------
# 關鍵詞 + 窗口召回(取代脆弱的正則切段)
# -----------------------------------------------------------------------------
# 思路:每個抽取目標定義一組「高信號關鍵詞」,掃全文取所有命中位置周圍
# ±half_window 字符的窗口,合併重疊後拼接喂給 LLM。
# 不依賴判決書的固定章節標題,對結構各異的香港判決書都能工作。
#
# 優化策略(基於實際案例分析):
# 1. 當事人信息直接從開頭2000字符提取通常在 BETWEEN...AND 結構中)
# 2. 判決結果優先從尾部4000字符提取通常在 JUDGMENT/ORDER/命令 部分)
# 3. 其他字段:使用關鍵詞召回策略
KEYWORD_GROUPS: dict[str, list[str]] = {
# Call 1當事人 - 不再使用,改為直接截取開頭
# 保留此處僅為向後兼容,實際不會被 gather_all 使用
"parties": [],
# Call 2事由與標的
"reason_object": [
# 段落標題類
"案情", "背景", "引言", "事實", "案件背景", "案由",
"INTRODUCTION", "BACKGROUND", "FACTS", "THE FACTS", "General course",
# 主張類
"申索", "索償", "訴因", "聲稱", "請求", "本案涉及", "本訴訟",
"原告聲稱", "申索人聲稱", "申索人指稱", "上訴人指", "答辯人指",
"Plaintiff", "Claimant", "Appellant", "claim", "allege",
# 標的物關鍵詞
"賠償", "損失", "損害", "精神困擾", "經濟損失", "醫療費",
"履行", "所有權", "占有", "撤銷", "宣告", "damages", "compensation",
],
# Call 3判決結果 - 不再使用關鍵詞,改為直接截取尾部
# 保留此處僅為向後兼容
"judgment_result": [],
# Call 4涉及實體法官、律師、引用案例中的法官
"entities": [
# 法官稱謂
"法官", "大法官", "審裁官", "裁判官", "首席法官", "常任法官", "非常任法官",
"Hon.", " J.", " JA ", " CJ ", " PJ ", " NPJ ", "Coroner", "Judge",
# 代表類
"代表", "大律師", "律師", "資深大律師", "代表律師",
"Counsel", "Solicitor", "instructed by", "represented by",
# 案例引用(會在周邊帶出法官名)
" v ", " v. ", "", "[19", "[20", "HKCFA", "HKCA", "HKCFI",
],
# Call 5法庭分析用於 summary 的核心輸入)
"analysis": [
# 法庭觀點標記
"本席認為", "本席接納", "本席不接納", "本席同意", "本席不同意",
"本席裁定", "本席拒絕", "本席認同", "本席考慮",
"本庭認為", "本庭接納", "本庭裁定", "本庭認同", "本庭考慮",
"I find", "I accept", "I do not accept", "I conclude", "I consider",
"The court finds", "In my view", "In my judgment", "The Court held",
# 法律原則
"舉證責任", "審慎責任", "鄰人原則", "替代責任", "合理疑點",
"違反", "侵權", "過失", "negligence", "breach", "duty of care",
# 證據評估
"證據顯示", "根據證據", "證人證供", "可信", "不可信",
"evidence shows", "testimony", "credible", "reliable",
],
}
def gather_chunks(text: str,
keywords: list[str],
half_window: int = 500,
max_total: int = 6500,
case_sensitive: bool = False) -> tuple[str, int]:
"""
召回所有 keywords 命中位置周圍 ±half_window 字符的窗口,
合併重疊區間,按位置順序拼接,總長不超過 max_total。
返回:(拼接後文本, 命中關鍵詞數)
若無命中fallback 返回文檔前 max_total 字。
"""
if not text:
return "", 0
flags = 0 if case_sensitive else re.IGNORECASE
hits: list[tuple[int, int]] = []
for kw in keywords:
for m in re.finditer(re.escape(kw), text, flags=flags):
s = max(0, m.start() - half_window)
e = min(len(text), m.end() + half_window)
hits.append((s, e))
if not hits:
return text[:max_total], 0
# 合併重疊區間
hits.sort()
merged: list[list[int]] = []
for s, e in hits:
if merged and s <= merged[-1][1]:
merged[-1][1] = max(merged[-1][1], e)
else:
merged.append([s, e])
# 按位置順序拼接,控制總長
pieces: list[str] = []
total = 0
for s, e in merged:
seg_len = e - s
if total + seg_len > max_total:
remain = max_total - total
if remain > 200:
pieces.append(text[s:s + remain])
break
pieces.append(text[s:e])
total += seg_len
return "\n\n[…]\n\n".join(pieces), len(hits)
def gather_all(text: str,
head_length: int = 5000,
tail_length: int = 5000,
entities_window: int = 400,
entities_max: int = 6500,
analysis_window: int = 500,
analysis_max: int = 6500) -> dict[str, str]:
"""為每個 group 召回對應的上下文片段
優化策略:
1. 基礎信息(當事人):直接取開頭 head_length 字符,不使用關鍵詞召回
2. 事由與標的reason_object直接取開頭 head_length 字符,不使用關鍵詞召回
3. 判決結果:取開頭 head_length 字符 + 尾部 tail_length 字符
4. 其他字段:保持關鍵詞召回策略
Args:
text: 判決書全文
head_length: 開頭截取長度(默認 5000
tail_length: 尾部截取長度(默認 5000
entities_window: 實體關鍵詞窗口半徑(默認 400
entities_max: 實體片段最大總長度(默認 6500
analysis_window: 分析關鍵詞窗口半徑(默認 500
analysis_max: 分析片段最大總長度(默認 6500
"""
out: dict[str, str] = {}
# 1. 當事人信息:直接從開頭截取
out["parties"] = text[:head_length]
out["_parties_hits"] = "0" # 不使用關鍵詞標記為0
# 2. 事由與標的:直接從開頭截取
out["reason_object"] = text[:head_length]
out["_reason_object_hits"] = "0" # 不使用關鍵詞標記為0
# 3. 判決結果:取開頭 + 尾部
head_text = text[:head_length] if len(text) > head_length else text
tail_text = text[-tail_length:] if len(text) > tail_length else ""
# 如果文本足夠長,拼接頭尾;否則只用全文
if tail_text and head_text != tail_text:
out["judgment_result"] = head_text + "\n\n[…]\n\n" + tail_text
else:
out["judgment_result"] = head_text
out["_judgment_result_hits"] = "0" # 直接截取,不計算關鍵詞命中
# 4. 其他字段:使用關鍵詞召回
params: dict[str, tuple[int, int]] = {
"entities": (entities_window, entities_max),
"analysis": (analysis_window, analysis_max),
}
for group in ["entities", "analysis"]:
kws = KEYWORD_GROUPS[group]
hw, mt = params[group]
ctx, hits = gather_chunks(text, kws, half_window=hw, max_total=mt)
out[group] = ctx
out[f"_{group}_hits"] = str(hits)
return out
# =============================================================================
# 2. OpenAI 兼容客戶端:支持 Ollama / OpenRouter / OpenAI 等
# =============================================================================
@dataclass
class OpenAICompatibleClient:
"""OpenAI 兼容的 API 客戶端
支持:
- Ollama (http://localhost:11434/v1)
- OpenRouter (https://openrouter.ai/api/v1)
- OpenAI (https://api.openai.com/v1)
- 其他 OpenAI 兼容的服務
"""
model: str = DEFAULT_MODEL
base_url: str = DEFAULT_BASE_URL
api_key: str = DEFAULT_API_KEY
timeout: int = DEFAULT_TIMEOUT
# token 用量累計(跨所有調用,含重試)
total_input_tokens: int = field(default=0, init=False)
total_output_tokens: int = field(default=0, init=False)
num_calls: int = field(default=0, init=False)
@property
def total_tokens(self) -> int:
return self.total_input_tokens + self.total_output_tokens
def chat_json(self, system: str, user: str, schema: dict,
temperature: float = 0.0,
max_tokens: int = 4096) -> dict:
"""調用 OpenAI 兼容 API使用 response_format 強制 JSON 輸出"""
# 構建請求 URL
url = f"{self.base_url.rstrip('/')}/chat/completions"
# 構建請求頭
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}",
}
# 構建請求體
payload = {
"model": self.model,
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": user},
],
"temperature": temperature,
"max_tokens": max_tokens,
"response_format": {"type": "json_object"}, # OpenAI 兼容的 JSON 模式
}
# 發送請求
try:
r = requests.post(url, json=payload, headers=headers, timeout=self.timeout)
r.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"❌ API 請求失敗:{e}", file=sys.stderr)
print(f" URL: {url}", file=sys.stderr)
print(f" Model: {self.model}", file=sys.stderr)
raise
# 解析響應
try:
response_data = r.json()
except json.JSONDecodeError as e:
print(f"❌ API 響應不是有效的 JSON", file=sys.stderr)
print(f" 響應狀態碼: {r.status_code}", file=sys.stderr)
print(f" 響應內容: {r.text[:500]}", file=sys.stderr)
raise
# 累計 token 用量OpenAI 兼容端點通常在 usage 字段返回)
usage = response_data.get("usage") or {}
self.total_input_tokens += int(usage.get("prompt_tokens", 0) or 0)
self.total_output_tokens += int(usage.get("completion_tokens", 0) or 0)
self.num_calls += 1
# 提取內容
if "choices" not in response_data or not response_data["choices"]:
print(f"❌ API 響應缺少 choices 字段", file=sys.stderr)
print(f" 響應數據: {json.dumps(response_data, ensure_ascii=False, indent=2)[:500]}", file=sys.stderr)
raise ValueError("API 響應格式錯誤:缺少 choices 字段")
content = response_data["choices"][0]["message"]["content"]
if not content or not content.strip():
print(f"❌ 模型返回空內容", file=sys.stderr)
print(f" 完整響應: {json.dumps(response_data, ensure_ascii=False, indent=2)[:1000]}", file=sys.stderr)
raise ValueError("模型返回空內容")
# 清理可能的 markdown 代碼塊包裹
content = self._clean_json_response(content)
try:
return json.loads(content)
except json.JSONDecodeError as e:
# 如果仍然失敗,打印錯誤信息以便調試
print(f"❌ JSON 解析失敗", file=sys.stderr)
print(f" 錯誤: {e}", file=sys.stderr)
print(f" 原始內容前500字符:\n{content[:500]}", file=sys.stderr)
print(f" 原始內容後500字符:\n{content[-500:]}", file=sys.stderr)
raise
def _clean_json_response(self, content: str) -> str:
"""清理模型輸出中可能包含的 markdown 代碼塊標記和開頭的 <think> 標籤
處理以下格式:
- <think>{{思考的內容}}</think> (僅開頭)
- ```json\n{...}\n```
- ```\n{...}\n```
- {... 前後有空白字符
"""
content = content.strip()
# 移除開頭的 <think>...</think> 標籤及其內容
# 使用非貪婪匹配,支持多行,只匹配開頭
if content.startswith("<think>") or content.startswith("<THINK>"):
match = re.match(r'<think>.*?</think>\s*', content, flags=re.DOTALL | re.IGNORECASE)
if match:
content = content[match.end():]
content = content.strip()
# 移除開頭的 ```json 或 ```
if content.startswith("```"):
# 找到第一個換行符
first_newline = content.find("\n")
if first_newline != -1:
content = content[first_newline + 1:]
# 移除結尾的 ```
if content.endswith("```"):
# 找到最後一個 ``` 之前的換行符
last_fence = content.rfind("```")
if last_fence != -1:
content = content[:last_fence]
return content.strip()
def chat_json_with_retry(self, system: str, user: str, schema: dict,
validator=None, **kw) -> dict:
"""validator(result) -> (ok: bool, hint: str);失敗則回灌 hint 重試"""
last_err = None
for attempt in range(MAX_RETRIES + 1):
try:
out = self.chat_json(system, user, schema, **kw)
if validator is None:
return out
ok, hint = validator(out)
if ok:
return out
# 回灌錯誤信息
user = (f"{user}\n\n上次輸出存在問題:{hint}\n"
f"請修正後重新輸出。")
except Exception as e:
last_err = e
if last_err:
raise last_err
return out # type: ignore
# =============================================================================
# 3. 五次抽取調用:每次只負責一組字段
# =============================================================================
# --- Call 1: 當事人 ----------------------------------------------------------
PARTIES_SCHEMA = {
"type": "object",
"properties": {
"plaintiff": {"type": "array", "items": {"type": "string"}},
"defendant": {"type": "array", "items": {"type": "string"}},
},
"required": ["plaintiff", "defendant"],
}
# 中文提示詞
PARTIES_SYSTEM_ZH = """你是香港法律文書信息抽取助手。
從給定的判決書開頭部分抽取所有當事人完整姓名/機構名。
格式識別:
1. 英文格式BETWEEN ... AND ...
2. 中文格式:申請人 ... 對/訴 答辯人 ...
3. 混合格式Plaintiff ... Defendant ...
分類規則:
- 原告/申索人/上訴人/覆核申請人/Plaintiff/Appellant/Claimant/Applicant → plaintiff
- 被告/答辯人/被上訴人/Defendant/Respondent → defendant
- 保留中英文對照(如有)
- 某類無則輸出空數組
只輸出符合 schema 的 JSON不要解釋。"""
PARTIES_FEWSHOT_ZH = """範例1原告/被告格式):
BETWEEN
陳大文 (CHAN TAI MAN) 上訴人
AND
香港房屋委員會 (Hong Kong Housing Authority) 答辯人
輸出:
{"plaintiff":["陳大文 (CHAN TAI MAN)"],"defendant":["香港房屋委員會 (Hong Kong Housing Authority)"]}
範例2申請人/答辯人格式):
申請人:
李小明
答辯人:
入境事務處處長
輸出:
{"plaintiff":["李小明"],"defendant":["入境事務處處長"]}"""
# 英文提示詞
PARTIES_SYSTEM_EN = """You are a Hong Kong legal document information extraction assistant.
Extract all complete names/organization names of parties from the beginning of the judgment.
Format Recognition:
1. English format: BETWEEN ... AND ...
2. Chinese format: 申請人 ... 對/訴 答辯人 ...
3. Mixed format: Plaintiff ... Defendant ...
Classification Rules:
- Plaintiff/Claimant/Appellant/Applicant/原告/申索人/上訴人/覆核申請人 → plaintiff
- Defendant/Respondent/被告/答辯人/被上訴人 → defendant
- Preserve bilingual names (if any)
- Output empty array if none
Output only JSON conforming to schema, no explanation."""
PARTIES_FEWSHOT_EN = """Example 1 (Plaintiff/Defendant format):
BETWEEN
Dr Paul KI Ping-ki 1st Plaintiff
Hong Kong Washington Company 2nd Plaintiff
AND
Next Magazine Publishing Ltd 1st Defendant
Output:
{"plaintiff":["Dr Paul KI Ping-ki","Hong Kong Washington Company"],"defendant":["Next Magazine Publishing Ltd"]}
Example 2 (Applicant/Respondent format):
Between:
MO YUK PING
Applicant
and
HONG KONG SPECIAL ADMINISTRATIVE REGION
Respondent
Output:
{"plaintiff":["MO YUK PING"],"defendant":["HONG KONG SPECIAL ADMINISTRATIVE REGION"]}"""
def extract_parties(client: OpenAICompatibleClient, context: str, lang: str = 'zh') -> dict:
system = PARTIES_SYSTEM_ZH if lang == 'zh' else PARTIES_SYSTEM_EN
fewshot = PARTIES_FEWSHOT_ZH if lang == 'zh' else PARTIES_FEWSHOT_EN
if lang == 'zh':
user = f"{fewshot}\n\n請從以下判決書開頭部分抽取:\n```\n{context[:5000]}\n```"
else:
user = f"{fewshot}\n\nPlease extract from the following judgment header:\n```\n{context[:5000]}\n```"
return client.chat_json_with_retry(system, user, PARTIES_SCHEMA)
# --- Call 2: 事由 + 標的 ----------------------------------------------------
def get_reason_object_schema(lang: str = 'zh') -> dict:
"""根據語言返回對應的 schema英文字數限制更寬鬆"""
max_length = 100 if lang == 'zh' else 200 # 英文允許 2 倍字符數
return {
"type": "object",
"properties": {
"case_reason": {"type": "string", "maxLength": max_length},
"case_object": {"type": "array", "items": {"type": "string"}},
},
"required": ["case_reason", "case_object"],
}
# 中文提示詞
REASON_OBJECT_SYSTEM_ZH = f"""從香港判決書中抽取:
1. case_reason事由
- 嚴格 ≤100 字,單句,清晰完整
- 結構:[原告身份] + [針對什麼事件/行為] + [向誰] + [提出什麼請求]
- 覆核/上訴案件須註明對哪個裁決提出覆核(含日期/案號)
- 嚴禁包含:判決結果、法庭分析、案發細節、證據評估
- 只描述訴訟的起因和請求,不涉及法庭的判斷
2. case_object標的物
- 訴訟請求指向的實體權利或利益
- 例:汽車、黃金、珠寶、不動產產權、撫養權、人身傷害賠償、合同履行、房產所有權、精神困擾賠償、居留權
- 涉及金錢標的時必須提取,並標明幣種與具體金額(如「拖欠貨款 HK$850,000」「索償 HK$1,000,000」「拖欠租金 HK$120,000」金額未定或待評定者註明「金額待評定」
- 合併本質相同的標的
- 嚴禁:證據材料、程序性訴求(如"要求法庭裁決")、法律條文名稱
用語要求:一律使用規範的法律專業用語(如「申索」「損害賠償」「違約」「侵權」「衡平法濟助」),避免口語化或不準確的表述。
只輸出 JSON。"""
REASON_OBJECT_FEWSHOT_ZH = """範例輸出1人身傷害
{"case_reason":"申索人為商場保安員就被告於2023年7月在商場毆打申索人造成的人身傷害向被告提出損害賠償申索。","case_object":["人身傷害賠償","醫療費用","精神困擾賠償"]}
範例輸出2金錢申索標的物須含具體金額
{"case_reason":"原告就被告未支付2022年買賣合約項下的貨款向被告提出追討欠款的申索。","case_object":["拖欠貨款 HK$850,000","合約利息","訟費"]}"""
# 英文提示詞
REASON_OBJECT_SYSTEM_EN = f"""Extract from Hong Kong judgment:
1. case_reason (Cause of Action):
- Strictly ≤200 words, single sentenceclear and complete
- Structure: [Plaintiff's identity] + [regarding what event/conduct] + [against whom] + [what relief sought]
- For judicial review/appeal cases, specify which decision is being challenged (with date/case number)
- MUST NOT include: judgment results, court analysis, incident details, evidence assessment
- Only describe the cause and relief sought, not the court's determination
2. case_object (Subject Matter):
- Tangible rights or interests targeted by the claim
- Examples: vehicle, gold, jewelry, property title, custody, personal injury damages, contract performance, property ownership, distress damages, right of abode
- When a monetary subject matter is involved, it MUST be extracted with currency and the specific amount (e.g., "outstanding goods price HK$850,000", "claim of HK$1,000,000", "arrears of rent HK$120,000"); if the amount is unascertained, note "amount to be assessed"
- Merge essentially identical subjects
- MUST NOT include: evidentiary materials, procedural requests (e.g., "seeking court ruling"), names of statutes
Terminology requirement: consistently use precise, standard legal terminology (e.g., "claim", "damages", "breach of contract", "negligence", "equitable relief"); avoid colloquial or imprecise wording.
Output only JSON."""
REASON_OBJECT_FEWSHOT_EN = """Example Output 1 (personal injury):
{"case_reason":"Plaintiff, a shopping mall security guard, claims damages from defendant for personal injuries caused by defendant's assault at the mall in July 2023.","case_object":["personal injury damages","medical expenses","distress damages"]}
Example Output 2 (monetary claim, subject matter must carry the specific amount):
{"case_reason":"Plaintiff claims against defendant for non-payment of the price of goods supplied under a 2022 sale and purchase contract.","case_object":["outstanding goods price HK$850,000","contractual interest","costs"]}"""
# 金錢數額識別:幣種前綴 + 數字,或數字 + 中文金額單位
MONEY_RE = re.compile(
r"(?:HK\$|US\$|RMB|MOP|港幣|港元|人民幣|美元|美金)\s*[\d,]+(?:\.\d+)?"
r"|[\$]\s*[\d,]+(?:\.\d+)?"
r"|[\d,]+(?:\.\d+)?\s*(?:萬元|億元|元|萬|億)",
re.I,
)
def _object_has_amount(objs: list[str]) -> bool:
"""case_object 中是否已含具體金額(任一項出現數字即視為已提取金額)"""
return any(re.search(r"\d", o or "") for o in (objs or []))
def _reason_object_validator(out: dict, lang: str = 'zh',
context: str = "") -> tuple[bool, str]:
r = out.get("case_reason", "")
max_length = 100 if lang == 'zh' else 200 # 英文允許 2 倍字符數
target_length = 80 if lang == 'zh' else 160 # 建議壓縮目標
if len(r) > max_length:
if lang == 'zh':
return False, f"case_reason 共 {len(r)} 字,超過 {max_length} 字上限,請壓縮到 {target_length} 字以內。"
else:
return False, f"case_reason has {len(r)} characters, exceeds {max_length} limit, please compress to within {target_length}."
if not out.get("case_object"):
if lang == 'zh':
return False, "case_object 不能為空。"
else:
return False, "case_object cannot be empty."
# 檢查是否包含判決結果性詞彙(嚴禁)
RESULT_KEYWORDS = [
"駁回", "拒絕", "勝訴", "敗訴", "維持", "撤銷", "發還",
"判給", "獲判", "判處", "部分勝訴",
"dismissed", "allowed", "granted", "refused", "upheld", "quashed",
]
for keyword in RESULT_KEYWORDS:
if keyword in r:
if lang == 'zh':
return False, f"case_reason 不應包含判決結果詞彙「{keyword}」,請只描述訴訟起因和請求。"
else:
return False, f"case_reason should not contain judgment result term '{keyword}', describe only cause and relief sought."
# 金錢標的物強制提取:原文出現金錢數額但 case_object 未含金額時要求補充
if context and MONEY_RE.search(context) and not _object_has_amount(out.get("case_object", [])):
if lang == 'zh':
return False, ("原文出現金錢數額。若該數額屬於訴訟標的(如欠款、索償、賠償金額),"
"必須在 case_object 中提取並標明幣種與具體金額(如「拖欠貨款 HK$850,000」"
"若僅為無關引用則可忽略。")
else:
return False, ("Monetary amounts appear in the source. If an amount forms part of the "
"subject matter (e.g., debt, claim, damages), it MUST be extracted in "
"case_object with currency and the specific figure (e.g., \"outstanding "
"goods price HK$850,000\"); ignore only if it is an unrelated citation.")
return True, ""
def extract_reason_object(client: OpenAICompatibleClient, context: str, lang: str = 'zh') -> dict:
system = REASON_OBJECT_SYSTEM_ZH if lang == 'zh' else REASON_OBJECT_SYSTEM_EN
fewshot = REASON_OBJECT_FEWSHOT_ZH if lang == 'zh' else REASON_OBJECT_FEWSHOT_EN
schema = get_reason_object_schema(lang)
max_length = 100 if lang == 'zh' else 200
if lang == 'zh':
user = (f"{fewshot}\n\n"
f"請從以下判決書開頭部分抽取:\n```\n{context[:5000]}\n```")
else:
user = (f"{fewshot}\n\n"
f"Please extract from the following judgment header:\n```\n{context[:5000]}\n```")
return client.chat_json_with_retry(system, user,
schema,
validator=lambda x: _reason_object_validator(x, lang, context[:5000]))
# --- Call 3: 判決結果 -------------------------------------------------------
JUDGMENT_RESULT_SCHEMA = {
"type": "object",
"properties": {
"judgment_result": {
"type": "array",
"items": {
"type": "object",
"properties": {
"charge": {"type": "string"},
"result": {"type": "string"},
},
"required": ["charge", "result"],
},
}
},
"required": ["judgment_result"],
}
# 中文提示詞
JUDGMENT_RESULT_SYSTEM_ZH = """從香港判決書尾部的命令/裁定部分抽取所有判決結果。
重要提示:
- 判決結果通常在判決書的最後部分
- 常見標記JUDGMENT, ORDER, CONCLUSION, DISPOSITION, 判決, 命令, 裁定, 頒令
- 可能包含:勝訴/敗訴、具體金額、訟費安排、上訴結果
拆分原則:
- 多項請求 → 分條
- "責任判定""損失/金額計算" 兩個層面 → 必須分條
- 每條 charge 必須以 "(責任問題)""(損失範圍)" 結尾標註層次
- result 必須包含:
a) 明確結果(勝訴/敗訴/部分勝訴/維持/撤銷/駁回/發還等)
b) 2-3 個關鍵法庭理由(如有)
c) 具體金額、利率或命令內容;凡有判給/命令支付的金錢數額,必須原文照錄幣種與金額(如 HK$28,500不得省略或約化
用語要求:一律使用規範的法律專業用語(如「判給」「訟費」「利息」「駁回」「發還重審」),避免口語化或不準確的表述。
只輸出 JSON。"""
JUDGMENT_RESULT_FEWSHOT_ZH = """範例輸出:
{"judgment_result":[
{"charge":"申索人就毆打事件的人身傷害索償 (責任問題)","result":"勝訴。法庭接納申索人證供可信,閉路電視顯示被告先動手,被告亦承認部分情節。"},
{"charge":"醫療費及精神困擾賠償金額 (損失範圍)","result":"部分勝訴。判給醫療費HK$8,500及一般損害賠償HK$20,000合共HK$28,500連同利息及訟費。"}
]}"""
# 英文提示詞
JUDGMENT_RESULT_SYSTEM_EN = """Extract all judgment results from the order/disposition section at the end of Hong Kong judgment.
Important Notes:
- Judgment results are usually at the end of the judgment
- Common markers: JUDGMENT, ORDER, CONCLUSION, DISPOSITION, 判決, 命令, 裁定, 頒令
- May include: success/dismissal, specific amounts, costs arrangements, appeal results
Splitting Principles:
- Multiple claims → separate items
- "Liability determination" vs "Quantum/damages assessment" → must be separate items
- Each charge must end with "(liability issue)" or "(quantum issue)" to mark the level
- result must include:
a) Clear outcome (allowed/dismissed/partially allowed/upheld/quashed/remitted, etc.)
b) 2-3 key court reasons (if any)
c) Specific amounts, interest rates or order details; whenever a sum is awarded/ordered to be paid, the currency and figure MUST be reproduced verbatim (e.g., HK$28,500), never omitted or rounded
Terminology requirement: consistently use precise, standard legal terminology (e.g., "awarded", "costs", "interest", "dismissed", "remitted for retrial"); avoid colloquial or imprecise wording.
Output only JSON."""
JUDGMENT_RESULT_FEWSHOT_EN = """Example Output:
{"judgment_result":[
{"charge":"Plaintiff's claim for personal injury from assault (liability issue)","result":"Allowed. Court accepted plaintiff's testimony as credible, CCTV showed defendant struck first, defendant also admitted parts of the incident."},
{"charge":"Medical expenses and distress damages quantum (quantum issue)","result":"Partially allowed. Awarded medical expenses HK$8,500 and general damages HK$20,000, totaling HK$28,500, with interest and costs."}
]}"""
def _judgment_validator(out: dict | list, lang: str = 'zh') -> tuple[bool, str]:
# 处理模型直接返回列表的情况
if isinstance(out, list):
items = out
else:
items = out.get("judgment_result", [])
if not items:
if lang == 'zh':
return False, "judgment_result 不能為空。"
else:
return False, "judgment_result cannot be empty."
if lang == 'zh':
bad = [i for i in items
if "責任問題" not in i.get("charge", "")
and "損失範圍" not in i.get("charge", "")]
if bad:
return False, (f"{len(bad)} 條 charge 未標註層次。"
f"每條 charge 必須以 '(責任問題)''(損失範圍)' 結尾。")
else:
bad = [i for i in items
if "liability issue" not in i.get("charge", "").lower()
and "quantum issue" not in i.get("charge", "").lower()]
if bad:
return False, (f"{len(bad)} charge items lack level annotation. "
f"Each charge must end with '(liability issue)' or '(quantum issue)'.")
return True, ""
def extract_judgment_result(client: OpenAICompatibleClient, context: str, lang: str = 'zh') -> dict:
system = JUDGMENT_RESULT_SYSTEM_ZH if lang == 'zh' else JUDGMENT_RESULT_SYSTEM_EN
fewshot = JUDGMENT_RESULT_FEWSHOT_ZH if lang == 'zh' else JUDGMENT_RESULT_FEWSHOT_EN
if lang == 'zh':
user = (f"{fewshot}\n\n"
f"請從以下判決書片段開頭5000字符 + 尾部5000字符抽取\n```\n{context}\n```")
else:
user = (f"{fewshot}\n\n"
f"Please extract from the following judgment segments (first 5000 + last 5000 characters):\n```\n{context}\n```")
result = client.chat_json_with_retry(system, user,
JUDGMENT_RESULT_SCHEMA,
validator=lambda x: _judgment_validator(x, lang))
# 如果模型返回的是列表,包装成标准格式
if isinstance(result, list):
return {"judgment_result": result}
return result
# --- Call 4: 涉及實體 -------------------------------------------------------
ENTITIES_SCHEMA = {
"type": "object",
"properties": {
"involved_entities": {
"type": "array",
"items": {
"type": "object",
"properties": {
"entity_name": {"type": "string"},
"reason": {"type": "string"},
},
"required": ["entity_name", "reason"],
},
}
},
"required": ["involved_entities"],
}
# 中文提示詞
ENTITIES_SYSTEM_ZH = """從香港判決書中抽取所有相關實體(自然人/法人/組織/機構)。
必須包含:
- 主審法官 / 審裁官 / 裁判官(通常在判決書開頭或結尾署名)
- 雙方代表律師、大律師(通常在判決書結尾的 Representation 部分)
- 判決中引用的先例所提及的法官
reason 須寫明在XX案[案號]中擔任XX職位闡述XX法律原則
- 涉案的政府部門、公司、機構(如:入境事務處處長、律政司司長)
嚴禁包含:
- 法案/條例名如《侵權條例》、Cap.xxx、《基本法》
- 純案例名稱(如 Donoghue v Stevenson
- 文獻、期刊名
用語要求reason 一律使用規範的法律專業用語(如「主審」「闡述」「先例」「判詞」),避免口語化或不準確的表述。
只輸出 JSON。"""
ENTITIES_FEWSHOT_ZH = """範例輸出:
{"involved_entities":[
{"entity_name":"林希維審裁官","reason":"本案主審審裁官,負責認定事實及裁決。"},
{"entity_name":"終審法院常任法官李義","reason":"在Tang Kwok Wah v HKSAR [2019] HKCFA 23 中擔任主筆法官闡述舉證責任原則本案第34段引用其判詞。"},
{"entity_name":"康樂文化事務署","reason":"涉案場所通州街公園的管理機構。"}
]}"""
# 英文提示詞
ENTITIES_SYSTEM_EN = """Extract all relevant entities (natural persons/legal persons/organizations/institutions) from Hong Kong judgment.
Must include:
- Presiding judge/adjudicator/magistrate (usually signed at beginning or end of judgment)
- Counsel/barristers representing both parties (usually in Representation section at end)
- Judges mentioned in cited precedents
reason must specify: served as XX position in XX case [case number], articulated XX legal principle
- Government departments, companies, institutions involved (e.g., Director of Immigration, Secretary for Justice)
MUST NOT include:
- Statute/ordinance names (e.g., Tort Ordinance, Cap.xxx, Basic Law)
- Pure case names (e.g., Donoghue v Stevenson)
- Literature, journal names
Terminology requirement: write each reason in precise, standard legal terminology (e.g., "presiding", "articulated", "precedent", "judgment"); avoid colloquial or imprecise wording.
Output only JSON."""
ENTITIES_FEWSHOT_EN = """Example Output:
{"involved_entities":[
{"entity_name":"Hon Leong JA","reason":"Presiding judge in this case, responsible for fact-finding and adjudication."},
{"entity_name":"Chief Justice Li","reason":"Served as lead judge in Tang Kwok Wah v HKSAR [2019] HKCFA 23, articulated burden of proof principles, cited in paragraph 34 of this judgment."},
{"entity_name":"Leisure and Cultural Services Department","reason":"Management authority of Tung Chau Street Park, the incident location."}
]}"""
def _entities_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]:
ents = out.get("involved_entities", [])
if not ents:
if lang == 'zh':
return False, "involved_entities 不能為空,至少要有主審法官。"
else:
return False, "involved_entities cannot be empty, must include at least the presiding judge."
# 檢查黑名單(條例、法案、案例名稱)
bad = []
for e in ents:
name = e.get("entity_name", "")
# 檢查是否包含黑名單關鍵詞
if any(k in name for k in ENTITY_NAME_BLACKLIST):
bad.append(name)
# 檢查是否為案例名稱格式(包含 v 或 訴)
if (" v " in name or " v. " in name or "" in name or
" vs " in name or " vs. " in name):
bad.append(name)
if bad:
if lang == 'zh':
return False, f"以下實體疑為條例/法案/案例名稱,應移除:{bad[:3]}"
else:
return False, f"Following entities appear to be statutes/acts/case names, should be removed: {bad[:3]}"
return True, ""
def extract_entities(client: OpenAICompatibleClient, context: str, lang: str = 'zh') -> dict:
system = ENTITIES_SYSTEM_ZH if lang == 'zh' else ENTITIES_SYSTEM_EN
fewshot = ENTITIES_FEWSHOT_ZH if lang == 'zh' else ENTITIES_FEWSHOT_EN
if lang == 'zh':
user = (f"{fewshot}\n\n"
f"請從以下片段(多處關鍵詞召回拼接)抽取所有涉及實體:\n"
f"```\n{context[:6500]}\n```")
else:
user = (f"{fewshot}\n\n"
f"Please extract all involved entities from the following segments (keyword-based retrieval):\n"
f"```\n{context[:6500]}\n```")
return client.chat_json_with_retry(system, user,
ENTITIES_SCHEMA,
validator=lambda x: _entities_validator(x, lang))
# --- Call 5: 判決總結(基於已抽取結果 + 分析段,不從原文重生) -----------
def get_summary_schema(lang: str = 'zh') -> dict:
"""根據語言返回對應的 schema英文字數限制更寬鬆"""
max_length = 300 if lang == 'zh' else 500 # 英文允許約 1.67 倍字符數
return {
"type": "object",
"properties": {
"judgment_summary": {"type": "string", "maxLength": max_length},
},
"required": ["judgment_summary"],
}
# 中文提示詞
SUMMARY_SYSTEM_ZH = """根據已抽取的結構化字段 + 法庭分析段,撰寫判決總結。
四要素結構(必須全部涵蓋,連貫成單段):
(1) 案件背景1-2 句交代起因與當事人關係
(2) 核心爭議焦點
(3) 法庭法律分析與推理(核心重點):
- 如何評估證據?
- 接受 / 拒絕主張的邏輯?
- 引用了哪些關鍵法律或判例?
(4) 最終裁決結果及命令
**重要judgment_summary 必須使用中文撰寫。**
用語要求:一律使用規範的法律專業用語;涉及金錢的判給或標的,須保留具體金額(含幣種)。
嚴格 ≤300 字。只輸出 JSON。"""
# 英文提示詞
SUMMARY_SYSTEM_EN = """Based on extracted structured fields + court analysis section, write judgment summary.
Four-element structure (must cover all, in coherent single paragraph):
(1) Case background: 1-2 sentences on cause and parties' relationship
(2) Core issues in dispute
(3) Court's legal analysis and reasoning (core focus):
- How was evidence assessed?
- Logic for accepting/rejecting claims?
- What key laws or precedents were cited?
(4) Final judgment and orders
**IMPORTANT: judgment_summary MUST be written in English.**
Terminology requirement: consistently use precise, standard legal terminology; preserve specific monetary figures (with currency) for any award or subject matter involving money.
Strictly ≤500 characters. Output only JSON."""
def _summary_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]:
s = out.get("judgment_summary", "")
max_length = 300 if lang == 'zh' else 500 # 英文允許約 1.67 倍字符數
min_length = 80 if lang == 'zh' else 120 # 英文最小長度也相應增加
if len(s) > max_length:
if lang == 'zh':
return False, f"summary 共 {len(s)} 字,超過 {max_length} 字上限,請壓縮。"
else:
return False, f"summary has {len(s)} characters, exceeds {max_length} limit, please compress."
if len(s) < min_length:
if lang == 'zh':
return False, "summary 過短,請完整覆蓋四要素。"
else:
return False, "summary too short, please cover all four elements."
# 檢查語言是否正確
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', s))
total_chars = len(s.strip())
if total_chars > 0:
chinese_ratio = chinese_chars / total_chars
if lang == 'zh' and chinese_ratio < 0.3:
return False, "judgment_summary 必須使用中文撰寫,但檢測到主要為英文內容,請用中文重寫。"
elif lang == 'en' and chinese_ratio > 0.3:
return False, "judgment_summary MUST be written in English, but detected primarily Chinese content. Please rewrite in English."
return True, ""
def extract_summary(client: OpenAICompatibleClient,
prior: dict, analysis: str, lang: str = 'zh') -> dict:
system = SUMMARY_SYSTEM_ZH if lang == 'zh' else SUMMARY_SYSTEM_EN
schema = get_summary_schema(lang)
max_length = 300 if lang == 'zh' else 500
if lang == 'zh':
user = f"""已抽取的字段:
```json
{json.dumps(prior, ensure_ascii=False, indent=2)}
```
法庭分析節選:
```
{analysis[:3500]}
```
請按四要素撰寫 ≤300 字的 judgment_summary。"""
else:
user = f"""Extracted fields:
```json
{json.dumps(prior, ensure_ascii=False, indent=2)}
```
Court analysis excerpt:
```
{analysis[:3500]}
```
Please write judgment_summary ≤500 characters covering four elements."""
return client.chat_json_with_retry(system, user, schema,
validator=lambda x: _summary_validator(x, lang))
# =============================================================================
# 4. 全局校驗與後處理
# =============================================================================
LOCATION_BLACKLIST = [
"法院", "法庭", "審裁處", "公園", "大廈", "大樓", "商場",
"", "道路", "", "中心", "醫院", "酒店", "車站",
]
ENTITY_NAME_BLACKLIST = [
"條例", "Cap.", "法案", "案例彙編", "Reports",
"期刊", "Journal",
# 案例名稱標記
" v ", " v. ", "", " vs ", " vs. ",
"HKCFAR", "HKCFA", "HKCA", "HKCFI", # 避免將案例引用誤認為實體
]
def validate_and_fix(result: dict, lang: str = 'zh') -> tuple[dict, list[str]]:
warnings: list[str] = []
# case_location剔除法院/場所/建築
locs = result.get("case_location") or []
cleaned = [l for l in locs
if l and not any(b in l for b in LOCATION_BLACKLIST)]
if "香港特別行政區" not in cleaned:
cleaned.insert(0, "香港特別行政區")
if set(cleaned) != set(locs):
warnings.append(
f"case_location 已清理:移除 {set(locs) - set(cleaned)}")
result["case_location"] = cleaned
# 字數檢查(僅警告,不截斷)
reason_max = 100 if lang == 'zh' else 200
summary_max = 300 if lang == 'zh' else 500
reason_len = len(result.get("case_reason", ""))
if reason_len > reason_max:
warnings.append(f"⚠️ case_reason 共 {reason_len} 字,超過建議上限 {reason_max}")
summary_len = len(result.get("judgment_summary", ""))
if summary_len > summary_max:
warnings.append(f"⚠️ judgment_summary 共 {summary_len} 字,超過建議上限 {summary_max}")
# involved_entities剔除條例/文獻
ents = result.get("involved_entities") or []
cleaned_ents = [e for e in ents
if not any(k in e.get("entity_name", "")
for k in ENTITY_NAME_BLACKLIST)]
if len(cleaned_ents) != len(ents):
warnings.append(
f"involved_entities 移除 {len(ents) - len(cleaned_ents)} 條疑似條例/文獻")
result["involved_entities"] = cleaned_ents
# judgment_result補層次標註提示
for jr in result.get("judgment_result", []) or []:
if ("責任問題" not in jr.get("charge", "")
and "損失範圍" not in jr.get("charge", "")):
warnings.append(
f"judgment_result 條目缺層次標註:{jr.get('charge', '')[:40]}")
# 空字段告警
for k in ("plaintiff", "defendant", "case_object",
"judgment_result", "involved_entities"):
if not result.get(k):
warnings.append(f"{k} 為空,請人工複核")
return result, warnings
# =============================================================================
# 5. 主管線
# =============================================================================
def run_pipeline(text: str, model: str, base_url: str, api_key: str,
head_length: int = 5000,
tail_length: int = 5000,
entities_window: int = 400,
entities_max: int = 6500,
analysis_window: int = 500,
analysis_max: int = 6500) -> tuple[dict, OpenAICompatibleClient]:
log = lambda m: print(m, file=sys.stderr)
log("[0/7] 檢測語言...")
lang = detect_language(text)
log(f" 檢測到語言:{'中文' if lang == 'zh' else '英文'} (lang={lang})")
log("[1/7] 預處理 + 關鍵詞召回...")
text = clean_text(text)
meta = extract_metadata_by_rule(text, lang)
ctx = gather_all(text, head_length, tail_length,
entities_window, entities_max,
analysis_window, analysis_max)
log(f" 規則元數據:{meta}")
log(f" 召回片段:")
for g in ("parties", "reason_object", "judgment_result",
"entities", "analysis"):
hits_info = f"hits={ctx[f'_{g}_hits']}" if ctx[f'_{g}_hits'] != "0" else "直接截取"
log(f" {g:16s} len={len(ctx[g]):5d} {hits_info}")
client = OpenAICompatibleClient(model=model, base_url=base_url, api_key=api_key)
log("[2/7] 抽取當事人...")
parties = extract_parties(client, ctx["parties"], lang)
log("[3/7] 抽取事由與標的...")
reason_obj = extract_reason_object(client, ctx["reason_object"], lang)
log("[4/7] 抽取判決結果...")
judgment = extract_judgment_result(client, ctx["judgment_result"], lang)
log("[5/7] 抽取涉及實體...")
# 實體抽取上下文:當事人片段(含律師名)+ 引用片段
entities_ctx = (ctx["parties"][:2500] + "\n\n[…]\n\n"
+ ctx["entities"])[:6500]
entities = extract_entities(client, entities_ctx, lang)
interim_for_summary = {
**parties, **reason_obj, **judgment, **entities,
"jurisdiction_name": meta["jurisdiction_name"],
}
log("[6/7] 撰寫判決總結...")
summary = extract_summary(client, interim_for_summary, ctx["analysis"], lang)
final = {
"plaintiff": parties["plaintiff"],
"defendant": parties["defendant"],
"jurisdiction_code": meta["jurisdiction_code"],
"jurisdiction_name": meta["jurisdiction_name"],
"case_location": meta["case_location"],
"case_reason": reason_obj["case_reason"],
"case_object": reason_obj["case_object"],
"judgment_result": judgment["judgment_result"],
"judgment_summary": summary["judgment_summary"],
"involved_entities": entities["involved_entities"],
}
log("[7/7] 校驗與後處理...")
final, warnings = validate_and_fix(final, lang)
for w in warnings:
log(f" ⚠️ {w}")
return final, client
# =============================================================================
# 5.5 成本統計:根據 models.json 計費價格計算本次抽取消耗
# =============================================================================
def compute_cost(client: OpenAICompatibleClient,
elapsed_seconds: float,
profile: dict | None) -> dict:
"""根據 token 用量、耗時和模型計費價格計算本次抽取成本。
價格單位為「每百萬 token 價格」:
input_cost = input_tokens / 1_000_000 * input_price
output_cost = output_tokens / 1_000_000 * output_price
profile 為 None未使用 --config價格相關字段為 null。
"""
input_tokens = client.total_input_tokens
output_tokens = client.total_output_tokens
input_price = output_price = None
price_unit = None
if profile is not None:
input_price = float(profile.get("input_price") or 0)
output_price = float(profile.get("output_price") or 0)
price_unit = profile.get("price_unit")
input_cost = output_cost = total_cost = None
if input_price is not None and output_price is not None:
input_cost = round(input_tokens / 1_000_000 * input_price, 6)
output_cost = round(output_tokens / 1_000_000 * output_price, 6)
total_cost = round(input_cost + output_cost, 6)
return {
"config_name": profile.get("Name") if profile else None,
"source": profile.get("source") if profile else None,
"model": client.model,
"elapsed_seconds": round(elapsed_seconds, 3),
"num_api_calls": client.num_calls,
"input_tokens": input_tokens,
"output_tokens": output_tokens,
"total_tokens": client.total_tokens,
"input_price_per_million": input_price,
"output_price_per_million": output_price,
"price_unit": price_unit,
"input_cost": input_cost,
"output_cost": output_cost,
"total_cost": total_cost,
}
# =============================================================================
# 6. YAML 輸出(長字串用 > 折疊;含特殊字符的自動雙引號)
# =============================================================================
class FoldedStr(str):
"""標記為 YAML > 折疊樣式"""
def _folded_str_representer(dumper, data):
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=">")
def _safe_str_representer(dumper, data):
"""含 :, #, - 開頭的字符串強制雙引號"""
if data and (":" in data or data.startswith("#") or data.startswith("- ")):
return dumper.represent_scalar("tag:yaml.org,2002:str", data,
style='"')
return dumper.represent_scalar("tag:yaml.org,2002:str", data)
yaml.add_representer(FoldedStr, _folded_str_representer)
yaml.add_representer(str, _safe_str_representer)
def to_yaml(result: dict) -> str:
if result.get("case_reason"):
result["case_reason"] = FoldedStr(result["case_reason"])
if result.get("judgment_summary"):
result["judgment_summary"] = FoldedStr(result["judgment_summary"])
return yaml.dump(result, allow_unicode=True, sort_keys=False,
default_flow_style=False, width=100)
# =============================================================================
# CLI
# =============================================================================
def main() -> None:
ap = argparse.ArgumentParser(
description="香港判決書結構化抽取OpenAI 兼容 API",
epilog="""
示例用法:
# 使用 models.json 中的配置名稱(推薦,省去多個參數)
python hk_case_extractor.py case.txt --config openrouter-claude-sonnet --out result.yaml
# 成本統計會寫入 result_cost.json
# 使用本地 Ollama
python hk_case_extractor.py case.txt --model qwen2.5:7b-instruct
# 使用 OpenRouter
python hk_case_extractor.py case.txt \\
--base-url https://openrouter.ai/api/v1 \\
--model anthropic/claude-3.5-sonnet \\
--api-key your-api-key
# 使用 OpenAI
python hk_case_extractor.py case.txt \\
--base-url https://api.openai.com/v1 \\
--model gpt-4 \\
--api-key your-api-key
# 調整截取長度
python hk_case_extractor.py case.txt \\
--head-length 8000 \\
--tail-length 8000 \\
--entities-max 10000 \\
--analysis-max 10000
""",
formatter_class=argparse.RawDescriptionHelpFormatter
)
ap.add_argument("input", help="判決書文本路徑(.txt 或 .json")
ap.add_argument("--config", default=None,
help="models.json 中的配置名稱Name"
"使用後可省略 --model/--base-url/--api-key")
ap.add_argument("--models-file", default=DEFAULT_MODELS_FILE,
help=f"模型配置文件路徑(默認:{DEFAULT_MODELS_FILE}")
ap.add_argument("--model", default=DEFAULT_MODEL,
help=f"模型名稱(默認:{DEFAULT_MODEL}")
ap.add_argument("--base-url", default=DEFAULT_BASE_URL,
help=f"API base URL默認{DEFAULT_BASE_URL}")
ap.add_argument("--api-key", default=DEFAULT_API_KEY,
help="API keyOllama 可忽略)")
ap.add_argument("--out", default=None, help="輸出 YAML 路徑(默認 stdout")
ap.add_argument("--cost", action="store_true",
help="輸出成本統計到 {輸出文件名}_cost.json默認不輸出")
ap.add_argument("--debug-dump", default=None,
help="額外輸出原始 JSON 結果到該路徑(便於 diff")
# 截取長度控制參數
ap.add_argument("--head-length", type=int, default=5000,
help="開頭截取長度默認5000")
ap.add_argument("--tail-length", type=int, default=5000,
help="尾部截取長度默認5000")
ap.add_argument("--entities-window", type=int, default=400,
help="實體關鍵詞窗口半徑默認400")
ap.add_argument("--entities-max", type=int, default=6500,
help="實體片段最大總長度默認6500")
ap.add_argument("--analysis-window", type=int, default=500,
help="分析關鍵詞窗口半徑默認500")
ap.add_argument("--analysis-max", type=int, default=6500,
help="分析片段最大總長度默認6500")
args = ap.parse_args()
# 解析模型配置:--config 優先,未命中的字段回退到命令行/默認值
profile: dict | None = None
if args.config:
profile = load_model_profile(args.config, args.models_file)
model = profile.get("model") or args.model
base_url = profile.get("BaseApiUrl") or args.base_url
api_key = profile.get("ApiKey") or args.api_key
print(f"使用配置 '{args.config}'model={model}, base_url={base_url}",
file=sys.stderr)
else:
model = args.model
base_url = args.base_url
api_key = args.api_key
# 支持从 .json 文件的 content 字段读取
input_path = Path(args.input)
if input_path.suffix.lower() == '.json':
import json
data = json.loads(input_path.read_text(encoding="utf-8"))
text = data.get("content", "")
if not text:
print("錯誤JSON 文件中沒有 'content' 字段", file=sys.stderr)
sys.exit(1)
else:
text = input_path.read_text(encoding="utf-8")
start = time.perf_counter()
result, client = run_pipeline(text, model, base_url, api_key,
args.head_length, args.tail_length,
args.entities_window, args.entities_max,
args.analysis_window, args.analysis_max)
elapsed = time.perf_counter() - start
# 成本統計:僅在 --cost 時輸出到 {summary_file_name}_cost.json
if args.cost:
cost = compute_cost(client, elapsed, profile)
if args.out:
cost_path = Path(args.out).with_name(Path(args.out).stem + "_cost.json")
else:
cost_path = input_path.with_name(input_path.stem + "_cost.json")
cost_path.parent.mkdir(parents=True, exist_ok=True)
cost_path.write_text(json.dumps(cost, ensure_ascii=False, indent=2),
encoding="utf-8")
print(f"💰 成本統計已寫入 {cost_path}:耗時 {cost['elapsed_seconds']}s"
f"input={cost['input_tokens']} output={cost['output_tokens']} "
f"total_cost={cost['total_cost']} {cost['price_unit'] or ''}",
file=sys.stderr)
if args.debug_dump:
debug_path = Path(args.debug_dump)
debug_path.parent.mkdir(parents=True, exist_ok=True)
debug_path.write_text(
json.dumps(result, ensure_ascii=False, indent=2),
encoding="utf-8")
yaml_str = to_yaml(result)
if args.out:
out_path = Path(args.out)
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(yaml_str, encoding="utf-8")
print(f"\n✅ 已寫入 {args.out}", file=sys.stderr)
else:
print(yaml_str)
if __name__ == "__main__":
main()