777 lines
29 KiB
Python
777 lines
29 KiB
Python
"""
|
||
hk_case_extractor.py
|
||
==========================================================
|
||
香港判決書結構化字段抽取管線
|
||
基於 本地 Ollama 小模型 + 分階段抽取 + JSON Schema 強制 + 校驗重試
|
||
|
||
設計理念
|
||
--------
|
||
不直接把幾萬字餵給模型,而是:
|
||
1. 預處理:規則去噪 + 切段,純規則抽司法區域/案號等高確定性字段
|
||
2. 定位:每個字段只截取對應的高信號區段(通常 < 4k 字)
|
||
3. 分組抽取:拆成 5 次獨立 Ollama 調用,每次只負責 1-3 個字段
|
||
4. Schema 強制:用 Ollama 0.5+ 的 format=<JSON Schema> 約束輸出
|
||
5. 校驗+重試:對字數、黑名單、結構標註逐項校驗
|
||
6. judgment_summary 不從原文重生,而從前 4 步結果 + 一段分析段生成
|
||
|
||
依賴
|
||
----
|
||
pip install requests pyyaml
|
||
本地需運行:ollama serve
|
||
模型:ollama pull qwen2.5:7b-instruct (推薦,中文抽取甜點)
|
||
或 ollama pull glm4:9b
|
||
|
||
使用
|
||
----
|
||
python hk_case_extractor.py <input.txt>
|
||
python hk_case_extractor.py case.txt --model qwen2.5:7b-instruct --out result.yaml
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import re
|
||
import sys
|
||
from dataclasses import dataclass
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
import requests
|
||
import yaml
|
||
|
||
|
||
# =============================================================================
|
||
# 配置
|
||
# =============================================================================
|
||
|
||
OLLAMA_URL = "https://openai.iconsz.com/ollama3090/api/chat"
|
||
DEFAULT_MODEL = "qwen2.5:7b-instruct"
|
||
DEFAULT_TIMEOUT = 240
|
||
MAX_RETRIES = 2
|
||
|
||
|
||
# =============================================================================
|
||
# 1. 預處理:去噪 + 切段 + 規則抽元數據
|
||
# =============================================================================
|
||
|
||
JURISDICTION_MAP: dict[str, str] = {
|
||
"HKCFA": "香港特別行政區終審法院",
|
||
"HKCA": "香港特別行政區高等法院上訴法庭",
|
||
"HKCFI": "香港特別行政區高等法院原訟法庭",
|
||
"HKDC": "香港特別行政區區域法院",
|
||
"HKMC": "香港特別行政區裁判法院",
|
||
"HKMagC": "香港特別行政區裁判法院",
|
||
"HKSCT": "香港特別行政區小額錢債審裁處",
|
||
"HKLT": "香港特別行政區土地審裁處",
|
||
"HKLD": "香港特別行政區勞資審裁處",
|
||
"HKCT": "香港特別行政區競爭事務審裁處",
|
||
"HKCorC": "香港特別行政區死因裁判法庭",
|
||
}
|
||
|
||
NEUTRAL_CITATION_RE = re.compile(
|
||
r"\[(\d{4})\]\s*(HKCFA|HKCA|HKCFI|HKDC|HKMagC|HKMC|HKSCT|HKLT|HKLD|HKCT|HKCorC)\s*(\d+)",
|
||
re.I,
|
||
)
|
||
CASE_NO_RE = re.compile(
|
||
r"(FACV|FACC|FAMV|FAMC|CACV|CACC|HCA|HCAL|HCMP|HCCW|HCB|DCCJ|DCMP|SCTC|LBTC|LDPD)"
|
||
r"\s*(?:NO\.?)?\s*\d+\s*(?:OF|/|\sof\s)\s*\d{4}",
|
||
re.I,
|
||
)
|
||
|
||
|
||
def clean_text(raw: str) -> str:
|
||
"""去頁眉頁腳、頁碼、多餘空行/空格"""
|
||
t = raw
|
||
t = re.sub(r"Page\s+\d+\s+of\s+\d+", "", t, flags=re.I)
|
||
t = re.sub(r"^\s*-\s*\d+\s*-\s*$", "", t, flags=re.M)
|
||
t = re.sub(r" +", " ", t) # 全角空格
|
||
t = re.sub(r"[ \t]+", " ", t)
|
||
t = re.sub(r"\n{3,}", "\n\n", t)
|
||
return t.strip()
|
||
|
||
|
||
def extract_metadata_by_rule(text: str) -> dict[str, Any]:
|
||
"""純規則:司法區域、案號、案件地點(默認香港特區)"""
|
||
meta: dict[str, Any] = {
|
||
"jurisdiction_code": None,
|
||
"jurisdiction_name": None,
|
||
"case_location": ["香港特別行政區"],
|
||
"case_number": None,
|
||
}
|
||
|
||
if m := NEUTRAL_CITATION_RE.search(text):
|
||
code = m.group(2).upper()
|
||
# 規範化大小寫
|
||
for k in JURISDICTION_MAP:
|
||
if k.upper() == code:
|
||
meta["jurisdiction_code"] = k
|
||
meta["jurisdiction_name"] = JURISDICTION_MAP[k]
|
||
break
|
||
|
||
# fallback:靠法院全稱反查
|
||
if not meta["jurisdiction_code"]:
|
||
for code, full in JURISDICTION_MAP.items():
|
||
short = full.replace("香港特別行政區", "")
|
||
if full in text or short in text:
|
||
meta["jurisdiction_code"] = code
|
||
meta["jurisdiction_name"] = full
|
||
break
|
||
|
||
if m := CASE_NO_RE.search(text):
|
||
meta["case_number"] = re.sub(r"\s+", " ", m.group(0).strip())
|
||
|
||
return meta
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# 關鍵詞 + 窗口召回(取代脆弱的正則切段)
|
||
# -----------------------------------------------------------------------------
|
||
# 思路:每個抽取目標定義一組「高信號關鍵詞」,掃全文取所有命中位置周圍
|
||
# ±half_window 字符的窗口,合併重疊後拼接喂給 LLM。
|
||
# 不依賴判決書的固定章節標題,對結構各異的香港判決書都能工作。
|
||
|
||
KEYWORD_GROUPS: dict[str, list[str]] = {
|
||
# Call 1:當事人。集中在抬頭,但也可能在 "本案中,原告..." 等行文中出現
|
||
"parties": [
|
||
"BETWEEN", "介乎", " AND ",
|
||
"申索人", "原告", "原訴人", "上訴人", "覆核申請人", "覆核人", "申請人",
|
||
"被告", "答辯人", "被上訴人", "答辯方",
|
||
"Plaintiff", "Defendant", "Appellant", "Respondent", "Applicant",
|
||
],
|
||
# Call 2:事由與標的
|
||
"reason_object": [
|
||
# 段落標題類
|
||
"案情", "背景", "引言", "事實", "案件背景",
|
||
"INTRODUCTION", "BACKGROUND", "FACTS", "THE FACTS",
|
||
# 主張類
|
||
"申索", "索償", "訴因", "聲稱", "請求", "本案涉及", "本訴訟",
|
||
"原告聲稱", "申索人聲稱", "申索人指稱",
|
||
# 標的物關鍵詞
|
||
"賠償", "損失", "損害", "精神困擾", "經濟損失", "醫療費",
|
||
"履行", "所有權", "占有", "撤銷", "宣告",
|
||
],
|
||
# Call 3:判決結果
|
||
"judgment_result": [
|
||
# 命令類
|
||
"本席命令", "本庭命令", "本席裁定", "本庭裁定", "命令如下",
|
||
"DISPOSITION", "ORDER", "CONCLUSION", "I therefore order",
|
||
# 結果類
|
||
"判決", "判給", "獲判", "判處", "支付",
|
||
"勝訴", "敗訴", "部分勝訴", "駁回", "撤銷", "維持", "發還",
|
||
"ALLOWED", "DISMISSED", "GRANTED", "REFUSED",
|
||
# 收尾連接詞
|
||
"因此", "故此", "綜上",
|
||
],
|
||
# Call 4:涉及實體(法官、律師、引用案例中的法官)
|
||
"entities": [
|
||
# 法官稱謂
|
||
"法官", "大法官", "審裁官", "裁判官", "首席法官", "常任法官", "非常任法官",
|
||
"Hon.", " J.", " JA ", " CJ ", " PJ ", " NPJ ",
|
||
# 代表類
|
||
"代表", "大律師", "律師", "資深大律師", "代表律師",
|
||
"Counsel", "Solicitor",
|
||
# 案例引用(會在周邊帶出法官名)
|
||
" v ", " v. ", "[19", "[20",
|
||
],
|
||
# Call 5:法庭分析(用於 summary 的核心輸入)
|
||
# 注意:避免使用"分析""理由""引用"等單/雙字泛詞——這些在程序性段落、
|
||
# 目錄、引文索引中也會大量出現,會把召回擴散成全文。
|
||
# 改用判決書分析段獨有的「本席/本庭 + 動詞」短語錨點。
|
||
"analysis": [
|
||
"本席認為", "本席接納", "本席不接納", "本席同意", "本席不同意",
|
||
"本席裁定", "本席拒絕", "本席認同",
|
||
"本庭認為", "本庭接納", "本庭裁定",
|
||
"I find", "I accept", "I do not accept", "I conclude",
|
||
"The court finds", "In my view", "In my judgment",
|
||
"舉證責任", "審慎責任", "鄰人原則", "替代責任",
|
||
"違反", "侵權", "過失",
|
||
],
|
||
}
|
||
|
||
|
||
def gather_chunks(text: str,
|
||
keywords: list[str],
|
||
half_window: int = 500,
|
||
max_total: int = 6500,
|
||
case_sensitive: bool = False) -> tuple[str, int]:
|
||
"""
|
||
召回所有 keywords 命中位置周圍 ±half_window 字符的窗口,
|
||
合併重疊區間,按位置順序拼接,總長不超過 max_total。
|
||
|
||
返回:(拼接後文本, 命中關鍵詞數)
|
||
若無命中,fallback 返回文檔前 max_total 字。
|
||
"""
|
||
if not text:
|
||
return "", 0
|
||
|
||
flags = 0 if case_sensitive else re.IGNORECASE
|
||
hits: list[tuple[int, int]] = []
|
||
for kw in keywords:
|
||
for m in re.finditer(re.escape(kw), text, flags=flags):
|
||
s = max(0, m.start() - half_window)
|
||
e = min(len(text), m.end() + half_window)
|
||
hits.append((s, e))
|
||
|
||
if not hits:
|
||
return text[:max_total], 0
|
||
|
||
# 合併重疊區間
|
||
hits.sort()
|
||
merged: list[list[int]] = []
|
||
for s, e in hits:
|
||
if merged and s <= merged[-1][1]:
|
||
merged[-1][1] = max(merged[-1][1], e)
|
||
else:
|
||
merged.append([s, e])
|
||
|
||
# 按位置順序拼接,控制總長
|
||
pieces: list[str] = []
|
||
total = 0
|
||
for s, e in merged:
|
||
seg_len = e - s
|
||
if total + seg_len > max_total:
|
||
remain = max_total - total
|
||
if remain > 200:
|
||
pieces.append(text[s:s + remain])
|
||
break
|
||
pieces.append(text[s:e])
|
||
total += seg_len
|
||
|
||
return "\n\n[…]\n\n".join(pieces), len(hits)
|
||
|
||
|
||
def gather_all(text: str) -> dict[str, str]:
|
||
"""為每個 group 召回對應的上下文片段"""
|
||
# 各 group 的窗口大小可微調(parties 偏短,judgment_result 偏密)
|
||
params: dict[str, tuple[int, int]] = {
|
||
"parties": (400, 3000),
|
||
"reason_object": (500, 6000),
|
||
"judgment_result": (500, 6500),
|
||
"entities": (400, 6500),
|
||
"analysis": (500, 6500),
|
||
}
|
||
out: dict[str, str] = {}
|
||
for group, kws in KEYWORD_GROUPS.items():
|
||
hw, mt = params[group]
|
||
ctx, hits = gather_chunks(text, kws, half_window=hw, max_total=mt)
|
||
out[group] = ctx
|
||
# 同時保留命中數,便於日誌
|
||
out[f"_{group}_hits"] = str(hits)
|
||
return out
|
||
|
||
|
||
# =============================================================================
|
||
# 2. Ollama 客戶端:JSON Schema 強制 + 重試
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class OllamaClient:
|
||
model: str = DEFAULT_MODEL
|
||
url: str = OLLAMA_URL
|
||
timeout: int = DEFAULT_TIMEOUT
|
||
|
||
def chat_json(self, system: str, user: str, schema: dict,
|
||
temperature: float = 0.0,
|
||
num_ctx: int = 8192) -> dict:
|
||
"""調用 Ollama,使用 format=<JSON Schema> 強制結構化輸出"""
|
||
payload = {
|
||
"model": self.model,
|
||
"messages": [
|
||
{"role": "system", "content": system},
|
||
{"role": "user", "content": user},
|
||
],
|
||
"format": schema,
|
||
"stream": False,
|
||
"options": {"temperature": temperature, "num_ctx": num_ctx},
|
||
}
|
||
r = requests.post(self.url, json=payload, timeout=self.timeout)
|
||
r.raise_for_status()
|
||
content = r.json()["message"]["content"]
|
||
try:
|
||
return json.loads(content)
|
||
except json.JSONDecodeError as e:
|
||
# 嘗試剝離可能的 ```json fence
|
||
stripped = re.sub(r"^```(?:json)?\s*|\s*```$", "",
|
||
content.strip(), flags=re.S)
|
||
return json.loads(stripped)
|
||
|
||
def chat_json_with_retry(self, system: str, user: str, schema: dict,
|
||
validator=None, **kw) -> dict:
|
||
"""validator(result) -> (ok: bool, hint: str);失敗則回灌 hint 重試"""
|
||
last_err = None
|
||
for attempt in range(MAX_RETRIES + 1):
|
||
try:
|
||
out = self.chat_json(system, user, schema, **kw)
|
||
if validator is None:
|
||
return out
|
||
ok, hint = validator(out)
|
||
if ok:
|
||
return out
|
||
# 回灌錯誤信息
|
||
user = (f"{user}\n\n上次輸出存在問題:{hint}\n"
|
||
f"請修正後重新輸出。")
|
||
except Exception as e:
|
||
last_err = e
|
||
if last_err:
|
||
raise last_err
|
||
return out # type: ignore
|
||
|
||
|
||
# =============================================================================
|
||
# 3. 五次抽取調用:每次只負責一組字段
|
||
# =============================================================================
|
||
|
||
# --- Call 1: 當事人 ----------------------------------------------------------
|
||
|
||
PARTIES_SCHEMA = {
|
||
"type": "object",
|
||
"properties": {
|
||
"plaintiff": {"type": "array", "items": {"type": "string"}},
|
||
"defendant": {"type": "array", "items": {"type": "string"}},
|
||
},
|
||
"required": ["plaintiff", "defendant"],
|
||
}
|
||
|
||
PARTIES_SYSTEM = """你是香港法律文書信息抽取助手。
|
||
從給定的判決書抬頭中抽取所有當事人完整姓名/機構名。
|
||
- 原告/申索人/上訴人/覆核申請人 → plaintiff
|
||
- 被告/答辯人/被上訴人 → defendant
|
||
- 保留中英文對照(如有)
|
||
- 某類無則輸出空數組
|
||
只輸出符合 schema 的 JSON,不要解釋。"""
|
||
|
||
PARTIES_FEWSHOT = """範例輸入:
|
||
BETWEEN
|
||
陳大文 (CHAN TAI MAN) 上訴人
|
||
AND
|
||
香港房屋委員會 (Hong Kong Housing Authority) 答辯人
|
||
|
||
範例輸出:
|
||
{"plaintiff":["陳大文 (CHAN TAI MAN)"],"defendant":["香港房屋委員會 (Hong Kong Housing Authority)"]}"""
|
||
|
||
|
||
def extract_parties(client: OllamaClient, context: str) -> dict:
|
||
user = f"{PARTIES_FEWSHOT}\n\n請從以下片段(多處關鍵詞召回拼接)抽取:\n```\n{context[:3000]}\n```"
|
||
return client.chat_json_with_retry(PARTIES_SYSTEM, user, PARTIES_SCHEMA)
|
||
|
||
|
||
# --- Call 2: 事由 + 標的 ----------------------------------------------------
|
||
|
||
REASON_OBJECT_SCHEMA = {
|
||
"type": "object",
|
||
"properties": {
|
||
"case_reason": {"type": "string", "maxLength": 100},
|
||
"case_object": {"type": "array", "items": {"type": "string"}},
|
||
},
|
||
"required": ["case_reason", "case_object"],
|
||
}
|
||
|
||
REASON_OBJECT_SYSTEM = """從香港判決書中抽取:
|
||
|
||
1. case_reason(事由):
|
||
- 嚴格 ≤100 字,單句
|
||
- 結構:[原告身份] + [針對什麼事件/行為] + [向誰] + [提出什麼請求]
|
||
- 覆核/上訴案件須註明對哪個裁決提出覆核(含日期/案號)
|
||
- 嚴禁包含:判決結果、法庭分析、案發細節
|
||
|
||
2. case_object(標的物):
|
||
- 訴訟請求指向的實體權利或利益
|
||
- 例:人身傷害賠償、合同履行、房產所有權、精神困擾賠償
|
||
- 合併本質相同的標的
|
||
- 嚴禁:證據材料、程序性訴求(如"要求法庭裁決")
|
||
|
||
只輸出 JSON。"""
|
||
|
||
REASON_OBJECT_FEWSHOT = """範例輸出:
|
||
{"case_reason":"申索人為商場保安員,就被告於2023年7月在商場毆打申索人造成的人身傷害向被告提出損害賠償申索。","case_object":["人身傷害賠償","醫療費用","精神困擾賠償"]}"""
|
||
|
||
|
||
def _reason_object_validator(out: dict) -> tuple[bool, str]:
|
||
r = out.get("case_reason", "")
|
||
if len(r) > 100:
|
||
return False, f"case_reason 共 {len(r)} 字,超過 100 字上限,請壓縮到 80 字以內。"
|
||
if not out.get("case_object"):
|
||
return False, "case_object 不能為空。"
|
||
return True, ""
|
||
|
||
|
||
def extract_reason_object(client: OllamaClient, context: str) -> dict:
|
||
user = (f"{REASON_OBJECT_FEWSHOT}\n\n"
|
||
f"請從以下片段(多處關鍵詞召回拼接)抽取:\n```\n{context[:6000]}\n```")
|
||
out = client.chat_json_with_retry(REASON_OBJECT_SYSTEM, user,
|
||
REASON_OBJECT_SCHEMA,
|
||
validator=_reason_object_validator)
|
||
if len(out["case_reason"]) > 100:
|
||
out["case_reason"] = out["case_reason"][:100]
|
||
return out
|
||
|
||
|
||
# --- Call 3: 判決結果 -------------------------------------------------------
|
||
|
||
JUDGMENT_RESULT_SCHEMA = {
|
||
"type": "object",
|
||
"properties": {
|
||
"judgment_result": {
|
||
"type": "array",
|
||
"items": {
|
||
"type": "object",
|
||
"properties": {
|
||
"charge": {"type": "string"},
|
||
"result": {"type": "string"},
|
||
},
|
||
"required": ["charge", "result"],
|
||
},
|
||
}
|
||
},
|
||
"required": ["judgment_result"],
|
||
}
|
||
|
||
JUDGMENT_RESULT_SYSTEM = """從香港判決書命令/裁定部分抽取所有判決結果。
|
||
|
||
拆分原則:
|
||
- 多項請求 → 分條
|
||
- "責任判定" 與 "損失/金額計算" 兩個層面 → 必須分條
|
||
- 每條 charge 必須以 "(責任問題)" 或 "(損失範圍)" 結尾標註層次
|
||
- result 必須包含:
|
||
a) 明確結果(勝訴/敗訴/部分勝訴/維持/撤銷/駁回等)
|
||
b) 2-3 個關鍵法庭理由
|
||
c) 具體金額、利率或命令內容(如有)
|
||
|
||
只輸出 JSON。"""
|
||
|
||
JUDGMENT_RESULT_FEWSHOT = """範例輸出:
|
||
{"judgment_result":[
|
||
{"charge":"申索人就毆打事件的人身傷害索償 (責任問題)","result":"勝訴。法庭接納申索人證供可信,閉路電視顯示被告先動手,被告亦承認部分情節。"},
|
||
{"charge":"醫療費及精神困擾賠償金額 (損失範圍)","result":"部分勝訴。判給醫療費HK$8,500及一般損害賠償HK$20,000,合共HK$28,500,連同利息及訟費。"}
|
||
]}"""
|
||
|
||
|
||
def _judgment_validator(out: dict) -> tuple[bool, str]:
|
||
items = out.get("judgment_result", [])
|
||
if not items:
|
||
return False, "judgment_result 不能為空。"
|
||
bad = [i for i in items
|
||
if "責任問題" not in i.get("charge", "")
|
||
and "損失範圍" not in i.get("charge", "")]
|
||
if bad:
|
||
return False, (f"有 {len(bad)} 條 charge 未標註層次。"
|
||
f"每條 charge 必須以 '(責任問題)' 或 '(損失範圍)' 結尾。")
|
||
return True, ""
|
||
|
||
|
||
def extract_judgment_result(client: OllamaClient, context: str) -> dict:
|
||
user = (f"{JUDGMENT_RESULT_FEWSHOT}\n\n"
|
||
f"請從以下片段(多處關鍵詞召回拼接)抽取:\n```\n{context[:6500]}\n```")
|
||
return client.chat_json_with_retry(JUDGMENT_RESULT_SYSTEM, user,
|
||
JUDGMENT_RESULT_SCHEMA,
|
||
validator=_judgment_validator)
|
||
|
||
|
||
# --- Call 4: 涉及實體 -------------------------------------------------------
|
||
|
||
ENTITIES_SCHEMA = {
|
||
"type": "object",
|
||
"properties": {
|
||
"involved_entities": {
|
||
"type": "array",
|
||
"items": {
|
||
"type": "object",
|
||
"properties": {
|
||
"entity_name": {"type": "string"},
|
||
"reason": {"type": "string"},
|
||
},
|
||
"required": ["entity_name", "reason"],
|
||
},
|
||
}
|
||
},
|
||
"required": ["involved_entities"],
|
||
}
|
||
|
||
ENTITIES_SYSTEM = """從香港判決書中抽取所有相關實體(自然人/法人/組織/機構)。
|
||
|
||
必須包含:
|
||
- 主審法官 / 審裁官
|
||
- 雙方代表律師、大律師
|
||
- 判決中引用的先例所提及的法官
|
||
reason 須寫明:在XX案[案號]中擔任XX職位,闡述XX法律原則
|
||
- 涉案的政府部門、公司、機構
|
||
|
||
嚴禁包含:
|
||
- 法案/條例名(如《侵權條例》、Cap.xxx)
|
||
- 純案例名稱(如 Donoghue v Stevenson)
|
||
- 文獻、期刊名
|
||
|
||
只輸出 JSON。"""
|
||
|
||
ENTITIES_FEWSHOT = """範例輸出:
|
||
{"involved_entities":[
|
||
{"entity_name":"林希維審裁官","reason":"本案主審審裁官,負責認定事實及裁決。"},
|
||
{"entity_name":"終審法院常任法官李義","reason":"在Tang Kwok Wah v HKSAR [2019] HKCFA 23 中擔任主筆法官,闡述舉證責任原則,本案第34段引用其判詞。"},
|
||
{"entity_name":"康樂文化事務署","reason":"涉案場所通州街公園的管理機構。"}
|
||
]}"""
|
||
|
||
|
||
def _entities_validator(out: dict) -> tuple[bool, str]:
|
||
ents = out.get("involved_entities", [])
|
||
if not ents:
|
||
return False, "involved_entities 不能為空,至少要有主審法官。"
|
||
blacklist = ["條例", "Cap.", "法案"]
|
||
bad = [e["entity_name"] for e in ents
|
||
if any(k in e.get("entity_name", "") for k in blacklist)]
|
||
if bad:
|
||
return False, f"以下實體疑為條例/法案,應移除:{bad}"
|
||
return True, ""
|
||
|
||
|
||
def extract_entities(client: OllamaClient, context: str) -> dict:
|
||
user = (f"{ENTITIES_FEWSHOT}\n\n"
|
||
f"請從以下片段(多處關鍵詞召回拼接)抽取所有涉及實體:\n"
|
||
f"```\n{context[:6500]}\n```")
|
||
return client.chat_json_with_retry(ENTITIES_SYSTEM, user,
|
||
ENTITIES_SCHEMA,
|
||
validator=_entities_validator)
|
||
|
||
|
||
# --- Call 5: 判決總結(基於已抽取結果 + 分析段,不從原文重生) -----------
|
||
|
||
SUMMARY_SCHEMA = {
|
||
"type": "object",
|
||
"properties": {
|
||
"judgment_summary": {"type": "string", "maxLength": 300},
|
||
},
|
||
"required": ["judgment_summary"],
|
||
}
|
||
|
||
SUMMARY_SYSTEM = """根據已抽取的結構化字段 + 法庭分析段,撰寫判決總結。
|
||
|
||
四要素結構(必須全部涵蓋,連貫成單段):
|
||
(1) 案件背景:1-2 句交代起因與當事人關係
|
||
(2) 核心爭議焦點
|
||
(3) 法庭法律分析與推理(核心重點):
|
||
- 如何評估證據?
|
||
- 接受 / 拒絕主張的邏輯?
|
||
- 引用了哪些關鍵法律或判例?
|
||
(4) 最終裁決結果及命令
|
||
|
||
嚴格 ≤300 字。只輸出 JSON。"""
|
||
|
||
|
||
def _summary_validator(out: dict) -> tuple[bool, str]:
|
||
s = out.get("judgment_summary", "")
|
||
if len(s) > 300:
|
||
return False, f"summary 共 {len(s)} 字,超過 300 字上限,請壓縮。"
|
||
if len(s) < 80:
|
||
return False, "summary 過短,請完整覆蓋四要素。"
|
||
return True, ""
|
||
|
||
|
||
def extract_summary(client: OllamaClient,
|
||
prior: dict, analysis: str) -> dict:
|
||
user = f"""已抽取的字段:
|
||
```json
|
||
{json.dumps(prior, ensure_ascii=False, indent=2)}
|
||
```
|
||
|
||
法庭分析節選:
|
||
```
|
||
{analysis[:3500]}
|
||
```
|
||
|
||
請按四要素撰寫 ≤300 字的 judgment_summary。"""
|
||
out = client.chat_json_with_retry(SUMMARY_SYSTEM, user, SUMMARY_SCHEMA,
|
||
validator=_summary_validator)
|
||
if len(out["judgment_summary"]) > 300:
|
||
out["judgment_summary"] = out["judgment_summary"][:300]
|
||
return out
|
||
|
||
|
||
# =============================================================================
|
||
# 4. 全局校驗與後處理
|
||
# =============================================================================
|
||
|
||
LOCATION_BLACKLIST = [
|
||
"法院", "法庭", "審裁處", "公園", "大廈", "大樓", "商場",
|
||
"街", "道路", "村", "中心", "醫院", "酒店", "車站",
|
||
]
|
||
ENTITY_NAME_BLACKLIST = ["條例", "Cap.", "法案", "案例彙編", "Reports",
|
||
"期刊", "Journal"]
|
||
|
||
|
||
def validate_and_fix(result: dict) -> tuple[dict, list[str]]:
|
||
warnings: list[str] = []
|
||
|
||
# case_location:剔除法院/場所/建築
|
||
locs = result.get("case_location") or []
|
||
cleaned = [l for l in locs
|
||
if l and not any(b in l for b in LOCATION_BLACKLIST)]
|
||
if "香港特別行政區" not in cleaned:
|
||
cleaned.insert(0, "香港特別行政區")
|
||
if set(cleaned) != set(locs):
|
||
warnings.append(
|
||
f"case_location 已清理:移除 {set(locs) - set(cleaned)}")
|
||
result["case_location"] = cleaned
|
||
|
||
# 字數硬截斷
|
||
if len(result.get("case_reason", "")) > 100:
|
||
warnings.append("case_reason > 100 字,已截斷")
|
||
result["case_reason"] = result["case_reason"][:100]
|
||
if len(result.get("judgment_summary", "")) > 300:
|
||
warnings.append("judgment_summary > 300 字,已截斷")
|
||
result["judgment_summary"] = result["judgment_summary"][:300]
|
||
|
||
# involved_entities:剔除條例/文獻
|
||
ents = result.get("involved_entities") or []
|
||
cleaned_ents = [e for e in ents
|
||
if not any(k in e.get("entity_name", "")
|
||
for k in ENTITY_NAME_BLACKLIST)]
|
||
if len(cleaned_ents) != len(ents):
|
||
warnings.append(
|
||
f"involved_entities 移除 {len(ents) - len(cleaned_ents)} 條疑似條例/文獻")
|
||
result["involved_entities"] = cleaned_ents
|
||
|
||
# judgment_result:補層次標註提示
|
||
for jr in result.get("judgment_result", []) or []:
|
||
if ("責任問題" not in jr.get("charge", "")
|
||
and "損失範圍" not in jr.get("charge", "")):
|
||
warnings.append(
|
||
f"judgment_result 條目缺層次標註:{jr.get('charge', '')[:40]}")
|
||
|
||
# 空字段告警
|
||
for k in ("plaintiff", "defendant", "case_object",
|
||
"judgment_result", "involved_entities"):
|
||
if not result.get(k):
|
||
warnings.append(f"{k} 為空,請人工複核")
|
||
|
||
return result, warnings
|
||
|
||
|
||
# =============================================================================
|
||
# 5. 主管線
|
||
# =============================================================================
|
||
|
||
def run_pipeline(text: str, model: str) -> dict:
|
||
log = lambda m: print(m, file=sys.stderr)
|
||
|
||
log("[1/6] 預處理 + 關鍵詞召回...")
|
||
text = clean_text(text)
|
||
meta = extract_metadata_by_rule(text)
|
||
ctx = gather_all(text)
|
||
|
||
log(f" 規則元數據:{meta}")
|
||
log(f" 召回片段:")
|
||
for g in ("parties", "reason_object", "judgment_result",
|
||
"entities", "analysis"):
|
||
log(f" {g:16s} len={len(ctx[g]):5d} hits={ctx[f'_{g}_hits']}")
|
||
|
||
client = OllamaClient(model=model)
|
||
|
||
log("[2/6] 抽取當事人...")
|
||
parties = extract_parties(client, ctx["parties"])
|
||
|
||
log("[3/6] 抽取事由與標的...")
|
||
reason_obj = extract_reason_object(client, ctx["reason_object"])
|
||
|
||
log("[4/6] 抽取判決結果...")
|
||
judgment = extract_judgment_result(client, ctx["judgment_result"])
|
||
|
||
log("[5/6] 抽取涉及實體...")
|
||
# 實體抽取上下文:當事人片段(含律師名)+ 引用片段
|
||
entities_ctx = (ctx["parties"][:2500] + "\n\n[…]\n\n"
|
||
+ ctx["entities"])[:6500]
|
||
entities = extract_entities(client, entities_ctx)
|
||
|
||
interim_for_summary = {
|
||
**parties, **reason_obj, **judgment, **entities,
|
||
"jurisdiction_name": meta["jurisdiction_name"],
|
||
}
|
||
|
||
log("[6/6] 撰寫判決總結...")
|
||
summary = extract_summary(client, interim_for_summary, ctx["analysis"])
|
||
|
||
final = {
|
||
"plaintiff": parties["plaintiff"],
|
||
"defendant": parties["defendant"],
|
||
"jurisdiction_code": meta["jurisdiction_code"],
|
||
"jurisdiction_name": meta["jurisdiction_name"],
|
||
"case_location": meta["case_location"],
|
||
"case_reason": reason_obj["case_reason"],
|
||
"case_object": reason_obj["case_object"],
|
||
"judgment_result": judgment["judgment_result"],
|
||
"judgment_summary": summary["judgment_summary"],
|
||
"involved_entities": entities["involved_entities"],
|
||
}
|
||
|
||
final, warnings = validate_and_fix(final)
|
||
for w in warnings:
|
||
log(f" ⚠️ {w}")
|
||
|
||
return final
|
||
|
||
|
||
# =============================================================================
|
||
# 6. YAML 輸出(長字串用 > 折疊;含特殊字符的自動雙引號)
|
||
# =============================================================================
|
||
|
||
class FoldedStr(str):
|
||
"""標記為 YAML > 折疊樣式"""
|
||
|
||
|
||
def _folded_str_representer(dumper, data):
|
||
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=">")
|
||
|
||
|
||
def _safe_str_representer(dumper, data):
|
||
"""含 :, #, - 開頭的字符串強制雙引號"""
|
||
if data and (":" in data or data.startswith("#") or data.startswith("- ")):
|
||
return dumper.represent_scalar("tag:yaml.org,2002:str", data,
|
||
style='"')
|
||
return dumper.represent_scalar("tag:yaml.org,2002:str", data)
|
||
|
||
|
||
yaml.add_representer(FoldedStr, _folded_str_representer)
|
||
yaml.add_representer(str, _safe_str_representer)
|
||
|
||
|
||
def to_yaml(result: dict) -> str:
|
||
if result.get("case_reason"):
|
||
result["case_reason"] = FoldedStr(result["case_reason"])
|
||
if result.get("judgment_summary"):
|
||
result["judgment_summary"] = FoldedStr(result["judgment_summary"])
|
||
return yaml.dump(result, allow_unicode=True, sort_keys=False,
|
||
default_flow_style=False, width=100)
|
||
|
||
|
||
# =============================================================================
|
||
# CLI
|
||
# =============================================================================
|
||
|
||
def main() -> None:
|
||
ap = argparse.ArgumentParser(
|
||
description="香港判決書結構化抽取(本地 Ollama 版)")
|
||
ap.add_argument("input", help="判決書文本路徑(.txt)")
|
||
ap.add_argument("--model", default=DEFAULT_MODEL, help="Ollama 模型名")
|
||
ap.add_argument("--out", default=None, help="輸出 YAML 路徑(默認 stdout)")
|
||
ap.add_argument("--debug-dump", default=None,
|
||
help="額外輸出原始 JSON 結果到該路徑(便於 diff)")
|
||
args = ap.parse_args()
|
||
|
||
text = Path(args.input).read_text(encoding="utf-8")
|
||
result = run_pipeline(text, args.model)
|
||
|
||
if args.debug_dump:
|
||
Path(args.debug_dump).write_text(
|
||
json.dumps(result, ensure_ascii=False, indent=2),
|
||
encoding="utf-8")
|
||
|
||
yaml_str = to_yaml(result)
|
||
if args.out:
|
||
Path(args.out).write_text(yaml_str, encoding="utf-8")
|
||
print(f"\n✅ 已寫入 {args.out}", file=sys.stderr)
|
||
else:
|
||
print(yaml_str)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|