hklii_samples/hk_case_extractor.py

777 lines
29 KiB
Python
Raw Blame History

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""
hk_case_extractor.py
==========================================================
香港判決書結構化字段抽取管線
基於 本地 Ollama 小模型 + 分階段抽取 + JSON Schema 強制 + 校驗重試
設計理念
--------
不直接把幾萬字餵給模型,而是:
1. 預處理:規則去噪 + 切段,純規則抽司法區域/案號等高確定性字段
2. 定位:每個字段只截取對應的高信號區段(通常 < 4k 字)
3. 分組抽取:拆成 5 次獨立 Ollama 調用,每次只負責 1-3 個字段
4. Schema 強制:用 Ollama 0.5+ 的 format=<JSON Schema> 約束輸出
5. 校驗+重試:對字數、黑名單、結構標註逐項校驗
6. judgment_summary 不從原文重生,而從前 4 步結果 + 一段分析段生成
依賴
----
pip install requests pyyaml
本地需運行ollama serve
模型ollama pull qwen2.5:7b-instruct (推薦,中文抽取甜點)
或 ollama pull glm4:9b
使用
----
python hk_case_extractor.py <input.txt>
python hk_case_extractor.py case.txt --model qwen2.5:7b-instruct --out result.yaml
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import requests
import yaml
# =============================================================================
# 配置
# =============================================================================
OLLAMA_URL = "https://openai.iconsz.com/ollama3090/api/chat"
DEFAULT_MODEL = "qwen2.5:7b-instruct"
DEFAULT_TIMEOUT = 240
MAX_RETRIES = 2
# =============================================================================
# 1. 預處理:去噪 + 切段 + 規則抽元數據
# =============================================================================
JURISDICTION_MAP: dict[str, str] = {
"HKCFA": "香港特別行政區終審法院",
"HKCA": "香港特別行政區高等法院上訴法庭",
"HKCFI": "香港特別行政區高等法院原訟法庭",
"HKDC": "香港特別行政區區域法院",
"HKMC": "香港特別行政區裁判法院",
"HKMagC": "香港特別行政區裁判法院",
"HKSCT": "香港特別行政區小額錢債審裁處",
"HKLT": "香港特別行政區土地審裁處",
"HKLD": "香港特別行政區勞資審裁處",
"HKCT": "香港特別行政區競爭事務審裁處",
"HKCorC": "香港特別行政區死因裁判法庭",
}
NEUTRAL_CITATION_RE = re.compile(
r"\[(\d{4})\]\s*(HKCFA|HKCA|HKCFI|HKDC|HKMagC|HKMC|HKSCT|HKLT|HKLD|HKCT|HKCorC)\s*(\d+)",
re.I,
)
CASE_NO_RE = re.compile(
r"(FACV|FACC|FAMV|FAMC|CACV|CACC|HCA|HCAL|HCMP|HCCW|HCB|DCCJ|DCMP|SCTC|LBTC|LDPD)"
r"\s*(?:NO\.?)?\s*\d+\s*(?:OF|/|\sof\s)\s*\d{4}",
re.I,
)
def clean_text(raw: str) -> str:
"""去頁眉頁腳、頁碼、多餘空行/空格"""
t = raw
t = re.sub(r"Page\s+\d+\s+of\s+\d+", "", t, flags=re.I)
t = re.sub(r"^\s*-\s*\d+\s*-\s*$", "", t, flags=re.M)
t = re.sub(r" +", " ", t) # 全角空格
t = re.sub(r"[ \t]+", " ", t)
t = re.sub(r"\n{3,}", "\n\n", t)
return t.strip()
def extract_metadata_by_rule(text: str) -> dict[str, Any]:
"""純規則:司法區域、案號、案件地點(默認香港特區)"""
meta: dict[str, Any] = {
"jurisdiction_code": None,
"jurisdiction_name": None,
"case_location": ["香港特別行政區"],
"case_number": None,
}
if m := NEUTRAL_CITATION_RE.search(text):
code = m.group(2).upper()
# 規範化大小寫
for k in JURISDICTION_MAP:
if k.upper() == code:
meta["jurisdiction_code"] = k
meta["jurisdiction_name"] = JURISDICTION_MAP[k]
break
# fallback靠法院全稱反查
if not meta["jurisdiction_code"]:
for code, full in JURISDICTION_MAP.items():
short = full.replace("香港特別行政區", "")
if full in text or short in text:
meta["jurisdiction_code"] = code
meta["jurisdiction_name"] = full
break
if m := CASE_NO_RE.search(text):
meta["case_number"] = re.sub(r"\s+", " ", m.group(0).strip())
return meta
# -----------------------------------------------------------------------------
# 關鍵詞 + 窗口召回(取代脆弱的正則切段)
# -----------------------------------------------------------------------------
# 思路:每個抽取目標定義一組「高信號關鍵詞」,掃全文取所有命中位置周圍
# ±half_window 字符的窗口,合併重疊後拼接喂給 LLM。
# 不依賴判決書的固定章節標題,對結構各異的香港判決書都能工作。
KEYWORD_GROUPS: dict[str, list[str]] = {
# Call 1當事人。集中在抬頭但也可能在 "本案中,原告..." 等行文中出現
"parties": [
"BETWEEN", "介乎", " AND ",
"申索人", "原告", "原訴人", "上訴人", "覆核申請人", "覆核人", "申請人",
"被告", "答辯人", "被上訴人", "答辯方",
"Plaintiff", "Defendant", "Appellant", "Respondent", "Applicant",
],
# Call 2事由與標的
"reason_object": [
# 段落標題類
"案情", "背景", "引言", "事實", "案件背景",
"INTRODUCTION", "BACKGROUND", "FACTS", "THE FACTS",
# 主張類
"申索", "索償", "訴因", "聲稱", "請求", "本案涉及", "本訴訟",
"原告聲稱", "申索人聲稱", "申索人指稱",
# 標的物關鍵詞
"賠償", "損失", "損害", "精神困擾", "經濟損失", "醫療費",
"履行", "所有權", "占有", "撤銷", "宣告",
],
# Call 3判決結果
"judgment_result": [
# 命令類
"本席命令", "本庭命令", "本席裁定", "本庭裁定", "命令如下",
"DISPOSITION", "ORDER", "CONCLUSION", "I therefore order",
# 結果類
"判決", "判給", "獲判", "判處", "支付",
"勝訴", "敗訴", "部分勝訴", "駁回", "撤銷", "維持", "發還",
"ALLOWED", "DISMISSED", "GRANTED", "REFUSED",
# 收尾連接詞
"因此", "故此", "綜上",
],
# Call 4涉及實體法官、律師、引用案例中的法官
"entities": [
# 法官稱謂
"法官", "大法官", "審裁官", "裁判官", "首席法官", "常任法官", "非常任法官",
"Hon.", " J.", " JA ", " CJ ", " PJ ", " NPJ ",
# 代表類
"代表", "大律師", "律師", "資深大律師", "代表律師",
"Counsel", "Solicitor",
# 案例引用(會在周邊帶出法官名)
" v ", " v. ", "[19", "[20",
],
# Call 5法庭分析用於 summary 的核心輸入)
# 注意:避免使用"分析""理由""引用"等單/雙字泛詞——這些在程序性段落、
# 目錄、引文索引中也會大量出現,會把召回擴散成全文。
# 改用判決書分析段獨有的「本席/本庭 + 動詞」短語錨點。
"analysis": [
"本席認為", "本席接納", "本席不接納", "本席同意", "本席不同意",
"本席裁定", "本席拒絕", "本席認同",
"本庭認為", "本庭接納", "本庭裁定",
"I find", "I accept", "I do not accept", "I conclude",
"The court finds", "In my view", "In my judgment",
"舉證責任", "審慎責任", "鄰人原則", "替代責任",
"違反", "侵權", "過失",
],
}
def gather_chunks(text: str,
keywords: list[str],
half_window: int = 500,
max_total: int = 6500,
case_sensitive: bool = False) -> tuple[str, int]:
"""
召回所有 keywords 命中位置周圍 ±half_window 字符的窗口,
合併重疊區間,按位置順序拼接,總長不超過 max_total。
返回:(拼接後文本, 命中關鍵詞數)
若無命中fallback 返回文檔前 max_total 字。
"""
if not text:
return "", 0
flags = 0 if case_sensitive else re.IGNORECASE
hits: list[tuple[int, int]] = []
for kw in keywords:
for m in re.finditer(re.escape(kw), text, flags=flags):
s = max(0, m.start() - half_window)
e = min(len(text), m.end() + half_window)
hits.append((s, e))
if not hits:
return text[:max_total], 0
# 合併重疊區間
hits.sort()
merged: list[list[int]] = []
for s, e in hits:
if merged and s <= merged[-1][1]:
merged[-1][1] = max(merged[-1][1], e)
else:
merged.append([s, e])
# 按位置順序拼接,控制總長
pieces: list[str] = []
total = 0
for s, e in merged:
seg_len = e - s
if total + seg_len > max_total:
remain = max_total - total
if remain > 200:
pieces.append(text[s:s + remain])
break
pieces.append(text[s:e])
total += seg_len
return "\n\n[…]\n\n".join(pieces), len(hits)
def gather_all(text: str) -> dict[str, str]:
"""為每個 group 召回對應的上下文片段"""
# 各 group 的窗口大小可微調parties 偏短judgment_result 偏密)
params: dict[str, tuple[int, int]] = {
"parties": (400, 3000),
"reason_object": (500, 6000),
"judgment_result": (500, 6500),
"entities": (400, 6500),
"analysis": (500, 6500),
}
out: dict[str, str] = {}
for group, kws in KEYWORD_GROUPS.items():
hw, mt = params[group]
ctx, hits = gather_chunks(text, kws, half_window=hw, max_total=mt)
out[group] = ctx
# 同時保留命中數,便於日誌
out[f"_{group}_hits"] = str(hits)
return out
# =============================================================================
# 2. Ollama 客戶端JSON Schema 強制 + 重試
# =============================================================================
@dataclass
class OllamaClient:
model: str = DEFAULT_MODEL
url: str = OLLAMA_URL
timeout: int = DEFAULT_TIMEOUT
def chat_json(self, system: str, user: str, schema: dict,
temperature: float = 0.0,
num_ctx: int = 8192) -> dict:
"""調用 Ollama使用 format=<JSON Schema> 強制結構化輸出"""
payload = {
"model": self.model,
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": user},
],
"format": schema,
"stream": False,
"options": {"temperature": temperature, "num_ctx": num_ctx},
}
r = requests.post(self.url, json=payload, timeout=self.timeout)
r.raise_for_status()
content = r.json()["message"]["content"]
try:
return json.loads(content)
except json.JSONDecodeError as e:
# 嘗試剝離可能的 ```json fence
stripped = re.sub(r"^```(?:json)?\s*|\s*```$", "",
content.strip(), flags=re.S)
return json.loads(stripped)
def chat_json_with_retry(self, system: str, user: str, schema: dict,
validator=None, **kw) -> dict:
"""validator(result) -> (ok: bool, hint: str);失敗則回灌 hint 重試"""
last_err = None
for attempt in range(MAX_RETRIES + 1):
try:
out = self.chat_json(system, user, schema, **kw)
if validator is None:
return out
ok, hint = validator(out)
if ok:
return out
# 回灌錯誤信息
user = (f"{user}\n\n上次輸出存在問題:{hint}\n"
f"請修正後重新輸出。")
except Exception as e:
last_err = e
if last_err:
raise last_err
return out # type: ignore
# =============================================================================
# 3. 五次抽取調用:每次只負責一組字段
# =============================================================================
# --- Call 1: 當事人 ----------------------------------------------------------
PARTIES_SCHEMA = {
"type": "object",
"properties": {
"plaintiff": {"type": "array", "items": {"type": "string"}},
"defendant": {"type": "array", "items": {"type": "string"}},
},
"required": ["plaintiff", "defendant"],
}
PARTIES_SYSTEM = """你是香港法律文書信息抽取助手。
從給定的判決書抬頭中抽取所有當事人完整姓名/機構名。
- 原告/申索人/上訴人/覆核申請人 → plaintiff
- 被告/答辯人/被上訴人 → defendant
- 保留中英文對照(如有)
- 某類無則輸出空數組
只輸出符合 schema 的 JSON不要解釋。"""
PARTIES_FEWSHOT = """範例輸入:
BETWEEN
陳大文 (CHAN TAI MAN) 上訴人
AND
香港房屋委員會 (Hong Kong Housing Authority) 答辯人
範例輸出:
{"plaintiff":["陳大文 (CHAN TAI MAN)"],"defendant":["香港房屋委員會 (Hong Kong Housing Authority)"]}"""
def extract_parties(client: OllamaClient, context: str) -> dict:
user = f"{PARTIES_FEWSHOT}\n\n請從以下片段(多處關鍵詞召回拼接)抽取:\n```\n{context[:3000]}\n```"
return client.chat_json_with_retry(PARTIES_SYSTEM, user, PARTIES_SCHEMA)
# --- Call 2: 事由 + 標的 ----------------------------------------------------
REASON_OBJECT_SCHEMA = {
"type": "object",
"properties": {
"case_reason": {"type": "string", "maxLength": 100},
"case_object": {"type": "array", "items": {"type": "string"}},
},
"required": ["case_reason", "case_object"],
}
REASON_OBJECT_SYSTEM = """從香港判決書中抽取:
1. case_reason事由
- 嚴格 ≤100 字,單句
- 結構:[原告身份] + [針對什麼事件/行為] + [向誰] + [提出什麼請求]
- 覆核/上訴案件須註明對哪個裁決提出覆核(含日期/案號)
- 嚴禁包含:判決結果、法庭分析、案發細節
2. case_object標的物
- 訴訟請求指向的實體權利或利益
- 例:人身傷害賠償、合同履行、房產所有權、精神困擾賠償
- 合併本質相同的標的
- 嚴禁:證據材料、程序性訴求(如"要求法庭裁決"
只輸出 JSON。"""
REASON_OBJECT_FEWSHOT = """範例輸出:
{"case_reason":"申索人為商場保安員就被告於2023年7月在商場毆打申索人造成的人身傷害向被告提出損害賠償申索。","case_object":["人身傷害賠償","醫療費用","精神困擾賠償"]}"""
def _reason_object_validator(out: dict) -> tuple[bool, str]:
r = out.get("case_reason", "")
if len(r) > 100:
return False, f"case_reason 共 {len(r)} 字,超過 100 字上限,請壓縮到 80 字以內。"
if not out.get("case_object"):
return False, "case_object 不能為空。"
return True, ""
def extract_reason_object(client: OllamaClient, context: str) -> dict:
user = (f"{REASON_OBJECT_FEWSHOT}\n\n"
f"請從以下片段(多處關鍵詞召回拼接)抽取:\n```\n{context[:6000]}\n```")
out = client.chat_json_with_retry(REASON_OBJECT_SYSTEM, user,
REASON_OBJECT_SCHEMA,
validator=_reason_object_validator)
if len(out["case_reason"]) > 100:
out["case_reason"] = out["case_reason"][:100]
return out
# --- Call 3: 判決結果 -------------------------------------------------------
JUDGMENT_RESULT_SCHEMA = {
"type": "object",
"properties": {
"judgment_result": {
"type": "array",
"items": {
"type": "object",
"properties": {
"charge": {"type": "string"},
"result": {"type": "string"},
},
"required": ["charge", "result"],
},
}
},
"required": ["judgment_result"],
}
JUDGMENT_RESULT_SYSTEM = """從香港判決書命令/裁定部分抽取所有判決結果。
拆分原則:
- 多項請求 → 分條
- "責任判定""損失/金額計算" 兩個層面 → 必須分條
- 每條 charge 必須以 "(責任問題)""(損失範圍)" 結尾標註層次
- result 必須包含:
a) 明確結果(勝訴/敗訴/部分勝訴/維持/撤銷/駁回等)
b) 2-3 個關鍵法庭理由
c) 具體金額、利率或命令內容(如有)
只輸出 JSON。"""
JUDGMENT_RESULT_FEWSHOT = """範例輸出:
{"judgment_result":[
{"charge":"申索人就毆打事件的人身傷害索償 (責任問題)","result":"勝訴。法庭接納申索人證供可信,閉路電視顯示被告先動手,被告亦承認部分情節。"},
{"charge":"醫療費及精神困擾賠償金額 (損失範圍)","result":"部分勝訴。判給醫療費HK$8,500及一般損害賠償HK$20,000合共HK$28,500連同利息及訟費。"}
]}"""
def _judgment_validator(out: dict) -> tuple[bool, str]:
items = out.get("judgment_result", [])
if not items:
return False, "judgment_result 不能為空。"
bad = [i for i in items
if "責任問題" not in i.get("charge", "")
and "損失範圍" not in i.get("charge", "")]
if bad:
return False, (f"{len(bad)} 條 charge 未標註層次。"
f"每條 charge 必須以 '(責任問題)''(損失範圍)' 結尾。")
return True, ""
def extract_judgment_result(client: OllamaClient, context: str) -> dict:
user = (f"{JUDGMENT_RESULT_FEWSHOT}\n\n"
f"請從以下片段(多處關鍵詞召回拼接)抽取:\n```\n{context[:6500]}\n```")
return client.chat_json_with_retry(JUDGMENT_RESULT_SYSTEM, user,
JUDGMENT_RESULT_SCHEMA,
validator=_judgment_validator)
# --- Call 4: 涉及實體 -------------------------------------------------------
ENTITIES_SCHEMA = {
"type": "object",
"properties": {
"involved_entities": {
"type": "array",
"items": {
"type": "object",
"properties": {
"entity_name": {"type": "string"},
"reason": {"type": "string"},
},
"required": ["entity_name", "reason"],
},
}
},
"required": ["involved_entities"],
}
ENTITIES_SYSTEM = """從香港判決書中抽取所有相關實體(自然人/法人/組織/機構)。
必須包含:
- 主審法官 / 審裁官
- 雙方代表律師、大律師
- 判決中引用的先例所提及的法官
reason 須寫明在XX案[案號]中擔任XX職位闡述XX法律原則
- 涉案的政府部門、公司、機構
嚴禁包含:
- 法案/條例名如《侵權條例》、Cap.xxx
- 純案例名稱(如 Donoghue v Stevenson
- 文獻、期刊名
只輸出 JSON。"""
ENTITIES_FEWSHOT = """範例輸出:
{"involved_entities":[
{"entity_name":"林希維審裁官","reason":"本案主審審裁官,負責認定事實及裁決。"},
{"entity_name":"終審法院常任法官李義","reason":"在Tang Kwok Wah v HKSAR [2019] HKCFA 23 中擔任主筆法官闡述舉證責任原則本案第34段引用其判詞。"},
{"entity_name":"康樂文化事務署","reason":"涉案場所通州街公園的管理機構。"}
]}"""
def _entities_validator(out: dict) -> tuple[bool, str]:
ents = out.get("involved_entities", [])
if not ents:
return False, "involved_entities 不能為空,至少要有主審法官。"
blacklist = ["條例", "Cap.", "法案"]
bad = [e["entity_name"] for e in ents
if any(k in e.get("entity_name", "") for k in blacklist)]
if bad:
return False, f"以下實體疑為條例/法案,應移除:{bad}"
return True, ""
def extract_entities(client: OllamaClient, context: str) -> dict:
user = (f"{ENTITIES_FEWSHOT}\n\n"
f"請從以下片段(多處關鍵詞召回拼接)抽取所有涉及實體:\n"
f"```\n{context[:6500]}\n```")
return client.chat_json_with_retry(ENTITIES_SYSTEM, user,
ENTITIES_SCHEMA,
validator=_entities_validator)
# --- Call 5: 判決總結(基於已抽取結果 + 分析段,不從原文重生) -----------
SUMMARY_SCHEMA = {
"type": "object",
"properties": {
"judgment_summary": {"type": "string", "maxLength": 300},
},
"required": ["judgment_summary"],
}
SUMMARY_SYSTEM = """根據已抽取的結構化字段 + 法庭分析段,撰寫判決總結。
四要素結構(必須全部涵蓋,連貫成單段):
(1) 案件背景1-2 句交代起因與當事人關係
(2) 核心爭議焦點
(3) 法庭法律分析與推理(核心重點):
- 如何評估證據?
- 接受 / 拒絕主張的邏輯?
- 引用了哪些關鍵法律或判例?
(4) 最終裁決結果及命令
嚴格 ≤300 字。只輸出 JSON。"""
def _summary_validator(out: dict) -> tuple[bool, str]:
s = out.get("judgment_summary", "")
if len(s) > 300:
return False, f"summary 共 {len(s)} 字,超過 300 字上限,請壓縮。"
if len(s) < 80:
return False, "summary 過短,請完整覆蓋四要素。"
return True, ""
def extract_summary(client: OllamaClient,
prior: dict, analysis: str) -> dict:
user = f"""已抽取的字段:
```json
{json.dumps(prior, ensure_ascii=False, indent=2)}
```
法庭分析節選:
```
{analysis[:3500]}
```
請按四要素撰寫 ≤300 字的 judgment_summary。"""
out = client.chat_json_with_retry(SUMMARY_SYSTEM, user, SUMMARY_SCHEMA,
validator=_summary_validator)
if len(out["judgment_summary"]) > 300:
out["judgment_summary"] = out["judgment_summary"][:300]
return out
# =============================================================================
# 4. 全局校驗與後處理
# =============================================================================
LOCATION_BLACKLIST = [
"法院", "法庭", "審裁處", "公園", "大廈", "大樓", "商場",
"", "道路", "", "中心", "醫院", "酒店", "車站",
]
ENTITY_NAME_BLACKLIST = ["條例", "Cap.", "法案", "案例彙編", "Reports",
"期刊", "Journal"]
def validate_and_fix(result: dict) -> tuple[dict, list[str]]:
warnings: list[str] = []
# case_location剔除法院/場所/建築
locs = result.get("case_location") or []
cleaned = [l for l in locs
if l and not any(b in l for b in LOCATION_BLACKLIST)]
if "香港特別行政區" not in cleaned:
cleaned.insert(0, "香港特別行政區")
if set(cleaned) != set(locs):
warnings.append(
f"case_location 已清理:移除 {set(locs) - set(cleaned)}")
result["case_location"] = cleaned
# 字數硬截斷
if len(result.get("case_reason", "")) > 100:
warnings.append("case_reason > 100 字,已截斷")
result["case_reason"] = result["case_reason"][:100]
if len(result.get("judgment_summary", "")) > 300:
warnings.append("judgment_summary > 300 字,已截斷")
result["judgment_summary"] = result["judgment_summary"][:300]
# involved_entities剔除條例/文獻
ents = result.get("involved_entities") or []
cleaned_ents = [e for e in ents
if not any(k in e.get("entity_name", "")
for k in ENTITY_NAME_BLACKLIST)]
if len(cleaned_ents) != len(ents):
warnings.append(
f"involved_entities 移除 {len(ents) - len(cleaned_ents)} 條疑似條例/文獻")
result["involved_entities"] = cleaned_ents
# judgment_result補層次標註提示
for jr in result.get("judgment_result", []) or []:
if ("責任問題" not in jr.get("charge", "")
and "損失範圍" not in jr.get("charge", "")):
warnings.append(
f"judgment_result 條目缺層次標註:{jr.get('charge', '')[:40]}")
# 空字段告警
for k in ("plaintiff", "defendant", "case_object",
"judgment_result", "involved_entities"):
if not result.get(k):
warnings.append(f"{k} 為空,請人工複核")
return result, warnings
# =============================================================================
# 5. 主管線
# =============================================================================
def run_pipeline(text: str, model: str) -> dict:
log = lambda m: print(m, file=sys.stderr)
log("[1/6] 預處理 + 關鍵詞召回...")
text = clean_text(text)
meta = extract_metadata_by_rule(text)
ctx = gather_all(text)
log(f" 規則元數據:{meta}")
log(f" 召回片段:")
for g in ("parties", "reason_object", "judgment_result",
"entities", "analysis"):
log(f" {g:16s} len={len(ctx[g]):5d} hits={ctx[f'_{g}_hits']}")
client = OllamaClient(model=model)
log("[2/6] 抽取當事人...")
parties = extract_parties(client, ctx["parties"])
log("[3/6] 抽取事由與標的...")
reason_obj = extract_reason_object(client, ctx["reason_object"])
log("[4/6] 抽取判決結果...")
judgment = extract_judgment_result(client, ctx["judgment_result"])
log("[5/6] 抽取涉及實體...")
# 實體抽取上下文:當事人片段(含律師名)+ 引用片段
entities_ctx = (ctx["parties"][:2500] + "\n\n[…]\n\n"
+ ctx["entities"])[:6500]
entities = extract_entities(client, entities_ctx)
interim_for_summary = {
**parties, **reason_obj, **judgment, **entities,
"jurisdiction_name": meta["jurisdiction_name"],
}
log("[6/6] 撰寫判決總結...")
summary = extract_summary(client, interim_for_summary, ctx["analysis"])
final = {
"plaintiff": parties["plaintiff"],
"defendant": parties["defendant"],
"jurisdiction_code": meta["jurisdiction_code"],
"jurisdiction_name": meta["jurisdiction_name"],
"case_location": meta["case_location"],
"case_reason": reason_obj["case_reason"],
"case_object": reason_obj["case_object"],
"judgment_result": judgment["judgment_result"],
"judgment_summary": summary["judgment_summary"],
"involved_entities": entities["involved_entities"],
}
final, warnings = validate_and_fix(final)
for w in warnings:
log(f" ⚠️ {w}")
return final
# =============================================================================
# 6. YAML 輸出(長字串用 > 折疊;含特殊字符的自動雙引號)
# =============================================================================
class FoldedStr(str):
"""標記為 YAML > 折疊樣式"""
def _folded_str_representer(dumper, data):
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=">")
def _safe_str_representer(dumper, data):
"""含 :, #, - 開頭的字符串強制雙引號"""
if data and (":" in data or data.startswith("#") or data.startswith("- ")):
return dumper.represent_scalar("tag:yaml.org,2002:str", data,
style='"')
return dumper.represent_scalar("tag:yaml.org,2002:str", data)
yaml.add_representer(FoldedStr, _folded_str_representer)
yaml.add_representer(str, _safe_str_representer)
def to_yaml(result: dict) -> str:
if result.get("case_reason"):
result["case_reason"] = FoldedStr(result["case_reason"])
if result.get("judgment_summary"):
result["judgment_summary"] = FoldedStr(result["judgment_summary"])
return yaml.dump(result, allow_unicode=True, sort_keys=False,
default_flow_style=False, width=100)
# =============================================================================
# CLI
# =============================================================================
def main() -> None:
ap = argparse.ArgumentParser(
description="香港判決書結構化抽取(本地 Ollama 版)")
ap.add_argument("input", help="判決書文本路徑(.txt")
ap.add_argument("--model", default=DEFAULT_MODEL, help="Ollama 模型名")
ap.add_argument("--out", default=None, help="輸出 YAML 路徑(默認 stdout")
ap.add_argument("--debug-dump", default=None,
help="額外輸出原始 JSON 結果到該路徑(便於 diff")
args = ap.parse_args()
text = Path(args.input).read_text(encoding="utf-8")
result = run_pipeline(text, args.model)
if args.debug_dump:
Path(args.debug_dump).write_text(
json.dumps(result, ensure_ascii=False, indent=2),
encoding="utf-8")
yaml_str = to_yaml(result)
if args.out:
Path(args.out).write_text(yaml_str, encoding="utf-8")
print(f"\n✅ 已寫入 {args.out}", file=sys.stderr)
else:
print(yaml_str)
if __name__ == "__main__":
main()