1738 lines
73 KiB
Python
1738 lines
73 KiB
Python
"""
|
||
hk_case_extractor.py
|
||
==========================================================
|
||
香港判決書結構化字段抽取管線
|
||
基於 本地 Ollama 小模型 + 分階段抽取 + JSON Schema 強制 + 校驗重試
|
||
|
||
設計理念
|
||
--------
|
||
基於對實際香港判決書結構的分析優化:
|
||
1. 預處理:規則去噪 + 切段,純規則抽司法區域/案號等高確定性字段
|
||
2. 智能定位:
|
||
- 基礎信息(當事人、案號):直接從開頭2000字符提取
|
||
- 判決結果:優先從尾部4000字符提取
|
||
- 其他字段:使用關鍵詞召回相關段落
|
||
3. 分組抽取:拆成 5 次獨立 Ollama 調用,每次只負責 1-3 個字段
|
||
4. Schema 強制:用 Ollama 0.5+ 的 format=<JSON Schema> 約束輸出
|
||
5. 校驗+重試:對字數、黑名單、結構標註逐項校驗
|
||
6. judgment_summary 不從原文重生,而從前 4 步結果 + 一段分析段生成
|
||
|
||
判決書結構特點(基於實際案例分析)
|
||
--------------------------------
|
||
- 開頭部分(前2000字符):
|
||
* 案號(如 CACV000175/2000)
|
||
* 法院名稱和級別
|
||
* 當事人信息(BETWEEN...AND 格式)
|
||
* 案件標題
|
||
* 審理日期和法官信息
|
||
|
||
- 中間部分:
|
||
* 案情背景(BACKGROUND, INTRODUCTION, 背景, 案情)
|
||
* 法律分析和推理
|
||
* 證據評估
|
||
* 法律原則引用
|
||
|
||
- 尾部部分(後4000字符):
|
||
* 判決結果(JUDGMENT, ORDER, CONCLUSION, 判決, 命令)
|
||
* 具體命令和裁定
|
||
* 訟費安排
|
||
* 法官簽名
|
||
|
||
依賴
|
||
----
|
||
pip install requests pyyaml
|
||
|
||
使用
|
||
----
|
||
# 使用本地 Ollama(默認)
|
||
python hk_case_extractor.py case.txt
|
||
python hk_case_extractor.py case.txt --model qwen2.5:7b-instruct --out result.yaml
|
||
|
||
# 使用 OpenRouter
|
||
python hk_case_extractor.py case.txt \\
|
||
--base-url https://openrouter.ai/api/v1 \\
|
||
--model anthropic/claude-3.5-sonnet \\
|
||
--api-key your-api-key
|
||
|
||
# 使用 OpenAI
|
||
python hk_case_extractor.py case.txt \\
|
||
--base-url https://api.openai.com/v1 \\
|
||
--model gpt-4 \\
|
||
--api-key your-api-key
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import re
|
||
import sys
|
||
import time
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from dataclasses import dataclass, field
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
import requests
|
||
import yaml
|
||
|
||
|
||
# =============================================================================
|
||
# 配置
|
||
# =============================================================================
|
||
|
||
DEFAULT_BASE_URL = "http://localhost:11434/v1" # Ollama 默認 OpenAI 兼容端點
|
||
DEFAULT_MODEL = "qwen2.5:7b-instruct"
|
||
DEFAULT_API_KEY = "ollama" # Ollama 不需要真實 key,但 API 需要提供
|
||
DEFAULT_TIMEOUT = 600 # 增加到 10 分鐘,適應遠程服務器
|
||
MAX_RETRIES = 2
|
||
|
||
DEFAULT_MODELS_FILE = "models.json"
|
||
|
||
|
||
# =============================================================================
|
||
# 模型配置(models.json):按配置名加載 base_url / api_key / model / 計費價格
|
||
# =============================================================================
|
||
|
||
def load_model_profile(name: str, models_file: str = DEFAULT_MODELS_FILE) -> dict:
|
||
"""從 models.json 按 Name 加載一個模型配置。
|
||
|
||
配置格式(數組):
|
||
[{"Name":..., "source":..., "BaseApiUrl":..., "ApiKey":...,
|
||
"model":..., "input_price":..., "output_price":..., "price_unit":...}]
|
||
|
||
找不到文件或配置名時直接退出並提示可用配置。
|
||
"""
|
||
path = Path(models_file)
|
||
if not path.exists():
|
||
print(f"❌ 找不到模型配置文件:{models_file}", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
try:
|
||
profiles = json.loads(path.read_text(encoding="utf-8"))
|
||
except json.JSONDecodeError as e:
|
||
print(f"❌ 模型配置文件不是有效的 JSON:{e}", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
if not isinstance(profiles, list):
|
||
print("❌ 模型配置文件應為配置對象組成的數組", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
for p in profiles:
|
||
if p.get("Name") == name:
|
||
return p
|
||
|
||
available = [p.get("Name") for p in profiles]
|
||
print(f"❌ 配置 '{name}' 不存在。可用配置:{available}", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
|
||
# =============================================================================
|
||
# 0. 語言檢測
|
||
# =============================================================================
|
||
|
||
def detect_language(text: str) -> str:
|
||
"""檢測文本主要語言:'zh' 或 'en'
|
||
|
||
策略:統計前3000字符中的中文字符比例
|
||
- 中文字符 > 30%:判定為中文
|
||
- 否則:判定為英文
|
||
"""
|
||
sample = text[:3000]
|
||
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', sample))
|
||
total_chars = len(sample.strip())
|
||
|
||
if total_chars == 0:
|
||
return 'en'
|
||
|
||
chinese_ratio = chinese_chars / total_chars
|
||
return 'zh' if chinese_ratio > 0.3 else 'en'
|
||
|
||
|
||
# =============================================================================
|
||
# 1. 預處理:去噪 + 切段 + 規則抽元數據
|
||
# =============================================================================
|
||
|
||
JURISDICTION_MAP_ZH: dict[str, str] = {
|
||
"HKCFA": "香港特別行政區終審法院",
|
||
"HKCA": "香港特別行政區高等法院上訴法庭",
|
||
"HKCFI": "香港特別行政區高等法院原訟法庭",
|
||
"HKDC": "香港特別行政區區域法院",
|
||
"HKMC": "香港特別行政區裁判法院",
|
||
"HKMagC": "香港特別行政區裁判法院",
|
||
"HKSCT": "香港特別行政區小額錢債審裁處",
|
||
"HKLT": "香港特別行政區土地審裁處",
|
||
"HKLDT": "香港特別行政區土地審裁處",
|
||
"HKLD": "香港特別行政區勞資審裁處",
|
||
"HKLAT": "香港特別行政區勞資審裁處",
|
||
"HKCT": "香港特別行政區競爭事務審裁處",
|
||
"HKCorC": "香港特別行政區死因裁判法庭",
|
||
"HKCrC": "香港特別行政區死因裁判法庭",
|
||
}
|
||
|
||
JURISDICTION_MAP_EN: dict[str, str] = {
|
||
"HKCFA": "Court of Final Appeal of the Hong Kong Special Administrative Region",
|
||
"HKCA": "Court of Appeal of the High Court of the Hong Kong Special Administrative Region",
|
||
"HKCFI": "Court of First Instance of the High Court of the Hong Kong Special Administrative Region",
|
||
"HKDC": "District Court of the Hong Kong Special Administrative Region",
|
||
"HKMC": "Magistrates' Courts of the Hong Kong Special Administrative Region",
|
||
"HKMagC": "Magistrates' Courts of the Hong Kong Special Administrative Region",
|
||
"HKSCT": "Small Claims Tribunal of the Hong Kong Special Administrative Region",
|
||
"HKLT": "Lands Tribunal of the Hong Kong Special Administrative Region",
|
||
"HKLDT": "Lands Tribunal of the Hong Kong Special Administrative Region",
|
||
"HKLD": "Labour Tribunal of the Hong Kong Special Administrative Region",
|
||
"HKLAT": "Labour Tribunal of the Hong Kong Special Administrative Region",
|
||
"HKCT": "Competition Tribunal of the Hong Kong Special Administrative Region",
|
||
"HKCorC": "Coroner's Court of the Hong Kong Special Administrative Region",
|
||
"HKCrC": "Coroner's Court of the Hong Kong Special Administrative Region",
|
||
}
|
||
|
||
NEUTRAL_CITATION_RE = re.compile(
|
||
r"\[(\d{4})\]\s*(HKCFA|HKCA|HKCFI|HKDC|HKMagC|HKMC|HKSCT|HKLT|HKLD|HKCT|HKCorC)\s*(\d+)",
|
||
re.I,
|
||
)
|
||
CASE_NO_RE = re.compile(
|
||
r"(FACV|FACC|FAMV|FAMC|CACV|CACC|CAAG|HCA|HCAL|HCMP|HCCW|HCB|DCCJ|DCMP|SCTC|LBTC|LDPD|LDBM|CCDI|WKCC)"
|
||
r"\s*(?:NO\.?)?\s*\d+\s*(?:OF|/|\sof\s)\s*\d{4}",
|
||
re.I,
|
||
)
|
||
|
||
# 案號前綴到法院代碼的映射(優先級最高)
|
||
CASE_NO_PREFIX_MAP: dict[str, str] = {
|
||
"FACV": "HKCFA", # Final Appeal Civil
|
||
"FACC": "HKCFA", # Final Appeal Criminal
|
||
"FAMV": "HKCFA", # Final Appeal Miscellaneous
|
||
"FAMC": "HKCFA", # Final Appeal Miscellaneous Criminal
|
||
"CACV": "HKCA", # Court of Appeal Civil
|
||
"CACC": "HKCA", # Court of Appeal Criminal
|
||
"CAAG": "HKCA", # Court of Appeal (Administrative)
|
||
"HCA": "HKCFI", # High Court Action
|
||
"HCAL": "HKCFI", # High Court Administrative Law
|
||
"HCMP": "HKCFI", # High Court Miscellaneous Proceedings
|
||
"HCCW": "HKCFI", # High Court Companies Winding Up
|
||
"HCB": "HKCFI", # High Court Bankruptcy
|
||
"DCCJ": "HKDC", # District Court
|
||
"DCMP": "HKDC", # District Court Miscellaneous Proceedings
|
||
"SCTC": "HKSCT", # Small Claims Tribunal
|
||
"LBTC": "HKLAT", # Labour Tribunal (勞資審裁處)
|
||
"LDPD": "HKLAT", # Labour Tribunal
|
||
"LDBM": "HKLDT", # Lands Tribunal (土地審裁處)
|
||
"CCDI": "HKCrC", # Coroner's Court (死因裁判法庭)
|
||
"WKCC": "HKMagC", # Magistrates' Court (裁判法院)
|
||
}
|
||
|
||
|
||
def clean_text(raw: str) -> str:
|
||
"""去頁眉頁腳、頁碼、多餘空行/空格"""
|
||
t = raw
|
||
t = re.sub(r"Page\s+\d+\s+of\s+\d+", "", t, flags=re.I)
|
||
t = re.sub(r"^\s*-\s*\d+\s*-\s*$", "", t, flags=re.M)
|
||
t = re.sub(r" +", " ", t) # 全角空格
|
||
t = re.sub(r"[ \t]+", " ", t)
|
||
t = re.sub(r"\n{3,}", "\n\n", t)
|
||
return t.strip()
|
||
|
||
|
||
def extract_metadata_by_rule(text: str, lang: str = 'zh') -> dict[str, Any]:
|
||
"""純規則:司法區域、案號、案件地點(默認香港特區)
|
||
|
||
優先級:
|
||
1. 案號前綴(最可靠)
|
||
2. Neutral Citation
|
||
3. 法院全稱匹配
|
||
|
||
Args:
|
||
text: 判決書文本
|
||
lang: 語言代碼 ('zh' 或 'en')
|
||
"""
|
||
# 根據語言選擇對應的映射表和默認地點
|
||
jurisdiction_map = JURISDICTION_MAP_ZH if lang == 'zh' else JURISDICTION_MAP_EN
|
||
default_location = ["香港特別行政區"] if lang == 'zh' else ["Hong Kong Special Administrative Region"]
|
||
|
||
meta: dict[str, Any] = {
|
||
"jurisdiction_code": None,
|
||
"jurisdiction_name": None,
|
||
"case_location": default_location,
|
||
"case_number": None,
|
||
}
|
||
|
||
# 優先:從案號前綴判斷法院
|
||
if m := CASE_NO_RE.search(text):
|
||
case_no = re.sub(r"\s+", " ", m.group(0).strip())
|
||
meta["case_number"] = case_no
|
||
|
||
# 提取前綴並映射到法院代碼
|
||
prefix = m.group(1).upper()
|
||
if prefix in CASE_NO_PREFIX_MAP:
|
||
code = CASE_NO_PREFIX_MAP[prefix]
|
||
meta["jurisdiction_code"] = code
|
||
meta["jurisdiction_name"] = jurisdiction_map.get(code)
|
||
|
||
# 次優先:Neutral Citation(如果案號未能確定法院)
|
||
if not meta["jurisdiction_code"]:
|
||
if m := NEUTRAL_CITATION_RE.search(text):
|
||
code = m.group(2).upper()
|
||
# 規範化大小寫
|
||
for k in jurisdiction_map:
|
||
if k.upper() == code:
|
||
meta["jurisdiction_code"] = k
|
||
meta["jurisdiction_name"] = jurisdiction_map[k]
|
||
break
|
||
|
||
# 最後:靠法院全稱反查(僅在前兩者都失敗時使用,且只搜索前2000字符)
|
||
if not meta["jurisdiction_code"]:
|
||
header = text[:2000] # 只在開頭搜索,避免被引用案例干擾
|
||
# 同時搜索中英文法院名稱
|
||
for code in jurisdiction_map:
|
||
full_zh = JURISDICTION_MAP_ZH.get(code, "")
|
||
full_en = JURISDICTION_MAP_EN.get(code, "")
|
||
short_zh = full_zh.replace("香港特別行政區", "")
|
||
short_en = full_en.replace("Hong Kong Special Administrative Region", "").replace(" of the ", " ")
|
||
|
||
if any(name in header for name in [full_zh, short_zh, full_en, short_en] if name):
|
||
meta["jurisdiction_code"] = code
|
||
meta["jurisdiction_name"] = jurisdiction_map[code]
|
||
break
|
||
|
||
return meta
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# 關鍵詞 + 窗口召回(取代脆弱的正則切段)
|
||
# -----------------------------------------------------------------------------
|
||
# 思路:每個抽取目標定義一組「高信號關鍵詞」,掃全文取所有命中位置周圍
|
||
# ±half_window 字符的窗口,合併重疊後拼接喂給 LLM。
|
||
# 不依賴判決書的固定章節標題,對結構各異的香港判決書都能工作。
|
||
#
|
||
# 優化策略(基於實際案例分析):
|
||
# 1. 當事人信息:直接從開頭2000字符提取(通常在 BETWEEN...AND 結構中)
|
||
# 2. 判決結果:優先從尾部4000字符提取(通常在 JUDGMENT/ORDER/命令 部分)
|
||
# 3. 其他字段:使用關鍵詞召回策略
|
||
|
||
KEYWORD_GROUPS: dict[str, list[str]] = {
|
||
# Call 1:當事人 - 不再使用,改為直接截取開頭
|
||
# 保留此處僅為向後兼容,實際不會被 gather_all 使用
|
||
"parties": [],
|
||
|
||
# Call 2:事由與標的
|
||
"reason_object": [
|
||
# 段落標題類
|
||
"案情", "背景", "引言", "事實", "案件背景", "案由",
|
||
"INTRODUCTION", "BACKGROUND", "FACTS", "THE FACTS", "General course",
|
||
# 主張類
|
||
"申索", "索償", "訴因", "聲稱", "請求", "本案涉及", "本訴訟",
|
||
"原告聲稱", "申索人聲稱", "申索人指稱", "上訴人指", "答辯人指",
|
||
"Plaintiff", "Claimant", "Appellant", "claim", "allege",
|
||
# 標的物關鍵詞
|
||
"賠償", "損失", "損害", "精神困擾", "經濟損失", "醫療費",
|
||
"履行", "所有權", "占有", "撤銷", "宣告", "damages", "compensation",
|
||
],
|
||
|
||
# Call 3:判決結果 - 不再使用關鍵詞,改為直接截取尾部
|
||
# 保留此處僅為向後兼容
|
||
"judgment_result": [],
|
||
|
||
# Call 4:涉及實體(法官、律師、引用案例中的法官)
|
||
"entities": [
|
||
# 法官稱謂
|
||
"法官", "大法官", "審裁官", "裁判官", "首席法官", "常任法官", "非常任法官",
|
||
"Hon.", " J.", " JA ", " CJ ", " PJ ", " NPJ ", "Coroner", "Judge",
|
||
# 代表類
|
||
"代表", "大律師", "律師", "資深大律師", "代表律師",
|
||
"Counsel", "Solicitor", "instructed by", "represented by",
|
||
# 案例引用(會在周邊帶出法官名)
|
||
" v ", " v. ", " 訴 ", "[19", "[20", "HKCFA", "HKCA", "HKCFI",
|
||
],
|
||
|
||
# Call 5:法庭分析(用於 summary 的核心輸入)
|
||
"analysis": [
|
||
# 法庭觀點標記
|
||
"本席認為", "本席接納", "本席不接納", "本席同意", "本席不同意",
|
||
"本席裁定", "本席拒絕", "本席認同", "本席考慮",
|
||
"本庭認為", "本庭接納", "本庭裁定", "本庭認同", "本庭考慮",
|
||
"I find", "I accept", "I do not accept", "I conclude", "I consider",
|
||
"The court finds", "In my view", "In my judgment", "The Court held",
|
||
# 法律原則
|
||
"舉證責任", "審慎責任", "鄰人原則", "替代責任", "合理疑點",
|
||
"違反", "侵權", "過失", "negligence", "breach", "duty of care",
|
||
# 證據評估
|
||
"證據顯示", "根據證據", "證人證供", "可信", "不可信",
|
||
"evidence shows", "testimony", "credible", "reliable",
|
||
],
|
||
}
|
||
|
||
|
||
def gather_chunks(text: str,
|
||
keywords: list[str],
|
||
half_window: int = 500,
|
||
max_total: int = 6500,
|
||
case_sensitive: bool = False) -> tuple[str, int]:
|
||
"""
|
||
召回所有 keywords 命中位置周圍 ±half_window 字符的窗口,
|
||
合併重疊區間,按位置順序拼接,總長不超過 max_total。
|
||
|
||
返回:(拼接後文本, 命中關鍵詞數)
|
||
若無命中,fallback 返回文檔前 max_total 字。
|
||
"""
|
||
if not text:
|
||
return "", 0
|
||
|
||
flags = 0 if case_sensitive else re.IGNORECASE
|
||
hits: list[tuple[int, int]] = []
|
||
for kw in keywords:
|
||
for m in re.finditer(re.escape(kw), text, flags=flags):
|
||
s = max(0, m.start() - half_window)
|
||
e = min(len(text), m.end() + half_window)
|
||
hits.append((s, e))
|
||
|
||
if not hits:
|
||
return text[:max_total], 0
|
||
|
||
# 合併重疊區間
|
||
hits.sort()
|
||
merged: list[list[int]] = []
|
||
for s, e in hits:
|
||
if merged and s <= merged[-1][1]:
|
||
merged[-1][1] = max(merged[-1][1], e)
|
||
else:
|
||
merged.append([s, e])
|
||
|
||
# 按位置順序拼接,控制總長
|
||
pieces: list[str] = []
|
||
total = 0
|
||
for s, e in merged:
|
||
seg_len = e - s
|
||
if total + seg_len > max_total:
|
||
remain = max_total - total
|
||
if remain > 200:
|
||
pieces.append(text[s:s + remain])
|
||
break
|
||
pieces.append(text[s:e])
|
||
total += seg_len
|
||
|
||
return "\n\n[…]\n\n".join(pieces), len(hits)
|
||
|
||
|
||
def gather_all(text: str,
|
||
head_length: int = 5000,
|
||
tail_length: int = 5000,
|
||
entities_window: int = 400,
|
||
entities_max: int = 6500,
|
||
analysis_window: int = 500,
|
||
analysis_max: int = 6500) -> dict[str, str]:
|
||
"""為每個 group 召回對應的上下文片段
|
||
|
||
優化策略:
|
||
1. 基礎信息(當事人):直接取開頭 head_length 字符,不使用關鍵詞召回
|
||
2. 事由與標的(reason_object):直接取開頭 head_length 字符,不使用關鍵詞召回
|
||
3. 判決結果:取開頭 head_length 字符 + 尾部 tail_length 字符
|
||
4. 其他字段:保持關鍵詞召回策略
|
||
|
||
Args:
|
||
text: 判決書全文
|
||
head_length: 開頭截取長度(默認 5000)
|
||
tail_length: 尾部截取長度(默認 5000)
|
||
entities_window: 實體關鍵詞窗口半徑(默認 400)
|
||
entities_max: 實體片段最大總長度(默認 6500)
|
||
analysis_window: 分析關鍵詞窗口半徑(默認 500)
|
||
analysis_max: 分析片段最大總長度(默認 6500)
|
||
"""
|
||
out: dict[str, str] = {}
|
||
|
||
# 1. 當事人信息:直接從開頭截取
|
||
out["parties"] = text[:head_length]
|
||
out["_parties_hits"] = "0" # 不使用關鍵詞,標記為0
|
||
|
||
# 2. 事由與標的:直接從開頭截取
|
||
out["reason_object"] = text[:head_length]
|
||
out["_reason_object_hits"] = "0" # 不使用關鍵詞,標記為0
|
||
|
||
# 3. 判決結果:取開頭 + 尾部
|
||
head_text = text[:head_length] if len(text) > head_length else text
|
||
tail_text = text[-tail_length:] if len(text) > tail_length else ""
|
||
# 如果文本足夠長,拼接頭尾;否則只用全文
|
||
if tail_text and head_text != tail_text:
|
||
out["judgment_result"] = head_text + "\n\n[…]\n\n" + tail_text
|
||
else:
|
||
out["judgment_result"] = head_text
|
||
out["_judgment_result_hits"] = "0" # 直接截取,不計算關鍵詞命中
|
||
|
||
# 4. 其他字段:使用關鍵詞召回
|
||
params: dict[str, tuple[int, int]] = {
|
||
"entities": (entities_window, entities_max),
|
||
"analysis": (analysis_window, analysis_max),
|
||
}
|
||
for group in ["entities", "analysis"]:
|
||
kws = KEYWORD_GROUPS[group]
|
||
hw, mt = params[group]
|
||
ctx, hits = gather_chunks(text, kws, half_window=hw, max_total=mt)
|
||
out[group] = ctx
|
||
out[f"_{group}_hits"] = str(hits)
|
||
|
||
return out
|
||
|
||
|
||
# =============================================================================
|
||
# 2. OpenAI 兼容客戶端:支持 Ollama / OpenRouter / OpenAI 等
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class OpenAICompatibleClient:
|
||
"""OpenAI 兼容的 API 客戶端
|
||
|
||
支持:
|
||
- Ollama (http://localhost:11434/v1)
|
||
- OpenRouter (https://openrouter.ai/api/v1)
|
||
- OpenAI (https://api.openai.com/v1)
|
||
- 其他 OpenAI 兼容的服務
|
||
"""
|
||
model: str = DEFAULT_MODEL
|
||
base_url: str = DEFAULT_BASE_URL
|
||
api_key: str = DEFAULT_API_KEY
|
||
timeout: int = DEFAULT_TIMEOUT
|
||
|
||
# token 用量累計(跨所有調用,含重試)
|
||
total_input_tokens: int = field(default=0, init=False)
|
||
total_output_tokens: int = field(default=0, init=False)
|
||
num_calls: int = field(default=0, init=False)
|
||
|
||
@property
|
||
def total_tokens(self) -> int:
|
||
return self.total_input_tokens + self.total_output_tokens
|
||
|
||
def chat_json(self, system: str, user: str, schema: dict,
|
||
temperature: float = 0.0,
|
||
max_tokens: int = 4096) -> dict:
|
||
"""調用 OpenAI 兼容 API,使用 response_format 強制 JSON 輸出"""
|
||
# 構建請求 URL
|
||
url = f"{self.base_url.rstrip('/')}/chat/completions"
|
||
|
||
# 構建請求頭
|
||
headers = {
|
||
"Content-Type": "application/json",
|
||
"Authorization": f"Bearer {self.api_key}",
|
||
}
|
||
|
||
# 構建請求體
|
||
payload = {
|
||
"model": self.model,
|
||
"messages": [
|
||
{"role": "system", "content": system},
|
||
{"role": "user", "content": user},
|
||
],
|
||
"temperature": temperature,
|
||
"max_tokens": max_tokens,
|
||
"response_format": {"type": "json_object"}, # OpenAI 兼容的 JSON 模式
|
||
}
|
||
|
||
# 發送請求
|
||
try:
|
||
r = requests.post(url, json=payload, headers=headers, timeout=self.timeout)
|
||
r.raise_for_status()
|
||
except requests.exceptions.RequestException as e:
|
||
print(f"❌ API 請求失敗:{e}", file=sys.stderr)
|
||
print(f" URL: {url}", file=sys.stderr)
|
||
print(f" Model: {self.model}", file=sys.stderr)
|
||
raise
|
||
|
||
# 解析響應
|
||
try:
|
||
response_data = r.json()
|
||
except json.JSONDecodeError as e:
|
||
print(f"❌ API 響應不是有效的 JSON", file=sys.stderr)
|
||
print(f" 響應狀態碼: {r.status_code}", file=sys.stderr)
|
||
print(f" 響應內容: {r.text[:500]}", file=sys.stderr)
|
||
raise
|
||
|
||
# 累計 token 用量(OpenAI 兼容端點通常在 usage 字段返回)
|
||
usage = response_data.get("usage") or {}
|
||
self.total_input_tokens += int(usage.get("prompt_tokens", 0) or 0)
|
||
self.total_output_tokens += int(usage.get("completion_tokens", 0) or 0)
|
||
self.num_calls += 1
|
||
|
||
# 提取內容
|
||
if "choices" not in response_data or not response_data["choices"]:
|
||
print(f"❌ API 響應缺少 choices 字段", file=sys.stderr)
|
||
print(f" 響應數據: {json.dumps(response_data, ensure_ascii=False, indent=2)[:500]}", file=sys.stderr)
|
||
raise ValueError("API 響應格式錯誤:缺少 choices 字段")
|
||
|
||
content = response_data["choices"][0]["message"]["content"]
|
||
|
||
if not content or not content.strip():
|
||
print(f"❌ 模型返回空內容", file=sys.stderr)
|
||
print(f" 完整響應: {json.dumps(response_data, ensure_ascii=False, indent=2)[:1000]}", file=sys.stderr)
|
||
raise ValueError("模型返回空內容")
|
||
|
||
# 清理可能的 markdown 代碼塊包裹
|
||
content = self._clean_json_response(content)
|
||
|
||
try:
|
||
return json.loads(content)
|
||
except json.JSONDecodeError as e:
|
||
# 如果仍然失敗,打印錯誤信息以便調試
|
||
print(f"❌ JSON 解析失敗", file=sys.stderr)
|
||
print(f" 錯誤: {e}", file=sys.stderr)
|
||
print(f" 原始內容(前500字符):\n{content[:500]}", file=sys.stderr)
|
||
print(f" 原始內容(後500字符):\n{content[-500:]}", file=sys.stderr)
|
||
raise
|
||
|
||
def _clean_json_response(self, content: str) -> str:
|
||
"""清理模型輸出中可能包含的 markdown 代碼塊標記和開頭的 <think> 標籤
|
||
|
||
處理以下格式:
|
||
- <think>{{思考的內容}}</think> (僅開頭)
|
||
- ```json\n{...}\n```
|
||
- ```\n{...}\n```
|
||
- {... 前後有空白字符
|
||
"""
|
||
content = content.strip()
|
||
|
||
# 移除開頭的 <think>...</think> 標籤及其內容
|
||
# 使用非貪婪匹配,支持多行,只匹配開頭
|
||
if content.startswith("<think>") or content.startswith("<THINK>"):
|
||
match = re.match(r'<think>.*?</think>\s*', content, flags=re.DOTALL | re.IGNORECASE)
|
||
if match:
|
||
content = content[match.end():]
|
||
content = content.strip()
|
||
|
||
# 移除開頭的 ```json 或 ```
|
||
if content.startswith("```"):
|
||
# 找到第一個換行符
|
||
first_newline = content.find("\n")
|
||
if first_newline != -1:
|
||
content = content[first_newline + 1:]
|
||
|
||
# 移除結尾的 ```
|
||
if content.endswith("```"):
|
||
# 找到最後一個 ``` 之前的換行符
|
||
last_fence = content.rfind("```")
|
||
if last_fence != -1:
|
||
content = content[:last_fence]
|
||
|
||
return content.strip()
|
||
|
||
def chat_json_with_retry(self, system: str, user: str, schema: dict,
|
||
validator=None, **kw) -> dict:
|
||
"""validator(result) -> (ok: bool, hint: str);失敗則回灌 hint 重試"""
|
||
last_err = None
|
||
for attempt in range(MAX_RETRIES + 1):
|
||
try:
|
||
out = self.chat_json(system, user, schema, **kw)
|
||
if validator is None:
|
||
return out
|
||
ok, hint = validator(out)
|
||
if ok:
|
||
return out
|
||
# 回灌錯誤信息
|
||
user = (f"{user}\n\n上次輸出存在問題:{hint}\n"
|
||
f"請修正後重新輸出。")
|
||
except Exception as e:
|
||
last_err = e
|
||
if last_err:
|
||
raise last_err
|
||
return out # type: ignore
|
||
|
||
|
||
# =============================================================================
|
||
# 3. 五次抽取調用:每次只負責一組字段
|
||
# =============================================================================
|
||
|
||
# --- Call 1: 當事人 ----------------------------------------------------------
|
||
|
||
PARTIES_SCHEMA = {
|
||
"type": "object",
|
||
"properties": {
|
||
"plaintiff": {"type": "array", "items": {"type": "string"}},
|
||
"defendant": {"type": "array", "items": {"type": "string"}},
|
||
},
|
||
"required": ["plaintiff", "defendant"],
|
||
}
|
||
|
||
# 中文提示詞
|
||
PARTIES_SYSTEM_ZH = """你是香港法律文書信息抽取助手。
|
||
從給定的判決書開頭部分抽取所有當事人完整姓名/機構名。
|
||
|
||
格式識別:
|
||
1. 英文格式:BETWEEN ... AND ...
|
||
2. 中文格式:申請人 ... 對/訴 答辯人 ...
|
||
3. 混合格式:Plaintiff ... Defendant ...
|
||
|
||
分類規則:
|
||
- 原告/申索人/上訴人/覆核申請人/Plaintiff/Appellant/Claimant/Applicant → plaintiff
|
||
- 被告/答辯人/被上訴人/Defendant/Respondent → defendant
|
||
- 保留中英文對照(如有)
|
||
- 某類無則輸出空數組
|
||
|
||
只輸出符合 schema 的 JSON,不要解釋。"""
|
||
|
||
PARTIES_FEWSHOT_ZH = """範例1(原告/被告格式):
|
||
BETWEEN
|
||
陳大文 (CHAN TAI MAN) 上訴人
|
||
AND
|
||
香港房屋委員會 (Hong Kong Housing Authority) 答辯人
|
||
|
||
輸出:
|
||
{"plaintiff":["陳大文 (CHAN TAI MAN)"],"defendant":["香港房屋委員會 (Hong Kong Housing Authority)"]}
|
||
|
||
範例2(申請人/答辯人格式):
|
||
申請人:
|
||
李小明
|
||
答辯人:
|
||
入境事務處處長
|
||
|
||
輸出:
|
||
{"plaintiff":["李小明"],"defendant":["入境事務處處長"]}"""
|
||
|
||
# 英文提示詞
|
||
PARTIES_SYSTEM_EN = """You are a Hong Kong legal document information extraction assistant.
|
||
Extract all complete names/organization names of parties from the beginning of the judgment.
|
||
|
||
Format Recognition:
|
||
1. English format: BETWEEN ... AND ...
|
||
2. Chinese format: 申請人 ... 對/訴 答辯人 ...
|
||
3. Mixed format: Plaintiff ... Defendant ...
|
||
|
||
Classification Rules:
|
||
- Plaintiff/Claimant/Appellant/Applicant/原告/申索人/上訴人/覆核申請人 → plaintiff
|
||
- Defendant/Respondent/被告/答辯人/被上訴人 → defendant
|
||
- Preserve bilingual names (if any)
|
||
- Output empty array if none
|
||
|
||
Output only JSON conforming to schema, no explanation."""
|
||
|
||
PARTIES_FEWSHOT_EN = """Example 1 (Plaintiff/Defendant format):
|
||
BETWEEN
|
||
Dr Paul KI Ping-ki 1st Plaintiff
|
||
Hong Kong Washington Company 2nd Plaintiff
|
||
AND
|
||
Next Magazine Publishing Ltd 1st Defendant
|
||
|
||
Output:
|
||
{"plaintiff":["Dr Paul KI Ping-ki","Hong Kong Washington Company"],"defendant":["Next Magazine Publishing Ltd"]}
|
||
|
||
Example 2 (Applicant/Respondent format):
|
||
Between:
|
||
MO YUK PING
|
||
Applicant
|
||
and
|
||
HONG KONG SPECIAL ADMINISTRATIVE REGION
|
||
Respondent
|
||
|
||
Output:
|
||
{"plaintiff":["MO YUK PING"],"defendant":["HONG KONG SPECIAL ADMINISTRATIVE REGION"]}"""
|
||
|
||
|
||
def extract_parties(client: OpenAICompatibleClient, context: str, lang: str = 'zh') -> dict:
|
||
system = PARTIES_SYSTEM_ZH if lang == 'zh' else PARTIES_SYSTEM_EN
|
||
fewshot = PARTIES_FEWSHOT_ZH if lang == 'zh' else PARTIES_FEWSHOT_EN
|
||
|
||
if lang == 'zh':
|
||
user = f"{fewshot}\n\n請從以下判決書開頭部分抽取:\n```\n{context[:5000]}\n```"
|
||
else:
|
||
user = f"{fewshot}\n\nPlease extract from the following judgment header:\n```\n{context[:5000]}\n```"
|
||
|
||
return client.chat_json_with_retry(system, user, PARTIES_SCHEMA)
|
||
|
||
|
||
# --- Call 2: 事由 + 標的 ----------------------------------------------------
|
||
|
||
def get_reason_object_schema(lang: str = 'zh') -> dict:
|
||
"""根據語言返回對應的 schema(英文字數限制更寬鬆)"""
|
||
max_length = 100 if lang == 'zh' else 200 # 英文允許 2 倍字符數
|
||
return {
|
||
"type": "object",
|
||
"properties": {
|
||
"case_reason": {"type": "string", "maxLength": max_length},
|
||
"case_object": {"type": "array", "items": {"type": "string"}},
|
||
},
|
||
"required": ["case_reason", "case_object"],
|
||
}
|
||
|
||
# 中文提示詞
|
||
REASON_OBJECT_SYSTEM_ZH = f"""從香港判決書中抽取:
|
||
|
||
1. case_reason(事由):
|
||
- 嚴格 ≤100 字,單句,清晰完整
|
||
- 結構:[原告身份] + [針對什麼事件/行為] + [向誰] + [提出什麼請求]
|
||
- 覆核/上訴案件須註明對哪個裁決提出覆核(含日期/案號)
|
||
- 嚴禁包含:判決結果、法庭分析、案發細節、證據評估
|
||
- 只描述訴訟的起因和請求,不涉及法庭的判斷
|
||
|
||
2. case_object(標的物):
|
||
- 訴訟請求指向的實體權利或利益
|
||
- 例:汽車、黃金、珠寶、不動產產權、撫養權、人身傷害賠償、合同履行、房產所有權、精神困擾賠償、居留權
|
||
- 涉及金錢標的時必須提取,並標明幣種與具體金額(如「拖欠貨款 HK$850,000」「索償 HK$1,000,000」「拖欠租金 HK$120,000」);金額未定或待評定者註明「金額待評定」
|
||
- 合併本質相同的標的
|
||
- 嚴禁:證據材料、程序性訴求(如"要求法庭裁決")、法律條文名稱
|
||
|
||
用語要求:一律使用規範的法律專業用語(如「申索」「損害賠償」「違約」「侵權」「衡平法濟助」),避免口語化或不準確的表述。
|
||
|
||
只輸出 JSON。"""
|
||
|
||
REASON_OBJECT_FEWSHOT_ZH = """範例輸出1(人身傷害):
|
||
{"case_reason":"申索人為商場保安員,就被告於2023年7月在商場毆打申索人造成的人身傷害向被告提出損害賠償申索。","case_object":["人身傷害賠償","醫療費用","精神困擾賠償"]}
|
||
|
||
範例輸出2(金錢申索,標的物須含具體金額):
|
||
{"case_reason":"原告就被告未支付2022年買賣合約項下的貨款,向被告提出追討欠款的申索。","case_object":["拖欠貨款 HK$850,000","合約利息","訟費"]}"""
|
||
|
||
# 英文提示詞
|
||
REASON_OBJECT_SYSTEM_EN = f"""Extract from Hong Kong judgment:
|
||
|
||
1. case_reason (Cause of Action):
|
||
- Strictly ≤200 words, single sentence,clear and complete
|
||
- Structure: [Plaintiff's identity] + [regarding what event/conduct] + [against whom] + [what relief sought]
|
||
- For judicial review/appeal cases, specify which decision is being challenged (with date/case number)
|
||
- MUST NOT include: judgment results, court analysis, incident details, evidence assessment
|
||
- Only describe the cause and relief sought, not the court's determination
|
||
|
||
2. case_object (Subject Matter):
|
||
- Tangible rights or interests targeted by the claim
|
||
- Examples: vehicle, gold, jewelry, property title, custody, personal injury damages, contract performance, property ownership, distress damages, right of abode
|
||
- When a monetary subject matter is involved, it MUST be extracted with currency and the specific amount (e.g., "outstanding goods price HK$850,000", "claim of HK$1,000,000", "arrears of rent HK$120,000"); if the amount is unascertained, note "amount to be assessed"
|
||
- Merge essentially identical subjects
|
||
- MUST NOT include: evidentiary materials, procedural requests (e.g., "seeking court ruling"), names of statutes
|
||
|
||
Terminology requirement: consistently use precise, standard legal terminology (e.g., "claim", "damages", "breach of contract", "negligence", "equitable relief"); avoid colloquial or imprecise wording.
|
||
|
||
Output only JSON."""
|
||
|
||
REASON_OBJECT_FEWSHOT_EN = """Example Output 1 (personal injury):
|
||
{"case_reason":"Plaintiff, a shopping mall security guard, claims damages from defendant for personal injuries caused by defendant's assault at the mall in July 2023.","case_object":["personal injury damages","medical expenses","distress damages"]}
|
||
|
||
Example Output 2 (monetary claim, subject matter must carry the specific amount):
|
||
{"case_reason":"Plaintiff claims against defendant for non-payment of the price of goods supplied under a 2022 sale and purchase contract.","case_object":["outstanding goods price HK$850,000","contractual interest","costs"]}"""
|
||
|
||
|
||
# 金錢數額識別:幣種前綴 + 數字,或數字 + 中文金額單位
|
||
MONEY_RE = re.compile(
|
||
r"(?:HK\$|US\$|RMB|MOP|港幣|港元|人民幣|美元|美金)\s*[\d,]+(?:\.\d+)?"
|
||
r"|[\$$]\s*[\d,]+(?:\.\d+)?"
|
||
r"|[\d,]+(?:\.\d+)?\s*(?:萬元|億元|元|萬|億)",
|
||
re.I,
|
||
)
|
||
|
||
|
||
def _object_has_amount(objs: list[str]) -> bool:
|
||
"""case_object 中是否已含具體金額(任一項出現數字即視為已提取金額)"""
|
||
return any(re.search(r"\d", o or "") for o in (objs or []))
|
||
|
||
|
||
def _reason_object_validator(out: dict, lang: str = 'zh',
|
||
context: str = "") -> tuple[bool, str]:
|
||
r = out.get("case_reason", "")
|
||
max_length = 100 if lang == 'zh' else 200 # 英文允許 2 倍字符數
|
||
target_length = 80 if lang == 'zh' else 160 # 建議壓縮目標
|
||
|
||
if len(r) > max_length:
|
||
if lang == 'zh':
|
||
return False, f"case_reason 共 {len(r)} 字,超過 {max_length} 字上限,請壓縮到 {target_length} 字以內。"
|
||
else:
|
||
return False, f"case_reason has {len(r)} characters, exceeds {max_length} limit, please compress to within {target_length}."
|
||
if not out.get("case_object"):
|
||
if lang == 'zh':
|
||
return False, "case_object 不能為空。"
|
||
else:
|
||
return False, "case_object cannot be empty."
|
||
|
||
# 檢查是否包含判決結果性詞彙(嚴禁)
|
||
RESULT_KEYWORDS = [
|
||
"駁回", "拒絕", "勝訴", "敗訴", "維持", "撤銷", "發還",
|
||
"判給", "獲判", "判處", "部分勝訴",
|
||
"dismissed", "allowed", "granted", "refused", "upheld", "quashed",
|
||
]
|
||
for keyword in RESULT_KEYWORDS:
|
||
if keyword in r:
|
||
if lang == 'zh':
|
||
return False, f"case_reason 不應包含判決結果詞彙「{keyword}」,請只描述訴訟起因和請求。"
|
||
else:
|
||
return False, f"case_reason should not contain judgment result term '{keyword}', describe only cause and relief sought."
|
||
|
||
# 金錢標的物強制提取:原文出現金錢數額但 case_object 未含金額時要求補充
|
||
if context and MONEY_RE.search(context) and not _object_has_amount(out.get("case_object", [])):
|
||
if lang == 'zh':
|
||
return False, ("原文出現金錢數額。若該數額屬於訴訟標的(如欠款、索償、賠償金額),"
|
||
"必須在 case_object 中提取並標明幣種與具體金額(如「拖欠貨款 HK$850,000」);"
|
||
"若僅為無關引用則可忽略。")
|
||
else:
|
||
return False, ("Monetary amounts appear in the source. If an amount forms part of the "
|
||
"subject matter (e.g., debt, claim, damages), it MUST be extracted in "
|
||
"case_object with currency and the specific figure (e.g., \"outstanding "
|
||
"goods price HK$850,000\"); ignore only if it is an unrelated citation.")
|
||
|
||
return True, ""
|
||
|
||
|
||
def extract_reason_object(client: OpenAICompatibleClient, context: str, lang: str = 'zh') -> dict:
|
||
system = REASON_OBJECT_SYSTEM_ZH if lang == 'zh' else REASON_OBJECT_SYSTEM_EN
|
||
fewshot = REASON_OBJECT_FEWSHOT_ZH if lang == 'zh' else REASON_OBJECT_FEWSHOT_EN
|
||
schema = get_reason_object_schema(lang)
|
||
max_length = 100 if lang == 'zh' else 200
|
||
|
||
if lang == 'zh':
|
||
user = (f"{fewshot}\n\n"
|
||
f"請從以下判決書開頭部分抽取:\n```\n{context[:5000]}\n```")
|
||
else:
|
||
user = (f"{fewshot}\n\n"
|
||
f"Please extract from the following judgment header:\n```\n{context[:5000]}\n```")
|
||
|
||
return client.chat_json_with_retry(system, user,
|
||
schema,
|
||
validator=lambda x: _reason_object_validator(x, lang, context[:5000]))
|
||
|
||
|
||
# --- Call 3: 判決結果 -------------------------------------------------------
|
||
|
||
JUDGMENT_RESULT_SCHEMA = {
|
||
"type": "object",
|
||
"properties": {
|
||
"judgment_result": {
|
||
"type": "array",
|
||
"items": {
|
||
"type": "object",
|
||
"properties": {
|
||
"charge": {"type": "string"},
|
||
"result": {"type": "string"},
|
||
},
|
||
"required": ["charge", "result"],
|
||
},
|
||
}
|
||
},
|
||
"required": ["judgment_result"],
|
||
}
|
||
|
||
# 中文提示詞
|
||
JUDGMENT_RESULT_SYSTEM_ZH = """從香港判決書尾部的命令/裁定部分抽取所有判決結果。
|
||
|
||
重要提示:
|
||
- 判決結果通常在判決書的最後部分
|
||
- 常見標記:JUDGMENT, ORDER, CONCLUSION, DISPOSITION, 判決, 命令, 裁定, 頒令
|
||
- 可能包含:勝訴/敗訴、具體金額、訟費安排、上訴結果
|
||
|
||
拆分原則:
|
||
- 多項請求 → 分條
|
||
- "責任判定" 與 "損失/金額計算" 兩個層面 → 必須分條
|
||
- 每條 charge 必須以 "(責任問題)" 或 "(損失範圍)" 結尾標註層次
|
||
- result 必須包含:
|
||
a) 明確結果(勝訴/敗訴/部分勝訴/維持/撤銷/駁回/發還等)
|
||
b) 2-3 個關鍵法庭理由(如有)
|
||
c) 具體金額、利率或命令內容;凡有判給/命令支付的金錢數額,必須原文照錄幣種與金額(如 HK$28,500),不得省略或約化
|
||
|
||
用語要求:一律使用規範的法律專業用語(如「判給」「訟費」「利息」「駁回」「發還重審」),避免口語化或不準確的表述。
|
||
|
||
只輸出 JSON。"""
|
||
|
||
JUDGMENT_RESULT_FEWSHOT_ZH = """範例輸出:
|
||
{"judgment_result":[
|
||
{"charge":"申索人就毆打事件的人身傷害索償 (責任問題)","result":"勝訴。法庭接納申索人證供可信,閉路電視顯示被告先動手,被告亦承認部分情節。"},
|
||
{"charge":"醫療費及精神困擾賠償金額 (損失範圍)","result":"部分勝訴。判給醫療費HK$8,500及一般損害賠償HK$20,000,合共HK$28,500,連同利息及訟費。"}
|
||
]}"""
|
||
|
||
# 英文提示詞
|
||
JUDGMENT_RESULT_SYSTEM_EN = """Extract all judgment results from the order/disposition section at the end of Hong Kong judgment.
|
||
|
||
Important Notes:
|
||
- Judgment results are usually at the end of the judgment
|
||
- Common markers: JUDGMENT, ORDER, CONCLUSION, DISPOSITION, 判決, 命令, 裁定, 頒令
|
||
- May include: success/dismissal, specific amounts, costs arrangements, appeal results
|
||
|
||
Splitting Principles:
|
||
- Multiple claims → separate items
|
||
- "Liability determination" vs "Quantum/damages assessment" → must be separate items
|
||
- Each charge must end with "(liability issue)" or "(quantum issue)" to mark the level
|
||
- result must include:
|
||
a) Clear outcome (allowed/dismissed/partially allowed/upheld/quashed/remitted, etc.)
|
||
b) 2-3 key court reasons (if any)
|
||
c) Specific amounts, interest rates or order details; whenever a sum is awarded/ordered to be paid, the currency and figure MUST be reproduced verbatim (e.g., HK$28,500), never omitted or rounded
|
||
|
||
Terminology requirement: consistently use precise, standard legal terminology (e.g., "awarded", "costs", "interest", "dismissed", "remitted for retrial"); avoid colloquial or imprecise wording.
|
||
|
||
Output only JSON."""
|
||
|
||
JUDGMENT_RESULT_FEWSHOT_EN = """Example Output:
|
||
{"judgment_result":[
|
||
{"charge":"Plaintiff's claim for personal injury from assault (liability issue)","result":"Allowed. Court accepted plaintiff's testimony as credible, CCTV showed defendant struck first, defendant also admitted parts of the incident."},
|
||
{"charge":"Medical expenses and distress damages quantum (quantum issue)","result":"Partially allowed. Awarded medical expenses HK$8,500 and general damages HK$20,000, totaling HK$28,500, with interest and costs."}
|
||
]}"""
|
||
|
||
|
||
def _judgment_validator(out: dict | list, lang: str = 'zh') -> tuple[bool, str]:
|
||
# 处理模型直接返回列表的情况
|
||
if isinstance(out, list):
|
||
items = out
|
||
else:
|
||
items = out.get("judgment_result", [])
|
||
|
||
if not items:
|
||
if lang == 'zh':
|
||
return False, "judgment_result 不能為空。"
|
||
else:
|
||
return False, "judgment_result cannot be empty."
|
||
|
||
if lang == 'zh':
|
||
bad = [i for i in items
|
||
if "責任問題" not in i.get("charge", "")
|
||
and "損失範圍" not in i.get("charge", "")]
|
||
if bad:
|
||
return False, (f"有 {len(bad)} 條 charge 未標註層次。"
|
||
f"每條 charge 必須以 '(責任問題)' 或 '(損失範圍)' 結尾。")
|
||
else:
|
||
bad = [i for i in items
|
||
if "liability issue" not in i.get("charge", "").lower()
|
||
and "quantum issue" not in i.get("charge", "").lower()]
|
||
if bad:
|
||
return False, (f"{len(bad)} charge items lack level annotation. "
|
||
f"Each charge must end with '(liability issue)' or '(quantum issue)'.")
|
||
|
||
return True, ""
|
||
|
||
|
||
def extract_judgment_result(client: OpenAICompatibleClient, context: str, lang: str = 'zh') -> dict:
|
||
system = JUDGMENT_RESULT_SYSTEM_ZH if lang == 'zh' else JUDGMENT_RESULT_SYSTEM_EN
|
||
fewshot = JUDGMENT_RESULT_FEWSHOT_ZH if lang == 'zh' else JUDGMENT_RESULT_FEWSHOT_EN
|
||
|
||
if lang == 'zh':
|
||
user = (f"{fewshot}\n\n"
|
||
f"請從以下判決書片段(開頭5000字符 + 尾部5000字符)抽取:\n```\n{context}\n```")
|
||
else:
|
||
user = (f"{fewshot}\n\n"
|
||
f"Please extract from the following judgment segments (first 5000 + last 5000 characters):\n```\n{context}\n```")
|
||
|
||
result = client.chat_json_with_retry(system, user,
|
||
JUDGMENT_RESULT_SCHEMA,
|
||
validator=lambda x: _judgment_validator(x, lang))
|
||
|
||
# 如果模型返回的是列表,包装成标准格式
|
||
if isinstance(result, list):
|
||
return {"judgment_result": result}
|
||
return result
|
||
|
||
|
||
# --- Call 4: 涉及實體 -------------------------------------------------------
|
||
|
||
ENTITIES_SCHEMA = {
|
||
"type": "object",
|
||
"properties": {
|
||
"involved_entities": {
|
||
"type": "array",
|
||
"items": {
|
||
"type": "object",
|
||
"properties": {
|
||
"entity_name": {"type": "string"},
|
||
"reason": {"type": "string"},
|
||
},
|
||
"required": ["entity_name", "reason"],
|
||
},
|
||
}
|
||
},
|
||
"required": ["involved_entities"],
|
||
}
|
||
|
||
# 中文提示詞
|
||
ENTITIES_SYSTEM_ZH = """從香港判決書中抽取所有相關實體(自然人/法人/組織/機構)。
|
||
|
||
必須包含:
|
||
- 主審法官 / 審裁官 / 裁判官(通常在判決書開頭或結尾署名)
|
||
- 雙方代表律師、大律師(通常在判決書結尾的 Representation 部分)
|
||
- 判決中引用的先例所提及的法官
|
||
reason 須寫明:在XX案[案號]中擔任XX職位,闡述XX法律原則
|
||
- 涉案的政府部門、公司、機構(如:入境事務處處長、律政司司長)
|
||
|
||
嚴禁包含:
|
||
- 法案/條例名(如《侵權條例》、Cap.xxx、《基本法》)
|
||
- 純案例名稱(如 Donoghue v Stevenson)
|
||
- 文獻、期刊名
|
||
|
||
用語要求:reason 一律使用規範的法律專業用語(如「主審」「闡述」「先例」「判詞」),避免口語化或不準確的表述。
|
||
|
||
只輸出 JSON。"""
|
||
|
||
ENTITIES_FEWSHOT_ZH = """範例輸出:
|
||
{"involved_entities":[
|
||
{"entity_name":"林希維審裁官","reason":"本案主審審裁官,負責認定事實及裁決。"},
|
||
{"entity_name":"終審法院常任法官李義","reason":"在Tang Kwok Wah v HKSAR [2019] HKCFA 23 中擔任主筆法官,闡述舉證責任原則,本案第34段引用其判詞。"},
|
||
{"entity_name":"康樂文化事務署","reason":"涉案場所通州街公園的管理機構。"}
|
||
]}"""
|
||
|
||
# 英文提示詞
|
||
ENTITIES_SYSTEM_EN = """Extract all relevant entities (natural persons/legal persons/organizations/institutions) from Hong Kong judgment.
|
||
|
||
Must include:
|
||
- Presiding judge/adjudicator/magistrate (usually signed at beginning or end of judgment)
|
||
- Counsel/barristers representing both parties (usually in Representation section at end)
|
||
- Judges mentioned in cited precedents
|
||
reason must specify: served as XX position in XX case [case number], articulated XX legal principle
|
||
- Government departments, companies, institutions involved (e.g., Director of Immigration, Secretary for Justice)
|
||
|
||
MUST NOT include:
|
||
- Statute/ordinance names (e.g., Tort Ordinance, Cap.xxx, Basic Law)
|
||
- Pure case names (e.g., Donoghue v Stevenson)
|
||
- Literature, journal names
|
||
|
||
Terminology requirement: write each reason in precise, standard legal terminology (e.g., "presiding", "articulated", "precedent", "judgment"); avoid colloquial or imprecise wording.
|
||
|
||
Output only JSON."""
|
||
|
||
ENTITIES_FEWSHOT_EN = """Example Output:
|
||
{"involved_entities":[
|
||
{"entity_name":"Hon Leong JA","reason":"Presiding judge in this case, responsible for fact-finding and adjudication."},
|
||
{"entity_name":"Chief Justice Li","reason":"Served as lead judge in Tang Kwok Wah v HKSAR [2019] HKCFA 23, articulated burden of proof principles, cited in paragraph 34 of this judgment."},
|
||
{"entity_name":"Leisure and Cultural Services Department","reason":"Management authority of Tung Chau Street Park, the incident location."}
|
||
]}"""
|
||
|
||
|
||
def _entities_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]:
|
||
ents = out.get("involved_entities", [])
|
||
if not ents:
|
||
if lang == 'zh':
|
||
return False, "involved_entities 不能為空,至少要有主審法官。"
|
||
else:
|
||
return False, "involved_entities cannot be empty, must include at least the presiding judge."
|
||
|
||
# 檢查黑名單(條例、法案、案例名稱)
|
||
bad = []
|
||
for e in ents:
|
||
name = e.get("entity_name", "")
|
||
# 檢查是否包含黑名單關鍵詞
|
||
if any(k in name for k in ENTITY_NAME_BLACKLIST):
|
||
bad.append(name)
|
||
# 檢查是否為案例名稱格式(包含 v 或 訴)
|
||
if (" v " in name or " v. " in name or " 訴 " in name or
|
||
" vs " in name or " vs. " in name):
|
||
bad.append(name)
|
||
|
||
if bad:
|
||
if lang == 'zh':
|
||
return False, f"以下實體疑為條例/法案/案例名稱,應移除:{bad[:3]}"
|
||
else:
|
||
return False, f"Following entities appear to be statutes/acts/case names, should be removed: {bad[:3]}"
|
||
|
||
return True, ""
|
||
|
||
|
||
def extract_entities(client: OpenAICompatibleClient, context: str, lang: str = 'zh') -> dict:
|
||
system = ENTITIES_SYSTEM_ZH if lang == 'zh' else ENTITIES_SYSTEM_EN
|
||
fewshot = ENTITIES_FEWSHOT_ZH if lang == 'zh' else ENTITIES_FEWSHOT_EN
|
||
|
||
if lang == 'zh':
|
||
user = (f"{fewshot}\n\n"
|
||
f"請從以下片段(多處關鍵詞召回拼接)抽取所有涉及實體:\n"
|
||
f"```\n{context[:6500]}\n```")
|
||
else:
|
||
user = (f"{fewshot}\n\n"
|
||
f"Please extract all involved entities from the following segments (keyword-based retrieval):\n"
|
||
f"```\n{context[:6500]}\n```")
|
||
|
||
return client.chat_json_with_retry(system, user,
|
||
ENTITIES_SCHEMA,
|
||
validator=lambda x: _entities_validator(x, lang))
|
||
|
||
|
||
# --- Call 5: 判決總結(基於已抽取結果 + 分析段,不從原文重生) -----------
|
||
|
||
def get_summary_schema(lang: str = 'zh') -> dict:
|
||
"""根據語言返回對應的 schema(英文字數限制更寬鬆)"""
|
||
max_length = 300 if lang == 'zh' else 500 # 英文允許約 1.67 倍字符數
|
||
return {
|
||
"type": "object",
|
||
"properties": {
|
||
"judgment_summary": {"type": "string", "maxLength": max_length},
|
||
},
|
||
"required": ["judgment_summary"],
|
||
}
|
||
|
||
# 中文提示詞
|
||
SUMMARY_SYSTEM_ZH = """根據已抽取的結構化字段 + 法庭分析段,撰寫判決總結。
|
||
|
||
四要素結構(必須全部涵蓋,連貫成單段):
|
||
(1) 案件背景:1-2 句交代起因與當事人關係
|
||
(2) 核心爭議焦點
|
||
(3) 法庭法律分析與推理(核心重點):
|
||
- 如何評估證據?
|
||
- 接受 / 拒絕主張的邏輯?
|
||
- 引用了哪些關鍵法律或判例?
|
||
(4) 最終裁決結果及命令
|
||
|
||
**重要:judgment_summary 必須使用中文撰寫。**
|
||
|
||
用語要求:一律使用規範的法律專業用語;涉及金錢的判給或標的,須保留具體金額(含幣種)。
|
||
|
||
嚴格 ≤300 字。只輸出 JSON。"""
|
||
|
||
# 英文提示詞
|
||
SUMMARY_SYSTEM_EN = """Based on extracted structured fields + court analysis section, write judgment summary.
|
||
|
||
Four-element structure (must cover all, in coherent single paragraph):
|
||
(1) Case background: 1-2 sentences on cause and parties' relationship
|
||
(2) Core issues in dispute
|
||
(3) Court's legal analysis and reasoning (core focus):
|
||
- How was evidence assessed?
|
||
- Logic for accepting/rejecting claims?
|
||
- What key laws or precedents were cited?
|
||
(4) Final judgment and orders
|
||
|
||
**IMPORTANT: judgment_summary MUST be written in English.**
|
||
|
||
Terminology requirement: consistently use precise, standard legal terminology; preserve specific monetary figures (with currency) for any award or subject matter involving money.
|
||
|
||
Strictly ≤500 characters. Output only JSON."""
|
||
|
||
|
||
def _summary_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]:
|
||
s = out.get("judgment_summary", "")
|
||
max_length = 300 if lang == 'zh' else 500 # 英文允許約 1.67 倍字符數
|
||
min_length = 80 if lang == 'zh' else 120 # 英文最小長度也相應增加
|
||
|
||
if len(s) > max_length:
|
||
if lang == 'zh':
|
||
return False, f"summary 共 {len(s)} 字,超過 {max_length} 字上限,請壓縮。"
|
||
else:
|
||
return False, f"summary has {len(s)} characters, exceeds {max_length} limit, please compress."
|
||
if len(s) < min_length:
|
||
if lang == 'zh':
|
||
return False, "summary 過短,請完整覆蓋四要素。"
|
||
else:
|
||
return False, "summary too short, please cover all four elements."
|
||
|
||
# 檢查語言是否正確
|
||
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', s))
|
||
total_chars = len(s.strip())
|
||
if total_chars > 0:
|
||
chinese_ratio = chinese_chars / total_chars
|
||
if lang == 'zh' and chinese_ratio < 0.3:
|
||
return False, "judgment_summary 必須使用中文撰寫,但檢測到主要為英文內容,請用中文重寫。"
|
||
elif lang == 'en' and chinese_ratio > 0.3:
|
||
return False, "judgment_summary MUST be written in English, but detected primarily Chinese content. Please rewrite in English."
|
||
|
||
return True, ""
|
||
|
||
|
||
def extract_summary(client: OpenAICompatibleClient,
|
||
prior: dict, analysis: str, lang: str = 'zh') -> dict:
|
||
system = SUMMARY_SYSTEM_ZH if lang == 'zh' else SUMMARY_SYSTEM_EN
|
||
schema = get_summary_schema(lang)
|
||
max_length = 300 if lang == 'zh' else 500
|
||
|
||
if lang == 'zh':
|
||
user = f"""已抽取的字段:
|
||
```json
|
||
{json.dumps(prior, ensure_ascii=False, indent=2)}
|
||
```
|
||
|
||
法庭分析節選:
|
||
```
|
||
{analysis[:3500]}
|
||
```
|
||
|
||
請按四要素撰寫 ≤300 字的 judgment_summary。"""
|
||
else:
|
||
user = f"""Extracted fields:
|
||
```json
|
||
{json.dumps(prior, ensure_ascii=False, indent=2)}
|
||
```
|
||
|
||
Court analysis excerpt:
|
||
```
|
||
{analysis[:3500]}
|
||
```
|
||
|
||
Please write judgment_summary ≤500 characters covering four elements."""
|
||
|
||
return client.chat_json_with_retry(system, user, schema,
|
||
validator=lambda x: _summary_validator(x, lang))
|
||
|
||
|
||
# =============================================================================
|
||
# 4. 全局校驗與後處理
|
||
# =============================================================================
|
||
|
||
LOCATION_BLACKLIST = [
|
||
"法院", "法庭", "審裁處", "公園", "大廈", "大樓", "商場",
|
||
"街", "道路", "村", "中心", "醫院", "酒店", "車站",
|
||
]
|
||
ENTITY_NAME_BLACKLIST = [
|
||
"條例", "Cap.", "法案", "案例彙編", "Reports",
|
||
"期刊", "Journal",
|
||
# 案例名稱標記
|
||
" v ", " v. ", " 訴 ", " vs ", " vs. ",
|
||
"HKCFAR", "HKCFA", "HKCA", "HKCFI", # 避免將案例引用誤認為實體
|
||
]
|
||
|
||
|
||
def validate_and_fix(result: dict, lang: str = 'zh') -> tuple[dict, list[str]]:
|
||
warnings: list[str] = []
|
||
|
||
# case_location:剔除法院/場所/建築
|
||
locs = result.get("case_location") or []
|
||
cleaned = [l for l in locs
|
||
if l and not any(b in l for b in LOCATION_BLACKLIST)]
|
||
if "香港特別行政區" not in cleaned:
|
||
cleaned.insert(0, "香港特別行政區")
|
||
if set(cleaned) != set(locs):
|
||
warnings.append(
|
||
f"case_location 已清理:移除 {set(locs) - set(cleaned)}")
|
||
result["case_location"] = cleaned
|
||
|
||
# 字數檢查(僅警告,不截斷)
|
||
reason_max = 100 if lang == 'zh' else 200
|
||
summary_max = 300 if lang == 'zh' else 500
|
||
|
||
reason_len = len(result.get("case_reason", ""))
|
||
if reason_len > reason_max:
|
||
warnings.append(f"⚠️ case_reason 共 {reason_len} 字,超過建議上限 {reason_max} 字")
|
||
|
||
summary_len = len(result.get("judgment_summary", ""))
|
||
if summary_len > summary_max:
|
||
warnings.append(f"⚠️ judgment_summary 共 {summary_len} 字,超過建議上限 {summary_max} 字")
|
||
|
||
# involved_entities:剔除條例/文獻
|
||
ents = result.get("involved_entities") or []
|
||
cleaned_ents = [e for e in ents
|
||
if not any(k in e.get("entity_name", "")
|
||
for k in ENTITY_NAME_BLACKLIST)]
|
||
if len(cleaned_ents) != len(ents):
|
||
warnings.append(
|
||
f"involved_entities 移除 {len(ents) - len(cleaned_ents)} 條疑似條例/文獻")
|
||
result["involved_entities"] = cleaned_ents
|
||
|
||
# judgment_result:補層次標註提示
|
||
for jr in result.get("judgment_result", []) or []:
|
||
if ("責任問題" not in jr.get("charge", "")
|
||
and "損失範圍" not in jr.get("charge", "")):
|
||
warnings.append(
|
||
f"judgment_result 條目缺層次標註:{jr.get('charge', '')[:40]}")
|
||
|
||
# 空字段告警
|
||
for k in ("plaintiff", "defendant", "case_object",
|
||
"judgment_result", "involved_entities"):
|
||
if not result.get(k):
|
||
warnings.append(f"{k} 為空,請人工複核")
|
||
|
||
return result, warnings
|
||
|
||
|
||
# =============================================================================
|
||
# 5. 主管線
|
||
# =============================================================================
|
||
|
||
def run_pipeline(text: str, model: str, base_url: str, api_key: str,
|
||
head_length: int = 5000,
|
||
tail_length: int = 5000,
|
||
entities_window: int = 400,
|
||
entities_max: int = 6500,
|
||
analysis_window: int = 500,
|
||
analysis_max: int = 6500,
|
||
log_prefix: str = "") -> tuple[dict, OpenAICompatibleClient]:
|
||
log = lambda m: print(f"{log_prefix}{m}", file=sys.stderr)
|
||
|
||
log("[0/7] 檢測語言...")
|
||
lang = detect_language(text)
|
||
log(f" 檢測到語言:{'中文' if lang == 'zh' else '英文'} (lang={lang})")
|
||
|
||
log("[1/7] 預處理 + 關鍵詞召回...")
|
||
text = clean_text(text)
|
||
meta = extract_metadata_by_rule(text, lang)
|
||
ctx = gather_all(text, head_length, tail_length,
|
||
entities_window, entities_max,
|
||
analysis_window, analysis_max)
|
||
|
||
log(f" 規則元數據:{meta}")
|
||
log(f" 召回片段:")
|
||
for g in ("parties", "reason_object", "judgment_result",
|
||
"entities", "analysis"):
|
||
hits_info = f"hits={ctx[f'_{g}_hits']}" if ctx[f'_{g}_hits'] != "0" else "直接截取"
|
||
log(f" {g:16s} len={len(ctx[g]):5d} {hits_info}")
|
||
|
||
client = OpenAICompatibleClient(model=model, base_url=base_url, api_key=api_key)
|
||
|
||
log("[2/7] 抽取當事人...")
|
||
parties = extract_parties(client, ctx["parties"], lang)
|
||
|
||
log("[3/7] 抽取事由與標的...")
|
||
reason_obj = extract_reason_object(client, ctx["reason_object"], lang)
|
||
|
||
log("[4/7] 抽取判決結果...")
|
||
judgment = extract_judgment_result(client, ctx["judgment_result"], lang)
|
||
|
||
log("[5/7] 抽取涉及實體...")
|
||
# 實體抽取上下文:當事人片段(含律師名)+ 引用片段
|
||
entities_ctx = (ctx["parties"][:2500] + "\n\n[…]\n\n"
|
||
+ ctx["entities"])[:6500]
|
||
entities = extract_entities(client, entities_ctx, lang)
|
||
|
||
interim_for_summary = {
|
||
**parties, **reason_obj, **judgment, **entities,
|
||
"jurisdiction_name": meta["jurisdiction_name"],
|
||
}
|
||
|
||
log("[6/7] 撰寫判決總結...")
|
||
summary = extract_summary(client, interim_for_summary, ctx["analysis"], lang)
|
||
|
||
final = {
|
||
"plaintiff": parties["plaintiff"],
|
||
"defendant": parties["defendant"],
|
||
"jurisdiction_code": meta["jurisdiction_code"],
|
||
"jurisdiction_name": meta["jurisdiction_name"],
|
||
"case_location": meta["case_location"],
|
||
"case_reason": reason_obj["case_reason"],
|
||
"case_object": reason_obj["case_object"],
|
||
"judgment_result": judgment["judgment_result"],
|
||
"judgment_summary": summary["judgment_summary"],
|
||
"involved_entities": entities["involved_entities"],
|
||
}
|
||
|
||
log("[7/7] 校驗與後處理...")
|
||
final, warnings = validate_and_fix(final, lang)
|
||
for w in warnings:
|
||
log(f" ⚠️ {w}")
|
||
|
||
return final, client
|
||
|
||
|
||
# =============================================================================
|
||
# 5.5 成本統計:根據 models.json 計費價格計算本次抽取消耗
|
||
# =============================================================================
|
||
|
||
def compute_cost(client: OpenAICompatibleClient,
|
||
elapsed_seconds: float,
|
||
profile: dict | None) -> dict:
|
||
"""根據 token 用量、耗時和模型計費價格計算本次抽取成本。
|
||
|
||
價格單位為「每百萬 token 價格」:
|
||
input_cost = input_tokens / 1_000_000 * input_price
|
||
output_cost = output_tokens / 1_000_000 * output_price
|
||
profile 為 None(未使用 --config)時,價格相關字段為 null。
|
||
"""
|
||
input_tokens = client.total_input_tokens
|
||
output_tokens = client.total_output_tokens
|
||
|
||
input_price = output_price = None
|
||
price_unit = None
|
||
if profile is not None:
|
||
input_price = float(profile.get("input_price") or 0)
|
||
output_price = float(profile.get("output_price") or 0)
|
||
price_unit = profile.get("price_unit")
|
||
|
||
input_cost = output_cost = total_cost = None
|
||
if input_price is not None and output_price is not None:
|
||
input_cost = round(input_tokens / 1_000_000 * input_price, 6)
|
||
output_cost = round(output_tokens / 1_000_000 * output_price, 6)
|
||
total_cost = round(input_cost + output_cost, 6)
|
||
|
||
return {
|
||
"config_name": profile.get("Name") if profile else None,
|
||
"source": profile.get("source") if profile else None,
|
||
"model": client.model,
|
||
"elapsed_seconds": round(elapsed_seconds, 3),
|
||
"num_api_calls": client.num_calls,
|
||
"input_tokens": input_tokens,
|
||
"output_tokens": output_tokens,
|
||
"total_tokens": client.total_tokens,
|
||
"input_price_per_million": input_price,
|
||
"output_price_per_million": output_price,
|
||
"price_unit": price_unit,
|
||
"input_cost": input_cost,
|
||
"output_cost": output_cost,
|
||
"total_cost": total_cost,
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# 6. YAML 輸出(長字串用 > 折疊;含特殊字符的自動雙引號)
|
||
# =============================================================================
|
||
|
||
class FoldedStr(str):
|
||
"""標記為 YAML > 折疊樣式"""
|
||
|
||
|
||
def _folded_str_representer(dumper, data):
|
||
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=">")
|
||
|
||
|
||
def _safe_str_representer(dumper, data):
|
||
"""含 :, #, - 開頭的字符串強制雙引號"""
|
||
if data and (":" in data or data.startswith("#") or data.startswith("- ")):
|
||
return dumper.represent_scalar("tag:yaml.org,2002:str", data,
|
||
style='"')
|
||
return dumper.represent_scalar("tag:yaml.org,2002:str", data)
|
||
|
||
|
||
yaml.add_representer(FoldedStr, _folded_str_representer)
|
||
yaml.add_representer(str, _safe_str_representer)
|
||
|
||
|
||
def to_yaml(result: dict) -> str:
|
||
if result.get("case_reason"):
|
||
result["case_reason"] = FoldedStr(result["case_reason"])
|
||
if result.get("judgment_summary"):
|
||
result["judgment_summary"] = FoldedStr(result["judgment_summary"])
|
||
return yaml.dump(result, allow_unicode=True, sort_keys=False,
|
||
default_flow_style=False, width=100)
|
||
|
||
|
||
# =============================================================================
|
||
# 7. 多模型運行:解析運行規格 + 輸出路徑推導
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class RunSpec:
|
||
"""單個模型的運行規格(標籤 + 連接參數 + 計費配置)"""
|
||
label: str # 用於日誌前綴與輸出文件名(配置名或模型名)
|
||
model: str
|
||
base_url: str
|
||
api_key: str
|
||
profile: dict | None = None # models.json 配置(用於成本計算),無則 None
|
||
|
||
|
||
def _safe_label(name: str) -> str:
|
||
"""把配置/模型名轉成可安全用於文件名的標籤(如 anthropic/claude → anthropic_claude)"""
|
||
safe = re.sub(r"[^0-9A-Za-z._-]+", "_", name).strip("_")
|
||
return safe or "model"
|
||
|
||
|
||
def parse_run_specs(args) -> list[RunSpec]:
|
||
"""解析 --config / --model(均支持逗號分隔)為一組運行規格。
|
||
|
||
優先級:
|
||
- 有 --config:逐個從 models.json 加載配置(自帶 model/base_url/api_key/價格)
|
||
- 否則:用 --model(可逗號分隔多個),共用 --base-url / --api-key
|
||
"""
|
||
specs: list[RunSpec] = []
|
||
if args.config:
|
||
names = [n.strip() for n in args.config.split(",") if n.strip()]
|
||
for name in names:
|
||
profile = load_model_profile(name, args.models_file)
|
||
specs.append(RunSpec(
|
||
label=name,
|
||
model=profile.get("model") or args.model,
|
||
base_url=profile.get("BaseApiUrl") or args.base_url,
|
||
api_key=profile.get("ApiKey") or args.api_key,
|
||
profile=profile,
|
||
))
|
||
else:
|
||
names = [n.strip() for n in args.model.split(",") if n.strip()] or [args.model]
|
||
for name in names:
|
||
specs.append(RunSpec(
|
||
label=name,
|
||
model=name,
|
||
base_url=args.base_url,
|
||
api_key=args.api_key,
|
||
profile=None,
|
||
))
|
||
return specs
|
||
|
||
|
||
def build_out_path(args, input_path: Path, label: str, multi: bool) -> Path | None:
|
||
"""推導某個模型的 YAML 輸出路徑。
|
||
|
||
- 單模型:沿用原行為;若使用單個 --config 且指定 --out,則在文件名中插入配置名
|
||
- 多模型:在文件名中插入標籤;未給 --out 時用「輸入名_標籤.yaml」
|
||
"""
|
||
include_label = multi or bool(args.config and args.out)
|
||
if not include_label:
|
||
return Path(args.out) if args.out else None
|
||
safe = _safe_label(label)
|
||
if args.out:
|
||
base = Path(args.out)
|
||
return base.with_name(f"{base.stem}_{safe}{base.suffix or '.yaml'}")
|
||
return input_path.with_name(f"{input_path.stem}_{safe}.yaml")
|
||
|
||
|
||
def build_cost_path(out_path: Path | None, input_path: Path,
|
||
label: str, multi: bool) -> Path:
|
||
"""成本文件路徑:有輸出文件時用「輸出名_cost.json」,否則回退到輸入名。"""
|
||
if out_path is not None:
|
||
return out_path.with_name(out_path.stem + "_cost.json")
|
||
if multi:
|
||
return input_path.with_name(f"{input_path.stem}_{_safe_label(label)}_cost.json")
|
||
return input_path.with_name(input_path.stem + "_cost.json")
|
||
|
||
|
||
def build_debug_path(args, label: str, multi: bool) -> Path | None:
|
||
"""debug-dump 路徑:多模型時在文件名中插入標籤。"""
|
||
if not args.debug_dump:
|
||
return None
|
||
base = Path(args.debug_dump)
|
||
if multi:
|
||
return base.with_name(f"{base.stem}_{_safe_label(label)}{base.suffix or '.json'}")
|
||
return base
|
||
|
||
|
||
# =============================================================================
|
||
# CLI
|
||
# =============================================================================
|
||
|
||
def main() -> None:
|
||
ap = argparse.ArgumentParser(
|
||
description="香港判決書結構化抽取(OpenAI 兼容 API)",
|
||
epilog="""
|
||
示例用法:
|
||
# 使用 models.json 中的配置名稱(推薦,省去多個參數)
|
||
python hk_case_extractor.py case.txt --config openrouter-claude-sonnet --out result.yaml
|
||
# 成本統計會寫入 result_cost.json
|
||
|
||
# 使用本地 Ollama
|
||
python hk_case_extractor.py case.txt --model qwen2.5:7b-instruct
|
||
|
||
# 使用 OpenRouter
|
||
python hk_case_extractor.py case.txt \\
|
||
--base-url https://openrouter.ai/api/v1 \\
|
||
--model anthropic/claude-3.5-sonnet \\
|
||
--api-key your-api-key
|
||
|
||
# 使用 OpenAI
|
||
python hk_case_extractor.py case.txt \\
|
||
--base-url https://api.openai.com/v1 \\
|
||
--model gpt-4 \\
|
||
--api-key your-api-key
|
||
|
||
# 同時跑多個模型(逗號分隔),並發執行,分別輸出到不同文件
|
||
python hk_case_extractor.py case.txt \\
|
||
--config openrouter-claude-sonnet,openrouter-gpt4o,ollama-qwen \\
|
||
--out result.yaml --cost
|
||
# 生成 result_openrouter-claude-sonnet.yaml / result_openrouter-gpt4o.yaml / ...
|
||
# 及各自的 *_cost.json
|
||
|
||
# 多個本地 Ollama 模型共用同一端點
|
||
python hk_case_extractor.py case.txt \\
|
||
--model qwen2.5:7b-instruct,llama3.1:8b --out result.yaml
|
||
|
||
# 調整截取長度
|
||
python hk_case_extractor.py case.txt \\
|
||
--head-length 8000 \\
|
||
--tail-length 8000 \\
|
||
--entities-max 10000 \\
|
||
--analysis-max 10000
|
||
""",
|
||
formatter_class=argparse.RawDescriptionHelpFormatter
|
||
)
|
||
ap.add_argument("input", help="判決書文本路徑(.txt 或 .json)")
|
||
ap.add_argument("--config", default=None,
|
||
help="models.json 中的配置名稱(Name),可逗號分隔多個以同時運行多個模型,"
|
||
"使用後可省略 --model/--base-url/--api-key")
|
||
ap.add_argument("--models-file", default=DEFAULT_MODELS_FILE,
|
||
help=f"模型配置文件路徑(默認:{DEFAULT_MODELS_FILE})")
|
||
ap.add_argument("--model", default=DEFAULT_MODEL,
|
||
help=f"模型名稱(默認:{DEFAULT_MODEL}),可逗號分隔多個(共用 --base-url/--api-key)")
|
||
ap.add_argument("--base-url", default=DEFAULT_BASE_URL,
|
||
help=f"API base URL(默認:{DEFAULT_BASE_URL})")
|
||
ap.add_argument("--api-key", default=DEFAULT_API_KEY,
|
||
help="API key(Ollama 可忽略)")
|
||
ap.add_argument("--out", default=None,
|
||
help="輸出 YAML 路徑(單模型默認 stdout);多模型時在文件名中插入標籤")
|
||
ap.add_argument("--max-workers", type=int, default=0,
|
||
help="多模型時的並發數(默認 0 = 模型數量;設為 1 則順序執行)")
|
||
ap.add_argument("--cost", action="store_true",
|
||
help="輸出成本統計到 {輸出文件名}_cost.json(默認不輸出)")
|
||
ap.add_argument("--debug-dump", default=None,
|
||
help="額外輸出原始 JSON 結果到該路徑(多模型時在文件名中插入標籤)")
|
||
|
||
# 截取長度控制參數
|
||
ap.add_argument("--head-length", type=int, default=5000,
|
||
help="開頭截取長度(默認:5000)")
|
||
ap.add_argument("--tail-length", type=int, default=5000,
|
||
help="尾部截取長度(默認:5000)")
|
||
ap.add_argument("--entities-window", type=int, default=400,
|
||
help="實體關鍵詞窗口半徑(默認:400)")
|
||
ap.add_argument("--entities-max", type=int, default=6500,
|
||
help="實體片段最大總長度(默認:6500)")
|
||
ap.add_argument("--analysis-window", type=int, default=500,
|
||
help="分析關鍵詞窗口半徑(默認:500)")
|
||
ap.add_argument("--analysis-max", type=int, default=6500,
|
||
help="分析片段最大總長度(默認:6500)")
|
||
|
||
args = ap.parse_args()
|
||
|
||
# 解析運行規格(--config / --model 均支持逗號分隔多個)
|
||
specs = parse_run_specs(args)
|
||
multi = len(specs) > 1
|
||
for spec in specs:
|
||
if spec.profile is not None:
|
||
print(f"使用配置 '{spec.label}':model={spec.model}, base_url={spec.base_url}",
|
||
file=sys.stderr)
|
||
|
||
# 支持从 .json 文件的 content 字段读取
|
||
input_path = Path(args.input)
|
||
if input_path.suffix.lower() == '.json':
|
||
data = json.loads(input_path.read_text(encoding="utf-8"))
|
||
text = data.get("content", "")
|
||
if not text:
|
||
print("錯誤:JSON 文件中沒有 'content' 字段", file=sys.stderr)
|
||
sys.exit(1)
|
||
else:
|
||
text = input_path.read_text(encoding="utf-8")
|
||
|
||
def run_and_emit(spec: RunSpec) -> str:
|
||
"""跑單個模型並寫出其 YAML / cost / debug 文件,返回輸出路徑描述。"""
|
||
log_prefix = f"[{spec.label}] " if multi else ""
|
||
start = time.perf_counter()
|
||
result, client = run_pipeline(
|
||
text, spec.model, spec.base_url, spec.api_key,
|
||
args.head_length, args.tail_length,
|
||
args.entities_window, args.entities_max,
|
||
args.analysis_window, args.analysis_max,
|
||
log_prefix=log_prefix)
|
||
elapsed = time.perf_counter() - start
|
||
|
||
out_path = build_out_path(args, input_path, spec.label, multi)
|
||
|
||
# 成本統計:僅在 --cost 時輸出到 {輸出文件名}_cost.json
|
||
if args.cost:
|
||
cost = compute_cost(client, elapsed, spec.profile)
|
||
cost_path = build_cost_path(out_path, input_path, spec.label, multi)
|
||
cost_path.parent.mkdir(parents=True, exist_ok=True)
|
||
cost_path.write_text(json.dumps(cost, ensure_ascii=False, indent=2),
|
||
encoding="utf-8")
|
||
print(f"{log_prefix}💰 成本統計已寫入 {cost_path}:耗時 {cost['elapsed_seconds']}s,"
|
||
f"input={cost['input_tokens']} output={cost['output_tokens']} "
|
||
f"total_cost={cost['total_cost']} {cost['price_unit'] or ''}",
|
||
file=sys.stderr)
|
||
|
||
# debug dump(原始 JSON)
|
||
debug_path = build_debug_path(args, spec.label, multi)
|
||
if debug_path is not None:
|
||
debug_path.parent.mkdir(parents=True, exist_ok=True)
|
||
debug_path.write_text(json.dumps(result, ensure_ascii=False, indent=2),
|
||
encoding="utf-8")
|
||
|
||
# YAML 輸出
|
||
yaml_str = to_yaml(result)
|
||
if out_path is not None:
|
||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||
out_path.write_text(yaml_str, encoding="utf-8")
|
||
print(f"{log_prefix}✅ 已寫入 {out_path}", file=sys.stderr)
|
||
return str(out_path)
|
||
print(yaml_str)
|
||
return "(stdout)"
|
||
|
||
if multi and args.max_workers != 1:
|
||
# 並發執行:每個模型一個線程,日誌以 [標籤] 前綴區分
|
||
workers = args.max_workers if args.max_workers > 0 else len(specs)
|
||
print(f"⏳ 同時運行 {len(specs)} 個模型(並發 {workers}):"
|
||
f"{[s.label for s in specs]}", file=sys.stderr)
|
||
summary: list[tuple[str, str]] = []
|
||
with ThreadPoolExecutor(max_workers=workers) as ex:
|
||
futures = {ex.submit(run_and_emit, spec): spec for spec in specs}
|
||
for fut in as_completed(futures):
|
||
spec = futures[fut]
|
||
try:
|
||
summary.append((spec.label, fut.result()))
|
||
except Exception as e:
|
||
print(f"[{spec.label}] ❌ 運行失敗:{e}", file=sys.stderr)
|
||
summary.append((spec.label, f"FAILED: {e}"))
|
||
print("\n=== 多模型運行結果 ===", file=sys.stderr)
|
||
for label, out in sorted(summary):
|
||
print(f" {label:30s} → {out}", file=sys.stderr)
|
||
else:
|
||
# 單模型,或多模型但顯式 --max-workers 1 順序執行
|
||
for spec in specs:
|
||
try:
|
||
run_and_emit(spec)
|
||
except Exception as e:
|
||
if not multi:
|
||
raise
|
||
print(f"[{spec.label}] ❌ 運行失敗:{e}", file=sys.stderr)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|