main
fengruixiang 2026-05-13 18:28:44 +08:00
parent 3033b3aaeb
commit 011adb00d7
8 changed files with 1118 additions and 1 deletions

4
.gitignore vendored
View File

@ -1,3 +1,5 @@
*.doc
*.docx
*.html
*.html
result.yaml

1
.python-version 100644
View File

@ -0,0 +1 @@
3.13

0
README.md 100644
View File

View File

@ -0,0 +1,776 @@
"""
hk_case_extractor.py
==========================================================
香港判決書結構化字段抽取管線
基於 本地 Ollama 小模型 + 分階段抽取 + JSON Schema 強制 + 校驗重試
設計理念
--------
不直接把幾萬字餵給模型而是
1. 預處理規則去噪 + 切段純規則抽司法區域/案號等高確定性字段
2. 定位每個字段只截取對應的高信號區段通常 < 4k
3. 分組抽取拆成 5 次獨立 Ollama 調用每次只負責 1-3 個字段
4. Schema 強制 Ollama 0.5+ format=<JSON Schema> 約束輸出
5. 校驗+重試對字數黑名單結構標註逐項校驗
6. judgment_summary 不從原文重生而從前 4 步結果 + 一段分析段生成
依賴
----
pip install requests pyyaml
本地需運行ollama serve
模型ollama pull qwen2.5:7b-instruct 推薦中文抽取甜點
ollama pull glm4:9b
使用
----
python hk_case_extractor.py <input.txt>
python hk_case_extractor.py case.txt --model qwen2.5:7b-instruct --out result.yaml
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import requests
import yaml
# =============================================================================
# 配置
# =============================================================================
OLLAMA_URL = "https://openai.iconsz.com/ollama3090/api/chat"
DEFAULT_MODEL = "qwen2.5:7b-instruct"
DEFAULT_TIMEOUT = 240
MAX_RETRIES = 2
# =============================================================================
# 1. 預處理:去噪 + 切段 + 規則抽元數據
# =============================================================================
JURISDICTION_MAP: dict[str, str] = {
"HKCFA": "香港特別行政區終審法院",
"HKCA": "香港特別行政區高等法院上訴法庭",
"HKCFI": "香港特別行政區高等法院原訟法庭",
"HKDC": "香港特別行政區區域法院",
"HKMC": "香港特別行政區裁判法院",
"HKMagC": "香港特別行政區裁判法院",
"HKSCT": "香港特別行政區小額錢債審裁處",
"HKLT": "香港特別行政區土地審裁處",
"HKLD": "香港特別行政區勞資審裁處",
"HKCT": "香港特別行政區競爭事務審裁處",
"HKCorC": "香港特別行政區死因裁判法庭",
}
NEUTRAL_CITATION_RE = re.compile(
r"\[(\d{4})\]\s*(HKCFA|HKCA|HKCFI|HKDC|HKMagC|HKMC|HKSCT|HKLT|HKLD|HKCT|HKCorC)\s*(\d+)",
re.I,
)
CASE_NO_RE = re.compile(
r"(FACV|FACC|FAMV|FAMC|CACV|CACC|HCA|HCAL|HCMP|HCCW|HCB|DCCJ|DCMP|SCTC|LBTC|LDPD)"
r"\s*(?:NO\.?)?\s*\d+\s*(?:OF|/|\sof\s)\s*\d{4}",
re.I,
)
def clean_text(raw: str) -> str:
"""去頁眉頁腳、頁碼、多餘空行/空格"""
t = raw
t = re.sub(r"Page\s+\d+\s+of\s+\d+", "", t, flags=re.I)
t = re.sub(r"^\s*-\s*\d+\s*-\s*$", "", t, flags=re.M)
t = re.sub(r" +", " ", t) # 全角空格
t = re.sub(r"[ \t]+", " ", t)
t = re.sub(r"\n{3,}", "\n\n", t)
return t.strip()
def extract_metadata_by_rule(text: str) -> dict[str, Any]:
"""純規則:司法區域、案號、案件地點(默認香港特區)"""
meta: dict[str, Any] = {
"jurisdiction_code": None,
"jurisdiction_name": None,
"case_location": ["香港特別行政區"],
"case_number": None,
}
if m := NEUTRAL_CITATION_RE.search(text):
code = m.group(2).upper()
# 規範化大小寫
for k in JURISDICTION_MAP:
if k.upper() == code:
meta["jurisdiction_code"] = k
meta["jurisdiction_name"] = JURISDICTION_MAP[k]
break
# fallback靠法院全稱反查
if not meta["jurisdiction_code"]:
for code, full in JURISDICTION_MAP.items():
short = full.replace("香港特別行政區", "")
if full in text or short in text:
meta["jurisdiction_code"] = code
meta["jurisdiction_name"] = full
break
if m := CASE_NO_RE.search(text):
meta["case_number"] = re.sub(r"\s+", " ", m.group(0).strip())
return meta
# -----------------------------------------------------------------------------
# 關鍵詞 + 窗口召回(取代脆弱的正則切段)
# -----------------------------------------------------------------------------
# 思路:每個抽取目標定義一組「高信號關鍵詞」,掃全文取所有命中位置周圍
# ±half_window 字符的窗口,合併重疊後拼接喂給 LLM。
# 不依賴判決書的固定章節標題,對結構各異的香港判決書都能工作。
KEYWORD_GROUPS: dict[str, list[str]] = {
# Call 1當事人。集中在抬頭但也可能在 "本案中,原告..." 等行文中出現
"parties": [
"BETWEEN", "介乎", " AND ",
"申索人", "原告", "原訴人", "上訴人", "覆核申請人", "覆核人", "申請人",
"被告", "答辯人", "被上訴人", "答辯方",
"Plaintiff", "Defendant", "Appellant", "Respondent", "Applicant",
],
# Call 2事由與標的
"reason_object": [
# 段落標題類
"案情", "背景", "引言", "事實", "案件背景",
"INTRODUCTION", "BACKGROUND", "FACTS", "THE FACTS",
# 主張類
"申索", "索償", "訴因", "聲稱", "請求", "本案涉及", "本訴訟",
"原告聲稱", "申索人聲稱", "申索人指稱",
# 標的物關鍵詞
"賠償", "損失", "損害", "精神困擾", "經濟損失", "醫療費",
"履行", "所有權", "占有", "撤銷", "宣告",
],
# Call 3判決結果
"judgment_result": [
# 命令類
"本席命令", "本庭命令", "本席裁定", "本庭裁定", "命令如下",
"DISPOSITION", "ORDER", "CONCLUSION", "I therefore order",
# 結果類
"判決", "判給", "獲判", "判處", "支付",
"勝訴", "敗訴", "部分勝訴", "駁回", "撤銷", "維持", "發還",
"ALLOWED", "DISMISSED", "GRANTED", "REFUSED",
# 收尾連接詞
"因此", "故此", "綜上",
],
# Call 4涉及實體法官、律師、引用案例中的法官
"entities": [
# 法官稱謂
"法官", "大法官", "審裁官", "裁判官", "首席法官", "常任法官", "非常任法官",
"Hon.", " J.", " JA ", " CJ ", " PJ ", " NPJ ",
# 代表類
"代表", "大律師", "律師", "資深大律師", "代表律師",
"Counsel", "Solicitor",
# 案例引用(會在周邊帶出法官名)
" v ", " v. ", "[19", "[20",
],
# Call 5法庭分析用於 summary 的核心輸入)
# 注意:避免使用"分析""理由""引用"等單/雙字泛詞——這些在程序性段落、
# 目錄、引文索引中也會大量出現,會把召回擴散成全文。
# 改用判決書分析段獨有的「本席/本庭 + 動詞」短語錨點。
"analysis": [
"本席認為", "本席接納", "本席不接納", "本席同意", "本席不同意",
"本席裁定", "本席拒絕", "本席認同",
"本庭認為", "本庭接納", "本庭裁定",
"I find", "I accept", "I do not accept", "I conclude",
"The court finds", "In my view", "In my judgment",
"舉證責任", "審慎責任", "鄰人原則", "替代責任",
"違反", "侵權", "過失",
],
}
def gather_chunks(text: str,
keywords: list[str],
half_window: int = 500,
max_total: int = 6500,
case_sensitive: bool = False) -> tuple[str, int]:
"""
召回所有 keywords 命中位置周圍 ±half_window 字符的窗口
合併重疊區間按位置順序拼接總長不超過 max_total
返回(拼接後文本, 命中關鍵詞數)
若無命中fallback 返回文檔前 max_total
"""
if not text:
return "", 0
flags = 0 if case_sensitive else re.IGNORECASE
hits: list[tuple[int, int]] = []
for kw in keywords:
for m in re.finditer(re.escape(kw), text, flags=flags):
s = max(0, m.start() - half_window)
e = min(len(text), m.end() + half_window)
hits.append((s, e))
if not hits:
return text[:max_total], 0
# 合併重疊區間
hits.sort()
merged: list[list[int]] = []
for s, e in hits:
if merged and s <= merged[-1][1]:
merged[-1][1] = max(merged[-1][1], e)
else:
merged.append([s, e])
# 按位置順序拼接,控制總長
pieces: list[str] = []
total = 0
for s, e in merged:
seg_len = e - s
if total + seg_len > max_total:
remain = max_total - total
if remain > 200:
pieces.append(text[s:s + remain])
break
pieces.append(text[s:e])
total += seg_len
return "\n\n[…]\n\n".join(pieces), len(hits)
def gather_all(text: str) -> dict[str, str]:
"""為每個 group 召回對應的上下文片段"""
# 各 group 的窗口大小可微調parties 偏短judgment_result 偏密)
params: dict[str, tuple[int, int]] = {
"parties": (400, 3000),
"reason_object": (500, 6000),
"judgment_result": (500, 6500),
"entities": (400, 6500),
"analysis": (500, 6500),
}
out: dict[str, str] = {}
for group, kws in KEYWORD_GROUPS.items():
hw, mt = params[group]
ctx, hits = gather_chunks(text, kws, half_window=hw, max_total=mt)
out[group] = ctx
# 同時保留命中數,便於日誌
out[f"_{group}_hits"] = str(hits)
return out
# =============================================================================
# 2. Ollama 客戶端JSON Schema 強制 + 重試
# =============================================================================
@dataclass
class OllamaClient:
model: str = DEFAULT_MODEL
url: str = OLLAMA_URL
timeout: int = DEFAULT_TIMEOUT
def chat_json(self, system: str, user: str, schema: dict,
temperature: float = 0.0,
num_ctx: int = 8192) -> dict:
"""調用 Ollama使用 format=<JSON Schema> 強制結構化輸出"""
payload = {
"model": self.model,
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": user},
],
"format": schema,
"stream": False,
"options": {"temperature": temperature, "num_ctx": num_ctx},
}
r = requests.post(self.url, json=payload, timeout=self.timeout)
r.raise_for_status()
content = r.json()["message"]["content"]
try:
return json.loads(content)
except json.JSONDecodeError as e:
# 嘗試剝離可能的 ```json fence
stripped = re.sub(r"^```(?:json)?\s*|\s*```$", "",
content.strip(), flags=re.S)
return json.loads(stripped)
def chat_json_with_retry(self, system: str, user: str, schema: dict,
validator=None, **kw) -> dict:
"""validator(result) -> (ok: bool, hint: str);失敗則回灌 hint 重試"""
last_err = None
for attempt in range(MAX_RETRIES + 1):
try:
out = self.chat_json(system, user, schema, **kw)
if validator is None:
return out
ok, hint = validator(out)
if ok:
return out
# 回灌錯誤信息
user = (f"{user}\n\n上次輸出存在問題:{hint}\n"
f"請修正後重新輸出。")
except Exception as e:
last_err = e
if last_err:
raise last_err
return out # type: ignore
# =============================================================================
# 3. 五次抽取調用:每次只負責一組字段
# =============================================================================
# --- Call 1: 當事人 ----------------------------------------------------------
PARTIES_SCHEMA = {
"type": "object",
"properties": {
"plaintiff": {"type": "array", "items": {"type": "string"}},
"defendant": {"type": "array", "items": {"type": "string"}},
},
"required": ["plaintiff", "defendant"],
}
PARTIES_SYSTEM = """你是香港法律文書信息抽取助手。
從給定的判決書抬頭中抽取所有當事人完整姓名/機構名
- 原告/申索人/上訴人/覆核申請人 plaintiff
- 被告/答辯人/被上訴人 defendant
- 保留中英文對照如有
- 某類無則輸出空數組
只輸出符合 schema JSON不要解釋"""
PARTIES_FEWSHOT = """範例輸入:
BETWEEN
陳大文 (CHAN TAI MAN) 上訴人
AND
香港房屋委員會 (Hong Kong Housing Authority) 答辯人
範例輸出
{"plaintiff":["陳大文 (CHAN TAI MAN)"],"defendant":["香港房屋委員會 (Hong Kong Housing Authority)"]}"""
def extract_parties(client: OllamaClient, context: str) -> dict:
user = f"{PARTIES_FEWSHOT}\n\n請從以下片段(多處關鍵詞召回拼接)抽取:\n```\n{context[:3000]}\n```"
return client.chat_json_with_retry(PARTIES_SYSTEM, user, PARTIES_SCHEMA)
# --- Call 2: 事由 + 標的 ----------------------------------------------------
REASON_OBJECT_SCHEMA = {
"type": "object",
"properties": {
"case_reason": {"type": "string", "maxLength": 100},
"case_object": {"type": "array", "items": {"type": "string"}},
},
"required": ["case_reason", "case_object"],
}
REASON_OBJECT_SYSTEM = """從香港判決書中抽取:
1. case_reason事由
- 嚴格 100 單句
- 結構[原告身份] + [針對什麼事件/行為] + [向誰] + [提出什麼請求]
- 覆核/上訴案件須註明對哪個裁決提出覆核含日期/案號
- 嚴禁包含判決結果法庭分析案發細節
2. case_object標的物
- 訴訟請求指向的實體權利或利益
- 人身傷害賠償合同履行房產所有權精神困擾賠償
- 合併本質相同的標的
- 嚴禁證據材料程序性訴求"要求法庭裁決"
只輸出 JSON"""
REASON_OBJECT_FEWSHOT = """範例輸出:
{"case_reason":"申索人為商場保安員就被告於2023年7月在商場毆打申索人造成的人身傷害向被告提出損害賠償申索。","case_object":["人身傷害賠償","醫療費用","精神困擾賠償"]}"""
def _reason_object_validator(out: dict) -> tuple[bool, str]:
r = out.get("case_reason", "")
if len(r) > 100:
return False, f"case_reason 共 {len(r)} 字,超過 100 字上限,請壓縮到 80 字以內。"
if not out.get("case_object"):
return False, "case_object 不能為空。"
return True, ""
def extract_reason_object(client: OllamaClient, context: str) -> dict:
user = (f"{REASON_OBJECT_FEWSHOT}\n\n"
f"請從以下片段(多處關鍵詞召回拼接)抽取:\n```\n{context[:6000]}\n```")
out = client.chat_json_with_retry(REASON_OBJECT_SYSTEM, user,
REASON_OBJECT_SCHEMA,
validator=_reason_object_validator)
if len(out["case_reason"]) > 100:
out["case_reason"] = out["case_reason"][:100]
return out
# --- Call 3: 判決結果 -------------------------------------------------------
JUDGMENT_RESULT_SCHEMA = {
"type": "object",
"properties": {
"judgment_result": {
"type": "array",
"items": {
"type": "object",
"properties": {
"charge": {"type": "string"},
"result": {"type": "string"},
},
"required": ["charge", "result"],
},
}
},
"required": ["judgment_result"],
}
JUDGMENT_RESULT_SYSTEM = """從香港判決書命令/裁定部分抽取所有判決結果。
拆分原則
- 多項請求 分條
- "責任判定" "損失/金額計算" 兩個層面 必須分條
- 每條 charge 必須以 "(責任問題)" "(損失範圍)" 結尾標註層次
- result 必須包含
a) 明確結果勝訴/敗訴/部分勝訴/維持/撤銷/駁回等
b) 2-3 個關鍵法庭理由
c) 具體金額利率或命令內容如有
只輸出 JSON"""
JUDGMENT_RESULT_FEWSHOT = """範例輸出:
{"judgment_result":[
{"charge":"申索人就毆打事件的人身傷害索償 (責任問題)","result":"勝訴。法庭接納申索人證供可信,閉路電視顯示被告先動手,被告亦承認部分情節。"},
{"charge":"醫療費及精神困擾賠償金額 (損失範圍)","result":"部分勝訴。判給醫療費HK$8,500及一般損害賠償HK$20,000合共HK$28,500連同利息及訟費。"}
]}"""
def _judgment_validator(out: dict) -> tuple[bool, str]:
items = out.get("judgment_result", [])
if not items:
return False, "judgment_result 不能為空。"
bad = [i for i in items
if "責任問題" not in i.get("charge", "")
and "損失範圍" not in i.get("charge", "")]
if bad:
return False, (f"{len(bad)} 條 charge 未標註層次。"
f"每條 charge 必須以 '(責任問題)''(損失範圍)' 結尾。")
return True, ""
def extract_judgment_result(client: OllamaClient, context: str) -> dict:
user = (f"{JUDGMENT_RESULT_FEWSHOT}\n\n"
f"請從以下片段(多處關鍵詞召回拼接)抽取:\n```\n{context[:6500]}\n```")
return client.chat_json_with_retry(JUDGMENT_RESULT_SYSTEM, user,
JUDGMENT_RESULT_SCHEMA,
validator=_judgment_validator)
# --- Call 4: 涉及實體 -------------------------------------------------------
ENTITIES_SCHEMA = {
"type": "object",
"properties": {
"involved_entities": {
"type": "array",
"items": {
"type": "object",
"properties": {
"entity_name": {"type": "string"},
"reason": {"type": "string"},
},
"required": ["entity_name", "reason"],
},
}
},
"required": ["involved_entities"],
}
ENTITIES_SYSTEM = """從香港判決書中抽取所有相關實體(自然人/法人/組織/機構)。
必須包含
- 主審法官 / 審裁官
- 雙方代表律師大律師
- 判決中引用的先例所提及的法官
reason 須寫明在XX案[案號]中擔任XX職位闡述XX法律原則
- 涉案的政府部門公司機構
嚴禁包含
- 法案/條例名侵權條例Cap.xxx
- 純案例名稱 Donoghue v Stevenson
- 文獻期刊名
只輸出 JSON"""
ENTITIES_FEWSHOT = """範例輸出:
{"involved_entities":[
{"entity_name":"林希維審裁官","reason":"本案主審審裁官,負責認定事實及裁決。"},
{"entity_name":"終審法院常任法官李義","reason":"在Tang Kwok Wah v HKSAR [2019] HKCFA 23 中擔任主筆法官闡述舉證責任原則本案第34段引用其判詞。"},
{"entity_name":"康樂文化事務署","reason":"涉案場所通州街公園的管理機構。"}
]}"""
def _entities_validator(out: dict) -> tuple[bool, str]:
ents = out.get("involved_entities", [])
if not ents:
return False, "involved_entities 不能為空,至少要有主審法官。"
blacklist = ["條例", "Cap.", "法案"]
bad = [e["entity_name"] for e in ents
if any(k in e.get("entity_name", "") for k in blacklist)]
if bad:
return False, f"以下實體疑為條例/法案,應移除:{bad}"
return True, ""
def extract_entities(client: OllamaClient, context: str) -> dict:
user = (f"{ENTITIES_FEWSHOT}\n\n"
f"請從以下片段(多處關鍵詞召回拼接)抽取所有涉及實體:\n"
f"```\n{context[:6500]}\n```")
return client.chat_json_with_retry(ENTITIES_SYSTEM, user,
ENTITIES_SCHEMA,
validator=_entities_validator)
# --- Call 5: 判決總結(基於已抽取結果 + 分析段,不從原文重生) -----------
SUMMARY_SCHEMA = {
"type": "object",
"properties": {
"judgment_summary": {"type": "string", "maxLength": 300},
},
"required": ["judgment_summary"],
}
SUMMARY_SYSTEM = """根據已抽取的結構化字段 + 法庭分析段,撰寫判決總結。
四要素結構必須全部涵蓋連貫成單段
(1) 案件背景1-2 句交代起因與當事人關係
(2) 核心爭議焦點
(3) 法庭法律分析與推理核心重點
- 如何評估證據
- 接受 / 拒絕主張的邏輯
- 引用了哪些關鍵法律或判例
(4) 最終裁決結果及命令
嚴格 300 只輸出 JSON"""
def _summary_validator(out: dict) -> tuple[bool, str]:
s = out.get("judgment_summary", "")
if len(s) > 300:
return False, f"summary 共 {len(s)} 字,超過 300 字上限,請壓縮。"
if len(s) < 80:
return False, "summary 過短,請完整覆蓋四要素。"
return True, ""
def extract_summary(client: OllamaClient,
prior: dict, analysis: str) -> dict:
user = f"""已抽取的字段:
```json
{json.dumps(prior, ensure_ascii=False, indent=2)}
```
法庭分析節選
```
{analysis[:3500]}
```
請按四要素撰寫 300 字的 judgment_summary"""
out = client.chat_json_with_retry(SUMMARY_SYSTEM, user, SUMMARY_SCHEMA,
validator=_summary_validator)
if len(out["judgment_summary"]) > 300:
out["judgment_summary"] = out["judgment_summary"][:300]
return out
# =============================================================================
# 4. 全局校驗與後處理
# =============================================================================
LOCATION_BLACKLIST = [
"法院", "法庭", "審裁處", "公園", "大廈", "大樓", "商場",
"", "道路", "", "中心", "醫院", "酒店", "車站",
]
ENTITY_NAME_BLACKLIST = ["條例", "Cap.", "法案", "案例彙編", "Reports",
"期刊", "Journal"]
def validate_and_fix(result: dict) -> tuple[dict, list[str]]:
warnings: list[str] = []
# case_location剔除法院/場所/建築
locs = result.get("case_location") or []
cleaned = [l for l in locs
if l and not any(b in l for b in LOCATION_BLACKLIST)]
if "香港特別行政區" not in cleaned:
cleaned.insert(0, "香港特別行政區")
if set(cleaned) != set(locs):
warnings.append(
f"case_location 已清理:移除 {set(locs) - set(cleaned)}")
result["case_location"] = cleaned
# 字數硬截斷
if len(result.get("case_reason", "")) > 100:
warnings.append("case_reason > 100 字,已截斷")
result["case_reason"] = result["case_reason"][:100]
if len(result.get("judgment_summary", "")) > 300:
warnings.append("judgment_summary > 300 字,已截斷")
result["judgment_summary"] = result["judgment_summary"][:300]
# involved_entities剔除條例/文獻
ents = result.get("involved_entities") or []
cleaned_ents = [e for e in ents
if not any(k in e.get("entity_name", "")
for k in ENTITY_NAME_BLACKLIST)]
if len(cleaned_ents) != len(ents):
warnings.append(
f"involved_entities 移除 {len(ents) - len(cleaned_ents)} 條疑似條例/文獻")
result["involved_entities"] = cleaned_ents
# judgment_result補層次標註提示
for jr in result.get("judgment_result", []) or []:
if ("責任問題" not in jr.get("charge", "")
and "損失範圍" not in jr.get("charge", "")):
warnings.append(
f"judgment_result 條目缺層次標註:{jr.get('charge', '')[:40]}")
# 空字段告警
for k in ("plaintiff", "defendant", "case_object",
"judgment_result", "involved_entities"):
if not result.get(k):
warnings.append(f"{k} 為空,請人工複核")
return result, warnings
# =============================================================================
# 5. 主管線
# =============================================================================
def run_pipeline(text: str, model: str) -> dict:
log = lambda m: print(m, file=sys.stderr)
log("[1/6] 預處理 + 關鍵詞召回...")
text = clean_text(text)
meta = extract_metadata_by_rule(text)
ctx = gather_all(text)
log(f" 規則元數據:{meta}")
log(f" 召回片段:")
for g in ("parties", "reason_object", "judgment_result",
"entities", "analysis"):
log(f" {g:16s} len={len(ctx[g]):5d} hits={ctx[f'_{g}_hits']}")
client = OllamaClient(model=model)
log("[2/6] 抽取當事人...")
parties = extract_parties(client, ctx["parties"])
log("[3/6] 抽取事由與標的...")
reason_obj = extract_reason_object(client, ctx["reason_object"])
log("[4/6] 抽取判決結果...")
judgment = extract_judgment_result(client, ctx["judgment_result"])
log("[5/6] 抽取涉及實體...")
# 實體抽取上下文:當事人片段(含律師名)+ 引用片段
entities_ctx = (ctx["parties"][:2500] + "\n\n[…]\n\n"
+ ctx["entities"])[:6500]
entities = extract_entities(client, entities_ctx)
interim_for_summary = {
**parties, **reason_obj, **judgment, **entities,
"jurisdiction_name": meta["jurisdiction_name"],
}
log("[6/6] 撰寫判決總結...")
summary = extract_summary(client, interim_for_summary, ctx["analysis"])
final = {
"plaintiff": parties["plaintiff"],
"defendant": parties["defendant"],
"jurisdiction_code": meta["jurisdiction_code"],
"jurisdiction_name": meta["jurisdiction_name"],
"case_location": meta["case_location"],
"case_reason": reason_obj["case_reason"],
"case_object": reason_obj["case_object"],
"judgment_result": judgment["judgment_result"],
"judgment_summary": summary["judgment_summary"],
"involved_entities": entities["involved_entities"],
}
final, warnings = validate_and_fix(final)
for w in warnings:
log(f" ⚠️ {w}")
return final
# =============================================================================
# 6. YAML 輸出(長字串用 > 折疊;含特殊字符的自動雙引號)
# =============================================================================
class FoldedStr(str):
"""標記為 YAML > 折疊樣式"""
def _folded_str_representer(dumper, data):
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=">")
def _safe_str_representer(dumper, data):
"""含 :, #, - 開頭的字符串強制雙引號"""
if data and (":" in data or data.startswith("#") or data.startswith("- ")):
return dumper.represent_scalar("tag:yaml.org,2002:str", data,
style='"')
return dumper.represent_scalar("tag:yaml.org,2002:str", data)
yaml.add_representer(FoldedStr, _folded_str_representer)
yaml.add_representer(str, _safe_str_representer)
def to_yaml(result: dict) -> str:
if result.get("case_reason"):
result["case_reason"] = FoldedStr(result["case_reason"])
if result.get("judgment_summary"):
result["judgment_summary"] = FoldedStr(result["judgment_summary"])
return yaml.dump(result, allow_unicode=True, sort_keys=False,
default_flow_style=False, width=100)
# =============================================================================
# CLI
# =============================================================================
def main() -> None:
ap = argparse.ArgumentParser(
description="香港判決書結構化抽取(本地 Ollama 版)")
ap.add_argument("input", help="判決書文本路徑(.txt")
ap.add_argument("--model", default=DEFAULT_MODEL, help="Ollama 模型名")
ap.add_argument("--out", default=None, help="輸出 YAML 路徑(默認 stdout")
ap.add_argument("--debug-dump", default=None,
help="額外輸出原始 JSON 結果到該路徑(便於 diff")
args = ap.parse_args()
text = Path(args.input).read_text(encoding="utf-8")
result = run_pipeline(text, args.model)
if args.debug_dump:
Path(args.debug_dump).write_text(
json.dumps(result, ensure_ascii=False, indent=2),
encoding="utf-8")
yaml_str = to_yaml(result)
if args.out:
Path(args.out).write_text(yaml_str, encoding="utf-8")
print(f"\n✅ 已寫入 {args.out}", file=sys.stderr)
else:
print(yaml_str)
if __name__ == "__main__":
main()

6
main.py 100644
View File

@ -0,0 +1,6 @@
def main():
print("Hello from hklii-samples!")
if __name__ == "__main__":
main()

10
pyproject.toml 100644
View File

@ -0,0 +1,10 @@
[project]
name = "hklii-samples"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"pyyaml>=6.0.3",
"requests>=2.34.0",
]

153
uv.lock 100644
View File

@ -0,0 +1,153 @@
version = 1
revision = 2
requires-python = ">=3.13"
[[package]]
name = "certifi"
version = "2026.4.22"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/25/ee/6caf7a40c36a1220410afe15a1cc64993a1f864871f698c0f93acb72842a/certifi-2026.4.22.tar.gz", hash = "sha256:8d455352a37b71bf76a79caa83a3d6c25afee4a385d632127b6afb3963f1c580", size = 137077, upload-time = "2026-04-22T11:26:11.191Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl", hash = "sha256:3cb2210c8f88ba2318d29b0388d1023c8492ff72ecdde4ebdaddbb13a31b1c4a", size = 135707, upload-time = "2026-04-22T11:26:09.372Z" },
]
[[package]]
name = "charset-normalizer"
version = "3.4.7"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/e7/a1/67fe25fac3c7642725500a3f6cfe5821ad557c3abb11c9d20d12c7008d3e/charset_normalizer-3.4.7.tar.gz", hash = "sha256:ae89db9e5f98a11a4bf50407d4363e7b09b31e55bc117b4f7d80aab97ba009e5", size = 144271, upload-time = "2026-04-02T09:28:39.342Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c1/3b/66777e39d3ae1ddc77ee606be4ec6d8cbd4c801f65e5a1b6f2b11b8346dd/charset_normalizer-3.4.7-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:f496c9c3cc02230093d8330875c4c3cdfc3b73612a5fd921c65d39cbcef08063", size = 309627, upload-time = "2026-04-02T09:26:45.198Z" },
{ url = "https://files.pythonhosted.org/packages/2e/4e/b7f84e617b4854ade48a1b7915c8ccfadeba444d2a18c291f696e37f0d3b/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ea948db76d31190bf08bd371623927ee1339d5f2a0b4b1b4a4439a65298703c", size = 207008, upload-time = "2026-04-02T09:26:46.824Z" },
{ url = "https://files.pythonhosted.org/packages/c4/bb/ec73c0257c9e11b268f018f068f5d00aa0ef8c8b09f7753ebd5f2880e248/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a277ab8928b9f299723bc1a2dabb1265911b1a76341f90a510368ca44ad9ab66", size = 228303, upload-time = "2026-04-02T09:26:48.397Z" },
{ url = "https://files.pythonhosted.org/packages/85/fb/32d1f5033484494619f701e719429c69b766bfc4dbc61aa9e9c8c166528b/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3bec022aec2c514d9cf199522a802bd007cd588ab17ab2525f20f9c34d067c18", size = 224282, upload-time = "2026-04-02T09:26:49.684Z" },
{ url = "https://files.pythonhosted.org/packages/fa/07/330e3a0dda4c404d6da83b327270906e9654a24f6c546dc886a0eb0ffb23/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e044c39e41b92c845bc815e5ae4230804e8e7bc29e399b0437d64222d92809dd", size = 215595, upload-time = "2026-04-02T09:26:50.915Z" },
{ url = "https://files.pythonhosted.org/packages/e3/7c/fc890655786e423f02556e0216d4b8c6bcb6bdfa890160dc66bf52dee468/charset_normalizer-3.4.7-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:f495a1652cf3fbab2eb0639776dad966c2fb874d79d87ca07f9d5f059b8bd215", size = 201986, upload-time = "2026-04-02T09:26:52.197Z" },
{ url = "https://files.pythonhosted.org/packages/d8/97/bfb18b3db2aed3b90cf54dc292ad79fdd5ad65c4eae454099475cbeadd0d/charset_normalizer-3.4.7-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e712b419df8ba5e42b226c510472b37bd57b38e897d3eca5e8cfd410a29fa859", size = 211711, upload-time = "2026-04-02T09:26:53.49Z" },
{ url = "https://files.pythonhosted.org/packages/6f/a5/a581c13798546a7fd557c82614a5c65a13df2157e9ad6373166d2a3e645d/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7804338df6fcc08105c7745f1502ba68d900f45fd770d5bdd5288ddccb8a42d8", size = 210036, upload-time = "2026-04-02T09:26:54.975Z" },
{ url = "https://files.pythonhosted.org/packages/8c/bf/b3ab5bcb478e4193d517644b0fb2bf5497fbceeaa7a1bc0f4d5b50953861/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:481551899c856c704d58119b5025793fa6730adda3571971af568f66d2424bb5", size = 202998, upload-time = "2026-04-02T09:26:56.303Z" },
{ url = "https://files.pythonhosted.org/packages/e7/4e/23efd79b65d314fa320ec6017b4b5834d5c12a58ba4610aa353af2e2f577/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f59099f9b66f0d7145115e6f80dd8b1d847176df89b234a5a6b3f00437aa0832", size = 230056, upload-time = "2026-04-02T09:26:57.554Z" },
{ url = "https://files.pythonhosted.org/packages/b9/9f/1e1941bc3f0e01df116e68dc37a55c4d249df5e6fa77f008841aef68264f/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:f59ad4c0e8f6bba240a9bb85504faa1ab438237199d4cce5f622761507b8f6a6", size = 211537, upload-time = "2026-04-02T09:26:58.843Z" },
{ url = "https://files.pythonhosted.org/packages/80/0f/088cbb3020d44428964a6c97fe1edfb1b9550396bf6d278330281e8b709c/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:3dedcc22d73ec993f42055eff4fcfed9318d1eeb9a6606c55892a26964964e48", size = 226176, upload-time = "2026-04-02T09:27:00.437Z" },
{ url = "https://files.pythonhosted.org/packages/6a/9f/130394f9bbe06f4f63e22641d32fc9b202b7e251c9aef4db044324dac493/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:64f02c6841d7d83f832cd97ccf8eb8a906d06eb95d5276069175c696b024b60a", size = 217723, upload-time = "2026-04-02T09:27:02.021Z" },
{ url = "https://files.pythonhosted.org/packages/73/55/c469897448a06e49f8fa03f6caae97074fde823f432a98f979cc42b90e69/charset_normalizer-3.4.7-cp313-cp313-win32.whl", hash = "sha256:4042d5c8f957e15221d423ba781e85d553722fc4113f523f2feb7b188cc34c5e", size = 148085, upload-time = "2026-04-02T09:27:03.192Z" },
{ url = "https://files.pythonhosted.org/packages/5d/78/1b74c5bbb3f99b77a1715c91b3e0b5bdb6fe302d95ace4f5b1bec37b0167/charset_normalizer-3.4.7-cp313-cp313-win_amd64.whl", hash = "sha256:3946fa46a0cf3e4c8cb1cc52f56bb536310d34f25f01ca9b6c16afa767dab110", size = 158819, upload-time = "2026-04-02T09:27:04.454Z" },
{ url = "https://files.pythonhosted.org/packages/68/86/46bd42279d323deb8687c4a5a811fd548cb7d1de10cf6535d099877a9a9f/charset_normalizer-3.4.7-cp313-cp313-win_arm64.whl", hash = "sha256:80d04837f55fc81da168b98de4f4b797ef007fc8a79ab71c6ec9bc4dd662b15b", size = 147915, upload-time = "2026-04-02T09:27:05.971Z" },
{ url = "https://files.pythonhosted.org/packages/97/c8/c67cb8c70e19ef1960b97b22ed2a1567711de46c4ddf19799923adc836c2/charset_normalizer-3.4.7-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:c36c333c39be2dbca264d7803333c896ab8fa7d4d6f0ab7edb7dfd7aea6e98c0", size = 309234, upload-time = "2026-04-02T09:27:07.194Z" },
{ url = "https://files.pythonhosted.org/packages/99/85/c091fdee33f20de70d6c8b522743b6f831a2f1cd3ff86de4c6a827c48a76/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1c2aed2e5e41f24ea8ef1590b8e848a79b56f3a5564a65ceec43c9d692dc7d8a", size = 208042, upload-time = "2026-04-02T09:27:08.749Z" },
{ url = "https://files.pythonhosted.org/packages/87/1c/ab2ce611b984d2fd5d86a5a8a19c1ae26acac6bad967da4967562c75114d/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:54523e136b8948060c0fa0bc7b1b50c32c186f2fceee897a495406bb6e311d2b", size = 228706, upload-time = "2026-04-02T09:27:09.951Z" },
{ url = "https://files.pythonhosted.org/packages/a8/29/2b1d2cb00bf085f59d29eb773ce58ec2d325430f8c216804a0a5cd83cbca/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:715479b9a2802ecac752a3b0efa2b0b60285cf962ee38414211abdfccc233b41", size = 224727, upload-time = "2026-04-02T09:27:11.175Z" },
{ url = "https://files.pythonhosted.org/packages/47/5c/032c2d5a07fe4d4855fea851209cca2b6f03ebeb6d4e3afdb3358386a684/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bd6c2a1c7573c64738d716488d2cdd3c00e340e4835707d8fdb8dc1a66ef164e", size = 215882, upload-time = "2026-04-02T09:27:12.446Z" },
{ url = "https://files.pythonhosted.org/packages/2c/c2/356065d5a8b78ed04499cae5f339f091946a6a74f91e03476c33f0ab7100/charset_normalizer-3.4.7-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:c45e9440fb78f8ddabcf714b68f936737a121355bf59f3907f4e17721b9d1aae", size = 200860, upload-time = "2026-04-02T09:27:13.721Z" },
{ url = "https://files.pythonhosted.org/packages/0c/cd/a32a84217ced5039f53b29f460962abb2d4420def55afabe45b1c3c7483d/charset_normalizer-3.4.7-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3534e7dcbdcf757da6b85a0bbf5b6868786d5982dd959b065e65481644817a18", size = 211564, upload-time = "2026-04-02T09:27:15.272Z" },
{ url = "https://files.pythonhosted.org/packages/44/86/58e6f13ce26cc3b8f4a36b94a0f22ae2f00a72534520f4ae6857c4b81f89/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e8ac484bf18ce6975760921bb6148041faa8fef0547200386ea0b52b5d27bf7b", size = 211276, upload-time = "2026-04-02T09:27:16.834Z" },
{ url = "https://files.pythonhosted.org/packages/8f/fe/d17c32dc72e17e155e06883efa84514ca375f8a528ba2546bee73fc4df81/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a5fe03b42827c13cdccd08e6c0247b6a6d4b5e3cdc53fd1749f5896adcdc2356", size = 201238, upload-time = "2026-04-02T09:27:18.229Z" },
{ url = "https://files.pythonhosted.org/packages/6a/29/f33daa50b06525a237451cdb6c69da366c381a3dadcd833fa5676bc468b3/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:2d6eb928e13016cea4f1f21d1e10c1cebd5a421bc57ddf5b1142ae3f86824fab", size = 230189, upload-time = "2026-04-02T09:27:19.445Z" },
{ url = "https://files.pythonhosted.org/packages/b6/6e/52c84015394a6a0bdcd435210a7e944c5f94ea1055f5cc5d56c5fe368e7b/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:e74327fb75de8986940def6e8dee4f127cc9752bee7355bb323cc5b2659b6d46", size = 211352, upload-time = "2026-04-02T09:27:20.79Z" },
{ url = "https://files.pythonhosted.org/packages/8c/d7/4353be581b373033fb9198bf1da3cf8f09c1082561e8e922aa7b39bf9fe8/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:d6038d37043bced98a66e68d3aa2b6a35505dc01328cd65217cefe82f25def44", size = 227024, upload-time = "2026-04-02T09:27:22.063Z" },
{ url = "https://files.pythonhosted.org/packages/30/45/99d18aa925bd1740098ccd3060e238e21115fffbfdcb8f3ece837d0ace6c/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7579e913a5339fb8fa133f6bbcfd8e6749696206cf05acdbdca71a1b436d8e72", size = 217869, upload-time = "2026-04-02T09:27:23.486Z" },
{ url = "https://files.pythonhosted.org/packages/5c/05/5ee478aa53f4bb7996482153d4bfe1b89e0f087f0ab6b294fcf92d595873/charset_normalizer-3.4.7-cp314-cp314-win32.whl", hash = "sha256:5b77459df20e08151cd6f8b9ef8ef1f961ef73d85c21a555c7eed5b79410ec10", size = 148541, upload-time = "2026-04-02T09:27:25.146Z" },
{ url = "https://files.pythonhosted.org/packages/48/77/72dcb0921b2ce86420b2d79d454c7022bf5be40202a2a07906b9f2a35c97/charset_normalizer-3.4.7-cp314-cp314-win_amd64.whl", hash = "sha256:92a0a01ead5e668468e952e4238cccd7c537364eb7d851ab144ab6627dbbe12f", size = 159634, upload-time = "2026-04-02T09:27:26.642Z" },
{ url = "https://files.pythonhosted.org/packages/c6/a3/c2369911cd72f02386e4e340770f6e158c7980267da16af8f668217abaa0/charset_normalizer-3.4.7-cp314-cp314-win_arm64.whl", hash = "sha256:67f6279d125ca0046a7fd386d01b311c6363844deac3e5b069b514ba3e63c246", size = 148384, upload-time = "2026-04-02T09:27:28.271Z" },
{ url = "https://files.pythonhosted.org/packages/94/09/7e8a7f73d24dba1f0035fbbf014d2c36828fc1bf9c88f84093e57d315935/charset_normalizer-3.4.7-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:effc3f449787117233702311a1b7d8f59cba9ced946ba727bdc329ec69028e24", size = 330133, upload-time = "2026-04-02T09:27:29.474Z" },
{ url = "https://files.pythonhosted.org/packages/8d/da/96975ddb11f8e977f706f45cddd8540fd8242f71ecdb5d18a80723dcf62c/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fbccdc05410c9ee21bbf16a35f4c1d16123dcdeb8a1d38f33654fa21d0234f79", size = 216257, upload-time = "2026-04-02T09:27:30.793Z" },
{ url = "https://files.pythonhosted.org/packages/e5/e8/1d63bf8ef2d388e95c64b2098f45f84758f6d102a087552da1485912637b/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:733784b6d6def852c814bce5f318d25da2ee65dd4839a0718641c696e09a2960", size = 234851, upload-time = "2026-04-02T09:27:32.44Z" },
{ url = "https://files.pythonhosted.org/packages/9b/40/e5ff04233e70da2681fa43969ad6f66ca5611d7e669be0246c4c7aaf6dc8/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a89c23ef8d2c6b27fd200a42aa4ac72786e7c60d40efdc76e6011260b6e949c4", size = 233393, upload-time = "2026-04-02T09:27:34.03Z" },
{ url = "https://files.pythonhosted.org/packages/be/c1/06c6c49d5a5450f76899992f1ee40b41d076aee9279b49cf9974d2f313d5/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6c114670c45346afedc0d947faf3c7f701051d2518b943679c8ff88befe14f8e", size = 223251, upload-time = "2026-04-02T09:27:35.369Z" },
{ url = "https://files.pythonhosted.org/packages/2b/9f/f2ff16fb050946169e3e1f82134d107e5d4ae72647ec8a1b1446c148480f/charset_normalizer-3.4.7-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:a180c5e59792af262bf263b21a3c49353f25945d8d9f70628e73de370d55e1e1", size = 206609, upload-time = "2026-04-02T09:27:36.661Z" },
{ url = "https://files.pythonhosted.org/packages/69/d5/a527c0cd8d64d2eab7459784fb4169a0ac76e5a6fc5237337982fd61347e/charset_normalizer-3.4.7-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3c9a494bc5ec77d43cea229c4f6db1e4d8fe7e1bbffa8b6f0f0032430ff8ab44", size = 220014, upload-time = "2026-04-02T09:27:38.019Z" },
{ url = "https://files.pythonhosted.org/packages/7e/80/8a7b8104a3e203074dc9aa2c613d4b726c0e136bad1cc734594b02867972/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8d828b6667a32a728a1ad1d93957cdf37489c57b97ae6c4de2860fa749b8fc1e", size = 218979, upload-time = "2026-04-02T09:27:39.37Z" },
{ url = "https://files.pythonhosted.org/packages/02/9a/b759b503d507f375b2b5c153e4d2ee0a75aa215b7f2489cf314f4541f2c0/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:cf1493cd8607bec4d8a7b9b004e699fcf8f9103a9284cc94962cb73d20f9d4a3", size = 209238, upload-time = "2026-04-02T09:27:40.722Z" },
{ url = "https://files.pythonhosted.org/packages/c2/4e/0f3f5d47b86bdb79256e7290b26ac847a2832d9a4033f7eb2cd4bcf4bb5b/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:0c96c3b819b5c3e9e165495db84d41914d6894d55181d2d108cc1a69bfc9cce0", size = 236110, upload-time = "2026-04-02T09:27:42.33Z" },
{ url = "https://files.pythonhosted.org/packages/96/23/bce28734eb3ed2c91dcf93abeb8a5cf393a7b2749725030bb630e554fdd8/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:752a45dc4a6934060b3b0dab47e04edc3326575f82be64bc4fc293914566503e", size = 219824, upload-time = "2026-04-02T09:27:43.924Z" },
{ url = "https://files.pythonhosted.org/packages/2c/6f/6e897c6984cc4d41af319b077f2f600fc8214eb2fe2d6bcb79141b882400/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:8778f0c7a52e56f75d12dae53ae320fae900a8b9b4164b981b9c5ce059cd1fcb", size = 233103, upload-time = "2026-04-02T09:27:45.348Z" },
{ url = "https://files.pythonhosted.org/packages/76/22/ef7bd0fe480a0ae9b656189ec00744b60933f68b4f42a7bb06589f6f576a/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ce3412fbe1e31eb81ea42f4169ed94861c56e643189e1e75f0041f3fe7020abe", size = 225194, upload-time = "2026-04-02T09:27:46.706Z" },
{ url = "https://files.pythonhosted.org/packages/c5/a7/0e0ab3e0b5bc1219bd80a6a0d4d72ca74d9250cb2382b7c699c147e06017/charset_normalizer-3.4.7-cp314-cp314t-win32.whl", hash = "sha256:c03a41a8784091e67a39648f70c5f97b5b6a37f216896d44d2cdcb82615339a0", size = 159827, upload-time = "2026-04-02T09:27:48.053Z" },
{ url = "https://files.pythonhosted.org/packages/7a/1d/29d32e0fb40864b1f878c7f5a0b343ae676c6e2b271a2d55cc3a152391da/charset_normalizer-3.4.7-cp314-cp314t-win_amd64.whl", hash = "sha256:03853ed82eeebbce3c2abfdbc98c96dc205f32a79627688ac9a27370ea61a49c", size = 174168, upload-time = "2026-04-02T09:27:49.795Z" },
{ url = "https://files.pythonhosted.org/packages/de/32/d92444ad05c7a6e41fb2036749777c163baf7a0301a040cb672d6b2b1ae9/charset_normalizer-3.4.7-cp314-cp314t-win_arm64.whl", hash = "sha256:c35abb8bfff0185efac5878da64c45dafd2b37fb0383add1be155a763c1f083d", size = 153018, upload-time = "2026-04-02T09:27:51.116Z" },
{ url = "https://files.pythonhosted.org/packages/db/8f/61959034484a4a7c527811f4721e75d02d653a35afb0b6054474d8185d4c/charset_normalizer-3.4.7-py3-none-any.whl", hash = "sha256:3dce51d0f5e7951f8bb4900c257dad282f49190fdbebecd4ba99bcc41fef404d", size = 61958, upload-time = "2026-04-02T09:28:37.794Z" },
]
[[package]]
name = "hklii-samples"
version = "0.1.0"
source = { virtual = "." }
dependencies = [
{ name = "pyyaml" },
{ name = "requests" },
]
[package.metadata]
requires-dist = [
{ name = "pyyaml", specifier = ">=6.0.3" },
{ name = "requests", specifier = ">=2.34.0" },
]
[[package]]
name = "idna"
version = "3.15"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/82/77/7b3966d0b9d1d31a36ddf1746926a11dface89a83409bf1483f0237aa758/idna-3.15.tar.gz", hash = "sha256:ca962446ea538f7092a95e057da437618e886f4d349216d2b1e294abfdb65fdc", size = 199245, upload-time = "2026-05-12T22:45:57.011Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d2/23/408243171aa9aaba178d3e2559159c24c1171a641aa83b67bdd3394ead8e/idna-3.15-py3-none-any.whl", hash = "sha256:048adeaf8c2d788c40fee287673ccaa74c24ffd8dcf09ffa555a2fbb59f10ac8", size = 72340, upload-time = "2026-05-12T22:45:55.733Z" },
]
[[package]]
name = "pyyaml"
version = "6.0.3"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" },
{ url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" },
{ url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" },
{ url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" },
{ url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" },
{ url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" },
{ url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" },
{ url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" },
{ url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" },
{ url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" },
{ url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" },
{ url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" },
{ url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" },
{ url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" },
{ url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" },
{ url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" },
{ url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" },
{ url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" },
{ url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" },
{ url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" },
{ url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" },
{ url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" },
{ url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" },
{ url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" },
{ url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" },
{ url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" },
{ url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" },
{ url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
]
[[package]]
name = "requests"
version = "2.34.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "certifi" },
{ name = "charset-normalizer" },
{ name = "idna" },
{ name = "urllib3" },
]
sdist = { url = "https://files.pythonhosted.org/packages/43/b8/7a707d60fea4c49094e40262cc0e2ca6c768cca21587e34d3f705afec47e/requests-2.34.0.tar.gz", hash = "sha256:7d62fe92f50eb82c529b0916bb445afa1531a566fc8f35ffdc64446e771b856a", size = 142436, upload-time = "2026-05-11T19:29:51.717Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ef/e6/e300fce5fe83c30520607a015dabd985df3251e188d234bfe9492e17a389/requests-2.34.0-py3-none-any.whl", hash = "sha256:917520a21b767485ce7c588f4ebb917c436b24a31231b44228715eaeb5a52c60", size = 73021, upload-time = "2026-05-11T19:29:49.923Z" },
]
[[package]]
name = "urllib3"
version = "2.7.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/53/0c/06f8b233b8fd13b9e5ee11424ef85419ba0d8ba0b3138bf360be2ff56953/urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c", size = 433602, upload-time = "2026-05-07T16:13:18.596Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/7f/3e/5db95bcf282c52709639744ca2a8b149baccf648e39c8cc87553df9eae0c/urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897", size = 131087, upload-time = "2026-05-07T16:13:17.151Z" },
]

View File

@ -0,0 +1,169 @@
我需要从一个案件的content里面提取我想要的信息案件案件内容比较长可长达几万个字不等。但用的是本地ollama的小模型如何在这些有限条件下提高信息提取的准确度
# Extraction Rules (字段提取规则)
## 一、 主体与管辖信息
1. `plaintiff` (原告/申索人/上诉人/覆核申请人) [数组]
- 提取所有当事人的完整姓名或机构名(含中英文)。若无则输出空数组 `[]`
2. `defendant` (被告/答辩人/被上诉人) [数组]
- 提取所有被告的完整姓名或机构名。若无则输出空数组 `[]`
3. `jurisdiction_code` (司法区域代码) [字符串]
- 提取官方代码HKSCT, HKCFA, HKCA, HKCFI 等。
4. `jurisdiction_name` (司法区域名称) [字符串]
- 完整的法院或审裁处名称(如:香港特別行政區終審法院)。
5. `case_location` (案件地点) [数组]
- 提取与案件直接相关的**地理位置**。
- ✅ **必须包含**:司法管辖区(如 "香港特別行政區")。
- ✅ **可包含**:行政区域、城市、国家(如 "九龍", "Luxor City, Egypt")。
- ❌ **严禁提取**:法院名称(如"小額錢債審裁處")、具体建筑物或场所名称(如"通州街公園"、"某某大厦")。若原文无明确地点,仅输出司法管辖区。
## 二、 核心案件要素
6. `case_reason` (事由) [字符串]
- **字数强制限制**:绝对不可超过 100 字,必须浓缩为单句。
- **内容公式**`[原告身份] + [针对什么事件/行为] + [向谁] + [提出什么请求/诉讼]`。
- ⚠️ **覆核/上诉案件特例**:需指明对哪个裁决(含日期/案号)提出覆核、核心理由及法律依据。
- ❌ **严禁包含**:判决结果、法庭分析、详细案发经过。
7. `case_object` (案件标的物) [数组]
- 提取诉讼请求指向的**实体权利或利益**(如:私人財物損失賠償、精神困擾賠償、合同履行、房产所有权)。合并本质相同的标的物。
- ❌ **严禁提取**:证据材料(如单据、证人证词)、程序性诉求(如"要求法庭裁决"、"追究责任")。
## 三、 裁判与实体分析
8. `judgment_result` (判决结果) [对象数组]
- **拆分原则**:若案件涉及多项请求,或包含“责任判定”与“损失/金额计算”两个层面,**必须分条列出**。
- 子字段 `charge` [字符串]:具体的诉讼请求或覆核理由(请在括号内标注是“责任问题”还是“损失范围”)。
- 子字段 `result` [字符串]:明确的裁决结果(胜诉/败诉/部分胜诉/维持/撤销等),并简述 2-3 个关键法庭理由及具体金额/命令。
9. `judgment_summary` (判决总结) [字符串]
- **字数强制限制**:不得超过 300 字。
- **结构四要素**
(1) 案件背景1-2句话交代起因与当事人关系
(2) 核心争议焦点;
(3) **法庭法律分析与推理(核心重点)**:法庭如何评估证据?接受/拒绝主张的逻辑是什么?引用了哪些关键法律或判例?
(4) 最终裁决结果及命令。
- ⚠️ 必须使用 YAML 多行字符串语法(`>`)。
10. `involved_entities` (涉及实体) [对象数组]
- 仅提取与案件相关的**自然人、法人、组织、机构**。
- 子字段 `entity_name` [字符串]:实体全称(如 "林希維審裁官", "康樂文化事務署")。
- 子字段 `reason` [字符串]:说明其在案件中的角色与作用。
- ⚠️ **法官提取特例**:除主审法官外,**必须提取法庭引用的先例中所提及的法官**reason 需写明在XX案[案号]中担任XX职位阐述XX法律原则及引用的段落
- ❌ **严禁提取**:法案名、条例名、法律条文、纯案例名称、文献期刊名。
---
# Strict YAML Format Constraints (严格格式约束)
- 遇到包含冒号 `:`、井号 `#`、连字符 `-` 的字符串时,**必须**使用双引号 `""` 将其包裹,防止 YAML 解析错误。
- 对于长文本(`case_reason`, `judgment_summary`),请使用 `>` 折叠换行符。
# Output Template
请严格按照以下格式输出结果,不要输出任何多余的解释:
```yaml
plaintiff:
- ""
defendant:
- ""
jurisdiction_code: ""
jurisdiction_name: ""
case_location:
- ""
case_reason: >
[此处填入符合公式、限 100 字内的事由]
case_object:
- ""
judgment_result:
- charge: "[请求描述] (责任问题/损失范围)"
result: "[裁定结果及理由简述]"
judgment_summary: >
[此处填入包含四要素的判决总结,限 300 字内]
involved_entities:
- entity_name: ""
reason: ""
----------案件信息提取思路--------------
针对你的场景(本地小模型 + 数万字长文本 + 严格结构化抽取),单次喂入全文 + 一次性输出所有字段几乎注定失败。提升准确度的关键不在"换更好的提示词",而在**重构整个抽取管线**。以下是按优先级排列的策略:
## 一、核心思路:放弃"一次抽取所有字段"
小模型在长上下文里会严重"注意力稀释",必须把任务拆小。建议改成 **预处理 → 定位 → 分组抽取 → 校验** 四阶段管线。
## 二、预处理:先用规则把文本"压缩"和"切片"
在送入模型前,用纯代码做这些事,能让有效输入缩短 70% 以上:
- **去噪**删除页眉页脚、页码、空行、重复的法院抬头、英文与中文重复段落HK 判决书常见双语,可只保留中文版本,整段省一半)。
- **结构化切段**:判决书有强格式特征,用正则切分出:`【案件编号】`、`【当事人】`、`【判决理由】`、`【判决结果】`、`【命令】` 等区块,做成 `dict[section_name, text]`
- **元数据用规则直接抽**:以下字段根本不需要 LLM
- `jurisdiction_code` / `jurisdiction_name`:正则匹配 `HKSCT|HKCFA|...` 或法院全称。
- `case_location`从法院名映射HKSCT → 香港特別行政區)。
- 案件编号、日期、案号引用:正则。
- 规则能覆盖的字段准确率 ≈ 100%,不要浪费在模型上。
## 三、定位-再抽取Locate-then-Extract
每个字段都有"高信号区域",先定位再喂局部:
| 字段 | 高信号区域 |
|---|---|
| `plaintiff` / `defendant` | 文档前 5001000 字(标头部分) |
| `case_reason` | 首段 + "申索"/"訴因"/"本案涉及" 关键词附近段落 |
| `case_object` | "申索金额"、"索償"、"诉讼请求" 附近 |
| `judgment_result` | 文末 10002000 字("本席命令"、"判決"、"裁定" |
| `involved_entities`(先例法官) | 含"案[" / "v." / "[20XX]" 引用案例的句子 |
| `judgment_summary` | 全文,但可用提取出的其他字段拼接 |
实现:用关键词或 BM25/embedding 检索筛出每个字段的 Top-K 段落(一般 3-5 段2000 字以内),只把这部分喂给模型。
## 四、分组分次调用,而不是一次性全部输出
小模型一次吐 10 个字段一定有崩。把抽取分成 4-5 次独立调用:
1. **Call 1主体**plaintiff, defendant, jurisdiction用文档前部
2. **Call 2事由+标的)**case_reason, case_object用首段 + 申索段)
3. **Call 3判决结果**judgment_result用文末判决段
4. **Call 4涉及实体**involved_entities用先例引用段
5. **Call 5总结**judgment_summary把前 4 步结果拼回去喂模型生成,而不是从原文重新生成)
第 5 步特别关键——总结的输入是已抽取的结构化数据,而不是几万字原文,质量会跳一档。
## 五、用 Ollama 的强制结构化输出
Ollama 原生支持 `format: "json"`(或 `format` 传 JSON Schema强制模型只输出合法 JSON。**先输出 JSON再用代码转 YAML**,比让小模型直接写 YAML 稳定得多——小模型在 YAML 缩进、`>` 折叠、`""` 转义上极易出错。
每个 Call 用对应的子 schema 约束,例如 Call 1 的 schema 只包含 4 个字段,模型几乎无法跑偏。
## 六、Few-shot 示例(每个 Call 1-2 个就够)
给一个完整正例 + 一个反例("❌ 不要这样做"),效果远胜纯规则描述。例如对 `case_location`
```
✅ 正确:["香港特別行政區", "九龍"]
❌ 错误:["小額錢債審裁處", "通州街公園"] ← 这是法院名和场所名,不是地点
```
小模型不擅长抽象规则,但能很好地模仿示例。
## 七、后处理校验 + 失败重试
每个 Call 出来后用代码校验:
- **格式校验**JSON 解析、字段类型、数组非空规则。
- **业务校验**`case_reason` 字数 ≤ 100`judgment_summary` ≤ 300`case_location` 不含法院关键词(用黑名单 `["法院", "法庭", "審裁處"]``involved_entities` 不含法案名(黑名单 `["條例", "法案", "Cap."]`)。
- **失败时**:把校验错误信息回灌给模型重试一次("上次输出超过 100 字,请精简到 80 字以内"),通常一次重试就能修正。
## 八、可选增强
- **温度调到 0**`temperature: 0`),抽取任务不要随机性。
- **量化精度**:如果用的是 Q4 量化,换 Q5_K_M 或 Q6_K结构化抽取对量化精度敏感提升明显。
- **模型选型**同尺寸下Qwen2.5、GLM4 系列在中文抽取任务上明显强于 Llama 系7B 量级的 Qwen2.5-7B-Instruct 在你这类任务上是甜点。
- **自一致性**:对关键字段跑 2-3 次取众数,能压一部分波动(代价是延迟翻倍)。
## 九、推荐的最小可行落地顺序
如果只让你做三件事,按这个顺序做收益最大:
1. **正则抽元数据 + 文档分段**(一天工作量,准确率拉满)。
2. **改成分 4-5 次调用 + JSON 强制输出**(半天,崩溃率断崖式下降)。
3. **每字段加 Few-shot + 校验重试**(一天,长尾错误大幅减少)。
做完这三步,小模型也能在长判决书上跑出可用准确率。剩下的 prompt 微调和模型选型是锦上添花。