From 011adb00d7caaec3c37f721b1184333514493376 Mon Sep 17 00:00:00 2001 From: fengruixiang <474182370@qq.com> Date: Wed, 13 May 2026 18:28:44 +0800 Subject: [PATCH] update --- .gitignore | 4 +- .python-version | 1 + README.md | 0 hk_case_extractor.py | 776 +++++++++++++++++++++++++++++++++++++++++++ main.py | 6 + pyproject.toml | 10 + uv.lock | 153 +++++++++ 案件信息提取思路.md | 169 ++++++++++ 8 files changed, 1118 insertions(+), 1 deletion(-) create mode 100644 .python-version create mode 100644 README.md create mode 100644 hk_case_extractor.py create mode 100644 main.py create mode 100644 pyproject.toml create mode 100644 uv.lock create mode 100644 案件信息提取思路.md diff --git a/.gitignore b/.gitignore index 6d45c0a..11df575 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ *.doc *.docx -*.html \ No newline at end of file +*.html + +result.yaml \ No newline at end of file diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..24ee5b1 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.13 diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/hk_case_extractor.py b/hk_case_extractor.py new file mode 100644 index 0000000..c625226 --- /dev/null +++ b/hk_case_extractor.py @@ -0,0 +1,776 @@ +""" +hk_case_extractor.py +========================================================== +香港判決書結構化字段抽取管線 +基於 本地 Ollama 小模型 + 分階段抽取 + JSON Schema 強制 + 校驗重試 + +設計理念 +-------- +不直接把幾萬字餵給模型,而是: + 1. 預處理:規則去噪 + 切段,純規則抽司法區域/案號等高確定性字段 + 2. 定位:每個字段只截取對應的高信號區段(通常 < 4k 字) + 3. 分組抽取:拆成 5 次獨立 Ollama 調用,每次只負責 1-3 個字段 + 4. Schema 強制:用 Ollama 0.5+ 的 format= 約束輸出 + 5. 校驗+重試:對字數、黑名單、結構標註逐項校驗 + 6. judgment_summary 不從原文重生,而從前 4 步結果 + 一段分析段生成 + +依賴 +---- + pip install requests pyyaml + 本地需運行:ollama serve + 模型:ollama pull qwen2.5:7b-instruct (推薦,中文抽取甜點) + 或 ollama pull glm4:9b + +使用 +---- + python hk_case_extractor.py + python hk_case_extractor.py case.txt --model qwen2.5:7b-instruct --out result.yaml +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import requests +import yaml + + +# ============================================================================= +# 配置 +# ============================================================================= + +OLLAMA_URL = "https://openai.iconsz.com/ollama3090/api/chat" +DEFAULT_MODEL = "qwen2.5:7b-instruct" +DEFAULT_TIMEOUT = 240 +MAX_RETRIES = 2 + + +# ============================================================================= +# 1. 預處理:去噪 + 切段 + 規則抽元數據 +# ============================================================================= + +JURISDICTION_MAP: dict[str, str] = { + "HKCFA": "香港特別行政區終審法院", + "HKCA": "香港特別行政區高等法院上訴法庭", + "HKCFI": "香港特別行政區高等法院原訟法庭", + "HKDC": "香港特別行政區區域法院", + "HKMC": "香港特別行政區裁判法院", + "HKMagC": "香港特別行政區裁判法院", + "HKSCT": "香港特別行政區小額錢債審裁處", + "HKLT": "香港特別行政區土地審裁處", + "HKLD": "香港特別行政區勞資審裁處", + "HKCT": "香港特別行政區競爭事務審裁處", + "HKCorC": "香港特別行政區死因裁判法庭", +} + +NEUTRAL_CITATION_RE = re.compile( + r"\[(\d{4})\]\s*(HKCFA|HKCA|HKCFI|HKDC|HKMagC|HKMC|HKSCT|HKLT|HKLD|HKCT|HKCorC)\s*(\d+)", + re.I, +) +CASE_NO_RE = re.compile( + r"(FACV|FACC|FAMV|FAMC|CACV|CACC|HCA|HCAL|HCMP|HCCW|HCB|DCCJ|DCMP|SCTC|LBTC|LDPD)" + r"\s*(?:NO\.?)?\s*\d+\s*(?:OF|/|\sof\s)\s*\d{4}", + re.I, +) + + +def clean_text(raw: str) -> str: + """去頁眉頁腳、頁碼、多餘空行/空格""" + t = raw + t = re.sub(r"Page\s+\d+\s+of\s+\d+", "", t, flags=re.I) + t = re.sub(r"^\s*-\s*\d+\s*-\s*$", "", t, flags=re.M) + t = re.sub(r" +", " ", t) # 全角空格 + t = re.sub(r"[ \t]+", " ", t) + t = re.sub(r"\n{3,}", "\n\n", t) + return t.strip() + + +def extract_metadata_by_rule(text: str) -> dict[str, Any]: + """純規則:司法區域、案號、案件地點(默認香港特區)""" + meta: dict[str, Any] = { + "jurisdiction_code": None, + "jurisdiction_name": None, + "case_location": ["香港特別行政區"], + "case_number": None, + } + + if m := NEUTRAL_CITATION_RE.search(text): + code = m.group(2).upper() + # 規範化大小寫 + for k in JURISDICTION_MAP: + if k.upper() == code: + meta["jurisdiction_code"] = k + meta["jurisdiction_name"] = JURISDICTION_MAP[k] + break + + # fallback:靠法院全稱反查 + if not meta["jurisdiction_code"]: + for code, full in JURISDICTION_MAP.items(): + short = full.replace("香港特別行政區", "") + if full in text or short in text: + meta["jurisdiction_code"] = code + meta["jurisdiction_name"] = full + break + + if m := CASE_NO_RE.search(text): + meta["case_number"] = re.sub(r"\s+", " ", m.group(0).strip()) + + return meta + + +# ----------------------------------------------------------------------------- +# 關鍵詞 + 窗口召回(取代脆弱的正則切段) +# ----------------------------------------------------------------------------- +# 思路:每個抽取目標定義一組「高信號關鍵詞」,掃全文取所有命中位置周圍 +# ±half_window 字符的窗口,合併重疊後拼接喂給 LLM。 +# 不依賴判決書的固定章節標題,對結構各異的香港判決書都能工作。 + +KEYWORD_GROUPS: dict[str, list[str]] = { + # Call 1:當事人。集中在抬頭,但也可能在 "本案中,原告..." 等行文中出現 + "parties": [ + "BETWEEN", "介乎", " AND ", + "申索人", "原告", "原訴人", "上訴人", "覆核申請人", "覆核人", "申請人", + "被告", "答辯人", "被上訴人", "答辯方", + "Plaintiff", "Defendant", "Appellant", "Respondent", "Applicant", + ], + # Call 2:事由與標的 + "reason_object": [ + # 段落標題類 + "案情", "背景", "引言", "事實", "案件背景", + "INTRODUCTION", "BACKGROUND", "FACTS", "THE FACTS", + # 主張類 + "申索", "索償", "訴因", "聲稱", "請求", "本案涉及", "本訴訟", + "原告聲稱", "申索人聲稱", "申索人指稱", + # 標的物關鍵詞 + "賠償", "損失", "損害", "精神困擾", "經濟損失", "醫療費", + "履行", "所有權", "占有", "撤銷", "宣告", + ], + # Call 3:判決結果 + "judgment_result": [ + # 命令類 + "本席命令", "本庭命令", "本席裁定", "本庭裁定", "命令如下", + "DISPOSITION", "ORDER", "CONCLUSION", "I therefore order", + # 結果類 + "判決", "判給", "獲判", "判處", "支付", + "勝訴", "敗訴", "部分勝訴", "駁回", "撤銷", "維持", "發還", + "ALLOWED", "DISMISSED", "GRANTED", "REFUSED", + # 收尾連接詞 + "因此", "故此", "綜上", + ], + # Call 4:涉及實體(法官、律師、引用案例中的法官) + "entities": [ + # 法官稱謂 + "法官", "大法官", "審裁官", "裁判官", "首席法官", "常任法官", "非常任法官", + "Hon.", " J.", " JA ", " CJ ", " PJ ", " NPJ ", + # 代表類 + "代表", "大律師", "律師", "資深大律師", "代表律師", + "Counsel", "Solicitor", + # 案例引用(會在周邊帶出法官名) + " v ", " v. ", "[19", "[20", + ], + # Call 5:法庭分析(用於 summary 的核心輸入) + # 注意:避免使用"分析""理由""引用"等單/雙字泛詞——這些在程序性段落、 + # 目錄、引文索引中也會大量出現,會把召回擴散成全文。 + # 改用判決書分析段獨有的「本席/本庭 + 動詞」短語錨點。 + "analysis": [ + "本席認為", "本席接納", "本席不接納", "本席同意", "本席不同意", + "本席裁定", "本席拒絕", "本席認同", + "本庭認為", "本庭接納", "本庭裁定", + "I find", "I accept", "I do not accept", "I conclude", + "The court finds", "In my view", "In my judgment", + "舉證責任", "審慎責任", "鄰人原則", "替代責任", + "違反", "侵權", "過失", + ], +} + + +def gather_chunks(text: str, + keywords: list[str], + half_window: int = 500, + max_total: int = 6500, + case_sensitive: bool = False) -> tuple[str, int]: + """ + 召回所有 keywords 命中位置周圍 ±half_window 字符的窗口, + 合併重疊區間,按位置順序拼接,總長不超過 max_total。 + + 返回:(拼接後文本, 命中關鍵詞數) + 若無命中,fallback 返回文檔前 max_total 字。 + """ + if not text: + return "", 0 + + flags = 0 if case_sensitive else re.IGNORECASE + hits: list[tuple[int, int]] = [] + for kw in keywords: + for m in re.finditer(re.escape(kw), text, flags=flags): + s = max(0, m.start() - half_window) + e = min(len(text), m.end() + half_window) + hits.append((s, e)) + + if not hits: + return text[:max_total], 0 + + # 合併重疊區間 + hits.sort() + merged: list[list[int]] = [] + for s, e in hits: + if merged and s <= merged[-1][1]: + merged[-1][1] = max(merged[-1][1], e) + else: + merged.append([s, e]) + + # 按位置順序拼接,控制總長 + pieces: list[str] = [] + total = 0 + for s, e in merged: + seg_len = e - s + if total + seg_len > max_total: + remain = max_total - total + if remain > 200: + pieces.append(text[s:s + remain]) + break + pieces.append(text[s:e]) + total += seg_len + + return "\n\n[…]\n\n".join(pieces), len(hits) + + +def gather_all(text: str) -> dict[str, str]: + """為每個 group 召回對應的上下文片段""" + # 各 group 的窗口大小可微調(parties 偏短,judgment_result 偏密) + params: dict[str, tuple[int, int]] = { + "parties": (400, 3000), + "reason_object": (500, 6000), + "judgment_result": (500, 6500), + "entities": (400, 6500), + "analysis": (500, 6500), + } + out: dict[str, str] = {} + for group, kws in KEYWORD_GROUPS.items(): + hw, mt = params[group] + ctx, hits = gather_chunks(text, kws, half_window=hw, max_total=mt) + out[group] = ctx + # 同時保留命中數,便於日誌 + out[f"_{group}_hits"] = str(hits) + return out + + +# ============================================================================= +# 2. Ollama 客戶端:JSON Schema 強制 + 重試 +# ============================================================================= + +@dataclass +class OllamaClient: + model: str = DEFAULT_MODEL + url: str = OLLAMA_URL + timeout: int = DEFAULT_TIMEOUT + + def chat_json(self, system: str, user: str, schema: dict, + temperature: float = 0.0, + num_ctx: int = 8192) -> dict: + """調用 Ollama,使用 format= 強制結構化輸出""" + payload = { + "model": self.model, + "messages": [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + "format": schema, + "stream": False, + "options": {"temperature": temperature, "num_ctx": num_ctx}, + } + r = requests.post(self.url, json=payload, timeout=self.timeout) + r.raise_for_status() + content = r.json()["message"]["content"] + try: + return json.loads(content) + except json.JSONDecodeError as e: + # 嘗試剝離可能的 ```json fence + stripped = re.sub(r"^```(?:json)?\s*|\s*```$", "", + content.strip(), flags=re.S) + return json.loads(stripped) + + def chat_json_with_retry(self, system: str, user: str, schema: dict, + validator=None, **kw) -> dict: + """validator(result) -> (ok: bool, hint: str);失敗則回灌 hint 重試""" + last_err = None + for attempt in range(MAX_RETRIES + 1): + try: + out = self.chat_json(system, user, schema, **kw) + if validator is None: + return out + ok, hint = validator(out) + if ok: + return out + # 回灌錯誤信息 + user = (f"{user}\n\n上次輸出存在問題:{hint}\n" + f"請修正後重新輸出。") + except Exception as e: + last_err = e + if last_err: + raise last_err + return out # type: ignore + + +# ============================================================================= +# 3. 五次抽取調用:每次只負責一組字段 +# ============================================================================= + +# --- Call 1: 當事人 ---------------------------------------------------------- + +PARTIES_SCHEMA = { + "type": "object", + "properties": { + "plaintiff": {"type": "array", "items": {"type": "string"}}, + "defendant": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["plaintiff", "defendant"], +} + +PARTIES_SYSTEM = """你是香港法律文書信息抽取助手。 +從給定的判決書抬頭中抽取所有當事人完整姓名/機構名。 +- 原告/申索人/上訴人/覆核申請人 → plaintiff +- 被告/答辯人/被上訴人 → defendant +- 保留中英文對照(如有) +- 某類無則輸出空數組 +只輸出符合 schema 的 JSON,不要解釋。""" + +PARTIES_FEWSHOT = """範例輸入: +BETWEEN + 陳大文 (CHAN TAI MAN) 上訴人 + AND + 香港房屋委員會 (Hong Kong Housing Authority) 答辯人 + +範例輸出: +{"plaintiff":["陳大文 (CHAN TAI MAN)"],"defendant":["香港房屋委員會 (Hong Kong Housing Authority)"]}""" + + +def extract_parties(client: OllamaClient, context: str) -> dict: + user = f"{PARTIES_FEWSHOT}\n\n請從以下片段(多處關鍵詞召回拼接)抽取:\n```\n{context[:3000]}\n```" + return client.chat_json_with_retry(PARTIES_SYSTEM, user, PARTIES_SCHEMA) + + +# --- Call 2: 事由 + 標的 ---------------------------------------------------- + +REASON_OBJECT_SCHEMA = { + "type": "object", + "properties": { + "case_reason": {"type": "string", "maxLength": 100}, + "case_object": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["case_reason", "case_object"], +} + +REASON_OBJECT_SYSTEM = """從香港判決書中抽取: + +1. case_reason(事由): + - 嚴格 ≤100 字,單句 + - 結構:[原告身份] + [針對什麼事件/行為] + [向誰] + [提出什麼請求] + - 覆核/上訴案件須註明對哪個裁決提出覆核(含日期/案號) + - 嚴禁包含:判決結果、法庭分析、案發細節 + +2. case_object(標的物): + - 訴訟請求指向的實體權利或利益 + - 例:人身傷害賠償、合同履行、房產所有權、精神困擾賠償 + - 合併本質相同的標的 + - 嚴禁:證據材料、程序性訴求(如"要求法庭裁決") + +只輸出 JSON。""" + +REASON_OBJECT_FEWSHOT = """範例輸出: +{"case_reason":"申索人為商場保安員,就被告於2023年7月在商場毆打申索人造成的人身傷害向被告提出損害賠償申索。","case_object":["人身傷害賠償","醫療費用","精神困擾賠償"]}""" + + +def _reason_object_validator(out: dict) -> tuple[bool, str]: + r = out.get("case_reason", "") + if len(r) > 100: + return False, f"case_reason 共 {len(r)} 字,超過 100 字上限,請壓縮到 80 字以內。" + if not out.get("case_object"): + return False, "case_object 不能為空。" + return True, "" + + +def extract_reason_object(client: OllamaClient, context: str) -> dict: + user = (f"{REASON_OBJECT_FEWSHOT}\n\n" + f"請從以下片段(多處關鍵詞召回拼接)抽取:\n```\n{context[:6000]}\n```") + out = client.chat_json_with_retry(REASON_OBJECT_SYSTEM, user, + REASON_OBJECT_SCHEMA, + validator=_reason_object_validator) + if len(out["case_reason"]) > 100: + out["case_reason"] = out["case_reason"][:100] + return out + + +# --- Call 3: 判決結果 ------------------------------------------------------- + +JUDGMENT_RESULT_SCHEMA = { + "type": "object", + "properties": { + "judgment_result": { + "type": "array", + "items": { + "type": "object", + "properties": { + "charge": {"type": "string"}, + "result": {"type": "string"}, + }, + "required": ["charge", "result"], + }, + } + }, + "required": ["judgment_result"], +} + +JUDGMENT_RESULT_SYSTEM = """從香港判決書命令/裁定部分抽取所有判決結果。 + +拆分原則: +- 多項請求 → 分條 +- "責任判定" 與 "損失/金額計算" 兩個層面 → 必須分條 +- 每條 charge 必須以 "(責任問題)" 或 "(損失範圍)" 結尾標註層次 +- result 必須包含: + a) 明確結果(勝訴/敗訴/部分勝訴/維持/撤銷/駁回等) + b) 2-3 個關鍵法庭理由 + c) 具體金額、利率或命令內容(如有) + +只輸出 JSON。""" + +JUDGMENT_RESULT_FEWSHOT = """範例輸出: +{"judgment_result":[ + {"charge":"申索人就毆打事件的人身傷害索償 (責任問題)","result":"勝訴。法庭接納申索人證供可信,閉路電視顯示被告先動手,被告亦承認部分情節。"}, + {"charge":"醫療費及精神困擾賠償金額 (損失範圍)","result":"部分勝訴。判給醫療費HK$8,500及一般損害賠償HK$20,000,合共HK$28,500,連同利息及訟費。"} +]}""" + + +def _judgment_validator(out: dict) -> tuple[bool, str]: + items = out.get("judgment_result", []) + if not items: + return False, "judgment_result 不能為空。" + bad = [i for i in items + if "責任問題" not in i.get("charge", "") + and "損失範圍" not in i.get("charge", "")] + if bad: + return False, (f"有 {len(bad)} 條 charge 未標註層次。" + f"每條 charge 必須以 '(責任問題)' 或 '(損失範圍)' 結尾。") + return True, "" + + +def extract_judgment_result(client: OllamaClient, context: str) -> dict: + user = (f"{JUDGMENT_RESULT_FEWSHOT}\n\n" + f"請從以下片段(多處關鍵詞召回拼接)抽取:\n```\n{context[:6500]}\n```") + return client.chat_json_with_retry(JUDGMENT_RESULT_SYSTEM, user, + JUDGMENT_RESULT_SCHEMA, + validator=_judgment_validator) + + +# --- Call 4: 涉及實體 ------------------------------------------------------- + +ENTITIES_SCHEMA = { + "type": "object", + "properties": { + "involved_entities": { + "type": "array", + "items": { + "type": "object", + "properties": { + "entity_name": {"type": "string"}, + "reason": {"type": "string"}, + }, + "required": ["entity_name", "reason"], + }, + } + }, + "required": ["involved_entities"], +} + +ENTITIES_SYSTEM = """從香港判決書中抽取所有相關實體(自然人/法人/組織/機構)。 + +必須包含: +- 主審法官 / 審裁官 +- 雙方代表律師、大律師 +- 判決中引用的先例所提及的法官 + reason 須寫明:在XX案[案號]中擔任XX職位,闡述XX法律原則 +- 涉案的政府部門、公司、機構 + +嚴禁包含: +- 法案/條例名(如《侵權條例》、Cap.xxx) +- 純案例名稱(如 Donoghue v Stevenson) +- 文獻、期刊名 + +只輸出 JSON。""" + +ENTITIES_FEWSHOT = """範例輸出: +{"involved_entities":[ + {"entity_name":"林希維審裁官","reason":"本案主審審裁官,負責認定事實及裁決。"}, + {"entity_name":"終審法院常任法官李義","reason":"在Tang Kwok Wah v HKSAR [2019] HKCFA 23 中擔任主筆法官,闡述舉證責任原則,本案第34段引用其判詞。"}, + {"entity_name":"康樂文化事務署","reason":"涉案場所通州街公園的管理機構。"} +]}""" + + +def _entities_validator(out: dict) -> tuple[bool, str]: + ents = out.get("involved_entities", []) + if not ents: + return False, "involved_entities 不能為空,至少要有主審法官。" + blacklist = ["條例", "Cap.", "法案"] + bad = [e["entity_name"] for e in ents + if any(k in e.get("entity_name", "") for k in blacklist)] + if bad: + return False, f"以下實體疑為條例/法案,應移除:{bad}" + return True, "" + + +def extract_entities(client: OllamaClient, context: str) -> dict: + user = (f"{ENTITIES_FEWSHOT}\n\n" + f"請從以下片段(多處關鍵詞召回拼接)抽取所有涉及實體:\n" + f"```\n{context[:6500]}\n```") + return client.chat_json_with_retry(ENTITIES_SYSTEM, user, + ENTITIES_SCHEMA, + validator=_entities_validator) + + +# --- Call 5: 判決總結(基於已抽取結果 + 分析段,不從原文重生) ----------- + +SUMMARY_SCHEMA = { + "type": "object", + "properties": { + "judgment_summary": {"type": "string", "maxLength": 300}, + }, + "required": ["judgment_summary"], +} + +SUMMARY_SYSTEM = """根據已抽取的結構化字段 + 法庭分析段,撰寫判決總結。 + +四要素結構(必須全部涵蓋,連貫成單段): +(1) 案件背景:1-2 句交代起因與當事人關係 +(2) 核心爭議焦點 +(3) 法庭法律分析與推理(核心重點): + - 如何評估證據? + - 接受 / 拒絕主張的邏輯? + - 引用了哪些關鍵法律或判例? +(4) 最終裁決結果及命令 + +嚴格 ≤300 字。只輸出 JSON。""" + + +def _summary_validator(out: dict) -> tuple[bool, str]: + s = out.get("judgment_summary", "") + if len(s) > 300: + return False, f"summary 共 {len(s)} 字,超過 300 字上限,請壓縮。" + if len(s) < 80: + return False, "summary 過短,請完整覆蓋四要素。" + return True, "" + + +def extract_summary(client: OllamaClient, + prior: dict, analysis: str) -> dict: + user = f"""已抽取的字段: +```json +{json.dumps(prior, ensure_ascii=False, indent=2)} +``` + +法庭分析節選: +``` +{analysis[:3500]} +``` + +請按四要素撰寫 ≤300 字的 judgment_summary。""" + out = client.chat_json_with_retry(SUMMARY_SYSTEM, user, SUMMARY_SCHEMA, + validator=_summary_validator) + if len(out["judgment_summary"]) > 300: + out["judgment_summary"] = out["judgment_summary"][:300] + return out + + +# ============================================================================= +# 4. 全局校驗與後處理 +# ============================================================================= + +LOCATION_BLACKLIST = [ + "法院", "法庭", "審裁處", "公園", "大廈", "大樓", "商場", + "街", "道路", "村", "中心", "醫院", "酒店", "車站", +] +ENTITY_NAME_BLACKLIST = ["條例", "Cap.", "法案", "案例彙編", "Reports", + "期刊", "Journal"] + + +def validate_and_fix(result: dict) -> tuple[dict, list[str]]: + warnings: list[str] = [] + + # case_location:剔除法院/場所/建築 + locs = result.get("case_location") or [] + cleaned = [l for l in locs + if l and not any(b in l for b in LOCATION_BLACKLIST)] + if "香港特別行政區" not in cleaned: + cleaned.insert(0, "香港特別行政區") + if set(cleaned) != set(locs): + warnings.append( + f"case_location 已清理:移除 {set(locs) - set(cleaned)}") + result["case_location"] = cleaned + + # 字數硬截斷 + if len(result.get("case_reason", "")) > 100: + warnings.append("case_reason > 100 字,已截斷") + result["case_reason"] = result["case_reason"][:100] + if len(result.get("judgment_summary", "")) > 300: + warnings.append("judgment_summary > 300 字,已截斷") + result["judgment_summary"] = result["judgment_summary"][:300] + + # involved_entities:剔除條例/文獻 + ents = result.get("involved_entities") or [] + cleaned_ents = [e for e in ents + if not any(k in e.get("entity_name", "") + for k in ENTITY_NAME_BLACKLIST)] + if len(cleaned_ents) != len(ents): + warnings.append( + f"involved_entities 移除 {len(ents) - len(cleaned_ents)} 條疑似條例/文獻") + result["involved_entities"] = cleaned_ents + + # judgment_result:補層次標註提示 + for jr in result.get("judgment_result", []) or []: + if ("責任問題" not in jr.get("charge", "") + and "損失範圍" not in jr.get("charge", "")): + warnings.append( + f"judgment_result 條目缺層次標註:{jr.get('charge', '')[:40]}") + + # 空字段告警 + for k in ("plaintiff", "defendant", "case_object", + "judgment_result", "involved_entities"): + if not result.get(k): + warnings.append(f"{k} 為空,請人工複核") + + return result, warnings + + +# ============================================================================= +# 5. 主管線 +# ============================================================================= + +def run_pipeline(text: str, model: str) -> dict: + log = lambda m: print(m, file=sys.stderr) + + log("[1/6] 預處理 + 關鍵詞召回...") + text = clean_text(text) + meta = extract_metadata_by_rule(text) + ctx = gather_all(text) + + log(f" 規則元數據:{meta}") + log(f" 召回片段:") + for g in ("parties", "reason_object", "judgment_result", + "entities", "analysis"): + log(f" {g:16s} len={len(ctx[g]):5d} hits={ctx[f'_{g}_hits']}") + + client = OllamaClient(model=model) + + log("[2/6] 抽取當事人...") + parties = extract_parties(client, ctx["parties"]) + + log("[3/6] 抽取事由與標的...") + reason_obj = extract_reason_object(client, ctx["reason_object"]) + + log("[4/6] 抽取判決結果...") + judgment = extract_judgment_result(client, ctx["judgment_result"]) + + log("[5/6] 抽取涉及實體...") + # 實體抽取上下文:當事人片段(含律師名)+ 引用片段 + entities_ctx = (ctx["parties"][:2500] + "\n\n[…]\n\n" + + ctx["entities"])[:6500] + entities = extract_entities(client, entities_ctx) + + interim_for_summary = { + **parties, **reason_obj, **judgment, **entities, + "jurisdiction_name": meta["jurisdiction_name"], + } + + log("[6/6] 撰寫判決總結...") + summary = extract_summary(client, interim_for_summary, ctx["analysis"]) + + final = { + "plaintiff": parties["plaintiff"], + "defendant": parties["defendant"], + "jurisdiction_code": meta["jurisdiction_code"], + "jurisdiction_name": meta["jurisdiction_name"], + "case_location": meta["case_location"], + "case_reason": reason_obj["case_reason"], + "case_object": reason_obj["case_object"], + "judgment_result": judgment["judgment_result"], + "judgment_summary": summary["judgment_summary"], + "involved_entities": entities["involved_entities"], + } + + final, warnings = validate_and_fix(final) + for w in warnings: + log(f" ⚠️ {w}") + + return final + + +# ============================================================================= +# 6. YAML 輸出(長字串用 > 折疊;含特殊字符的自動雙引號) +# ============================================================================= + +class FoldedStr(str): + """標記為 YAML > 折疊樣式""" + + +def _folded_str_representer(dumper, data): + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=">") + + +def _safe_str_representer(dumper, data): + """含 :, #, - 開頭的字符串強制雙引號""" + if data and (":" in data or data.startswith("#") or data.startswith("- ")): + return dumper.represent_scalar("tag:yaml.org,2002:str", data, + style='"') + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + +yaml.add_representer(FoldedStr, _folded_str_representer) +yaml.add_representer(str, _safe_str_representer) + + +def to_yaml(result: dict) -> str: + if result.get("case_reason"): + result["case_reason"] = FoldedStr(result["case_reason"]) + if result.get("judgment_summary"): + result["judgment_summary"] = FoldedStr(result["judgment_summary"]) + return yaml.dump(result, allow_unicode=True, sort_keys=False, + default_flow_style=False, width=100) + + +# ============================================================================= +# CLI +# ============================================================================= + +def main() -> None: + ap = argparse.ArgumentParser( + description="香港判決書結構化抽取(本地 Ollama 版)") + ap.add_argument("input", help="判決書文本路徑(.txt)") + ap.add_argument("--model", default=DEFAULT_MODEL, help="Ollama 模型名") + ap.add_argument("--out", default=None, help="輸出 YAML 路徑(默認 stdout)") + ap.add_argument("--debug-dump", default=None, + help="額外輸出原始 JSON 結果到該路徑(便於 diff)") + args = ap.parse_args() + + text = Path(args.input).read_text(encoding="utf-8") + result = run_pipeline(text, args.model) + + if args.debug_dump: + Path(args.debug_dump).write_text( + json.dumps(result, ensure_ascii=False, indent=2), + encoding="utf-8") + + yaml_str = to_yaml(result) + if args.out: + Path(args.out).write_text(yaml_str, encoding="utf-8") + print(f"\n✅ 已寫入 {args.out}", file=sys.stderr) + else: + print(yaml_str) + + +if __name__ == "__main__": + main() diff --git a/main.py b/main.py new file mode 100644 index 0000000..ab04354 --- /dev/null +++ b/main.py @@ -0,0 +1,6 @@ +def main(): + print("Hello from hklii-samples!") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..84a479e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,10 @@ +[project] +name = "hklii-samples" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.13" +dependencies = [ + "pyyaml>=6.0.3", + "requests>=2.34.0", +] diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..29b8b82 --- /dev/null +++ b/uv.lock @@ -0,0 +1,153 @@ +version = 1 +revision = 2 +requires-python = ">=3.13" + +[[package]] +name = "certifi" +version = "2026.4.22" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/25/ee/6caf7a40c36a1220410afe15a1cc64993a1f864871f698c0f93acb72842a/certifi-2026.4.22.tar.gz", hash = "sha256:8d455352a37b71bf76a79caa83a3d6c25afee4a385d632127b6afb3963f1c580", size = 137077, upload-time = "2026-04-22T11:26:11.191Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl", hash = "sha256:3cb2210c8f88ba2318d29b0388d1023c8492ff72ecdde4ebdaddbb13a31b1c4a", size = 135707, upload-time = "2026-04-22T11:26:09.372Z" }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e7/a1/67fe25fac3c7642725500a3f6cfe5821ad557c3abb11c9d20d12c7008d3e/charset_normalizer-3.4.7.tar.gz", hash = "sha256:ae89db9e5f98a11a4bf50407d4363e7b09b31e55bc117b4f7d80aab97ba009e5", size = 144271, upload-time = "2026-04-02T09:28:39.342Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/3b/66777e39d3ae1ddc77ee606be4ec6d8cbd4c801f65e5a1b6f2b11b8346dd/charset_normalizer-3.4.7-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:f496c9c3cc02230093d8330875c4c3cdfc3b73612a5fd921c65d39cbcef08063", size = 309627, upload-time = "2026-04-02T09:26:45.198Z" }, + { url = "https://files.pythonhosted.org/packages/2e/4e/b7f84e617b4854ade48a1b7915c8ccfadeba444d2a18c291f696e37f0d3b/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ea948db76d31190bf08bd371623927ee1339d5f2a0b4b1b4a4439a65298703c", size = 207008, upload-time = "2026-04-02T09:26:46.824Z" }, + { url = "https://files.pythonhosted.org/packages/c4/bb/ec73c0257c9e11b268f018f068f5d00aa0ef8c8b09f7753ebd5f2880e248/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a277ab8928b9f299723bc1a2dabb1265911b1a76341f90a510368ca44ad9ab66", size = 228303, upload-time = "2026-04-02T09:26:48.397Z" }, + { url = "https://files.pythonhosted.org/packages/85/fb/32d1f5033484494619f701e719429c69b766bfc4dbc61aa9e9c8c166528b/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3bec022aec2c514d9cf199522a802bd007cd588ab17ab2525f20f9c34d067c18", size = 224282, upload-time = "2026-04-02T09:26:49.684Z" }, + { url = "https://files.pythonhosted.org/packages/fa/07/330e3a0dda4c404d6da83b327270906e9654a24f6c546dc886a0eb0ffb23/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e044c39e41b92c845bc815e5ae4230804e8e7bc29e399b0437d64222d92809dd", size = 215595, upload-time = "2026-04-02T09:26:50.915Z" }, + { url = "https://files.pythonhosted.org/packages/e3/7c/fc890655786e423f02556e0216d4b8c6bcb6bdfa890160dc66bf52dee468/charset_normalizer-3.4.7-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:f495a1652cf3fbab2eb0639776dad966c2fb874d79d87ca07f9d5f059b8bd215", size = 201986, upload-time = "2026-04-02T09:26:52.197Z" }, + { url = "https://files.pythonhosted.org/packages/d8/97/bfb18b3db2aed3b90cf54dc292ad79fdd5ad65c4eae454099475cbeadd0d/charset_normalizer-3.4.7-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e712b419df8ba5e42b226c510472b37bd57b38e897d3eca5e8cfd410a29fa859", size = 211711, upload-time = "2026-04-02T09:26:53.49Z" }, + { url = "https://files.pythonhosted.org/packages/6f/a5/a581c13798546a7fd557c82614a5c65a13df2157e9ad6373166d2a3e645d/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7804338df6fcc08105c7745f1502ba68d900f45fd770d5bdd5288ddccb8a42d8", size = 210036, upload-time = "2026-04-02T09:26:54.975Z" }, + { url = "https://files.pythonhosted.org/packages/8c/bf/b3ab5bcb478e4193d517644b0fb2bf5497fbceeaa7a1bc0f4d5b50953861/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:481551899c856c704d58119b5025793fa6730adda3571971af568f66d2424bb5", size = 202998, upload-time = "2026-04-02T09:26:56.303Z" }, + { url = "https://files.pythonhosted.org/packages/e7/4e/23efd79b65d314fa320ec6017b4b5834d5c12a58ba4610aa353af2e2f577/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f59099f9b66f0d7145115e6f80dd8b1d847176df89b234a5a6b3f00437aa0832", size = 230056, upload-time = "2026-04-02T09:26:57.554Z" }, + { url = "https://files.pythonhosted.org/packages/b9/9f/1e1941bc3f0e01df116e68dc37a55c4d249df5e6fa77f008841aef68264f/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:f59ad4c0e8f6bba240a9bb85504faa1ab438237199d4cce5f622761507b8f6a6", size = 211537, upload-time = "2026-04-02T09:26:58.843Z" }, + { url = "https://files.pythonhosted.org/packages/80/0f/088cbb3020d44428964a6c97fe1edfb1b9550396bf6d278330281e8b709c/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:3dedcc22d73ec993f42055eff4fcfed9318d1eeb9a6606c55892a26964964e48", size = 226176, upload-time = "2026-04-02T09:27:00.437Z" }, + { url = "https://files.pythonhosted.org/packages/6a/9f/130394f9bbe06f4f63e22641d32fc9b202b7e251c9aef4db044324dac493/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:64f02c6841d7d83f832cd97ccf8eb8a906d06eb95d5276069175c696b024b60a", size = 217723, upload-time = "2026-04-02T09:27:02.021Z" }, + { url = "https://files.pythonhosted.org/packages/73/55/c469897448a06e49f8fa03f6caae97074fde823f432a98f979cc42b90e69/charset_normalizer-3.4.7-cp313-cp313-win32.whl", hash = "sha256:4042d5c8f957e15221d423ba781e85d553722fc4113f523f2feb7b188cc34c5e", size = 148085, upload-time = "2026-04-02T09:27:03.192Z" }, + { url = "https://files.pythonhosted.org/packages/5d/78/1b74c5bbb3f99b77a1715c91b3e0b5bdb6fe302d95ace4f5b1bec37b0167/charset_normalizer-3.4.7-cp313-cp313-win_amd64.whl", hash = "sha256:3946fa46a0cf3e4c8cb1cc52f56bb536310d34f25f01ca9b6c16afa767dab110", size = 158819, upload-time = "2026-04-02T09:27:04.454Z" }, + { url = "https://files.pythonhosted.org/packages/68/86/46bd42279d323deb8687c4a5a811fd548cb7d1de10cf6535d099877a9a9f/charset_normalizer-3.4.7-cp313-cp313-win_arm64.whl", hash = "sha256:80d04837f55fc81da168b98de4f4b797ef007fc8a79ab71c6ec9bc4dd662b15b", size = 147915, upload-time = "2026-04-02T09:27:05.971Z" }, + { url = "https://files.pythonhosted.org/packages/97/c8/c67cb8c70e19ef1960b97b22ed2a1567711de46c4ddf19799923adc836c2/charset_normalizer-3.4.7-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:c36c333c39be2dbca264d7803333c896ab8fa7d4d6f0ab7edb7dfd7aea6e98c0", size = 309234, upload-time = "2026-04-02T09:27:07.194Z" }, + { url = "https://files.pythonhosted.org/packages/99/85/c091fdee33f20de70d6c8b522743b6f831a2f1cd3ff86de4c6a827c48a76/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1c2aed2e5e41f24ea8ef1590b8e848a79b56f3a5564a65ceec43c9d692dc7d8a", size = 208042, upload-time = "2026-04-02T09:27:08.749Z" }, + { url = "https://files.pythonhosted.org/packages/87/1c/ab2ce611b984d2fd5d86a5a8a19c1ae26acac6bad967da4967562c75114d/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:54523e136b8948060c0fa0bc7b1b50c32c186f2fceee897a495406bb6e311d2b", size = 228706, upload-time = "2026-04-02T09:27:09.951Z" }, + { url = "https://files.pythonhosted.org/packages/a8/29/2b1d2cb00bf085f59d29eb773ce58ec2d325430f8c216804a0a5cd83cbca/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:715479b9a2802ecac752a3b0efa2b0b60285cf962ee38414211abdfccc233b41", size = 224727, upload-time = "2026-04-02T09:27:11.175Z" }, + { url = "https://files.pythonhosted.org/packages/47/5c/032c2d5a07fe4d4855fea851209cca2b6f03ebeb6d4e3afdb3358386a684/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bd6c2a1c7573c64738d716488d2cdd3c00e340e4835707d8fdb8dc1a66ef164e", size = 215882, upload-time = "2026-04-02T09:27:12.446Z" }, + { url = "https://files.pythonhosted.org/packages/2c/c2/356065d5a8b78ed04499cae5f339f091946a6a74f91e03476c33f0ab7100/charset_normalizer-3.4.7-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:c45e9440fb78f8ddabcf714b68f936737a121355bf59f3907f4e17721b9d1aae", size = 200860, upload-time = "2026-04-02T09:27:13.721Z" }, + { url = "https://files.pythonhosted.org/packages/0c/cd/a32a84217ced5039f53b29f460962abb2d4420def55afabe45b1c3c7483d/charset_normalizer-3.4.7-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3534e7dcbdcf757da6b85a0bbf5b6868786d5982dd959b065e65481644817a18", size = 211564, upload-time = "2026-04-02T09:27:15.272Z" }, + { url = "https://files.pythonhosted.org/packages/44/86/58e6f13ce26cc3b8f4a36b94a0f22ae2f00a72534520f4ae6857c4b81f89/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e8ac484bf18ce6975760921bb6148041faa8fef0547200386ea0b52b5d27bf7b", size = 211276, upload-time = "2026-04-02T09:27:16.834Z" }, + { url = "https://files.pythonhosted.org/packages/8f/fe/d17c32dc72e17e155e06883efa84514ca375f8a528ba2546bee73fc4df81/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a5fe03b42827c13cdccd08e6c0247b6a6d4b5e3cdc53fd1749f5896adcdc2356", size = 201238, upload-time = "2026-04-02T09:27:18.229Z" }, + { url = "https://files.pythonhosted.org/packages/6a/29/f33daa50b06525a237451cdb6c69da366c381a3dadcd833fa5676bc468b3/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:2d6eb928e13016cea4f1f21d1e10c1cebd5a421bc57ddf5b1142ae3f86824fab", size = 230189, upload-time = "2026-04-02T09:27:19.445Z" }, + { url = "https://files.pythonhosted.org/packages/b6/6e/52c84015394a6a0bdcd435210a7e944c5f94ea1055f5cc5d56c5fe368e7b/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:e74327fb75de8986940def6e8dee4f127cc9752bee7355bb323cc5b2659b6d46", size = 211352, upload-time = "2026-04-02T09:27:20.79Z" }, + { url = "https://files.pythonhosted.org/packages/8c/d7/4353be581b373033fb9198bf1da3cf8f09c1082561e8e922aa7b39bf9fe8/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:d6038d37043bced98a66e68d3aa2b6a35505dc01328cd65217cefe82f25def44", size = 227024, upload-time = "2026-04-02T09:27:22.063Z" }, + { url = "https://files.pythonhosted.org/packages/30/45/99d18aa925bd1740098ccd3060e238e21115fffbfdcb8f3ece837d0ace6c/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7579e913a5339fb8fa133f6bbcfd8e6749696206cf05acdbdca71a1b436d8e72", size = 217869, upload-time = "2026-04-02T09:27:23.486Z" }, + { url = "https://files.pythonhosted.org/packages/5c/05/5ee478aa53f4bb7996482153d4bfe1b89e0f087f0ab6b294fcf92d595873/charset_normalizer-3.4.7-cp314-cp314-win32.whl", hash = "sha256:5b77459df20e08151cd6f8b9ef8ef1f961ef73d85c21a555c7eed5b79410ec10", size = 148541, upload-time = "2026-04-02T09:27:25.146Z" }, + { url = "https://files.pythonhosted.org/packages/48/77/72dcb0921b2ce86420b2d79d454c7022bf5be40202a2a07906b9f2a35c97/charset_normalizer-3.4.7-cp314-cp314-win_amd64.whl", hash = "sha256:92a0a01ead5e668468e952e4238cccd7c537364eb7d851ab144ab6627dbbe12f", size = 159634, upload-time = "2026-04-02T09:27:26.642Z" }, + { url = "https://files.pythonhosted.org/packages/c6/a3/c2369911cd72f02386e4e340770f6e158c7980267da16af8f668217abaa0/charset_normalizer-3.4.7-cp314-cp314-win_arm64.whl", hash = "sha256:67f6279d125ca0046a7fd386d01b311c6363844deac3e5b069b514ba3e63c246", size = 148384, upload-time = "2026-04-02T09:27:28.271Z" }, + { url = "https://files.pythonhosted.org/packages/94/09/7e8a7f73d24dba1f0035fbbf014d2c36828fc1bf9c88f84093e57d315935/charset_normalizer-3.4.7-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:effc3f449787117233702311a1b7d8f59cba9ced946ba727bdc329ec69028e24", size = 330133, upload-time = "2026-04-02T09:27:29.474Z" }, + { url = "https://files.pythonhosted.org/packages/8d/da/96975ddb11f8e977f706f45cddd8540fd8242f71ecdb5d18a80723dcf62c/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fbccdc05410c9ee21bbf16a35f4c1d16123dcdeb8a1d38f33654fa21d0234f79", size = 216257, upload-time = "2026-04-02T09:27:30.793Z" }, + { url = "https://files.pythonhosted.org/packages/e5/e8/1d63bf8ef2d388e95c64b2098f45f84758f6d102a087552da1485912637b/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:733784b6d6def852c814bce5f318d25da2ee65dd4839a0718641c696e09a2960", size = 234851, upload-time = "2026-04-02T09:27:32.44Z" }, + { url = "https://files.pythonhosted.org/packages/9b/40/e5ff04233e70da2681fa43969ad6f66ca5611d7e669be0246c4c7aaf6dc8/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a89c23ef8d2c6b27fd200a42aa4ac72786e7c60d40efdc76e6011260b6e949c4", size = 233393, upload-time = "2026-04-02T09:27:34.03Z" }, + { url = "https://files.pythonhosted.org/packages/be/c1/06c6c49d5a5450f76899992f1ee40b41d076aee9279b49cf9974d2f313d5/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6c114670c45346afedc0d947faf3c7f701051d2518b943679c8ff88befe14f8e", size = 223251, upload-time = "2026-04-02T09:27:35.369Z" }, + { url = "https://files.pythonhosted.org/packages/2b/9f/f2ff16fb050946169e3e1f82134d107e5d4ae72647ec8a1b1446c148480f/charset_normalizer-3.4.7-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:a180c5e59792af262bf263b21a3c49353f25945d8d9f70628e73de370d55e1e1", size = 206609, upload-time = "2026-04-02T09:27:36.661Z" }, + { url = "https://files.pythonhosted.org/packages/69/d5/a527c0cd8d64d2eab7459784fb4169a0ac76e5a6fc5237337982fd61347e/charset_normalizer-3.4.7-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3c9a494bc5ec77d43cea229c4f6db1e4d8fe7e1bbffa8b6f0f0032430ff8ab44", size = 220014, upload-time = "2026-04-02T09:27:38.019Z" }, + { url = "https://files.pythonhosted.org/packages/7e/80/8a7b8104a3e203074dc9aa2c613d4b726c0e136bad1cc734594b02867972/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8d828b6667a32a728a1ad1d93957cdf37489c57b97ae6c4de2860fa749b8fc1e", size = 218979, upload-time = "2026-04-02T09:27:39.37Z" }, + { url = "https://files.pythonhosted.org/packages/02/9a/b759b503d507f375b2b5c153e4d2ee0a75aa215b7f2489cf314f4541f2c0/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:cf1493cd8607bec4d8a7b9b004e699fcf8f9103a9284cc94962cb73d20f9d4a3", size = 209238, upload-time = "2026-04-02T09:27:40.722Z" }, + { url = "https://files.pythonhosted.org/packages/c2/4e/0f3f5d47b86bdb79256e7290b26ac847a2832d9a4033f7eb2cd4bcf4bb5b/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:0c96c3b819b5c3e9e165495db84d41914d6894d55181d2d108cc1a69bfc9cce0", size = 236110, upload-time = "2026-04-02T09:27:42.33Z" }, + { url = "https://files.pythonhosted.org/packages/96/23/bce28734eb3ed2c91dcf93abeb8a5cf393a7b2749725030bb630e554fdd8/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:752a45dc4a6934060b3b0dab47e04edc3326575f82be64bc4fc293914566503e", size = 219824, upload-time = "2026-04-02T09:27:43.924Z" }, + { url = "https://files.pythonhosted.org/packages/2c/6f/6e897c6984cc4d41af319b077f2f600fc8214eb2fe2d6bcb79141b882400/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:8778f0c7a52e56f75d12dae53ae320fae900a8b9b4164b981b9c5ce059cd1fcb", size = 233103, upload-time = "2026-04-02T09:27:45.348Z" }, + { url = "https://files.pythonhosted.org/packages/76/22/ef7bd0fe480a0ae9b656189ec00744b60933f68b4f42a7bb06589f6f576a/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ce3412fbe1e31eb81ea42f4169ed94861c56e643189e1e75f0041f3fe7020abe", size = 225194, upload-time = "2026-04-02T09:27:46.706Z" }, + { url = "https://files.pythonhosted.org/packages/c5/a7/0e0ab3e0b5bc1219bd80a6a0d4d72ca74d9250cb2382b7c699c147e06017/charset_normalizer-3.4.7-cp314-cp314t-win32.whl", hash = "sha256:c03a41a8784091e67a39648f70c5f97b5b6a37f216896d44d2cdcb82615339a0", size = 159827, upload-time = "2026-04-02T09:27:48.053Z" }, + { url = "https://files.pythonhosted.org/packages/7a/1d/29d32e0fb40864b1f878c7f5a0b343ae676c6e2b271a2d55cc3a152391da/charset_normalizer-3.4.7-cp314-cp314t-win_amd64.whl", hash = "sha256:03853ed82eeebbce3c2abfdbc98c96dc205f32a79627688ac9a27370ea61a49c", size = 174168, upload-time = "2026-04-02T09:27:49.795Z" }, + { url = "https://files.pythonhosted.org/packages/de/32/d92444ad05c7a6e41fb2036749777c163baf7a0301a040cb672d6b2b1ae9/charset_normalizer-3.4.7-cp314-cp314t-win_arm64.whl", hash = "sha256:c35abb8bfff0185efac5878da64c45dafd2b37fb0383add1be155a763c1f083d", size = 153018, upload-time = "2026-04-02T09:27:51.116Z" }, + { url = "https://files.pythonhosted.org/packages/db/8f/61959034484a4a7c527811f4721e75d02d653a35afb0b6054474d8185d4c/charset_normalizer-3.4.7-py3-none-any.whl", hash = "sha256:3dce51d0f5e7951f8bb4900c257dad282f49190fdbebecd4ba99bcc41fef404d", size = 61958, upload-time = "2026-04-02T09:28:37.794Z" }, +] + +[[package]] +name = "hklii-samples" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "pyyaml" }, + { name = "requests" }, +] + +[package.metadata] +requires-dist = [ + { name = "pyyaml", specifier = ">=6.0.3" }, + { name = "requests", specifier = ">=2.34.0" }, +] + +[[package]] +name = "idna" +version = "3.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/82/77/7b3966d0b9d1d31a36ddf1746926a11dface89a83409bf1483f0237aa758/idna-3.15.tar.gz", hash = "sha256:ca962446ea538f7092a95e057da437618e886f4d349216d2b1e294abfdb65fdc", size = 199245, upload-time = "2026-05-12T22:45:57.011Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/23/408243171aa9aaba178d3e2559159c24c1171a641aa83b67bdd3394ead8e/idna-3.15-py3-none-any.whl", hash = "sha256:048adeaf8c2d788c40fee287673ccaa74c24ffd8dcf09ffa555a2fbb59f10ac8", size = 72340, upload-time = "2026-05-12T22:45:55.733Z" }, +] + +[[package]] +name = "pyyaml" +version = "6.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" }, + { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" }, + { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" }, + { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" }, + { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" }, + { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" }, + { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" }, + { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" }, + { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" }, + { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" }, + { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" }, + { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" }, + { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" }, + { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" }, + { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" }, + { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" }, + { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" }, + { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" }, + { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" }, + { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" }, + { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" }, + { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, + { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, + { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, +] + +[[package]] +name = "requests" +version = "2.34.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/43/b8/7a707d60fea4c49094e40262cc0e2ca6c768cca21587e34d3f705afec47e/requests-2.34.0.tar.gz", hash = "sha256:7d62fe92f50eb82c529b0916bb445afa1531a566fc8f35ffdc64446e771b856a", size = 142436, upload-time = "2026-05-11T19:29:51.717Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/e6/e300fce5fe83c30520607a015dabd985df3251e188d234bfe9492e17a389/requests-2.34.0-py3-none-any.whl", hash = "sha256:917520a21b767485ce7c588f4ebb917c436b24a31231b44228715eaeb5a52c60", size = 73021, upload-time = "2026-05-11T19:29:49.923Z" }, +] + +[[package]] +name = "urllib3" +version = "2.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/53/0c/06f8b233b8fd13b9e5ee11424ef85419ba0d8ba0b3138bf360be2ff56953/urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c", size = 433602, upload-time = "2026-05-07T16:13:18.596Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/3e/5db95bcf282c52709639744ca2a8b149baccf648e39c8cc87553df9eae0c/urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897", size = 131087, upload-time = "2026-05-07T16:13:17.151Z" }, +] diff --git a/案件信息提取思路.md b/案件信息提取思路.md new file mode 100644 index 0000000..6822482 --- /dev/null +++ b/案件信息提取思路.md @@ -0,0 +1,169 @@ +我需要从一个案件的content里面提取我想要的信息案件,案件内容比较长,可长达几万个字不等。但用的是本地ollama的小模型,如何在这些有限条件下提高信息提取的准确度? + +# Extraction Rules (字段提取规则) + +## 一、 主体与管辖信息 +1. `plaintiff` (原告/申索人/上诉人/覆核申请人) [数组] + - 提取所有当事人的完整姓名或机构名(含中英文)。若无则输出空数组 `[]`。 +2. `defendant` (被告/答辩人/被上诉人) [数组] + - 提取所有被告的完整姓名或机构名。若无则输出空数组 `[]`。 +3. `jurisdiction_code` (司法区域代码) [字符串] + - 提取官方代码,如:HKSCT, HKCFA, HKCA, HKCFI 等。 +4. `jurisdiction_name` (司法区域名称) [字符串] + - 完整的法院或审裁处名称(如:香港特別行政區終審法院)。 +5. `case_location` (案件地点) [数组] + - 提取与案件直接相关的**地理位置**。 + - ✅ **必须包含**:司法管辖区(如 "香港特別行政區")。 + - ✅ **可包含**:行政区域、城市、国家(如 "九龍", "Luxor City, Egypt")。 + - ❌ **严禁提取**:法院名称(如"小額錢債審裁處")、具体建筑物或场所名称(如"通州街公園"、"某某大厦")。若原文无明确地点,仅输出司法管辖区。 + +## 二、 核心案件要素 +6. `case_reason` (事由) [字符串] + - **字数强制限制**:绝对不可超过 100 字,必须浓缩为单句。 + - **内容公式**:`[原告身份] + [针对什么事件/行为] + [向谁] + [提出什么请求/诉讼]`。 + - ⚠️ **覆核/上诉案件特例**:需指明对哪个裁决(含日期/案号)提出覆核、核心理由及法律依据。 + - ❌ **严禁包含**:判决结果、法庭分析、详细案发经过。 +7. `case_object` (案件标的物) [数组] + - 提取诉讼请求指向的**实体权利或利益**(如:私人財物損失賠償、精神困擾賠償、合同履行、房产所有权)。合并本质相同的标的物。 + - ❌ **严禁提取**:证据材料(如单据、证人证词)、程序性诉求(如"要求法庭裁决"、"追究责任")。 + +## 三、 裁判与实体分析 +8. `judgment_result` (判决结果) [对象数组] + - **拆分原则**:若案件涉及多项请求,或包含“责任判定”与“损失/金额计算”两个层面,**必须分条列出**。 + - 子字段 `charge` [字符串]:具体的诉讼请求或覆核理由(请在括号内标注是“责任问题”还是“损失范围”)。 + - 子字段 `result` [字符串]:明确的裁决结果(胜诉/败诉/部分胜诉/维持/撤销等),并简述 2-3 个关键法庭理由及具体金额/命令。 +9. `judgment_summary` (判决总结) [字符串] + - **字数强制限制**:不得超过 300 字。 + - **结构四要素**: + (1) 案件背景(1-2句话交代起因与当事人关系); + (2) 核心争议焦点; + (3) **法庭法律分析与推理(核心重点)**:法庭如何评估证据?接受/拒绝主张的逻辑是什么?引用了哪些关键法律或判例? + (4) 最终裁决结果及命令。 + - ⚠️ 必须使用 YAML 多行字符串语法(`>`)。 +10. `involved_entities` (涉及实体) [对象数组] + - 仅提取与案件相关的**自然人、法人、组织、机构**。 + - 子字段 `entity_name` [字符串]:实体全称(如 "林希維審裁官", "康樂文化事務署")。 + - 子字段 `reason` [字符串]:说明其在案件中的角色与作用。 + - ⚠️ **法官提取特例**:除主审法官外,**必须提取法庭引用的先例中所提及的法官**(reason 需写明:在XX案[案号]中担任XX职位,阐述XX法律原则及引用的段落)。 + - ❌ **严禁提取**:法案名、条例名、法律条文、纯案例名称、文献期刊名。 + +--- + +# Strict YAML Format Constraints (严格格式约束) +- 遇到包含冒号 `:`、井号 `#`、连字符 `-` 的字符串时,**必须**使用双引号 `""` 将其包裹,防止 YAML 解析错误。 +- 对于长文本(`case_reason`, `judgment_summary`),请使用 `>` 折叠换行符。 + +# Output Template +请严格按照以下格式输出结果,不要输出任何多余的解释: + +```yaml +plaintiff: + - "" +defendant: + - "" +jurisdiction_code: "" +jurisdiction_name: "" +case_location: + - "" +case_reason: > + [此处填入符合公式、限 100 字内的事由] +case_object: + - "" +judgment_result: + - charge: "[请求描述] (责任问题/损失范围)" + result: "[裁定结果及理由简述]" +judgment_summary: > + [此处填入包含四要素的判决总结,限 300 字内] +involved_entities: + - entity_name: "" + reason: "" + + +----------案件信息提取思路-------------- + +针对你的场景(本地小模型 + 数万字长文本 + 严格结构化抽取),单次喂入全文 + 一次性输出所有字段几乎注定失败。提升准确度的关键不在"换更好的提示词",而在**重构整个抽取管线**。以下是按优先级排列的策略: + +## 一、核心思路:放弃"一次抽取所有字段" + +小模型在长上下文里会严重"注意力稀释",必须把任务拆小。建议改成 **预处理 → 定位 → 分组抽取 → 校验** 四阶段管线。 + +## 二、预处理:先用规则把文本"压缩"和"切片" + +在送入模型前,用纯代码做这些事,能让有效输入缩短 70% 以上: + +- **去噪**:删除页眉页脚、页码、空行、重复的法院抬头、英文与中文重复段落(HK 判决书常见双语,可只保留中文版本,整段省一半)。 +- **结构化切段**:判决书有强格式特征,用正则切分出:`【案件编号】`、`【当事人】`、`【判决理由】`、`【判决结果】`、`【命令】` 等区块,做成 `dict[section_name, text]`。 +- **元数据用规则直接抽**:以下字段根本不需要 LLM: + - `jurisdiction_code` / `jurisdiction_name`:正则匹配 `HKSCT|HKCFA|...` 或法院全称。 + - `case_location`:从法院名映射(HKSCT → 香港特別行政區)。 + - 案件编号、日期、案号引用:正则。 +- 规则能覆盖的字段准确率 ≈ 100%,不要浪费在模型上。 + +## 三、定位-再抽取(Locate-then-Extract) + +每个字段都有"高信号区域",先定位再喂局部: + +| 字段 | 高信号区域 | +|---|---| +| `plaintiff` / `defendant` | 文档前 500–1000 字(标头部分) | +| `case_reason` | 首段 + "申索"/"訴因"/"本案涉及" 关键词附近段落 | +| `case_object` | "申索金额"、"索償"、"诉讼请求" 附近 | +| `judgment_result` | 文末 1000–2000 字("本席命令"、"判決"、"裁定") | +| `involved_entities`(先例法官) | 含"案[" / "v." / "[20XX]" 引用案例的句子 | +| `judgment_summary` | 全文,但可用提取出的其他字段拼接 | + +实现:用关键词或 BM25/embedding 检索筛出每个字段的 Top-K 段落(一般 3-5 段,2000 字以内),只把这部分喂给模型。 + +## 四、分组分次调用,而不是一次性全部输出 + +小模型一次吐 10 个字段一定有崩。把抽取分成 4-5 次独立调用: + +1. **Call 1(主体)**:plaintiff, defendant, jurisdiction(用文档前部) +2. **Call 2(事由+标的)**:case_reason, case_object(用首段 + 申索段) +3. **Call 3(判决结果)**:judgment_result(用文末判决段) +4. **Call 4(涉及实体)**:involved_entities(用先例引用段) +5. **Call 5(总结)**:judgment_summary(把前 4 步结果拼回去喂模型生成,而不是从原文重新生成) + +第 5 步特别关键——总结的输入是已抽取的结构化数据,而不是几万字原文,质量会跳一档。 + +## 五、用 Ollama 的强制结构化输出 + +Ollama 原生支持 `format: "json"`(或 `format` 传 JSON Schema),强制模型只输出合法 JSON。**先输出 JSON,再用代码转 YAML**,比让小模型直接写 YAML 稳定得多——小模型在 YAML 缩进、`>` 折叠、`""` 转义上极易出错。 + +每个 Call 用对应的子 schema 约束,例如 Call 1 的 schema 只包含 4 个字段,模型几乎无法跑偏。 + +## 六、Few-shot 示例(每个 Call 1-2 个就够) + +给一个完整正例 + 一个反例("❌ 不要这样做"),效果远胜纯规则描述。例如对 `case_location`: + +``` +✅ 正确:["香港特別行政區", "九龍"] +❌ 错误:["小額錢債審裁處", "通州街公園"] ← 这是法院名和场所名,不是地点 +``` + +小模型不擅长抽象规则,但能很好地模仿示例。 + +## 七、后处理校验 + 失败重试 + +每个 Call 出来后用代码校验: + +- **格式校验**:JSON 解析、字段类型、数组非空规则。 +- **业务校验**:`case_reason` 字数 ≤ 100;`judgment_summary` ≤ 300;`case_location` 不含法院关键词(用黑名单 `["法院", "法庭", "審裁處"]`);`involved_entities` 不含法案名(黑名单 `["條例", "法案", "Cap."]`)。 +- **失败时**:把校验错误信息回灌给模型重试一次("上次输出超过 100 字,请精简到 80 字以内"),通常一次重试就能修正。 + +## 八、可选增强 + +- **温度调到 0**(`temperature: 0`),抽取任务不要随机性。 +- **量化精度**:如果用的是 Q4 量化,换 Q5_K_M 或 Q6_K,结构化抽取对量化精度敏感,提升明显。 +- **模型选型**:同尺寸下,Qwen2.5、GLM4 系列在中文抽取任务上明显强于 Llama 系;7B 量级的 Qwen2.5-7B-Instruct 在你这类任务上是甜点。 +- **自一致性**:对关键字段跑 2-3 次取众数,能压一部分波动(代价是延迟翻倍)。 + +## 九、推荐的最小可行落地顺序 + +如果只让你做三件事,按这个顺序做收益最大: + +1. **正则抽元数据 + 文档分段**(一天工作量,准确率拉满)。 +2. **改成分 4-5 次调用 + JSON 强制输出**(半天,崩溃率断崖式下降)。 +3. **每字段加 Few-shot + 校验重试**(一天,长尾错误大幅减少)。 + +做完这三步,小模型也能在长判决书上跑出可用准确率。剩下的 prompt 微调和模型选型是锦上添花。 \ No newline at end of file