From bc2b9826d5ad6b4c0b6637f844bc265ba777d0c0 Mon Sep 17 00:00:00 2001 From: fengruixiang <474182370@qq.com> Date: Thu, 28 May 2026 12:28:39 +0800 Subject: [PATCH 1/2] prompt for professional legal terminology and enforce monetary subject extraction Add a standard-legal-terminology instruction to the reason/object, judgment-result, entities, and summary prompts (ZH + EN). Require case_object to carry specific monetary amounts, reinforced with a few-shot example and a context-aware validator that retries when the source mentions a sum but no amount was extracted. Co-Authored-By: Claude Opus 4.7 --- hk_case_extractor.py | 69 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/hk_case_extractor.py b/hk_case_extractor.py index ca5e71e..cb68fac 100644 --- a/hk_case_extractor.py +++ b/hk_case_extractor.py @@ -760,13 +760,19 @@ REASON_OBJECT_SYSTEM_ZH = f"""從香港判決書中抽取: 2. case_object(標的物): - 訴訟請求指向的實體權利或利益 - 例:汽車、黃金、珠寶、不動產產權、撫養權、人身傷害賠償、合同履行、房產所有權、精神困擾賠償、居留權 + - 涉及金錢標的時必須提取,並標明幣種與具體金額(如「拖欠貨款 HK$850,000」「索償 HK$1,000,000」「拖欠租金 HK$120,000」);金額未定或待評定者註明「金額待評定」 - 合併本質相同的標的 - 嚴禁:證據材料、程序性訴求(如"要求法庭裁決")、法律條文名稱 +用語要求:一律使用規範的法律專業用語(如「申索」「損害賠償」「違約」「侵權」「衡平法濟助」),避免口語化或不準確的表述。 + 只輸出 JSON。""" -REASON_OBJECT_FEWSHOT_ZH = """範例輸出: -{"case_reason":"申索人為商場保安員,就被告於2023年7月在商場毆打申索人造成的人身傷害向被告提出損害賠償申索。","case_object":["人身傷害賠償","醫療費用","精神困擾賠償"]}""" +REASON_OBJECT_FEWSHOT_ZH = """範例輸出1(人身傷害): +{"case_reason":"申索人為商場保安員,就被告於2023年7月在商場毆打申索人造成的人身傷害向被告提出損害賠償申索。","case_object":["人身傷害賠償","醫療費用","精神困擾賠償"]} + +範例輸出2(金錢申索,標的物須含具體金額): +{"case_reason":"原告就被告未支付2022年買賣合約項下的貨款,向被告提出追討欠款的申索。","case_object":["拖欠貨款 HK$850,000","合約利息","訟費"]}""" # 英文提示詞 REASON_OBJECT_SYSTEM_EN = f"""Extract from Hong Kong judgment: @@ -781,16 +787,37 @@ REASON_OBJECT_SYSTEM_EN = f"""Extract from Hong Kong judgment: 2. case_object (Subject Matter): - Tangible rights or interests targeted by the claim - Examples: vehicle, gold, jewelry, property title, custody, personal injury damages, contract performance, property ownership, distress damages, right of abode + - When a monetary subject matter is involved, it MUST be extracted with currency and the specific amount (e.g., "outstanding goods price HK$850,000", "claim of HK$1,000,000", "arrears of rent HK$120,000"); if the amount is unascertained, note "amount to be assessed" - Merge essentially identical subjects - MUST NOT include: evidentiary materials, procedural requests (e.g., "seeking court ruling"), names of statutes +Terminology requirement: consistently use precise, standard legal terminology (e.g., "claim", "damages", "breach of contract", "negligence", "equitable relief"); avoid colloquial or imprecise wording. + Output only JSON.""" -REASON_OBJECT_FEWSHOT_EN = """Example Output: -{"case_reason":"Plaintiff, a shopping mall security guard, claims damages from defendant for personal injuries caused by defendant's assault at the mall in July 2023.","case_object":["personal injury damages","medical expenses","distress damages"]}""" +REASON_OBJECT_FEWSHOT_EN = """Example Output 1 (personal injury): +{"case_reason":"Plaintiff, a shopping mall security guard, claims damages from defendant for personal injuries caused by defendant's assault at the mall in July 2023.","case_object":["personal injury damages","medical expenses","distress damages"]} + +Example Output 2 (monetary claim, subject matter must carry the specific amount): +{"case_reason":"Plaintiff claims against defendant for non-payment of the price of goods supplied under a 2022 sale and purchase contract.","case_object":["outstanding goods price HK$850,000","contractual interest","costs"]}""" -def _reason_object_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]: +# 金錢數額識別:幣種前綴 + 數字,或數字 + 中文金額單位 +MONEY_RE = re.compile( + r"(?:HK\$|US\$|RMB|MOP|港幣|港元|人民幣|美元|美金)\s*[\d,]+(?:\.\d+)?" + r"|[\$$]\s*[\d,]+(?:\.\d+)?" + r"|[\d,]+(?:\.\d+)?\s*(?:萬元|億元|元|萬|億)", + re.I, +) + + +def _object_has_amount(objs: list[str]) -> bool: + """case_object 中是否已含具體金額(任一項出現數字即視為已提取金額)""" + return any(re.search(r"\d", o or "") for o in (objs or [])) + + +def _reason_object_validator(out: dict, lang: str = 'zh', + context: str = "") -> tuple[bool, str]: r = out.get("case_reason", "") max_length = 100 if lang == 'zh' else 200 # 英文允許 2 倍字符數 target_length = 80 if lang == 'zh' else 160 # 建議壓縮目標 @@ -818,7 +845,19 @@ def _reason_object_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]: return False, f"case_reason 不應包含判決結果詞彙「{keyword}」,請只描述訴訟起因和請求。" else: return False, f"case_reason should not contain judgment result term '{keyword}', describe only cause and relief sought." - + + # 金錢標的物強制提取:原文出現金錢數額但 case_object 未含金額時要求補充 + if context and MONEY_RE.search(context) and not _object_has_amount(out.get("case_object", [])): + if lang == 'zh': + return False, ("原文出現金錢數額。若該數額屬於訴訟標的(如欠款、索償、賠償金額)," + "必須在 case_object 中提取並標明幣種與具體金額(如「拖欠貨款 HK$850,000」);" + "若僅為無關引用則可忽略。") + else: + return False, ("Monetary amounts appear in the source. If an amount forms part of the " + "subject matter (e.g., debt, claim, damages), it MUST be extracted in " + "case_object with currency and the specific figure (e.g., \"outstanding " + "goods price HK$850,000\"); ignore only if it is an unrelated citation.") + return True, "" @@ -837,7 +876,7 @@ def extract_reason_object(client: OpenAICompatibleClient, context: str, lang: st return client.chat_json_with_retry(system, user, schema, - validator=lambda x: _reason_object_validator(x, lang)) + validator=lambda x: _reason_object_validator(x, lang, context[:5000])) # --- Call 3: 判決結果 ------------------------------------------------------- @@ -875,7 +914,9 @@ JUDGMENT_RESULT_SYSTEM_ZH = """從香港判決書尾部的命令/裁定部分抽 - result 必須包含: a) 明確結果(勝訴/敗訴/部分勝訴/維持/撤銷/駁回/發還等) b) 2-3 個關鍵法庭理由(如有) - c) 具體金額、利率或命令內容(如有) + c) 具體金額、利率或命令內容;凡有判給/命令支付的金錢數額,必須原文照錄幣種與金額(如 HK$28,500),不得省略或約化 + +用語要求:一律使用規範的法律專業用語(如「判給」「訟費」「利息」「駁回」「發還重審」),避免口語化或不準確的表述。 只輸出 JSON。""" @@ -900,7 +941,9 @@ Splitting Principles: - result must include: a) Clear outcome (allowed/dismissed/partially allowed/upheld/quashed/remitted, etc.) b) 2-3 key court reasons (if any) - c) Specific amounts, interest rates or order details (if any) + c) Specific amounts, interest rates or order details; whenever a sum is awarded/ordered to be paid, the currency and figure MUST be reproduced verbatim (e.g., HK$28,500), never omitted or rounded + +Terminology requirement: consistently use precise, standard legal terminology (e.g., "awarded", "costs", "interest", "dismissed", "remitted for retrial"); avoid colloquial or imprecise wording. Output only JSON.""" @@ -998,6 +1041,8 @@ ENTITIES_SYSTEM_ZH = """從香港判決書中抽取所有相關實體(自然 - 純案例名稱(如 Donoghue v Stevenson) - 文獻、期刊名 +用語要求:reason 一律使用規範的法律專業用語(如「主審」「闡述」「先例」「判詞」),避免口語化或不準確的表述。 + 只輸出 JSON。""" ENTITIES_FEWSHOT_ZH = """範例輸出: @@ -1022,6 +1067,8 @@ MUST NOT include: - Pure case names (e.g., Donoghue v Stevenson) - Literature, journal names +Terminology requirement: write each reason in precise, standard legal terminology (e.g., "presiding", "articulated", "precedent", "judgment"); avoid colloquial or imprecise wording. + Output only JSON.""" ENTITIES_FEWSHOT_EN = """Example Output: @@ -1106,6 +1153,8 @@ SUMMARY_SYSTEM_ZH = """根據已抽取的結構化字段 + 法庭分析段,撰 **重要:judgment_summary 必須使用中文撰寫。** +用語要求:一律使用規範的法律專業用語;涉及金錢的判給或標的,須保留具體金額(含幣種)。 + 嚴格 ≤300 字。只輸出 JSON。""" # 英文提示詞 @@ -1122,6 +1171,8 @@ Four-element structure (must cover all, in coherent single paragraph): **IMPORTANT: judgment_summary MUST be written in English.** +Terminology requirement: consistently use precise, standard legal terminology; preserve specific monetary figures (with currency) for any award or subject matter involving money. + Strictly ≤500 characters. Output only JSON.""" From abcb2103f1fb3405a866266b561cda321aa5ccd7 Mon Sep 17 00:00:00 2001 From: fengruixiang <474182370@qq.com> Date: Thu, 28 May 2026 15:16:04 +0800 Subject: [PATCH 2/2] add multi-model concurrent extraction with per-model output files --config and --model now accept comma-separated lists to run several models on one document. Runs execute concurrently via a thread pool (--max-workers controls parallelism; 1 forces sequential), with logs prefixed by the config/model label. Each model writes to its own YAML/cost/debug files by inserting a sanitized label into the --out name (or the input name when --out is omitted), and cost is computed per-model from its own profile. Single-model behavior, including stdout output, is unchanged. Co-Authored-By: Claude Opus 4.7 --- hk_case_extractor.py | 237 +++++++++++++++++++++++++++++++++---------- 1 file changed, 183 insertions(+), 54 deletions(-) diff --git a/hk_case_extractor.py b/hk_case_extractor.py index cb68fac..f5c6218 100644 --- a/hk_case_extractor.py +++ b/hk_case_extractor.py @@ -68,6 +68,7 @@ import json import re import sys import time +from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass, field from pathlib import Path from typing import Any @@ -1319,8 +1320,9 @@ def run_pipeline(text: str, model: str, base_url: str, api_key: str, entities_window: int = 400, entities_max: int = 6500, analysis_window: int = 500, - analysis_max: int = 6500) -> tuple[dict, OpenAICompatibleClient]: - log = lambda m: print(m, file=sys.stderr) + analysis_max: int = 6500, + log_prefix: str = "") -> tuple[dict, OpenAICompatibleClient]: + log = lambda m: print(f"{log_prefix}{m}", file=sys.stderr) log("[0/7] 檢測語言...") lang = detect_language(text) @@ -1467,6 +1469,93 @@ def to_yaml(result: dict) -> str: default_flow_style=False, width=100) +# ============================================================================= +# 7. 多模型運行:解析運行規格 + 輸出路徑推導 +# ============================================================================= + +@dataclass +class RunSpec: + """單個模型的運行規格(標籤 + 連接參數 + 計費配置)""" + label: str # 用於日誌前綴與輸出文件名(配置名或模型名) + model: str + base_url: str + api_key: str + profile: dict | None = None # models.json 配置(用於成本計算),無則 None + + +def _safe_label(name: str) -> str: + """把配置/模型名轉成可安全用於文件名的標籤(如 anthropic/claude → anthropic_claude)""" + safe = re.sub(r"[^0-9A-Za-z._-]+", "_", name).strip("_") + return safe or "model" + + +def parse_run_specs(args) -> list[RunSpec]: + """解析 --config / --model(均支持逗號分隔)為一組運行規格。 + + 優先級: + - 有 --config:逐個從 models.json 加載配置(自帶 model/base_url/api_key/價格) + - 否則:用 --model(可逗號分隔多個),共用 --base-url / --api-key + """ + specs: list[RunSpec] = [] + if args.config: + names = [n.strip() for n in args.config.split(",") if n.strip()] + for name in names: + profile = load_model_profile(name, args.models_file) + specs.append(RunSpec( + label=name, + model=profile.get("model") or args.model, + base_url=profile.get("BaseApiUrl") or args.base_url, + api_key=profile.get("ApiKey") or args.api_key, + profile=profile, + )) + else: + names = [n.strip() for n in args.model.split(",") if n.strip()] or [args.model] + for name in names: + specs.append(RunSpec( + label=name, + model=name, + base_url=args.base_url, + api_key=args.api_key, + profile=None, + )) + return specs + + +def build_out_path(args, input_path: Path, label: str, multi: bool) -> Path | None: + """推導某個模型的 YAML 輸出路徑。 + + - 單模型:沿用原行為(--out 指定則用之,否則 None 表示輸出到 stdout) + - 多模型:在文件名中插入標籤;未給 --out 時用「輸入名_標籤.yaml」 + """ + if not multi: + return Path(args.out) if args.out else None + safe = _safe_label(label) + if args.out: + base = Path(args.out) + return base.with_name(f"{base.stem}_{safe}{base.suffix or '.yaml'}") + return input_path.with_name(f"{input_path.stem}_{safe}.yaml") + + +def build_cost_path(out_path: Path | None, input_path: Path, + label: str, multi: bool) -> Path: + """成本文件路徑:有輸出文件時用「輸出名_cost.json」,否則回退到輸入名。""" + if out_path is not None: + return out_path.with_name(out_path.stem + "_cost.json") + if multi: + return input_path.with_name(f"{input_path.stem}_{_safe_label(label)}_cost.json") + return input_path.with_name(input_path.stem + "_cost.json") + + +def build_debug_path(args, label: str, multi: bool) -> Path | None: + """debug-dump 路徑:多模型時在文件名中插入標籤。""" + if not args.debug_dump: + return None + base = Path(args.debug_dump) + if multi: + return base.with_name(f"{base.stem}_{_safe_label(label)}{base.suffix or '.json'}") + return base + + # ============================================================================= # CLI # ============================================================================= @@ -1495,6 +1584,17 @@ def main() -> None: --model gpt-4 \\ --api-key your-api-key + # 同時跑多個模型(逗號分隔),並發執行,分別輸出到不同文件 + python hk_case_extractor.py case.txt \\ + --config openrouter-claude-sonnet,openrouter-gpt4o,ollama-qwen \\ + --out result.yaml --cost + # 生成 result_openrouter-claude-sonnet.yaml / result_openrouter-gpt4o.yaml / ... + # 及各自的 *_cost.json + + # 多個本地 Ollama 模型共用同一端點 + python hk_case_extractor.py case.txt \\ + --model qwen2.5:7b-instruct,llama3.1:8b --out result.yaml + # 調整截取長度 python hk_case_extractor.py case.txt \\ --head-length 8000 \\ @@ -1506,21 +1606,24 @@ def main() -> None: ) ap.add_argument("input", help="判決書文本路徑(.txt 或 .json)") ap.add_argument("--config", default=None, - help="models.json 中的配置名稱(Name)," + help="models.json 中的配置名稱(Name),可逗號分隔多個以同時運行多個模型," "使用後可省略 --model/--base-url/--api-key") ap.add_argument("--models-file", default=DEFAULT_MODELS_FILE, help=f"模型配置文件路徑(默認:{DEFAULT_MODELS_FILE})") ap.add_argument("--model", default=DEFAULT_MODEL, - help=f"模型名稱(默認:{DEFAULT_MODEL})") + help=f"模型名稱(默認:{DEFAULT_MODEL}),可逗號分隔多個(共用 --base-url/--api-key)") ap.add_argument("--base-url", default=DEFAULT_BASE_URL, help=f"API base URL(默認:{DEFAULT_BASE_URL})") ap.add_argument("--api-key", default=DEFAULT_API_KEY, help="API key(Ollama 可忽略)") - ap.add_argument("--out", default=None, help="輸出 YAML 路徑(默認 stdout)") + ap.add_argument("--out", default=None, + help="輸出 YAML 路徑(單模型默認 stdout);多模型時在文件名中插入標籤") + ap.add_argument("--max-workers", type=int, default=0, + help="多模型時的並發數(默認 0 = 模型數量;設為 1 則順序執行)") ap.add_argument("--cost", action="store_true", help="輸出成本統計到 {輸出文件名}_cost.json(默認不輸出)") ap.add_argument("--debug-dump", default=None, - help="額外輸出原始 JSON 結果到該路徑(便於 diff)") + help="額外輸出原始 JSON 結果到該路徑(多模型時在文件名中插入標籤)") # 截取長度控制參數 ap.add_argument("--head-length", type=int, default=5000, @@ -1538,24 +1641,17 @@ def main() -> None: args = ap.parse_args() - # 解析模型配置:--config 優先,未命中的字段回退到命令行/默認值 - profile: dict | None = None - if args.config: - profile = load_model_profile(args.config, args.models_file) - model = profile.get("model") or args.model - base_url = profile.get("BaseApiUrl") or args.base_url - api_key = profile.get("ApiKey") or args.api_key - print(f"使用配置 '{args.config}':model={model}, base_url={base_url}", - file=sys.stderr) - else: - model = args.model - base_url = args.base_url - api_key = args.api_key + # 解析運行規格(--config / --model 均支持逗號分隔多個) + specs = parse_run_specs(args) + multi = len(specs) > 1 + for spec in specs: + if spec.profile is not None: + print(f"使用配置 '{spec.label}':model={spec.model}, base_url={spec.base_url}", + file=sys.stderr) # 支持从 .json 文件的 content 字段读取 input_path = Path(args.input) if input_path.suffix.lower() == '.json': - import json data = json.loads(input_path.read_text(encoding="utf-8")) text = data.get("content", "") if not text: @@ -1563,44 +1659,77 @@ def main() -> None: sys.exit(1) else: text = input_path.read_text(encoding="utf-8") - - start = time.perf_counter() - result, client = run_pipeline(text, model, base_url, api_key, - args.head_length, args.tail_length, - args.entities_window, args.entities_max, - args.analysis_window, args.analysis_max) - elapsed = time.perf_counter() - start - # 成本統計:僅在 --cost 時輸出到 {summary_file_name}_cost.json - if args.cost: - cost = compute_cost(client, elapsed, profile) - if args.out: - cost_path = Path(args.out).with_name(Path(args.out).stem + "_cost.json") - else: - cost_path = input_path.with_name(input_path.stem + "_cost.json") - cost_path.parent.mkdir(parents=True, exist_ok=True) - cost_path.write_text(json.dumps(cost, ensure_ascii=False, indent=2), - encoding="utf-8") - print(f"💰 成本統計已寫入 {cost_path}:耗時 {cost['elapsed_seconds']}s," - f"input={cost['input_tokens']} output={cost['output_tokens']} " - f"total_cost={cost['total_cost']} {cost['price_unit'] or ''}", - file=sys.stderr) + def run_and_emit(spec: RunSpec) -> str: + """跑單個模型並寫出其 YAML / cost / debug 文件,返回輸出路徑描述。""" + log_prefix = f"[{spec.label}] " if multi else "" + start = time.perf_counter() + result, client = run_pipeline( + text, spec.model, spec.base_url, spec.api_key, + args.head_length, args.tail_length, + args.entities_window, args.entities_max, + args.analysis_window, args.analysis_max, + log_prefix=log_prefix) + elapsed = time.perf_counter() - start - if args.debug_dump: - debug_path = Path(args.debug_dump) - debug_path.parent.mkdir(parents=True, exist_ok=True) - debug_path.write_text( - json.dumps(result, ensure_ascii=False, indent=2), - encoding="utf-8") + out_path = build_out_path(args, input_path, spec.label, multi) - yaml_str = to_yaml(result) - if args.out: - out_path = Path(args.out) - out_path.parent.mkdir(parents=True, exist_ok=True) - out_path.write_text(yaml_str, encoding="utf-8") - print(f"\n✅ 已寫入 {args.out}", file=sys.stderr) - else: + # 成本統計:僅在 --cost 時輸出到 {輸出文件名}_cost.json + if args.cost: + cost = compute_cost(client, elapsed, spec.profile) + cost_path = build_cost_path(out_path, input_path, spec.label, multi) + cost_path.parent.mkdir(parents=True, exist_ok=True) + cost_path.write_text(json.dumps(cost, ensure_ascii=False, indent=2), + encoding="utf-8") + print(f"{log_prefix}💰 成本統計已寫入 {cost_path}:耗時 {cost['elapsed_seconds']}s," + f"input={cost['input_tokens']} output={cost['output_tokens']} " + f"total_cost={cost['total_cost']} {cost['price_unit'] or ''}", + file=sys.stderr) + + # debug dump(原始 JSON) + debug_path = build_debug_path(args, spec.label, multi) + if debug_path is not None: + debug_path.parent.mkdir(parents=True, exist_ok=True) + debug_path.write_text(json.dumps(result, ensure_ascii=False, indent=2), + encoding="utf-8") + + # YAML 輸出 + yaml_str = to_yaml(result) + if out_path is not None: + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(yaml_str, encoding="utf-8") + print(f"{log_prefix}✅ 已寫入 {out_path}", file=sys.stderr) + return str(out_path) print(yaml_str) + return "(stdout)" + + if multi and args.max_workers != 1: + # 並發執行:每個模型一個線程,日誌以 [標籤] 前綴區分 + workers = args.max_workers if args.max_workers > 0 else len(specs) + print(f"⏳ 同時運行 {len(specs)} 個模型(並發 {workers}):" + f"{[s.label for s in specs]}", file=sys.stderr) + summary: list[tuple[str, str]] = [] + with ThreadPoolExecutor(max_workers=workers) as ex: + futures = {ex.submit(run_and_emit, spec): spec for spec in specs} + for fut in as_completed(futures): + spec = futures[fut] + try: + summary.append((spec.label, fut.result())) + except Exception as e: + print(f"[{spec.label}] ❌ 運行失敗:{e}", file=sys.stderr) + summary.append((spec.label, f"FAILED: {e}")) + print("\n=== 多模型運行結果 ===", file=sys.stderr) + for label, out in sorted(summary): + print(f" {label:30s} → {out}", file=sys.stderr) + else: + # 單模型,或多模型但顯式 --max-workers 1 順序執行 + for spec in specs: + try: + run_and_emit(spec) + except Exception as e: + if not multi: + raise + print(f"[{spec.label}] ❌ 運行失敗:{e}", file=sys.stderr) if __name__ == "__main__":