From bc2b9826d5ad6b4c0b6637f844bc265ba777d0c0 Mon Sep 17 00:00:00 2001 From: fengruixiang <474182370@qq.com> Date: Thu, 28 May 2026 12:28:39 +0800 Subject: [PATCH] prompt for professional legal terminology and enforce monetary subject extraction Add a standard-legal-terminology instruction to the reason/object, judgment-result, entities, and summary prompts (ZH + EN). Require case_object to carry specific monetary amounts, reinforced with a few-shot example and a context-aware validator that retries when the source mentions a sum but no amount was extracted. Co-Authored-By: Claude Opus 4.7 --- hk_case_extractor.py | 69 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/hk_case_extractor.py b/hk_case_extractor.py index ca5e71e..cb68fac 100644 --- a/hk_case_extractor.py +++ b/hk_case_extractor.py @@ -760,13 +760,19 @@ REASON_OBJECT_SYSTEM_ZH = f"""從香港判決書中抽取: 2. case_object(標的物): - 訴訟請求指向的實體權利或利益 - 例:汽車、黃金、珠寶、不動產產權、撫養權、人身傷害賠償、合同履行、房產所有權、精神困擾賠償、居留權 + - 涉及金錢標的時必須提取,並標明幣種與具體金額(如「拖欠貨款 HK$850,000」「索償 HK$1,000,000」「拖欠租金 HK$120,000」);金額未定或待評定者註明「金額待評定」 - 合併本質相同的標的 - 嚴禁:證據材料、程序性訴求(如"要求法庭裁決")、法律條文名稱 +用語要求:一律使用規範的法律專業用語(如「申索」「損害賠償」「違約」「侵權」「衡平法濟助」),避免口語化或不準確的表述。 + 只輸出 JSON。""" -REASON_OBJECT_FEWSHOT_ZH = """範例輸出: -{"case_reason":"申索人為商場保安員,就被告於2023年7月在商場毆打申索人造成的人身傷害向被告提出損害賠償申索。","case_object":["人身傷害賠償","醫療費用","精神困擾賠償"]}""" +REASON_OBJECT_FEWSHOT_ZH = """範例輸出1(人身傷害): +{"case_reason":"申索人為商場保安員,就被告於2023年7月在商場毆打申索人造成的人身傷害向被告提出損害賠償申索。","case_object":["人身傷害賠償","醫療費用","精神困擾賠償"]} + +範例輸出2(金錢申索,標的物須含具體金額): +{"case_reason":"原告就被告未支付2022年買賣合約項下的貨款,向被告提出追討欠款的申索。","case_object":["拖欠貨款 HK$850,000","合約利息","訟費"]}""" # 英文提示詞 REASON_OBJECT_SYSTEM_EN = f"""Extract from Hong Kong judgment: @@ -781,16 +787,37 @@ REASON_OBJECT_SYSTEM_EN = f"""Extract from Hong Kong judgment: 2. case_object (Subject Matter): - Tangible rights or interests targeted by the claim - Examples: vehicle, gold, jewelry, property title, custody, personal injury damages, contract performance, property ownership, distress damages, right of abode + - When a monetary subject matter is involved, it MUST be extracted with currency and the specific amount (e.g., "outstanding goods price HK$850,000", "claim of HK$1,000,000", "arrears of rent HK$120,000"); if the amount is unascertained, note "amount to be assessed" - Merge essentially identical subjects - MUST NOT include: evidentiary materials, procedural requests (e.g., "seeking court ruling"), names of statutes +Terminology requirement: consistently use precise, standard legal terminology (e.g., "claim", "damages", "breach of contract", "negligence", "equitable relief"); avoid colloquial or imprecise wording. + Output only JSON.""" -REASON_OBJECT_FEWSHOT_EN = """Example Output: -{"case_reason":"Plaintiff, a shopping mall security guard, claims damages from defendant for personal injuries caused by defendant's assault at the mall in July 2023.","case_object":["personal injury damages","medical expenses","distress damages"]}""" +REASON_OBJECT_FEWSHOT_EN = """Example Output 1 (personal injury): +{"case_reason":"Plaintiff, a shopping mall security guard, claims damages from defendant for personal injuries caused by defendant's assault at the mall in July 2023.","case_object":["personal injury damages","medical expenses","distress damages"]} + +Example Output 2 (monetary claim, subject matter must carry the specific amount): +{"case_reason":"Plaintiff claims against defendant for non-payment of the price of goods supplied under a 2022 sale and purchase contract.","case_object":["outstanding goods price HK$850,000","contractual interest","costs"]}""" -def _reason_object_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]: +# 金錢數額識別:幣種前綴 + 數字,或數字 + 中文金額單位 +MONEY_RE = re.compile( + r"(?:HK\$|US\$|RMB|MOP|港幣|港元|人民幣|美元|美金)\s*[\d,]+(?:\.\d+)?" + r"|[\$$]\s*[\d,]+(?:\.\d+)?" + r"|[\d,]+(?:\.\d+)?\s*(?:萬元|億元|元|萬|億)", + re.I, +) + + +def _object_has_amount(objs: list[str]) -> bool: + """case_object 中是否已含具體金額(任一項出現數字即視為已提取金額)""" + return any(re.search(r"\d", o or "") for o in (objs or [])) + + +def _reason_object_validator(out: dict, lang: str = 'zh', + context: str = "") -> tuple[bool, str]: r = out.get("case_reason", "") max_length = 100 if lang == 'zh' else 200 # 英文允許 2 倍字符數 target_length = 80 if lang == 'zh' else 160 # 建議壓縮目標 @@ -818,7 +845,19 @@ def _reason_object_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]: return False, f"case_reason 不應包含判決結果詞彙「{keyword}」,請只描述訴訟起因和請求。" else: return False, f"case_reason should not contain judgment result term '{keyword}', describe only cause and relief sought." - + + # 金錢標的物強制提取:原文出現金錢數額但 case_object 未含金額時要求補充 + if context and MONEY_RE.search(context) and not _object_has_amount(out.get("case_object", [])): + if lang == 'zh': + return False, ("原文出現金錢數額。若該數額屬於訴訟標的(如欠款、索償、賠償金額)," + "必須在 case_object 中提取並標明幣種與具體金額(如「拖欠貨款 HK$850,000」);" + "若僅為無關引用則可忽略。") + else: + return False, ("Monetary amounts appear in the source. If an amount forms part of the " + "subject matter (e.g., debt, claim, damages), it MUST be extracted in " + "case_object with currency and the specific figure (e.g., \"outstanding " + "goods price HK$850,000\"); ignore only if it is an unrelated citation.") + return True, "" @@ -837,7 +876,7 @@ def extract_reason_object(client: OpenAICompatibleClient, context: str, lang: st return client.chat_json_with_retry(system, user, schema, - validator=lambda x: _reason_object_validator(x, lang)) + validator=lambda x: _reason_object_validator(x, lang, context[:5000])) # --- Call 3: 判決結果 ------------------------------------------------------- @@ -875,7 +914,9 @@ JUDGMENT_RESULT_SYSTEM_ZH = """從香港判決書尾部的命令/裁定部分抽 - result 必須包含: a) 明確結果(勝訴/敗訴/部分勝訴/維持/撤銷/駁回/發還等) b) 2-3 個關鍵法庭理由(如有) - c) 具體金額、利率或命令內容(如有) + c) 具體金額、利率或命令內容;凡有判給/命令支付的金錢數額,必須原文照錄幣種與金額(如 HK$28,500),不得省略或約化 + +用語要求:一律使用規範的法律專業用語(如「判給」「訟費」「利息」「駁回」「發還重審」),避免口語化或不準確的表述。 只輸出 JSON。""" @@ -900,7 +941,9 @@ Splitting Principles: - result must include: a) Clear outcome (allowed/dismissed/partially allowed/upheld/quashed/remitted, etc.) b) 2-3 key court reasons (if any) - c) Specific amounts, interest rates or order details (if any) + c) Specific amounts, interest rates or order details; whenever a sum is awarded/ordered to be paid, the currency and figure MUST be reproduced verbatim (e.g., HK$28,500), never omitted or rounded + +Terminology requirement: consistently use precise, standard legal terminology (e.g., "awarded", "costs", "interest", "dismissed", "remitted for retrial"); avoid colloquial or imprecise wording. Output only JSON.""" @@ -998,6 +1041,8 @@ ENTITIES_SYSTEM_ZH = """從香港判決書中抽取所有相關實體(自然 - 純案例名稱(如 Donoghue v Stevenson) - 文獻、期刊名 +用語要求:reason 一律使用規範的法律專業用語(如「主審」「闡述」「先例」「判詞」),避免口語化或不準確的表述。 + 只輸出 JSON。""" ENTITIES_FEWSHOT_ZH = """範例輸出: @@ -1022,6 +1067,8 @@ MUST NOT include: - Pure case names (e.g., Donoghue v Stevenson) - Literature, journal names +Terminology requirement: write each reason in precise, standard legal terminology (e.g., "presiding", "articulated", "precedent", "judgment"); avoid colloquial or imprecise wording. + Output only JSON.""" ENTITIES_FEWSHOT_EN = """Example Output: @@ -1106,6 +1153,8 @@ SUMMARY_SYSTEM_ZH = """根據已抽取的結構化字段 + 法庭分析段,撰 **重要:judgment_summary 必須使用中文撰寫。** +用語要求:一律使用規範的法律專業用語;涉及金錢的判給或標的,須保留具體金額(含幣種)。 + 嚴格 ≤300 字。只輸出 JSON。""" # 英文提示詞 @@ -1122,6 +1171,8 @@ Four-element structure (must cover all, in coherent single paragraph): **IMPORTANT: judgment_summary MUST be written in English.** +Terminology requirement: consistently use precise, standard legal terminology; preserve specific monetary figures (with currency) for any award or subject matter involving money. + Strictly ≤500 characters. Output only JSON."""