diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..a69f1f0 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1 @@ +python 要使用 uv run 运行。 \ No newline at end of file diff --git a/en_cases_hkcfa/2007_HKCFA_6/summary-fast.yml b/en_cases_hkcfa/2007_HKCFA_6/summary-fast.yml index f1ebab9..6f3a6ea 100644 --- a/en_cases_hkcfa/2007_HKCFA_6/summary-fast.yml +++ b/en_cases_hkcfa/2007_HKCFA_6/summary-fast.yml @@ -1,12 +1,15 @@ -plaintiff: [] +plaintiff: +- MO YUK PING defendant: - HONG KONG SPECIAL ADMINISTRATIVE REGION jurisdiction_code: HKCFA -jurisdiction_name: 香港特別行政區終審法院 +jurisdiction_name: Court of Final Appeal of the Hong Kong Special Administrative Region case_location: - 香港特別行政區 +- Hong Kong Special Administrative Region case_reason: >- - Applicant Mo Yuk Ping seeks leave to appeal against her conviction for conspiracy to defraud, cont拈摡 + Applicant Mo Yuk Ping seeks leave to appeal against her conviction for conspiracy to defraud, challenging + the sufficiency of legal certainty in the elements of the offence. case_object: - leave to appeal judgment_result: @@ -19,7 +22,11 @@ judgment_result: regarding the constitutional requirement of legal certainty in relation to the offence of conspiracy to defraud. judgment_summary: >- - 本案涉及莫玉萍女士对因其涉嫌共谋欺诈而被判有罪的上诉申请。原告为莫玉萍,被告为香港特别行政区。案件核心在于是否允许莫玉萍就其共谋欺诈罪的定罪提出上诉,并针对四个争议点请求法院认证或授予上诉许可。法院分析了证据评估及法律适用问题,认为第2至5项争议点缺乏实质意义,拒绝认证这些争议点。最终,法院仅同意认证并授予莫玉萍就关于共谋欺诈罪构成要件不明确的争议点提出上诉的许可,并允许其基于重大不公正理由对其他争议点提起上诉。 + In MO YUK PING v. HKSAR, Mo Yuk Ping appealed against her conviction for conspiracy to defraud, challenging + legal certainty in the offence's elements. The Court of Appeal refused certification for points 2 to + 5, deeming them unnecessary or obvious. However, it granted leave to appeal on point 3 regarding the + constitutional requirement of legal certainty. The judgment was based on the clarity of the trial judge’s + findings and the sufficiency of evidence. involved_entities: - entity_name: Chief Justice Li reason: Served as Chief Justice in this case, responsible for fact-finding and adjudication. diff --git a/hk_case_extractor.py b/hk_case_extractor.py index 707d5f1..edfbd3b 100644 --- a/hk_case_extractor.py +++ b/hk_case_extractor.py @@ -101,7 +101,7 @@ def detect_language(text: str) -> str: # 1. 預處理:去噪 + 切段 + 規則抽元數據 # ============================================================================= -JURISDICTION_MAP: dict[str, str] = { +JURISDICTION_MAP_ZH: dict[str, str] = { "HKCFA": "香港特別行政區終審法院", "HKCA": "香港特別行政區高等法院上訴法庭", "HKCFI": "香港特別行政區高等法院原訟法庭", @@ -118,6 +118,23 @@ JURISDICTION_MAP: dict[str, str] = { "HKCrC": "香港特別行政區死因裁判法庭", } +JURISDICTION_MAP_EN: dict[str, str] = { + "HKCFA": "Court of Final Appeal of the Hong Kong Special Administrative Region", + "HKCA": "Court of Appeal of the High Court of the Hong Kong Special Administrative Region", + "HKCFI": "Court of First Instance of the High Court of the Hong Kong Special Administrative Region", + "HKDC": "District Court of the Hong Kong Special Administrative Region", + "HKMC": "Magistrates' Courts of the Hong Kong Special Administrative Region", + "HKMagC": "Magistrates' Courts of the Hong Kong Special Administrative Region", + "HKSCT": "Small Claims Tribunal of the Hong Kong Special Administrative Region", + "HKLT": "Lands Tribunal of the Hong Kong Special Administrative Region", + "HKLDT": "Lands Tribunal of the Hong Kong Special Administrative Region", + "HKLD": "Labour Tribunal of the Hong Kong Special Administrative Region", + "HKLAT": "Labour Tribunal of the Hong Kong Special Administrative Region", + "HKCT": "Competition Tribunal of the Hong Kong Special Administrative Region", + "HKCorC": "Coroner's Court of the Hong Kong Special Administrative Region", + "HKCrC": "Coroner's Court of the Hong Kong Special Administrative Region", +} + NEUTRAL_CITATION_RE = re.compile( r"\[(\d{4})\]\s*(HKCFA|HKCA|HKCFI|HKDC|HKMagC|HKMC|HKSCT|HKLT|HKLD|HKCT|HKCorC)\s*(\d+)", re.I, @@ -164,18 +181,26 @@ def clean_text(raw: str) -> str: return t.strip() -def extract_metadata_by_rule(text: str) -> dict[str, Any]: +def extract_metadata_by_rule(text: str, lang: str = 'zh') -> dict[str, Any]: """純規則:司法區域、案號、案件地點(默認香港特區) 優先級: 1. 案號前綴(最可靠) 2. Neutral Citation 3. 法院全稱匹配 + + Args: + text: 判決書文本 + lang: 語言代碼 ('zh' 或 'en') """ + # 根據語言選擇對應的映射表和默認地點 + jurisdiction_map = JURISDICTION_MAP_ZH if lang == 'zh' else JURISDICTION_MAP_EN + default_location = ["香港特別行政區"] if lang == 'zh' else ["Hong Kong Special Administrative Region"] + meta: dict[str, Any] = { "jurisdiction_code": None, "jurisdiction_name": None, - "case_location": ["香港特別行政區"], + "case_location": default_location, "case_number": None, } @@ -189,27 +214,32 @@ def extract_metadata_by_rule(text: str) -> dict[str, Any]: if prefix in CASE_NO_PREFIX_MAP: code = CASE_NO_PREFIX_MAP[prefix] meta["jurisdiction_code"] = code - meta["jurisdiction_name"] = JURISDICTION_MAP.get(code) + meta["jurisdiction_name"] = jurisdiction_map.get(code) # 次優先:Neutral Citation(如果案號未能確定法院) if not meta["jurisdiction_code"]: if m := NEUTRAL_CITATION_RE.search(text): code = m.group(2).upper() # 規範化大小寫 - for k in JURISDICTION_MAP: + for k in jurisdiction_map: if k.upper() == code: meta["jurisdiction_code"] = k - meta["jurisdiction_name"] = JURISDICTION_MAP[k] + meta["jurisdiction_name"] = jurisdiction_map[k] break # 最後:靠法院全稱反查(僅在前兩者都失敗時使用,且只搜索前2000字符) if not meta["jurisdiction_code"]: header = text[:2000] # 只在開頭搜索,避免被引用案例干擾 - for code, full in JURISDICTION_MAP.items(): - short = full.replace("香港特別行政區", "") - if full in header or short in header: + # 同時搜索中英文法院名稱 + for code in jurisdiction_map: + full_zh = JURISDICTION_MAP_ZH.get(code, "") + full_en = JURISDICTION_MAP_EN.get(code, "") + short_zh = full_zh.replace("香港特別行政區", "") + short_en = full_en.replace("Hong Kong Special Administrative Region", "").replace(" of the ", " ") + + if any(name in header for name in [full_zh, short_zh, full_en, short_en] if name): meta["jurisdiction_code"] = code - meta["jurisdiction_name"] = full + meta["jurisdiction_name"] = jurisdiction_map[code] break return meta @@ -462,14 +492,23 @@ PARTIES_SYSTEM_ZH = """你是香港法律文書信息抽取助手。 只輸出符合 schema 的 JSON,不要解釋。""" -PARTIES_FEWSHOT_ZH = """範例輸入: +PARTIES_FEWSHOT_ZH = """範例1(原告/被告格式): BETWEEN 陳大文 (CHAN TAI MAN) 上訴人 AND 香港房屋委員會 (Hong Kong Housing Authority) 答辯人 -範例輸出: -{"plaintiff":["陳大文 (CHAN TAI MAN)"],"defendant":["香港房屋委員會 (Hong Kong Housing Authority)"]}""" +輸出: +{"plaintiff":["陳大文 (CHAN TAI MAN)"],"defendant":["香港房屋委員會 (Hong Kong Housing Authority)"]} + +範例2(申請人/答辯人格式): +申請人: + 李小明 +答辯人: + 入境事務處處長 + +輸出: +{"plaintiff":["李小明"],"defendant":["入境事務處處長"]}""" # 英文提示詞 PARTIES_SYSTEM_EN = """You are a Hong Kong legal document information extraction assistant. @@ -488,15 +527,26 @@ Classification Rules: Output only JSON conforming to schema, no explanation.""" -PARTIES_FEWSHOT_EN = """Example Input: +PARTIES_FEWSHOT_EN = """Example 1 (Plaintiff/Defendant format): BETWEEN Dr Paul KI Ping-ki 1st Plaintiff Hong Kong Washington Company 2nd Plaintiff AND Next Magazine Publishing Ltd 1st Defendant -Example Output: -{"plaintiff":["Dr Paul KI Ping-ki","Hong Kong Washington Company"],"defendant":["Next Magazine Publishing Ltd"]}""" +Output: +{"plaintiff":["Dr Paul KI Ping-ki","Hong Kong Washington Company"],"defendant":["Next Magazine Publishing Ltd"]} + +Example 2 (Applicant/Respondent format): +Between: +MO YUK PING +Applicant +and +HONG KONG SPECIAL ADMINISTRATIVE REGION +Respondent + +Output: +{"plaintiff":["MO YUK PING"],"defendant":["HONG KONG SPECIAL ADMINISTRATIVE REGION"]}""" def extract_parties(client: OllamaClient, context: str, lang: str = 'zh') -> dict: @@ -513,14 +563,17 @@ def extract_parties(client: OllamaClient, context: str, lang: str = 'zh') -> dic # --- Call 2: 事由 + 標的 ---------------------------------------------------- -REASON_OBJECT_SCHEMA = { - "type": "object", - "properties": { - "case_reason": {"type": "string", "maxLength": 100}, - "case_object": {"type": "array", "items": {"type": "string"}}, - }, - "required": ["case_reason", "case_object"], -} +def get_reason_object_schema(lang: str = 'zh') -> dict: + """根據語言返回對應的 schema(英文字數限制更寬鬆)""" + max_length = 100 if lang == 'zh' else 200 # 英文允許 2 倍字符數 + return { + "type": "object", + "properties": { + "case_reason": {"type": "string", "maxLength": max_length}, + "case_object": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["case_reason", "case_object"], + } # 中文提示詞 REASON_OBJECT_SYSTEM_ZH = """從香港判決書中抽取: @@ -567,11 +620,14 @@ REASON_OBJECT_FEWSHOT_EN = """Example Output: def _reason_object_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]: r = out.get("case_reason", "") - if len(r) > 100: + max_length = 100 if lang == 'zh' else 200 # 英文允許 2 倍字符數 + target_length = 80 if lang == 'zh' else 160 # 建議壓縮目標 + + if len(r) > max_length: if lang == 'zh': - return False, f"case_reason 共 {len(r)} 字,超過 100 字上限,請壓縮到 80 字以內。" + return False, f"case_reason 共 {len(r)} 字,超過 {max_length} 字上限,請壓縮到 {target_length} 字以內。" else: - return False, f"case_reason has {len(r)} characters, exceeds 100 limit, please compress to within 80." + return False, f"case_reason has {len(r)} characters, exceeds {max_length} limit, please compress to within {target_length}." if not out.get("case_object"): if lang == 'zh': return False, "case_object 不能為空。" @@ -597,6 +653,8 @@ def _reason_object_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]: def extract_reason_object(client: OllamaClient, context: str, lang: str = 'zh') -> dict: system = REASON_OBJECT_SYSTEM_ZH if lang == 'zh' else REASON_OBJECT_SYSTEM_EN fewshot = REASON_OBJECT_FEWSHOT_ZH if lang == 'zh' else REASON_OBJECT_FEWSHOT_EN + schema = get_reason_object_schema(lang) + max_length = 100 if lang == 'zh' else 200 if lang == 'zh': user = (f"{fewshot}\n\n" @@ -606,10 +664,10 @@ def extract_reason_object(client: OllamaClient, context: str, lang: str = 'zh') f"Please extract from the following segments (keyword-based retrieval):\n```\n{context[:6000]}\n```") out = client.chat_json_with_retry(system, user, - REASON_OBJECT_SCHEMA, + schema, validator=lambda x: _reason_object_validator(x, lang)) - if len(out["case_reason"]) > 100: - out["case_reason"] = out["case_reason"][:100] + if len(out["case_reason"]) > max_length: + out["case_reason"] = out["case_reason"][:max_length] return out @@ -844,13 +902,16 @@ def extract_entities(client: OllamaClient, context: str, lang: str = 'zh') -> di # --- Call 5: 判決總結(基於已抽取結果 + 分析段,不從原文重生) ----------- -SUMMARY_SCHEMA = { - "type": "object", - "properties": { - "judgment_summary": {"type": "string", "maxLength": 300}, - }, - "required": ["judgment_summary"], -} +def get_summary_schema(lang: str = 'zh') -> dict: + """根據語言返回對應的 schema(英文字數限制更寬鬆)""" + max_length = 300 if lang == 'zh' else 500 # 英文允許約 1.67 倍字符數 + return { + "type": "object", + "properties": { + "judgment_summary": {"type": "string", "maxLength": max_length}, + }, + "required": ["judgment_summary"], + } # 中文提示詞 SUMMARY_SYSTEM_ZH = """根據已抽取的結構化字段 + 法庭分析段,撰寫判決總結。 @@ -878,17 +939,20 @@ Four-element structure (must cover all, in coherent single paragraph): - What key laws or precedents were cited? (4) Final judgment and orders -Strictly ≤300 words. Output only JSON.""" +Strictly ≤500 characters. Output only JSON.""" def _summary_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]: s = out.get("judgment_summary", "") - if len(s) > 300: + max_length = 300 if lang == 'zh' else 500 # 英文允許約 1.67 倍字符數 + min_length = 80 if lang == 'zh' else 120 # 英文最小長度也相應增加 + + if len(s) > max_length: if lang == 'zh': - return False, f"summary 共 {len(s)} 字,超過 300 字上限,請壓縮。" + return False, f"summary 共 {len(s)} 字,超過 {max_length} 字上限,請壓縮。" else: - return False, f"summary has {len(s)} characters, exceeds 300 limit, please compress." - if len(s) < 80: + return False, f"summary has {len(s)} characters, exceeds {max_length} limit, please compress." + if len(s) < min_length: if lang == 'zh': return False, "summary 過短,請完整覆蓋四要素。" else: @@ -899,6 +963,8 @@ def _summary_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]: def extract_summary(client: OllamaClient, prior: dict, analysis: str, lang: str = 'zh') -> dict: system = SUMMARY_SYSTEM_ZH if lang == 'zh' else SUMMARY_SYSTEM_EN + schema = get_summary_schema(lang) + max_length = 300 if lang == 'zh' else 500 if lang == 'zh': user = f"""已抽取的字段: @@ -923,12 +989,12 @@ Court analysis excerpt: {analysis[:3500]} ``` -Please write judgment_summary ≤300 words covering four elements.""" +Please write judgment_summary ≤500 characters covering four elements.""" - out = client.chat_json_with_retry(system, user, SUMMARY_SCHEMA, + out = client.chat_json_with_retry(system, user, schema, validator=lambda x: _summary_validator(x, lang)) - if len(out["judgment_summary"]) > 300: - out["judgment_summary"] = out["judgment_summary"][:300] + if len(out["judgment_summary"]) > max_length: + out["judgment_summary"] = out["judgment_summary"][:max_length] return out @@ -949,7 +1015,7 @@ ENTITY_NAME_BLACKLIST = [ ] -def validate_and_fix(result: dict) -> tuple[dict, list[str]]: +def validate_and_fix(result: dict, lang: str = 'zh') -> tuple[dict, list[str]]: warnings: list[str] = [] # case_location:剔除法院/場所/建築 @@ -963,13 +1029,16 @@ def validate_and_fix(result: dict) -> tuple[dict, list[str]]: f"case_location 已清理:移除 {set(locs) - set(cleaned)}") result["case_location"] = cleaned - # 字數硬截斷 - if len(result.get("case_reason", "")) > 100: - warnings.append("case_reason > 100 字,已截斷") - result["case_reason"] = result["case_reason"][:100] - if len(result.get("judgment_summary", "")) > 300: - warnings.append("judgment_summary > 300 字,已截斷") - result["judgment_summary"] = result["judgment_summary"][:300] + # 字數硬截斷(根據語言調整限制) + reason_max = 100 if lang == 'zh' else 200 + summary_max = 300 if lang == 'zh' else 500 + + if len(result.get("case_reason", "")) > reason_max: + warnings.append(f"case_reason > {reason_max} 字,已截斷") + result["case_reason"] = result["case_reason"][:reason_max] + if len(result.get("judgment_summary", "")) > summary_max: + warnings.append(f"judgment_summary > {summary_max} 字,已截斷") + result["judgment_summary"] = result["judgment_summary"][:summary_max] # involved_entities:剔除條例/文獻 ents = result.get("involved_entities") or [] @@ -1010,7 +1079,7 @@ def run_pipeline(text: str, model: str) -> dict: log("[1/7] 預處理 + 關鍵詞召回...") text = clean_text(text) - meta = extract_metadata_by_rule(text) + meta = extract_metadata_by_rule(text, lang) ctx = gather_all(text) log(f" 規則元數據:{meta}") @@ -1059,7 +1128,7 @@ def run_pipeline(text: str, model: str) -> dict: } log("[7/7] 校驗與後處理...") - final, warnings = validate_and_fix(final) + final, warnings = validate_and_fix(final, lang) for w in warnings: log(f" ⚠️ {w}")