main
fengruixiang 2026-05-14 18:01:26 +08:00
parent deeaf741d1
commit 8ff86e74ba
3 changed files with 137 additions and 60 deletions

1
AGENTS.md 100644
View File

@ -0,0 +1 @@
python 要使用 uv run 运行。

View File

@ -1,12 +1,15 @@
plaintiff: [] plaintiff:
- MO YUK PING
defendant: defendant:
- HONG KONG SPECIAL ADMINISTRATIVE REGION - HONG KONG SPECIAL ADMINISTRATIVE REGION
jurisdiction_code: HKCFA jurisdiction_code: HKCFA
jurisdiction_name: 香港特別行政區終審法院 jurisdiction_name: Court of Final Appeal of the Hong Kong Special Administrative Region
case_location: case_location:
- 香港特別行政區 - 香港特別行政區
- Hong Kong Special Administrative Region
case_reason: >- case_reason: >-
Applicant Mo Yuk Ping seeks leave to appeal against her conviction for conspiracy to defraud, cont拈摡 Applicant Mo Yuk Ping seeks leave to appeal against her conviction for conspiracy to defraud, challenging
the sufficiency of legal certainty in the elements of the offence.
case_object: case_object:
- leave to appeal - leave to appeal
judgment_result: judgment_result:
@ -19,7 +22,11 @@ judgment_result:
regarding the constitutional requirement of legal certainty in relation to the offence of conspiracy regarding the constitutional requirement of legal certainty in relation to the offence of conspiracy
to defraud. to defraud.
judgment_summary: >- judgment_summary: >-
本案涉及莫玉萍女士对因其涉嫌共谋欺诈而被判有罪的上诉申请。原告为莫玉萍被告为香港特别行政区。案件核心在于是否允许莫玉萍就其共谋欺诈罪的定罪提出上诉并针对四个争议点请求法院认证或授予上诉许可。法院分析了证据评估及法律适用问题认为第2至5项争议点缺乏实质意义拒绝认证这些争议点。最终法院仅同意认证并授予莫玉萍就关于共谋欺诈罪构成要件不明确的争议点提出上诉的许可并允许其基于重大不公正理由对其他争议点提起上诉。 In MO YUK PING v. HKSAR, Mo Yuk Ping appealed against her conviction for conspiracy to defraud, challenging
legal certainty in the offence's elements. The Court of Appeal refused certification for points 2 to
5, deeming them unnecessary or obvious. However, it granted leave to appeal on point 3 regarding the
constitutional requirement of legal certainty. The judgment was based on the clarity of the trial judges
findings and the sufficiency of evidence.
involved_entities: involved_entities:
- entity_name: Chief Justice Li - entity_name: Chief Justice Li
reason: Served as Chief Justice in this case, responsible for fact-finding and adjudication. reason: Served as Chief Justice in this case, responsible for fact-finding and adjudication.

View File

@ -101,7 +101,7 @@ def detect_language(text: str) -> str:
# 1. 預處理:去噪 + 切段 + 規則抽元數據 # 1. 預處理:去噪 + 切段 + 規則抽元數據
# ============================================================================= # =============================================================================
JURISDICTION_MAP: dict[str, str] = { JURISDICTION_MAP_ZH: dict[str, str] = {
"HKCFA": "香港特別行政區終審法院", "HKCFA": "香港特別行政區終審法院",
"HKCA": "香港特別行政區高等法院上訴法庭", "HKCA": "香港特別行政區高等法院上訴法庭",
"HKCFI": "香港特別行政區高等法院原訟法庭", "HKCFI": "香港特別行政區高等法院原訟法庭",
@ -118,6 +118,23 @@ JURISDICTION_MAP: dict[str, str] = {
"HKCrC": "香港特別行政區死因裁判法庭", "HKCrC": "香港特別行政區死因裁判法庭",
} }
JURISDICTION_MAP_EN: dict[str, str] = {
"HKCFA": "Court of Final Appeal of the Hong Kong Special Administrative Region",
"HKCA": "Court of Appeal of the High Court of the Hong Kong Special Administrative Region",
"HKCFI": "Court of First Instance of the High Court of the Hong Kong Special Administrative Region",
"HKDC": "District Court of the Hong Kong Special Administrative Region",
"HKMC": "Magistrates' Courts of the Hong Kong Special Administrative Region",
"HKMagC": "Magistrates' Courts of the Hong Kong Special Administrative Region",
"HKSCT": "Small Claims Tribunal of the Hong Kong Special Administrative Region",
"HKLT": "Lands Tribunal of the Hong Kong Special Administrative Region",
"HKLDT": "Lands Tribunal of the Hong Kong Special Administrative Region",
"HKLD": "Labour Tribunal of the Hong Kong Special Administrative Region",
"HKLAT": "Labour Tribunal of the Hong Kong Special Administrative Region",
"HKCT": "Competition Tribunal of the Hong Kong Special Administrative Region",
"HKCorC": "Coroner's Court of the Hong Kong Special Administrative Region",
"HKCrC": "Coroner's Court of the Hong Kong Special Administrative Region",
}
NEUTRAL_CITATION_RE = re.compile( NEUTRAL_CITATION_RE = re.compile(
r"\[(\d{4})\]\s*(HKCFA|HKCA|HKCFI|HKDC|HKMagC|HKMC|HKSCT|HKLT|HKLD|HKCT|HKCorC)\s*(\d+)", r"\[(\d{4})\]\s*(HKCFA|HKCA|HKCFI|HKDC|HKMagC|HKMC|HKSCT|HKLT|HKLD|HKCT|HKCorC)\s*(\d+)",
re.I, re.I,
@ -164,18 +181,26 @@ def clean_text(raw: str) -> str:
return t.strip() return t.strip()
def extract_metadata_by_rule(text: str) -> dict[str, Any]: def extract_metadata_by_rule(text: str, lang: str = 'zh') -> dict[str, Any]:
"""純規則:司法區域、案號、案件地點(默認香港特區) """純規則:司法區域、案號、案件地點(默認香港特區)
優先級 優先級
1. 案號前綴最可靠 1. 案號前綴最可靠
2. Neutral Citation 2. Neutral Citation
3. 法院全稱匹配 3. 法院全稱匹配
Args:
text: 判決書文本
lang: 語言代碼 ('zh' 'en')
""" """
# 根據語言選擇對應的映射表和默認地點
jurisdiction_map = JURISDICTION_MAP_ZH if lang == 'zh' else JURISDICTION_MAP_EN
default_location = ["香港特別行政區"] if lang == 'zh' else ["Hong Kong Special Administrative Region"]
meta: dict[str, Any] = { meta: dict[str, Any] = {
"jurisdiction_code": None, "jurisdiction_code": None,
"jurisdiction_name": None, "jurisdiction_name": None,
"case_location": ["香港特別行政區"], "case_location": default_location,
"case_number": None, "case_number": None,
} }
@ -189,27 +214,32 @@ def extract_metadata_by_rule(text: str) -> dict[str, Any]:
if prefix in CASE_NO_PREFIX_MAP: if prefix in CASE_NO_PREFIX_MAP:
code = CASE_NO_PREFIX_MAP[prefix] code = CASE_NO_PREFIX_MAP[prefix]
meta["jurisdiction_code"] = code meta["jurisdiction_code"] = code
meta["jurisdiction_name"] = JURISDICTION_MAP.get(code) meta["jurisdiction_name"] = jurisdiction_map.get(code)
# 次優先Neutral Citation如果案號未能確定法院 # 次優先Neutral Citation如果案號未能確定法院
if not meta["jurisdiction_code"]: if not meta["jurisdiction_code"]:
if m := NEUTRAL_CITATION_RE.search(text): if m := NEUTRAL_CITATION_RE.search(text):
code = m.group(2).upper() code = m.group(2).upper()
# 規範化大小寫 # 規範化大小寫
for k in JURISDICTION_MAP: for k in jurisdiction_map:
if k.upper() == code: if k.upper() == code:
meta["jurisdiction_code"] = k meta["jurisdiction_code"] = k
meta["jurisdiction_name"] = JURISDICTION_MAP[k] meta["jurisdiction_name"] = jurisdiction_map[k]
break break
# 最後靠法院全稱反查僅在前兩者都失敗時使用且只搜索前2000字符 # 最後靠法院全稱反查僅在前兩者都失敗時使用且只搜索前2000字符
if not meta["jurisdiction_code"]: if not meta["jurisdiction_code"]:
header = text[:2000] # 只在開頭搜索,避免被引用案例干擾 header = text[:2000] # 只在開頭搜索,避免被引用案例干擾
for code, full in JURISDICTION_MAP.items(): # 同時搜索中英文法院名稱
short = full.replace("香港特別行政區", "") for code in jurisdiction_map:
if full in header or short in header: full_zh = JURISDICTION_MAP_ZH.get(code, "")
full_en = JURISDICTION_MAP_EN.get(code, "")
short_zh = full_zh.replace("香港特別行政區", "")
short_en = full_en.replace("Hong Kong Special Administrative Region", "").replace(" of the ", " ")
if any(name in header for name in [full_zh, short_zh, full_en, short_en] if name):
meta["jurisdiction_code"] = code meta["jurisdiction_code"] = code
meta["jurisdiction_name"] = full meta["jurisdiction_name"] = jurisdiction_map[code]
break break
return meta return meta
@ -462,14 +492,23 @@ PARTIES_SYSTEM_ZH = """你是香港法律文書信息抽取助手。
只輸出符合 schema JSON不要解釋""" 只輸出符合 schema JSON不要解釋"""
PARTIES_FEWSHOT_ZH = """範例輸入 PARTIES_FEWSHOT_ZH = """範例1原告/被告格式)
BETWEEN BETWEEN
陳大文 (CHAN TAI MAN) 上訴人 陳大文 (CHAN TAI MAN) 上訴人
AND AND
香港房屋委員會 (Hong Kong Housing Authority) 答辯人 香港房屋委員會 (Hong Kong Housing Authority) 答辯人
範例輸出 輸出
{"plaintiff":["陳大文 (CHAN TAI MAN)"],"defendant":["香港房屋委員會 (Hong Kong Housing Authority)"]}""" {"plaintiff":["陳大文 (CHAN TAI MAN)"],"defendant":["香港房屋委員會 (Hong Kong Housing Authority)"]}
範例2申請人/答辯人格式
申請人
李小明
答辯人
入境事務處處長
輸出
{"plaintiff":["李小明"],"defendant":["入境事務處處長"]}"""
# 英文提示詞 # 英文提示詞
PARTIES_SYSTEM_EN = """You are a Hong Kong legal document information extraction assistant. PARTIES_SYSTEM_EN = """You are a Hong Kong legal document information extraction assistant.
@ -488,15 +527,26 @@ Classification Rules:
Output only JSON conforming to schema, no explanation.""" Output only JSON conforming to schema, no explanation."""
PARTIES_FEWSHOT_EN = """Example Input: PARTIES_FEWSHOT_EN = """Example 1 (Plaintiff/Defendant format):
BETWEEN BETWEEN
Dr Paul KI Ping-ki 1st Plaintiff Dr Paul KI Ping-ki 1st Plaintiff
Hong Kong Washington Company 2nd Plaintiff Hong Kong Washington Company 2nd Plaintiff
AND AND
Next Magazine Publishing Ltd 1st Defendant Next Magazine Publishing Ltd 1st Defendant
Example Output: Output:
{"plaintiff":["Dr Paul KI Ping-ki","Hong Kong Washington Company"],"defendant":["Next Magazine Publishing Ltd"]}""" {"plaintiff":["Dr Paul KI Ping-ki","Hong Kong Washington Company"],"defendant":["Next Magazine Publishing Ltd"]}
Example 2 (Applicant/Respondent format):
Between:
MO YUK PING
Applicant
and
HONG KONG SPECIAL ADMINISTRATIVE REGION
Respondent
Output:
{"plaintiff":["MO YUK PING"],"defendant":["HONG KONG SPECIAL ADMINISTRATIVE REGION"]}"""
def extract_parties(client: OllamaClient, context: str, lang: str = 'zh') -> dict: def extract_parties(client: OllamaClient, context: str, lang: str = 'zh') -> dict:
@ -513,14 +563,17 @@ def extract_parties(client: OllamaClient, context: str, lang: str = 'zh') -> dic
# --- Call 2: 事由 + 標的 ---------------------------------------------------- # --- Call 2: 事由 + 標的 ----------------------------------------------------
REASON_OBJECT_SCHEMA = { def get_reason_object_schema(lang: str = 'zh') -> dict:
"type": "object", """根據語言返回對應的 schema英文字數限制更寬鬆"""
"properties": { max_length = 100 if lang == 'zh' else 200 # 英文允許 2 倍字符數
"case_reason": {"type": "string", "maxLength": 100}, return {
"case_object": {"type": "array", "items": {"type": "string"}}, "type": "object",
}, "properties": {
"required": ["case_reason", "case_object"], "case_reason": {"type": "string", "maxLength": max_length},
} "case_object": {"type": "array", "items": {"type": "string"}},
},
"required": ["case_reason", "case_object"],
}
# 中文提示詞 # 中文提示詞
REASON_OBJECT_SYSTEM_ZH = """從香港判決書中抽取: REASON_OBJECT_SYSTEM_ZH = """從香港判決書中抽取:
@ -567,11 +620,14 @@ REASON_OBJECT_FEWSHOT_EN = """Example Output:
def _reason_object_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]: def _reason_object_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]:
r = out.get("case_reason", "") r = out.get("case_reason", "")
if len(r) > 100: max_length = 100 if lang == 'zh' else 200 # 英文允許 2 倍字符數
target_length = 80 if lang == 'zh' else 160 # 建議壓縮目標
if len(r) > max_length:
if lang == 'zh': if lang == 'zh':
return False, f"case_reason 共 {len(r)} 字,超過 100 字上限,請壓縮到 80 字以內。" return False, f"case_reason 共 {len(r)} 字,超過 {max_length} 字上限,請壓縮到 {target_length} 字以內。"
else: else:
return False, f"case_reason has {len(r)} characters, exceeds 100 limit, please compress to within 80." return False, f"case_reason has {len(r)} characters, exceeds {max_length} limit, please compress to within {target_length}."
if not out.get("case_object"): if not out.get("case_object"):
if lang == 'zh': if lang == 'zh':
return False, "case_object 不能為空。" return False, "case_object 不能為空。"
@ -597,6 +653,8 @@ def _reason_object_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]:
def extract_reason_object(client: OllamaClient, context: str, lang: str = 'zh') -> dict: def extract_reason_object(client: OllamaClient, context: str, lang: str = 'zh') -> dict:
system = REASON_OBJECT_SYSTEM_ZH if lang == 'zh' else REASON_OBJECT_SYSTEM_EN system = REASON_OBJECT_SYSTEM_ZH if lang == 'zh' else REASON_OBJECT_SYSTEM_EN
fewshot = REASON_OBJECT_FEWSHOT_ZH if lang == 'zh' else REASON_OBJECT_FEWSHOT_EN fewshot = REASON_OBJECT_FEWSHOT_ZH if lang == 'zh' else REASON_OBJECT_FEWSHOT_EN
schema = get_reason_object_schema(lang)
max_length = 100 if lang == 'zh' else 200
if lang == 'zh': if lang == 'zh':
user = (f"{fewshot}\n\n" user = (f"{fewshot}\n\n"
@ -606,10 +664,10 @@ def extract_reason_object(client: OllamaClient, context: str, lang: str = 'zh')
f"Please extract from the following segments (keyword-based retrieval):\n```\n{context[:6000]}\n```") f"Please extract from the following segments (keyword-based retrieval):\n```\n{context[:6000]}\n```")
out = client.chat_json_with_retry(system, user, out = client.chat_json_with_retry(system, user,
REASON_OBJECT_SCHEMA, schema,
validator=lambda x: _reason_object_validator(x, lang)) validator=lambda x: _reason_object_validator(x, lang))
if len(out["case_reason"]) > 100: if len(out["case_reason"]) > max_length:
out["case_reason"] = out["case_reason"][:100] out["case_reason"] = out["case_reason"][:max_length]
return out return out
@ -844,13 +902,16 @@ def extract_entities(client: OllamaClient, context: str, lang: str = 'zh') -> di
# --- Call 5: 判決總結(基於已抽取結果 + 分析段,不從原文重生) ----------- # --- Call 5: 判決總結(基於已抽取結果 + 分析段,不從原文重生) -----------
SUMMARY_SCHEMA = { def get_summary_schema(lang: str = 'zh') -> dict:
"type": "object", """根據語言返回對應的 schema英文字數限制更寬鬆"""
"properties": { max_length = 300 if lang == 'zh' else 500 # 英文允許約 1.67 倍字符數
"judgment_summary": {"type": "string", "maxLength": 300}, return {
}, "type": "object",
"required": ["judgment_summary"], "properties": {
} "judgment_summary": {"type": "string", "maxLength": max_length},
},
"required": ["judgment_summary"],
}
# 中文提示詞 # 中文提示詞
SUMMARY_SYSTEM_ZH = """根據已抽取的結構化字段 + 法庭分析段,撰寫判決總結。 SUMMARY_SYSTEM_ZH = """根據已抽取的結構化字段 + 法庭分析段,撰寫判決總結。
@ -878,17 +939,20 @@ Four-element structure (must cover all, in coherent single paragraph):
- What key laws or precedents were cited? - What key laws or precedents were cited?
(4) Final judgment and orders (4) Final judgment and orders
Strictly 300 words. Output only JSON.""" Strictly 500 characters. Output only JSON."""
def _summary_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]: def _summary_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]:
s = out.get("judgment_summary", "") s = out.get("judgment_summary", "")
if len(s) > 300: max_length = 300 if lang == 'zh' else 500 # 英文允許約 1.67 倍字符數
min_length = 80 if lang == 'zh' else 120 # 英文最小長度也相應增加
if len(s) > max_length:
if lang == 'zh': if lang == 'zh':
return False, f"summary 共 {len(s)} 字,超過 300 字上限,請壓縮。" return False, f"summary 共 {len(s)} 字,超過 {max_length} 字上限,請壓縮。"
else: else:
return False, f"summary has {len(s)} characters, exceeds 300 limit, please compress." return False, f"summary has {len(s)} characters, exceeds {max_length} limit, please compress."
if len(s) < 80: if len(s) < min_length:
if lang == 'zh': if lang == 'zh':
return False, "summary 過短,請完整覆蓋四要素。" return False, "summary 過短,請完整覆蓋四要素。"
else: else:
@ -899,6 +963,8 @@ def _summary_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]:
def extract_summary(client: OllamaClient, def extract_summary(client: OllamaClient,
prior: dict, analysis: str, lang: str = 'zh') -> dict: prior: dict, analysis: str, lang: str = 'zh') -> dict:
system = SUMMARY_SYSTEM_ZH if lang == 'zh' else SUMMARY_SYSTEM_EN system = SUMMARY_SYSTEM_ZH if lang == 'zh' else SUMMARY_SYSTEM_EN
schema = get_summary_schema(lang)
max_length = 300 if lang == 'zh' else 500
if lang == 'zh': if lang == 'zh':
user = f"""已抽取的字段: user = f"""已抽取的字段:
@ -923,12 +989,12 @@ Court analysis excerpt:
{analysis[:3500]} {analysis[:3500]}
``` ```
Please write judgment_summary 300 words covering four elements.""" Please write judgment_summary 500 characters covering four elements."""
out = client.chat_json_with_retry(system, user, SUMMARY_SCHEMA, out = client.chat_json_with_retry(system, user, schema,
validator=lambda x: _summary_validator(x, lang)) validator=lambda x: _summary_validator(x, lang))
if len(out["judgment_summary"]) > 300: if len(out["judgment_summary"]) > max_length:
out["judgment_summary"] = out["judgment_summary"][:300] out["judgment_summary"] = out["judgment_summary"][:max_length]
return out return out
@ -949,7 +1015,7 @@ ENTITY_NAME_BLACKLIST = [
] ]
def validate_and_fix(result: dict) -> tuple[dict, list[str]]: def validate_and_fix(result: dict, lang: str = 'zh') -> tuple[dict, list[str]]:
warnings: list[str] = [] warnings: list[str] = []
# case_location剔除法院/場所/建築 # case_location剔除法院/場所/建築
@ -963,13 +1029,16 @@ def validate_and_fix(result: dict) -> tuple[dict, list[str]]:
f"case_location 已清理:移除 {set(locs) - set(cleaned)}") f"case_location 已清理:移除 {set(locs) - set(cleaned)}")
result["case_location"] = cleaned result["case_location"] = cleaned
# 字數硬截斷 # 字數硬截斷(根據語言調整限制)
if len(result.get("case_reason", "")) > 100: reason_max = 100 if lang == 'zh' else 200
warnings.append("case_reason > 100 字,已截斷") summary_max = 300 if lang == 'zh' else 500
result["case_reason"] = result["case_reason"][:100]
if len(result.get("judgment_summary", "")) > 300: if len(result.get("case_reason", "")) > reason_max:
warnings.append("judgment_summary > 300 字,已截斷") warnings.append(f"case_reason > {reason_max} 字,已截斷")
result["judgment_summary"] = result["judgment_summary"][:300] result["case_reason"] = result["case_reason"][:reason_max]
if len(result.get("judgment_summary", "")) > summary_max:
warnings.append(f"judgment_summary > {summary_max} 字,已截斷")
result["judgment_summary"] = result["judgment_summary"][:summary_max]
# involved_entities剔除條例/文獻 # involved_entities剔除條例/文獻
ents = result.get("involved_entities") or [] ents = result.get("involved_entities") or []
@ -1010,7 +1079,7 @@ def run_pipeline(text: str, model: str) -> dict:
log("[1/7] 預處理 + 關鍵詞召回...") log("[1/7] 預處理 + 關鍵詞召回...")
text = clean_text(text) text = clean_text(text)
meta = extract_metadata_by_rule(text) meta = extract_metadata_by_rule(text, lang)
ctx = gather_all(text) ctx = gather_all(text)
log(f" 規則元數據:{meta}") log(f" 規則元數據:{meta}")
@ -1059,7 +1128,7 @@ def run_pipeline(text: str, model: str) -> dict:
} }
log("[7/7] 校驗與後處理...") log("[7/7] 校驗與後處理...")
final, warnings = validate_and_fix(final) final, warnings = validate_and_fix(final, lang)
for w in warnings: for w in warnings:
log(f" ⚠️ {w}") log(f" ⚠️ {w}")