Merge branch 'main' of https://gitea.iconsz.com/fengruixiang/hklii_samples
commit
d3cba48e12
|
|
@ -68,6 +68,7 @@ import json
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
@ -760,13 +761,19 @@ REASON_OBJECT_SYSTEM_ZH = f"""從香港判決書中抽取:
|
||||||
2. case_object(標的物):
|
2. case_object(標的物):
|
||||||
- 訴訟請求指向的實體權利或利益
|
- 訴訟請求指向的實體權利或利益
|
||||||
- 例:汽車、黃金、珠寶、不動產產權、撫養權、人身傷害賠償、合同履行、房產所有權、精神困擾賠償、居留權
|
- 例:汽車、黃金、珠寶、不動產產權、撫養權、人身傷害賠償、合同履行、房產所有權、精神困擾賠償、居留權
|
||||||
|
- 涉及金錢標的時必須提取,並標明幣種與具體金額(如「拖欠貨款 HK$850,000」「索償 HK$1,000,000」「拖欠租金 HK$120,000」);金額未定或待評定者註明「金額待評定」
|
||||||
- 合併本質相同的標的
|
- 合併本質相同的標的
|
||||||
- 嚴禁:證據材料、程序性訴求(如"要求法庭裁決")、法律條文名稱
|
- 嚴禁:證據材料、程序性訴求(如"要求法庭裁決")、法律條文名稱
|
||||||
|
|
||||||
|
用語要求:一律使用規範的法律專業用語(如「申索」「損害賠償」「違約」「侵權」「衡平法濟助」),避免口語化或不準確的表述。
|
||||||
|
|
||||||
只輸出 JSON。"""
|
只輸出 JSON。"""
|
||||||
|
|
||||||
REASON_OBJECT_FEWSHOT_ZH = """範例輸出:
|
REASON_OBJECT_FEWSHOT_ZH = """範例輸出1(人身傷害):
|
||||||
{"case_reason":"申索人為商場保安員,就被告於2023年7月在商場毆打申索人造成的人身傷害向被告提出損害賠償申索。","case_object":["人身傷害賠償","醫療費用","精神困擾賠償"]}"""
|
{"case_reason":"申索人為商場保安員,就被告於2023年7月在商場毆打申索人造成的人身傷害向被告提出損害賠償申索。","case_object":["人身傷害賠償","醫療費用","精神困擾賠償"]}
|
||||||
|
|
||||||
|
範例輸出2(金錢申索,標的物須含具體金額):
|
||||||
|
{"case_reason":"原告就被告未支付2022年買賣合約項下的貨款,向被告提出追討欠款的申索。","case_object":["拖欠貨款 HK$850,000","合約利息","訟費"]}"""
|
||||||
|
|
||||||
# 英文提示詞
|
# 英文提示詞
|
||||||
REASON_OBJECT_SYSTEM_EN = f"""Extract from Hong Kong judgment:
|
REASON_OBJECT_SYSTEM_EN = f"""Extract from Hong Kong judgment:
|
||||||
|
|
@ -781,16 +788,37 @@ REASON_OBJECT_SYSTEM_EN = f"""Extract from Hong Kong judgment:
|
||||||
2. case_object (Subject Matter):
|
2. case_object (Subject Matter):
|
||||||
- Tangible rights or interests targeted by the claim
|
- Tangible rights or interests targeted by the claim
|
||||||
- Examples: vehicle, gold, jewelry, property title, custody, personal injury damages, contract performance, property ownership, distress damages, right of abode
|
- Examples: vehicle, gold, jewelry, property title, custody, personal injury damages, contract performance, property ownership, distress damages, right of abode
|
||||||
|
- When a monetary subject matter is involved, it MUST be extracted with currency and the specific amount (e.g., "outstanding goods price HK$850,000", "claim of HK$1,000,000", "arrears of rent HK$120,000"); if the amount is unascertained, note "amount to be assessed"
|
||||||
- Merge essentially identical subjects
|
- Merge essentially identical subjects
|
||||||
- MUST NOT include: evidentiary materials, procedural requests (e.g., "seeking court ruling"), names of statutes
|
- MUST NOT include: evidentiary materials, procedural requests (e.g., "seeking court ruling"), names of statutes
|
||||||
|
|
||||||
|
Terminology requirement: consistently use precise, standard legal terminology (e.g., "claim", "damages", "breach of contract", "negligence", "equitable relief"); avoid colloquial or imprecise wording.
|
||||||
|
|
||||||
Output only JSON."""
|
Output only JSON."""
|
||||||
|
|
||||||
REASON_OBJECT_FEWSHOT_EN = """Example Output:
|
REASON_OBJECT_FEWSHOT_EN = """Example Output 1 (personal injury):
|
||||||
{"case_reason":"Plaintiff, a shopping mall security guard, claims damages from defendant for personal injuries caused by defendant's assault at the mall in July 2023.","case_object":["personal injury damages","medical expenses","distress damages"]}"""
|
{"case_reason":"Plaintiff, a shopping mall security guard, claims damages from defendant for personal injuries caused by defendant's assault at the mall in July 2023.","case_object":["personal injury damages","medical expenses","distress damages"]}
|
||||||
|
|
||||||
|
Example Output 2 (monetary claim, subject matter must carry the specific amount):
|
||||||
|
{"case_reason":"Plaintiff claims against defendant for non-payment of the price of goods supplied under a 2022 sale and purchase contract.","case_object":["outstanding goods price HK$850,000","contractual interest","costs"]}"""
|
||||||
|
|
||||||
|
|
||||||
def _reason_object_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]:
|
# 金錢數額識別:幣種前綴 + 數字,或數字 + 中文金額單位
|
||||||
|
MONEY_RE = re.compile(
|
||||||
|
r"(?:HK\$|US\$|RMB|MOP|港幣|港元|人民幣|美元|美金)\s*[\d,]+(?:\.\d+)?"
|
||||||
|
r"|[\$$]\s*[\d,]+(?:\.\d+)?"
|
||||||
|
r"|[\d,]+(?:\.\d+)?\s*(?:萬元|億元|元|萬|億)",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _object_has_amount(objs: list[str]) -> bool:
|
||||||
|
"""case_object 中是否已含具體金額(任一項出現數字即視為已提取金額)"""
|
||||||
|
return any(re.search(r"\d", o or "") for o in (objs or []))
|
||||||
|
|
||||||
|
|
||||||
|
def _reason_object_validator(out: dict, lang: str = 'zh',
|
||||||
|
context: str = "") -> tuple[bool, str]:
|
||||||
r = out.get("case_reason", "")
|
r = out.get("case_reason", "")
|
||||||
max_length = 100 if lang == 'zh' else 200 # 英文允許 2 倍字符數
|
max_length = 100 if lang == 'zh' else 200 # 英文允許 2 倍字符數
|
||||||
target_length = 80 if lang == 'zh' else 160 # 建議壓縮目標
|
target_length = 80 if lang == 'zh' else 160 # 建議壓縮目標
|
||||||
|
|
@ -818,7 +846,19 @@ def _reason_object_validator(out: dict, lang: str = 'zh') -> tuple[bool, str]:
|
||||||
return False, f"case_reason 不應包含判決結果詞彙「{keyword}」,請只描述訴訟起因和請求。"
|
return False, f"case_reason 不應包含判決結果詞彙「{keyword}」,請只描述訴訟起因和請求。"
|
||||||
else:
|
else:
|
||||||
return False, f"case_reason should not contain judgment result term '{keyword}', describe only cause and relief sought."
|
return False, f"case_reason should not contain judgment result term '{keyword}', describe only cause and relief sought."
|
||||||
|
|
||||||
|
# 金錢標的物強制提取:原文出現金錢數額但 case_object 未含金額時要求補充
|
||||||
|
if context and MONEY_RE.search(context) and not _object_has_amount(out.get("case_object", [])):
|
||||||
|
if lang == 'zh':
|
||||||
|
return False, ("原文出現金錢數額。若該數額屬於訴訟標的(如欠款、索償、賠償金額),"
|
||||||
|
"必須在 case_object 中提取並標明幣種與具體金額(如「拖欠貨款 HK$850,000」);"
|
||||||
|
"若僅為無關引用則可忽略。")
|
||||||
|
else:
|
||||||
|
return False, ("Monetary amounts appear in the source. If an amount forms part of the "
|
||||||
|
"subject matter (e.g., debt, claim, damages), it MUST be extracted in "
|
||||||
|
"case_object with currency and the specific figure (e.g., \"outstanding "
|
||||||
|
"goods price HK$850,000\"); ignore only if it is an unrelated citation.")
|
||||||
|
|
||||||
return True, ""
|
return True, ""
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -837,7 +877,7 @@ def extract_reason_object(client: OpenAICompatibleClient, context: str, lang: st
|
||||||
|
|
||||||
return client.chat_json_with_retry(system, user,
|
return client.chat_json_with_retry(system, user,
|
||||||
schema,
|
schema,
|
||||||
validator=lambda x: _reason_object_validator(x, lang))
|
validator=lambda x: _reason_object_validator(x, lang, context[:5000]))
|
||||||
|
|
||||||
|
|
||||||
# --- Call 3: 判決結果 -------------------------------------------------------
|
# --- Call 3: 判決結果 -------------------------------------------------------
|
||||||
|
|
@ -875,7 +915,9 @@ JUDGMENT_RESULT_SYSTEM_ZH = """從香港判決書尾部的命令/裁定部分抽
|
||||||
- result 必須包含:
|
- result 必須包含:
|
||||||
a) 明確結果(勝訴/敗訴/部分勝訴/維持/撤銷/駁回/發還等)
|
a) 明確結果(勝訴/敗訴/部分勝訴/維持/撤銷/駁回/發還等)
|
||||||
b) 2-3 個關鍵法庭理由(如有)
|
b) 2-3 個關鍵法庭理由(如有)
|
||||||
c) 具體金額、利率或命令內容(如有)
|
c) 具體金額、利率或命令內容;凡有判給/命令支付的金錢數額,必須原文照錄幣種與金額(如 HK$28,500),不得省略或約化
|
||||||
|
|
||||||
|
用語要求:一律使用規範的法律專業用語(如「判給」「訟費」「利息」「駁回」「發還重審」),避免口語化或不準確的表述。
|
||||||
|
|
||||||
只輸出 JSON。"""
|
只輸出 JSON。"""
|
||||||
|
|
||||||
|
|
@ -900,7 +942,9 @@ Splitting Principles:
|
||||||
- result must include:
|
- result must include:
|
||||||
a) Clear outcome (allowed/dismissed/partially allowed/upheld/quashed/remitted, etc.)
|
a) Clear outcome (allowed/dismissed/partially allowed/upheld/quashed/remitted, etc.)
|
||||||
b) 2-3 key court reasons (if any)
|
b) 2-3 key court reasons (if any)
|
||||||
c) Specific amounts, interest rates or order details (if any)
|
c) Specific amounts, interest rates or order details; whenever a sum is awarded/ordered to be paid, the currency and figure MUST be reproduced verbatim (e.g., HK$28,500), never omitted or rounded
|
||||||
|
|
||||||
|
Terminology requirement: consistently use precise, standard legal terminology (e.g., "awarded", "costs", "interest", "dismissed", "remitted for retrial"); avoid colloquial or imprecise wording.
|
||||||
|
|
||||||
Output only JSON."""
|
Output only JSON."""
|
||||||
|
|
||||||
|
|
@ -998,6 +1042,8 @@ ENTITIES_SYSTEM_ZH = """從香港判決書中抽取所有相關實體(自然
|
||||||
- 純案例名稱(如 Donoghue v Stevenson)
|
- 純案例名稱(如 Donoghue v Stevenson)
|
||||||
- 文獻、期刊名
|
- 文獻、期刊名
|
||||||
|
|
||||||
|
用語要求:reason 一律使用規範的法律專業用語(如「主審」「闡述」「先例」「判詞」),避免口語化或不準確的表述。
|
||||||
|
|
||||||
只輸出 JSON。"""
|
只輸出 JSON。"""
|
||||||
|
|
||||||
ENTITIES_FEWSHOT_ZH = """範例輸出:
|
ENTITIES_FEWSHOT_ZH = """範例輸出:
|
||||||
|
|
@ -1022,6 +1068,8 @@ MUST NOT include:
|
||||||
- Pure case names (e.g., Donoghue v Stevenson)
|
- Pure case names (e.g., Donoghue v Stevenson)
|
||||||
- Literature, journal names
|
- Literature, journal names
|
||||||
|
|
||||||
|
Terminology requirement: write each reason in precise, standard legal terminology (e.g., "presiding", "articulated", "precedent", "judgment"); avoid colloquial or imprecise wording.
|
||||||
|
|
||||||
Output only JSON."""
|
Output only JSON."""
|
||||||
|
|
||||||
ENTITIES_FEWSHOT_EN = """Example Output:
|
ENTITIES_FEWSHOT_EN = """Example Output:
|
||||||
|
|
@ -1106,6 +1154,8 @@ SUMMARY_SYSTEM_ZH = """根據已抽取的結構化字段 + 法庭分析段,撰
|
||||||
|
|
||||||
**重要:judgment_summary 必須使用中文撰寫。**
|
**重要:judgment_summary 必須使用中文撰寫。**
|
||||||
|
|
||||||
|
用語要求:一律使用規範的法律專業用語;涉及金錢的判給或標的,須保留具體金額(含幣種)。
|
||||||
|
|
||||||
嚴格 ≤300 字。只輸出 JSON。"""
|
嚴格 ≤300 字。只輸出 JSON。"""
|
||||||
|
|
||||||
# 英文提示詞
|
# 英文提示詞
|
||||||
|
|
@ -1122,6 +1172,8 @@ Four-element structure (must cover all, in coherent single paragraph):
|
||||||
|
|
||||||
**IMPORTANT: judgment_summary MUST be written in English.**
|
**IMPORTANT: judgment_summary MUST be written in English.**
|
||||||
|
|
||||||
|
Terminology requirement: consistently use precise, standard legal terminology; preserve specific monetary figures (with currency) for any award or subject matter involving money.
|
||||||
|
|
||||||
Strictly ≤500 characters. Output only JSON."""
|
Strictly ≤500 characters. Output only JSON."""
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1268,8 +1320,9 @@ def run_pipeline(text: str, model: str, base_url: str, api_key: str,
|
||||||
entities_window: int = 400,
|
entities_window: int = 400,
|
||||||
entities_max: int = 6500,
|
entities_max: int = 6500,
|
||||||
analysis_window: int = 500,
|
analysis_window: int = 500,
|
||||||
analysis_max: int = 6500) -> tuple[dict, OpenAICompatibleClient]:
|
analysis_max: int = 6500,
|
||||||
log = lambda m: print(m, file=sys.stderr)
|
log_prefix: str = "") -> tuple[dict, OpenAICompatibleClient]:
|
||||||
|
log = lambda m: print(f"{log_prefix}{m}", file=sys.stderr)
|
||||||
|
|
||||||
log("[0/7] 檢測語言...")
|
log("[0/7] 檢測語言...")
|
||||||
lang = detect_language(text)
|
lang = detect_language(text)
|
||||||
|
|
@ -1416,6 +1469,93 @@ def to_yaml(result: dict) -> str:
|
||||||
default_flow_style=False, width=100)
|
default_flow_style=False, width=100)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# 7. 多模型運行:解析運行規格 + 輸出路徑推導
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RunSpec:
|
||||||
|
"""單個模型的運行規格(標籤 + 連接參數 + 計費配置)"""
|
||||||
|
label: str # 用於日誌前綴與輸出文件名(配置名或模型名)
|
||||||
|
model: str
|
||||||
|
base_url: str
|
||||||
|
api_key: str
|
||||||
|
profile: dict | None = None # models.json 配置(用於成本計算),無則 None
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_label(name: str) -> str:
|
||||||
|
"""把配置/模型名轉成可安全用於文件名的標籤(如 anthropic/claude → anthropic_claude)"""
|
||||||
|
safe = re.sub(r"[^0-9A-Za-z._-]+", "_", name).strip("_")
|
||||||
|
return safe or "model"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_run_specs(args) -> list[RunSpec]:
|
||||||
|
"""解析 --config / --model(均支持逗號分隔)為一組運行規格。
|
||||||
|
|
||||||
|
優先級:
|
||||||
|
- 有 --config:逐個從 models.json 加載配置(自帶 model/base_url/api_key/價格)
|
||||||
|
- 否則:用 --model(可逗號分隔多個),共用 --base-url / --api-key
|
||||||
|
"""
|
||||||
|
specs: list[RunSpec] = []
|
||||||
|
if args.config:
|
||||||
|
names = [n.strip() for n in args.config.split(",") if n.strip()]
|
||||||
|
for name in names:
|
||||||
|
profile = load_model_profile(name, args.models_file)
|
||||||
|
specs.append(RunSpec(
|
||||||
|
label=name,
|
||||||
|
model=profile.get("model") or args.model,
|
||||||
|
base_url=profile.get("BaseApiUrl") or args.base_url,
|
||||||
|
api_key=profile.get("ApiKey") or args.api_key,
|
||||||
|
profile=profile,
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
names = [n.strip() for n in args.model.split(",") if n.strip()] or [args.model]
|
||||||
|
for name in names:
|
||||||
|
specs.append(RunSpec(
|
||||||
|
label=name,
|
||||||
|
model=name,
|
||||||
|
base_url=args.base_url,
|
||||||
|
api_key=args.api_key,
|
||||||
|
profile=None,
|
||||||
|
))
|
||||||
|
return specs
|
||||||
|
|
||||||
|
|
||||||
|
def build_out_path(args, input_path: Path, label: str, multi: bool) -> Path | None:
|
||||||
|
"""推導某個模型的 YAML 輸出路徑。
|
||||||
|
|
||||||
|
- 單模型:沿用原行為(--out 指定則用之,否則 None 表示輸出到 stdout)
|
||||||
|
- 多模型:在文件名中插入標籤;未給 --out 時用「輸入名_標籤.yaml」
|
||||||
|
"""
|
||||||
|
if not multi:
|
||||||
|
return Path(args.out) if args.out else None
|
||||||
|
safe = _safe_label(label)
|
||||||
|
if args.out:
|
||||||
|
base = Path(args.out)
|
||||||
|
return base.with_name(f"{base.stem}_{safe}{base.suffix or '.yaml'}")
|
||||||
|
return input_path.with_name(f"{input_path.stem}_{safe}.yaml")
|
||||||
|
|
||||||
|
|
||||||
|
def build_cost_path(out_path: Path | None, input_path: Path,
|
||||||
|
label: str, multi: bool) -> Path:
|
||||||
|
"""成本文件路徑:有輸出文件時用「輸出名_cost.json」,否則回退到輸入名。"""
|
||||||
|
if out_path is not None:
|
||||||
|
return out_path.with_name(out_path.stem + "_cost.json")
|
||||||
|
if multi:
|
||||||
|
return input_path.with_name(f"{input_path.stem}_{_safe_label(label)}_cost.json")
|
||||||
|
return input_path.with_name(input_path.stem + "_cost.json")
|
||||||
|
|
||||||
|
|
||||||
|
def build_debug_path(args, label: str, multi: bool) -> Path | None:
|
||||||
|
"""debug-dump 路徑:多模型時在文件名中插入標籤。"""
|
||||||
|
if not args.debug_dump:
|
||||||
|
return None
|
||||||
|
base = Path(args.debug_dump)
|
||||||
|
if multi:
|
||||||
|
return base.with_name(f"{base.stem}_{_safe_label(label)}{base.suffix or '.json'}")
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# CLI
|
# CLI
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
@ -1444,6 +1584,17 @@ def main() -> None:
|
||||||
--model gpt-4 \\
|
--model gpt-4 \\
|
||||||
--api-key your-api-key
|
--api-key your-api-key
|
||||||
|
|
||||||
|
# 同時跑多個模型(逗號分隔),並發執行,分別輸出到不同文件
|
||||||
|
python hk_case_extractor.py case.txt \\
|
||||||
|
--config openrouter-claude-sonnet,openrouter-gpt4o,ollama-qwen \\
|
||||||
|
--out result.yaml --cost
|
||||||
|
# 生成 result_openrouter-claude-sonnet.yaml / result_openrouter-gpt4o.yaml / ...
|
||||||
|
# 及各自的 *_cost.json
|
||||||
|
|
||||||
|
# 多個本地 Ollama 模型共用同一端點
|
||||||
|
python hk_case_extractor.py case.txt \\
|
||||||
|
--model qwen2.5:7b-instruct,llama3.1:8b --out result.yaml
|
||||||
|
|
||||||
# 調整截取長度
|
# 調整截取長度
|
||||||
python hk_case_extractor.py case.txt \\
|
python hk_case_extractor.py case.txt \\
|
||||||
--head-length 8000 \\
|
--head-length 8000 \\
|
||||||
|
|
@ -1455,21 +1606,24 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
ap.add_argument("input", help="判決書文本路徑(.txt 或 .json)")
|
ap.add_argument("input", help="判決書文本路徑(.txt 或 .json)")
|
||||||
ap.add_argument("--config", default=None,
|
ap.add_argument("--config", default=None,
|
||||||
help="models.json 中的配置名稱(Name),"
|
help="models.json 中的配置名稱(Name),可逗號分隔多個以同時運行多個模型,"
|
||||||
"使用後可省略 --model/--base-url/--api-key")
|
"使用後可省略 --model/--base-url/--api-key")
|
||||||
ap.add_argument("--models-file", default=DEFAULT_MODELS_FILE,
|
ap.add_argument("--models-file", default=DEFAULT_MODELS_FILE,
|
||||||
help=f"模型配置文件路徑(默認:{DEFAULT_MODELS_FILE})")
|
help=f"模型配置文件路徑(默認:{DEFAULT_MODELS_FILE})")
|
||||||
ap.add_argument("--model", default=DEFAULT_MODEL,
|
ap.add_argument("--model", default=DEFAULT_MODEL,
|
||||||
help=f"模型名稱(默認:{DEFAULT_MODEL})")
|
help=f"模型名稱(默認:{DEFAULT_MODEL}),可逗號分隔多個(共用 --base-url/--api-key)")
|
||||||
ap.add_argument("--base-url", default=DEFAULT_BASE_URL,
|
ap.add_argument("--base-url", default=DEFAULT_BASE_URL,
|
||||||
help=f"API base URL(默認:{DEFAULT_BASE_URL})")
|
help=f"API base URL(默認:{DEFAULT_BASE_URL})")
|
||||||
ap.add_argument("--api-key", default=DEFAULT_API_KEY,
|
ap.add_argument("--api-key", default=DEFAULT_API_KEY,
|
||||||
help="API key(Ollama 可忽略)")
|
help="API key(Ollama 可忽略)")
|
||||||
ap.add_argument("--out", default=None, help="輸出 YAML 路徑(默認 stdout)")
|
ap.add_argument("--out", default=None,
|
||||||
|
help="輸出 YAML 路徑(單模型默認 stdout);多模型時在文件名中插入標籤")
|
||||||
|
ap.add_argument("--max-workers", type=int, default=0,
|
||||||
|
help="多模型時的並發數(默認 0 = 模型數量;設為 1 則順序執行)")
|
||||||
ap.add_argument("--cost", action="store_true",
|
ap.add_argument("--cost", action="store_true",
|
||||||
help="輸出成本統計到 {輸出文件名}_cost.json(默認不輸出)")
|
help="輸出成本統計到 {輸出文件名}_cost.json(默認不輸出)")
|
||||||
ap.add_argument("--debug-dump", default=None,
|
ap.add_argument("--debug-dump", default=None,
|
||||||
help="額外輸出原始 JSON 結果到該路徑(便於 diff)")
|
help="額外輸出原始 JSON 結果到該路徑(多模型時在文件名中插入標籤)")
|
||||||
|
|
||||||
# 截取長度控制參數
|
# 截取長度控制參數
|
||||||
ap.add_argument("--head-length", type=int, default=5000,
|
ap.add_argument("--head-length", type=int, default=5000,
|
||||||
|
|
@ -1487,24 +1641,17 @@ def main() -> None:
|
||||||
|
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
|
|
||||||
# 解析模型配置:--config 優先,未命中的字段回退到命令行/默認值
|
# 解析運行規格(--config / --model 均支持逗號分隔多個)
|
||||||
profile: dict | None = None
|
specs = parse_run_specs(args)
|
||||||
if args.config:
|
multi = len(specs) > 1
|
||||||
profile = load_model_profile(args.config, args.models_file)
|
for spec in specs:
|
||||||
model = profile.get("model") or args.model
|
if spec.profile is not None:
|
||||||
base_url = profile.get("BaseApiUrl") or args.base_url
|
print(f"使用配置 '{spec.label}':model={spec.model}, base_url={spec.base_url}",
|
||||||
api_key = profile.get("ApiKey") or args.api_key
|
file=sys.stderr)
|
||||||
print(f"使用配置 '{args.config}':model={model}, base_url={base_url}",
|
|
||||||
file=sys.stderr)
|
|
||||||
else:
|
|
||||||
model = args.model
|
|
||||||
base_url = args.base_url
|
|
||||||
api_key = args.api_key
|
|
||||||
|
|
||||||
# 支持从 .json 文件的 content 字段读取
|
# 支持从 .json 文件的 content 字段读取
|
||||||
input_path = Path(args.input)
|
input_path = Path(args.input)
|
||||||
if input_path.suffix.lower() == '.json':
|
if input_path.suffix.lower() == '.json':
|
||||||
import json
|
|
||||||
data = json.loads(input_path.read_text(encoding="utf-8"))
|
data = json.loads(input_path.read_text(encoding="utf-8"))
|
||||||
text = data.get("content", "")
|
text = data.get("content", "")
|
||||||
if not text:
|
if not text:
|
||||||
|
|
@ -1512,44 +1659,77 @@ def main() -> None:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
else:
|
else:
|
||||||
text = input_path.read_text(encoding="utf-8")
|
text = input_path.read_text(encoding="utf-8")
|
||||||
|
|
||||||
start = time.perf_counter()
|
|
||||||
result, client = run_pipeline(text, model, base_url, api_key,
|
|
||||||
args.head_length, args.tail_length,
|
|
||||||
args.entities_window, args.entities_max,
|
|
||||||
args.analysis_window, args.analysis_max)
|
|
||||||
elapsed = time.perf_counter() - start
|
|
||||||
|
|
||||||
# 成本統計:僅在 --cost 時輸出到 {summary_file_name}_cost.json
|
def run_and_emit(spec: RunSpec) -> str:
|
||||||
if args.cost:
|
"""跑單個模型並寫出其 YAML / cost / debug 文件,返回輸出路徑描述。"""
|
||||||
cost = compute_cost(client, elapsed, profile)
|
log_prefix = f"[{spec.label}] " if multi else ""
|
||||||
if args.out:
|
start = time.perf_counter()
|
||||||
cost_path = Path(args.out).with_name(Path(args.out).stem + "_cost.json")
|
result, client = run_pipeline(
|
||||||
else:
|
text, spec.model, spec.base_url, spec.api_key,
|
||||||
cost_path = input_path.with_name(input_path.stem + "_cost.json")
|
args.head_length, args.tail_length,
|
||||||
cost_path.parent.mkdir(parents=True, exist_ok=True)
|
args.entities_window, args.entities_max,
|
||||||
cost_path.write_text(json.dumps(cost, ensure_ascii=False, indent=2),
|
args.analysis_window, args.analysis_max,
|
||||||
encoding="utf-8")
|
log_prefix=log_prefix)
|
||||||
print(f"💰 成本統計已寫入 {cost_path}:耗時 {cost['elapsed_seconds']}s,"
|
elapsed = time.perf_counter() - start
|
||||||
f"input={cost['input_tokens']} output={cost['output_tokens']} "
|
|
||||||
f"total_cost={cost['total_cost']} {cost['price_unit'] or ''}",
|
|
||||||
file=sys.stderr)
|
|
||||||
|
|
||||||
if args.debug_dump:
|
out_path = build_out_path(args, input_path, spec.label, multi)
|
||||||
debug_path = Path(args.debug_dump)
|
|
||||||
debug_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
debug_path.write_text(
|
|
||||||
json.dumps(result, ensure_ascii=False, indent=2),
|
|
||||||
encoding="utf-8")
|
|
||||||
|
|
||||||
yaml_str = to_yaml(result)
|
# 成本統計:僅在 --cost 時輸出到 {輸出文件名}_cost.json
|
||||||
if args.out:
|
if args.cost:
|
||||||
out_path = Path(args.out)
|
cost = compute_cost(client, elapsed, spec.profile)
|
||||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
cost_path = build_cost_path(out_path, input_path, spec.label, multi)
|
||||||
out_path.write_text(yaml_str, encoding="utf-8")
|
cost_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
print(f"\n✅ 已寫入 {args.out}", file=sys.stderr)
|
cost_path.write_text(json.dumps(cost, ensure_ascii=False, indent=2),
|
||||||
else:
|
encoding="utf-8")
|
||||||
|
print(f"{log_prefix}💰 成本統計已寫入 {cost_path}:耗時 {cost['elapsed_seconds']}s,"
|
||||||
|
f"input={cost['input_tokens']} output={cost['output_tokens']} "
|
||||||
|
f"total_cost={cost['total_cost']} {cost['price_unit'] or ''}",
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
# debug dump(原始 JSON)
|
||||||
|
debug_path = build_debug_path(args, spec.label, multi)
|
||||||
|
if debug_path is not None:
|
||||||
|
debug_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
debug_path.write_text(json.dumps(result, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8")
|
||||||
|
|
||||||
|
# YAML 輸出
|
||||||
|
yaml_str = to_yaml(result)
|
||||||
|
if out_path is not None:
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
out_path.write_text(yaml_str, encoding="utf-8")
|
||||||
|
print(f"{log_prefix}✅ 已寫入 {out_path}", file=sys.stderr)
|
||||||
|
return str(out_path)
|
||||||
print(yaml_str)
|
print(yaml_str)
|
||||||
|
return "(stdout)"
|
||||||
|
|
||||||
|
if multi and args.max_workers != 1:
|
||||||
|
# 並發執行:每個模型一個線程,日誌以 [標籤] 前綴區分
|
||||||
|
workers = args.max_workers if args.max_workers > 0 else len(specs)
|
||||||
|
print(f"⏳ 同時運行 {len(specs)} 個模型(並發 {workers}):"
|
||||||
|
f"{[s.label for s in specs]}", file=sys.stderr)
|
||||||
|
summary: list[tuple[str, str]] = []
|
||||||
|
with ThreadPoolExecutor(max_workers=workers) as ex:
|
||||||
|
futures = {ex.submit(run_and_emit, spec): spec for spec in specs}
|
||||||
|
for fut in as_completed(futures):
|
||||||
|
spec = futures[fut]
|
||||||
|
try:
|
||||||
|
summary.append((spec.label, fut.result()))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[{spec.label}] ❌ 運行失敗:{e}", file=sys.stderr)
|
||||||
|
summary.append((spec.label, f"FAILED: {e}"))
|
||||||
|
print("\n=== 多模型運行結果 ===", file=sys.stderr)
|
||||||
|
for label, out in sorted(summary):
|
||||||
|
print(f" {label:30s} → {out}", file=sys.stderr)
|
||||||
|
else:
|
||||||
|
# 單模型,或多模型但顯式 --max-workers 1 順序執行
|
||||||
|
for spec in specs:
|
||||||
|
try:
|
||||||
|
run_and_emit(spec)
|
||||||
|
except Exception as e:
|
||||||
|
if not multi:
|
||||||
|
raise
|
||||||
|
print(f"[{spec.label}] ❌ 運行失敗:{e}", file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue