""" Common definitions and constants for the legal discovery pipeline. """ from dataclasses import dataclass from typing import List, Dict, Optional from enum import Enum # Case-specific criteria CASE_NAME = "Jennifer Capasso v. Memorial Sloan Kettering Cancer Center" PLAINTIFF_NAME = "Jennifer Capasso" # Plaintiff name variations PLAINTIFF_VARIATIONS = [ "jennifer capasso", "jen capasso", "jennifer", "jen", "jenn", "jenn capasso", "jennifer danielle capasso", "capasso", "j capasso", "jdc", ] # Facility names FACILITY_NAMES = ["memorial sloan kettering", "msk", "sloan kettering", "mskcc", "sk"] # Key topics for keyword filtering KEY_TOPICS = [ # Treatment at MSK "treatment", "medical care", "doctor", "physician", "nurse", "appointment", "visit", "hospital", "clinic", "surgery", "procedure", "diagnosis", "medication", "prescription", # Complaints "complaint", "complain", "complained", "issue", "problem", "concern", "patient representative", "patient advocate", # Patient information updates "patient information", "medical records", "pronouns", "gender identity", "gender marker", "update records", # Discrimination "discrimination", "discriminate", "discriminated", "bias", "unfair", "mistreat", "transphobia", "misgendered", "deadname", "wrong pronouns", "refused", "denied", # March 7, 2022 surgery "march 7", "march 2022", "3/7/22", "3/7/2022", "lung surgery", "wedge resection" # Emotional distress "emotional distress", "mental anguish", "pain", "suffering", "trauma", "anxious", "depressed", "stress", ] # Text normalization expansions TEXT_EXPANSIONS = { "admin": "administrator", "appt": "appointment", "dept": "department", "dr.": "doctor", "dr ": "doctor ", "info": "information", "meds": "medication", "msk": "memorial sloan kettering", "mskcc": "memorial sloan kettering", "proc": "procedure", "pt": "patient", "pts": "patients", "rep": "representative", "rx": "prescription", "sk": "memorial sloan kettering", "med": "medical", } # Subpoena criteria descriptions SUBPOENA_CRITERIA = { 1: "Medical treatment, care, procedures, appointments, services, and healthcare experiences at Memorial Sloan Kettering Cancer Center (MSK) involving patient Jennifer Capasso.", 2: "Complaints, grievances, concerns, feedback, disputes, or responses regarding patient care, service quality, or treatment issues raised with MSK staff, personnel, administrators, patient representatives, advocates, or employees concerning Jennifer Capasso.", 3: "Patient information updates, record changes, profile modifications, requests to change pronouns, gender identity markers, gender designation, preferred name, or demographic information in medical records for Jennifer Capasso at MSK.", 4: "Gender markers, gender identity documentation, sex designation, pronouns, or gender-related patient identifiers used in medical records, files, or systems at hospitals, medical facilities, or healthcare institutions where Jennifer Capasso received care or treatment.", 5: "Discrimination, bias, prejudice, mistreatment, harassment, disparate treatment, or negative experiences based on gender identity, transgender status, or gender expression that Jennifer Capasso encountered in any context, setting, location, or institution.", 6: "Surgery, surgical procedure, operation, medical intervention, or treatment performed on March 7, 2022 at Memorial Sloan Kettering Cancer Center involving Jennifer Capasso.", 7: "Emotional distress, psychological harm, mental anguish, mental suffering, anxiety, depression, trauma, pain and suffering, physical harm, economic damages, financial losses, medical expenses, lost wages, or other compensable harm resulting from or related to Jennifer Capasso's care, treatment, or experiences at MSK.", } # Query texts for semantic filtering SEMANTIC_QUERIES = SUBPOENA_CRITERIA.values() # Model configurations class ModelConfig: """Configuration for LLM models""" QWEN3_235B = { 'name': 'Qwen/Qwen3-235B-Instruct', 'gpus': 4, 'cost_per_hour': 2.56, 'port': 8000, 'quantization': 'awq' } QWEN25_72B = { 'name': 'Qwen/Qwen2.5-72B-Instruct', 'gpus': 2, 'cost_per_hour': 1.28, 'port': 8001, 'quantization': None } # Confidence levels class ConfidenceLevel(Enum): HIGH = "high" MEDIUM = "medium" LOW = "low" @dataclass class Message: """Represents a single message""" line_number: int timestamp: str sender: str message: str message_normalized: str = "" @dataclass class Chunk: """Represents a chunk of messages""" chunk_id: int start_line: int end_line: int messages: List[Message] combined_text: str timestamp_start: str timestamp_end: str keyword_matches: Optional[List[str]] = None keyword_score: Optional[int] = None semantic_score_model1: Optional[float] = None semantic_score_model2: Optional[float] = None semantic_score_combined: Optional[float] = None @dataclass class InferenceResult: """Results from LLM inference""" chunk_id: int responsive_line_numbers: List[int] reasoning: str confidence: ConfidenceLevel model_name: str @dataclass class MergedResult: """Merged results from dual models""" chunk_id: int responsive_line_numbers: List[int] confidence: ConfidenceLevel qwen3_lines: List[int] qwen25_lines: List[int] agreement: bool