justadri
/
disco


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
							"""
Common definitions and constants for the legal discovery pipeline.
"""

from dataclasses import dataclass
from typing import List, Dict, Optional
from enum import Enum

# Case-specific criteria
CASE_NAME = "Jennifer Capasso v. Memorial Sloan Kettering Cancer Center"
PLAINTIFF_NAME = "Jennifer Capasso"

# Plaintiff name variations
PLAINTIFF_VARIATIONS = [
    "jennifer capasso",
    "jen capasso",
    "jennifer",
    "jen",
    "jenn",
    "jenn capasso",
    "jennifer danielle capasso",
    "capasso",
    "j capasso",
    "jdc",
]

# Facility names
FACILITY_NAMES = ["memorial sloan kettering", "msk", "sloan kettering", "mskcc", "sk"]

# Key topics for keyword filtering
KEY_TOPICS = [
    # Treatment at MSK
    "treatment",
    "medical care",
    "doctor",
    "physician",
    "nurse",
    "appointment",
    "visit",
    "hospital",
    "clinic",
    "surgery",
    "procedure",
    "diagnosis",
    "medication",
    "prescription",
    # Complaints
    "complaint",
    "complain",
    "complained",
    "issue",
    "problem",
    "concern",
    "patient representative",
    "patient advocate",
    # Patient information updates
    "patient information",
    "medical records",
    "pronouns",
    "gender identity",
    "gender marker",
    "update records",
    # Discrimination
    "discrimination",
    "discriminate",
    "discriminated",
    "bias",
    "unfair",
    "mistreat",
    "transphobia",
    "misgendered",
    "deadname",
    "wrong pronouns",
    "refused",
    "denied",
    # March 7, 2022 surgery
    "march 7",
    "march 2022",
    "3/7/22",
    "3/7/2022",
    "lung surgery",
    "wedge resection"
    # Emotional distress
    "emotional distress",
    "mental anguish",
    "pain",
    "suffering",
    "trauma",
    "anxious",
    "depressed",
    "stress",
]

# Text normalization expansions
TEXT_EXPANSIONS = {
    "admin": "administrator",
    "appt": "appointment",
    "dept": "department",
    "dr.": "doctor",
    "dr ": "doctor ",
    "info": "information",
    "meds": "medication",
    "msk": "memorial sloan kettering",
    "mskcc": "memorial sloan kettering",
    "proc": "procedure",
    "pt": "patient",
    "pts": "patients",
    "rep": "representative",
    "rx": "prescription",
    "sk": "memorial sloan kettering",
    "med": "medical",
}

# Subpoena criteria descriptions
SUBPOENA_CRITERIA = {
    1: "Medical treatment, care, procedures, appointments, services, and healthcare experiences at Memorial Sloan Kettering Cancer Center (MSK) involving patient Jennifer Capasso.",
    2: "Complaints, grievances, concerns, feedback, disputes, or responses regarding patient care, service quality, or treatment issues raised with MSK staff, personnel, administrators, patient representatives, advocates, or employees concerning Jennifer Capasso.",
    3: "Patient information updates, record changes, profile modifications, requests to change pronouns, gender identity markers, gender designation, preferred name, or demographic information in medical records for Jennifer Capasso at MSK.",
    4: "Gender markers, gender identity documentation, sex designation, pronouns, or gender-related patient identifiers used in medical records, files, or systems at hospitals, medical facilities, or healthcare institutions where Jennifer Capasso received care or treatment.",
    5: "Discrimination, bias, prejudice, mistreatment, harassment, disparate treatment, or negative experiences based on gender identity, transgender status, or gender expression that Jennifer Capasso encountered in any context, setting, location, or institution.",
    6: "Surgery, surgical procedure, operation, medical intervention, or treatment performed on March 7, 2022 at Memorial Sloan Kettering Cancer Center involving Jennifer Capasso.",
    7: "Emotional distress, psychological harm, mental anguish, mental suffering, anxiety, depression, trauma, pain and suffering, physical harm, economic damages, financial losses, medical expenses, lost wages, or other compensable harm resulting from or related to Jennifer Capasso's care, treatment, or experiences at MSK.",
}

# Query texts for semantic filtering
SEMANTIC_QUERIES = SUBPOENA_CRITERIA.values()

# Model configurations
class ModelConfig:
    """Configuration for LLM models"""
    QWEN3_235B = {
        'name': 'Qwen/Qwen3-235B-Instruct',
        'gpus': 4,
        'cost_per_hour': 2.56,
        'port': 8000,
        'quantization': 'awq'
    }
    
    QWEN25_72B = {
        'name': 'Qwen/Qwen2.5-72B-Instruct',
        'gpus': 2,
        'cost_per_hour': 1.28,
        'port': 8001,
        'quantization': None
    }

# Confidence levels
class ConfidenceLevel(Enum):
    HIGH = "high"
    MEDIUM = "medium"
    LOW = "low"

@dataclass
class Message:
    """Represents a single message"""
    line_number: int
    timestamp: str
    sender: str
    message: str
    message_normalized: str = ""

@dataclass
class Chunk:
    """Represents a chunk of messages"""
    chunk_id: int
    start_line: int
    end_line: int
    messages: List[Message]
    combined_text: str
    timestamp_start: str
    timestamp_end: str
    keyword_matches: Optional[List[str]] = None
    keyword_score: Optional[int] = None
    semantic_score_model1: Optional[float] = None
    semantic_score_model2: Optional[float] = None
    semantic_score_combined: Optional[float] = None

@dataclass
class InferenceResult:
    """Results from LLM inference"""
    chunk_id: int
    responsive_line_numbers: List[int]
    reasoning: str
    confidence: ConfidenceLevel
    model_name: str

@dataclass
class MergedResult:
    """Merged results from dual models"""
    chunk_id: int
    responsive_line_numbers: List[int]
    confidence: ConfidenceLevel
    qwen3_lines: List[int]
    qwen25_lines: List[int]
    agreement: bool