common_defs.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. """
  2. Common definitions and constants for the legal discovery pipeline.
  3. """
  4. from dataclasses import dataclass
  5. from typing import List, Dict, Optional
  6. from enum import Enum
  7. # Case-specific criteria
  8. CASE_NAME = "Jennifer Capasso v. Memorial Sloan Kettering Cancer Center"
  9. PLAINTIFF_NAME = "Jennifer Capasso"
  10. # Plaintiff name variations
  11. PLAINTIFF_VARIATIONS = [
  12. "jennifer capasso",
  13. "jen capasso",
  14. "jennifer",
  15. "jen",
  16. "jenn",
  17. "jenn capasso",
  18. "jennifer danielle capasso",
  19. "capasso",
  20. "j capasso",
  21. "jdc",
  22. ]
  23. # Facility names
  24. FACILITY_NAMES = ["memorial sloan kettering", "msk", "sloan kettering", "mskcc", "sk"]
  25. # Key topics for keyword filtering
  26. KEY_TOPICS = [
  27. # Treatment at MSK
  28. "treatment",
  29. "medical care",
  30. "doctor",
  31. "physician",
  32. "nurse",
  33. "appointment",
  34. "visit",
  35. "hospital",
  36. "clinic",
  37. "surgery",
  38. "procedure",
  39. "diagnosis",
  40. "medication",
  41. "prescription",
  42. # Complaints
  43. "complaint",
  44. "complain",
  45. "complained",
  46. "issue",
  47. "problem",
  48. "concern",
  49. "patient representative",
  50. "patient advocate",
  51. # Patient information updates
  52. "patient information",
  53. "medical records",
  54. "pronouns",
  55. "gender identity",
  56. "gender marker",
  57. "update records",
  58. # Discrimination
  59. "discrimination",
  60. "discriminate",
  61. "discriminated",
  62. "bias",
  63. "unfair",
  64. "mistreat",
  65. "transphobia",
  66. "misgendered",
  67. "deadname",
  68. "wrong pronouns",
  69. "refused",
  70. "denied",
  71. # March 7, 2022 surgery
  72. "march 7",
  73. "march 2022",
  74. "3/7/22",
  75. "3/7/2022",
  76. "lung surgery",
  77. "wedge resection"
  78. # Emotional distress
  79. "emotional distress",
  80. "mental anguish",
  81. "pain",
  82. "suffering",
  83. "trauma",
  84. "anxious",
  85. "depressed",
  86. "stress",
  87. ]
  88. # Text normalization expansions
  89. TEXT_EXPANSIONS = {
  90. "admin": "administrator",
  91. "appt": "appointment",
  92. "dept": "department",
  93. "dr.": "doctor",
  94. "dr ": "doctor ",
  95. "info": "information",
  96. "meds": "medication",
  97. "msk": "memorial sloan kettering",
  98. "mskcc": "memorial sloan kettering",
  99. "proc": "procedure",
  100. "pt": "patient",
  101. "pts": "patients",
  102. "rep": "representative",
  103. "rx": "prescription",
  104. "sk": "memorial sloan kettering",
  105. "med": "medical",
  106. }
  107. # Subpoena criteria descriptions
  108. SUBPOENA_CRITERIA = {
  109. 1: "Medical treatment, care, procedures, appointments, services, and healthcare experiences at Memorial Sloan Kettering Cancer Center (MSK) involving patient Jennifer Capasso.",
  110. 2: "Complaints, grievances, concerns, feedback, disputes, or responses regarding patient care, service quality, or treatment issues raised with MSK staff, personnel, administrators, patient representatives, advocates, or employees concerning Jennifer Capasso.",
  111. 3: "Patient information updates, record changes, profile modifications, requests to change pronouns, gender identity markers, gender designation, preferred name, or demographic information in medical records for Jennifer Capasso at MSK.",
  112. 4: "Gender markers, gender identity documentation, sex designation, pronouns, or gender-related patient identifiers used in medical records, files, or systems at hospitals, medical facilities, or healthcare institutions where Jennifer Capasso received care or treatment.",
  113. 5: "Discrimination, bias, prejudice, mistreatment, harassment, disparate treatment, or negative experiences based on gender identity, transgender status, or gender expression that Jennifer Capasso encountered in any context, setting, location, or institution.",
  114. 6: "Surgery, surgical procedure, operation, medical intervention, or treatment performed on March 7, 2022 at Memorial Sloan Kettering Cancer Center involving Jennifer Capasso.",
  115. 7: "Emotional distress, psychological harm, mental anguish, mental suffering, anxiety, depression, trauma, pain and suffering, physical harm, economic damages, financial losses, medical expenses, lost wages, or other compensable harm resulting from or related to Jennifer Capasso's care, treatment, or experiences at MSK.",
  116. }
  117. # Query texts for semantic filtering
  118. SEMANTIC_QUERIES = SUBPOENA_CRITERIA.values()
  119. # Model configurations
  120. class ModelConfig:
  121. """Configuration for LLM models"""
  122. QWEN3_235B = {
  123. 'name': 'Qwen/Qwen3-235B-Instruct',
  124. 'gpus': 4,
  125. 'cost_per_hour': 2.56,
  126. 'port': 8000,
  127. 'quantization': 'awq'
  128. }
  129. QWEN25_72B = {
  130. 'name': 'Qwen/Qwen2.5-72B-Instruct',
  131. 'gpus': 2,
  132. 'cost_per_hour': 1.28,
  133. 'port': 8001,
  134. 'quantization': None
  135. }
  136. # Confidence levels
  137. class ConfidenceLevel(Enum):
  138. HIGH = "high"
  139. MEDIUM = "medium"
  140. LOW = "low"
  141. @dataclass
  142. class Message:
  143. """Represents a single message"""
  144. line_number: int
  145. timestamp: str
  146. sender: str
  147. message: str
  148. message_normalized: str = ""
  149. @dataclass
  150. class Chunk:
  151. """Represents a chunk of messages"""
  152. chunk_id: int
  153. start_line: int
  154. end_line: int
  155. messages: List[Message]
  156. combined_text: str
  157. timestamp_start: str
  158. timestamp_end: str
  159. keyword_matches: Optional[List[str]] = None
  160. keyword_score: Optional[int] = None
  161. semantic_score_model1: Optional[float] = None
  162. semantic_score_model2: Optional[float] = None
  163. semantic_score_combined: Optional[float] = None
  164. @dataclass
  165. class InferenceResult:
  166. """Results from LLM inference"""
  167. chunk_id: int
  168. responsive_line_numbers: List[int]
  169. reasoning: str
  170. confidence: ConfidenceLevel
  171. model_name: str
  172. @dataclass
  173. class MergedResult:
  174. """Merged results from dual models"""
  175. chunk_id: int
  176. responsive_line_numbers: List[int]
  177. confidence: ConfidenceLevel
  178. qwen3_lines: List[int]
  179. qwen25_lines: List[int]
  180. agreement: bool