common_defs.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813
  1. """
  2. Common definitions and constants for the legal discovery pipeline.
  3. """
  4. from dataclasses import dataclass
  5. from typing import List, Dict, Optional
  6. from enum import Enum
  7. # Case-specific criteria
  8. CASE_NAME = "Jennifer Capasso v. Memorial Sloan Kettering Cancer Center"
  9. PLAINTIFF_NAME = "Jennifer Capasso"
  10. # Plaintiff name variations
  11. PLAINTIFF_VARIATIONS = [
  12. "jennifer capasso",
  13. "jen capasso",
  14. "jennifer",
  15. "jen",
  16. "jenn",
  17. "jenn capasso",
  18. "jennifer danielle capasso",
  19. "capasso",
  20. "j capasso",
  21. "jdc",
  22. ]
  23. # Facility names
  24. FACILITY_NAMES = ["memorial sloan kettering", "msk", "sloan kettering", "mskcc", "sk"]
  25. # Key topics for keyword filtering
  26. KEY_TOPICS = [
  27. # Treatment at MSK
  28. "treatment",
  29. "medical care",
  30. "doctor",
  31. "physician",
  32. "nurse",
  33. "appointment",
  34. "visit",
  35. "hospital",
  36. "clinic",
  37. "surgery",
  38. "procedure",
  39. "diagnosis",
  40. "medication",
  41. "prescription",
  42. # Complaints
  43. "complaint",
  44. "complain",
  45. "complained",
  46. "issue",
  47. "problem",
  48. "concern",
  49. "patient representative",
  50. "patient advocate",
  51. # Patient information updates
  52. "patient information",
  53. "medical records",
  54. "pronouns",
  55. "gender identity",
  56. "gender marker",
  57. "update records",
  58. # Discrimination
  59. "discrimination",
  60. "discriminate",
  61. "discriminated",
  62. "bias",
  63. "unfair",
  64. "mistreat",
  65. "transphobia",
  66. "misgendered",
  67. "deadname",
  68. "wrong pronouns",
  69. "refused",
  70. "denied",
  71. # March 7, 2022 surgery
  72. "march 7",
  73. "march 2022",
  74. "3/7/22",
  75. "3/7/2022",
  76. "lung surgery",
  77. "wedge resection"
  78. # Emotional distress
  79. "emotional distress",
  80. "mental anguish",
  81. "pain",
  82. "suffering",
  83. "trauma",
  84. "anxious",
  85. "depressed",
  86. "stress",
  87. ]
  88. # Text normalization expansions
  89. ACRONYMS = {
  90. "AAA": "american automobile association",
  91. "AAVE": "african american vernacular english",
  92. "AC": "air conditioning", # or alternating current
  93. "AC18": "ip camera app",
  94. "ACIO": "associate chief information officer",
  95. "ACL": "anterior cruciate ligament",
  96. "ACME": "automatic certificate management environment",
  97. "ADHD": "attention deficit hyperactivity disorder",
  98. "AF": "as fuck",
  99. "AFAB": "assigned femle at birth",
  100. "AFAIK": "as far as i know",
  101. "AI": "artificial intelligence",
  102. "AIDS": "acquired immune deficiency syndrome",
  103. "AKA": "also known as",
  104. "AMA": "against medical advice", # or ask me anything
  105. "AMAB": "assigned male at birth",
  106. "AMD": "advanced micro devices",
  107. "AMEX": "american express",
  108. "AP": "access point",
  109. "API": "application programming interface",
  110. "APICHA": "asian/pacific islander coalition on hiv/aids",
  111. "APR": "annual percentage rate",
  112. "ASAP": "as soon as possible",
  113. "ASPCA": "american society for prevention of cruelty to animals",
  114. "ASR": "automated speech recognition",
  115. "ATL": "atlanta",
  116. "ATM": "at the moment", # or automated teller machine or ass to mouth
  117. "AWS": "amazon web services",
  118. "BA": "breast augmentation",
  119. "BB": "baby",
  120. "BBC": "big black cock",
  121. "BBL": "brazilian butt lift",
  122. "BBQ": "barbecue",
  123. "BC": "because",
  124. "BCBS": "blue cross blue shield health insurance",
  125. "BDSM": "bondage, discipline, sadism, and masochism",
  126. "BFF": "best friends forever",
  127. "BI": "business intelligence", # or bisexual
  128. "BIOS": "basic input/output system",
  129. "BJ": "blowjob",
  130. "BK": "brooklyn",
  131. "BLT": "benzocaine, lidocaine, and tetracaine",
  132. "BM": "bowel movement",
  133. "BMI": "body mass index",
  134. "BMW": "bavarian motor works",
  135. "BNWO": "black new world order",
  136. "BO": "body odor",
  137. "BP": "blood pressure",
  138. "BPD": "bipolar disorder",
  139. "BR": "bedroom",
  140. "BRATT": "bananas, rice, applesauce, and toast",
  141. "BRB": "be right back",
  142. "BS": "bullshit",
  143. "BTW": "by the way",
  144. "BX": "beatrice the cat",
  145. "C2C": "cam to cam",
  146. "CBD": "cannabinodiol",
  147. "CBT": "cognitive-behavioral therapy", # or cock and ball torture
  148. "CC": "credit card", # or carbon copy
  149. "CD": "crossdresser", # or compact disc
  150. "CDC": "centers for disease control",
  151. "CEA": "carcinoembryonic antigen",
  152. "CHEMO": "chemotherapy",
  153. "CIO": "chief information officer",
  154. "CL": "callen-lorde community health center",
  155. "CLI": "command line interface",
  156. "COBRA": "consolidated omnibus budget reconciliation act",
  157. "COIN": "cecilia’s occupational inclusion network health program",
  158. "CP": "colored person",
  159. "CPAP": "continuous positive airway pressure",
  160. "CPU": "central processing unit",
  161. "CS": "customer service",
  162. "CSI": "crime scene investigation",
  163. "CT": "computed tomography",
  164. "CVS": "consumer value stores",
  165. "DARE": "drug abuse resistance education",
  166. "DB": "database", # or decibels
  167. "DBA": "doing business as",
  168. "DC": "district of columbia",
  169. "DCD": "adult video store",
  170. "DEA": "drug enforcement agency",
  171. "DEI": "diversity, equity, and inclusion",
  172. "DG": "dollar general",
  173. "DHCP": "dynamic host configuration protocol",
  174. "DIY": "do it yourself",
  175. "DJ": "disc jockey",
  176. "DK": "don't know",
  177. "DL": "down low",
  178. "DMT": "dimethyltryptamine",
  179. "DMV": "department of motor vehicles",
  180. "DNA": "deoxyribonucleic acid",
  181. "DND": "do not disturb",
  182. "DNS": "domain name server",
  183. "DOA": "dead on arrival",
  184. "DP": "double penetration",
  185. "DPO": "dental provider organization",
  186. "DR": "doctor",
  187. "DSM": "diagnostic and statistical manual",
  188. "DTC": "double-team challenge",
  189. "DTF": "down to fuck",
  190. "DUI": "driving under the influence",
  191. "DVR": "digital video recorder",
  192. "DVT": "deep vein thrombosis",
  193. "DWI": "driving while intoxicated",
  194. "EBS": "elastic block storage",
  195. "EBT": "electronic benefit transfer",
  196. "ED": "erectile dysfunction",
  197. "EDT": "eastern daylight time",
  198. "EEOC": "equal employment opportunity commission",
  199. "ELI5": "explain it like i'm 5",
  200. "EMS": "emergency medical services",
  201. "EMT": "emergency medical technician",
  202. "ENT": "ear nose and throat",
  203. "ENV": "environment",
  204. "EOD": "end of day",
  205. "EOW": "end of week",
  206. "EPO": "exclusive provider organization",
  207. "ERP": "enterprise resource planning",
  208. "ES": "elastic search", # or 'is' in german
  209. "ESL": "english as a second language",
  210. "ET": "eastern time",
  211. "ETA": "estimated time of arrival",
  212. "FAFO": "fuck around and find out",
  213. "FAQ": "frequently asked questions",
  214. "FB": "facebook",
  215. "FBI": "federal bureau of investigation",
  216. "FDG": "fluorodeoxyglucose",
  217. "FFS": "facial feminization surgery", # or for fuck's sake
  218. "FL": "fetlife", # or florida
  219. "FML": "fuck my life",
  220. "FMLA": "family medical leave act",
  221. "FNA": "fine needle aspiration",
  222. "FODMAP": "fermentable oligosaccharides, disaccharides, monosaccharides and polyols",
  223. "FOH": "get the fuck out of here",
  224. "FOLFOX": "leucovorin, fluorouracil, and oxaliplatin",
  225. "FOMO": "fear of missing out",
  226. "FTM": "female-to-male transsexual",
  227. "FUE": "follicular unit extraction",
  228. "FUT": "follicular unit transplantation",
  229. "FWIW": "for what it's worth",
  230. "FYI": "for your information",
  231. "GB": "gigabyte", # or gangbang
  232. "GBL": "gamma-butyrolactone",
  233. "GBP": "gabapentin",
  234. "GC": "gonorrhea",
  235. "GCS": "gender confirmation surgery",
  236. "GERD": "gastro-esophageal reflux disorder",
  237. "GF": "girlfriend", # or gluten free
  238. "GFI": "go for it",
  239. "GGUF": "gpt-generated unified format",
  240. "GHB": "gamma-hydroxy buterol",
  241. "GI": "gastrointestinal",
  242. "GLOW": "gorgeous ladies of wrestling",
  243. "GM": "good morning",
  244. "GOAT": "greatest of all time",
  245. "GP": "general practitioner",
  246. "GPS": "global positioning system",
  247. "GPU": "graphics processing unit",
  248. "GV": "google voice",
  249. "GWB": "george washington bridge",
  250. "HAI": "hepatic artery infusion",
  251. "HAM": "hard as a motherfucker",
  252. "HDD": "hard disk drive",
  253. "HDMI": "high definition multimedia interface",
  254. "HELOC": "home equity line of credit",
  255. "HIV": "human immunodeficiency virus",
  256. "HOEING": "working as an escort",
  257. "HOOKED": "worked as an escort",
  258. "HPV": "human papilloma virus",
  259. "HR": "hour",
  260. "HRT": "hormone replacement therapy",
  261. "HSV": "herpes simplex virus",
  262. "HSV2": "herpes simplex virus 2",
  263. "HUNNY": "honey",
  264. "HVAC": "heat, ventilation, and air conditioning",
  265. "IAM": "identity and access management",
  266. "IANAL": "i am not a lawyer",
  267. "IBS": "irritable bowel syndrome",
  268. "ICD": "international classification of diseases",
  269. "ID": "identification",
  270. "IDGAF": "i don't give a fuck",
  271. "IDK": "i don't know",
  272. "IFTT": "if this then that",
  273. "IFTTT": "if this then that",
  274. "IG": "instagram",
  275. "IIRC": "if i recall correctly",
  276. "ILYSM": "i love you so much",
  277. "IMO": "in my opinion",
  278. "INFO": "information",
  279. "IPL": "intense pulsed light",
  280. "IRL": "in real life",
  281. "ISP": "internet service provider",
  282. "IV": "intravenous",
  283. "JAN28": "january 28",
  284. "JC": "jersey city",
  285. "JDC": "jennifer danielle capasso",
  286. "JFC": "jesus fucking christ",
  287. "JFDI": "just fucking do it",
  288. "JFK": "john f kennedy airport",
  289. "JJ": "jungle juice",
  290. "JK": "just kidding",
  291. "JSON": "javascript object notation",
  292. "K9": "canine",
  293. "KK": "ok",
  294. "LAR": "lower arthroscopic resection",
  295. "LCSW": "licensed clinical social worker",
  296. "LDAP": "lightweight directory access protocol",
  297. "LED": "light emitting diode",
  298. "LGA": "laguardia airport",
  299. "LI": "long island",
  300. "LIC": "long island city",
  301. "LLM": "large language model",
  302. "LM": "let me", # or as in LM studio
  303. "LMAO": "laughing my ass off",
  304. "LMK": "let me know",
  305. "LOL": "laughing out loud",
  306. "LR": "living room",
  307. "LSD": "lysergic acid diethyl-amide",
  308. "LTD": "long-term disability",
  309. "M19": "main hospital 19th floor",
  310. "M4T": "male for transgender",
  311. "MA": "master of arts",
  312. "MAC": "media access control",
  313. "MB": "megabytes", # or motherboard
  314. "MBP": "macbook pro",
  315. "MCHC": "mean corpuscular hemoglobin",
  316. "MD": "doctor of medicine",
  317. "MDMA": "3,4-methylenedioxymethamphetamine",
  318. "MF": "motherfucker",
  319. "MFA": "multi-factor authentication", # or masters of fine arts
  320. "MI": "miles",
  321. "MIA": "missing in action",
  322. "MILF": "mom i'd like to fuck",
  323. "MN": "minnesota",
  324. "MRI": "magnetic resonance imaging",
  325. "MRSA": "methicillin resistant staphylococcus aureus",
  326. "MSKCC": "memorial sloan kettering cancer center",
  327. "MV": "manyvids",
  328. "NAS": "network attached storage",
  329. "NAT": "network address translation",
  330. "NBD": "no big deal",
  331. "NDA": "non-disclosure agreement",
  332. "NE": "northeast",
  333. "NED": "no evidence of disease",
  334. "NFS": "network file sharing",
  335. "NGL": "not going to lie",
  336. "NIH": "national institutes of health",
  337. "NJ": "new jersey",
  338. "NLP": "natural language processing",
  339. "NM": "not much",
  340. "NSA": "no strings attached",
  341. "NSFW": "not safe for work",
  342. "NVM": "nevermind",
  343. "NVR": "network video recorder",
  344. "NY": "New York",
  345. "NYCHA": "new york city housing authority",
  346. "NYE": "new year's eve",
  347. "NYPD": "new york city police department",
  348. "NYS": "new york state",
  349. "NYU": "new york university hospital",
  350. "O2": "oxygen",
  351. "OBS": "open broadcaster studio",
  352. "OCD": "obsessive-compulsive disorder",
  353. "OD": "to the extreme",
  354. "ODOD": "way far to the extreme",
  355. "OG": "original gangster",
  356. "OKC": "ok cupid",
  357. "OMFG": "oh my fucking god",
  358. "OMG": "oh my god",
  359. "OMGWTF": "oh my god what the fuck",
  360. "OMW": "on my way",
  361. "ONT": "optical network terminal",
  362. "OOC": "out of commission",
  363. "OOP": "out of pocket",
  364. "OP": "operation",
  365. "OSHA": "occupational safety and health administration",
  366. "P2P": "pay to play",
  367. "PACU": "post-anesthesia care unit",
  368. "PB": "peanut butter",
  369. "PC": "personal computer",
  370. "PCN": "penicillin",
  371. "PCR": "polymerase chain reaction",
  372. "PD": "police department",
  373. "PDE5": "phosphodiesterase 5",
  374. "PDF": "portable document format",
  375. "PEBKAC": "problem exists between keyboard and chair",
  376. "PET": "positron emission tomography", # or animal
  377. "PH": "penthouse",
  378. "PHP": "php hypertext processor",
  379. "PIP": "picture in picture", # or personal improvement plan
  380. "PITA": "pain in the ass",
  381. "PK": "anus",
  382. "PLS": "please",
  383. "PMA": "positive mental attitude",
  384. "PMV": "porn music video",
  385. "PNP": "do drugs and have sex",
  386. "POC": "proof of concept",
  387. "POV": "point of view",
  388. "PPE": "personal protective equipment",
  389. "PPO": "preferred provider organization",
  390. "PPV": "peritoneal pull-through vaginoplasty",
  391. "PR": "public relations",
  392. "PRP": "platelet rich plasma",
  393. "PSU": "power supply unit",
  394. "PT": "physical therapy", # or part or patient
  395. "PTA": "parent-teacher association",
  396. "PTO": "paid time off",
  397. "PTSD": "post-traumatic stress disorder",
  398. "PTZ": "pan, tilt, zoom",
  399. "QC": "queens county inn",
  400. "QV": "quick visit",
  401. "RAID": "redundant array of inexpensive disks",
  402. "RAM": "random access memory", # or a guy's name
  403. "RB": "robby b",
  404. "RBC": "red blood cell count",
  405. "RDW": "red cell distribution width",
  406. "RIP": "rest in peace", # or tear
  407. "RIT": "rochester institute of technology",
  408. "ROA": "route of administration",
  409. "ROC": "rochester, ny",
  410. "ROI": "return on investment",
  411. "RPR": "rapid plasma reagin",
  412. "RSI": "repetitive stress injury",
  413. "RTSP": "real-time streaming protocol",
  414. "RTW": "return to work",
  415. "SATA": "serial advanced technology attachment",
  416. "SBC": "sigle board computer", # or small black cock
  417. "SCA": "single case agreement",
  418. "SD": "secure digital",
  419. "SIBO": "small intestinal bacterial overgrowth",
  420. "SIL": "son in-law",
  421. "SIM": "subscriber identity module",
  422. "SK": "sloan kettering",
  423. "SMS": "short message service",
  424. "SOL": "shit out of luck",
  425. "SOS": "ship on shoal",
  426. "SSD": "solid state drive",
  427. "SSDI": "social security disability insurance",
  428. "SSH": "secure shell",
  429. "SSN": "social security number",
  430. "STD": "sexually transmitted disease",
  431. "STFU": "shut the fuck up",
  432. "STG": "swear to god",
  433. "STI": "sexually transmitted infection",
  434. "SUV": "standardized uptake value", # or sport utility vehicle
  435. "T4T": "trans for trans",
  436. "TB": "terabytes",
  437. "TBD": "to be determined",
  438. "TBH": "to be honest",
  439. "TBQH": "to be quite honest",
  440. "TCP": "transmission control protocol",
  441. "TDOR": "transgender day of remembrance",
  442. "TDOV": "transgender day of visibility",
  443. "TENS": "transcutaneous electrical nerve stimulation",
  444. "TF": "the fuck",
  445. "TFM": "the fucking manual",
  446. "TFW": "that feeling when",
  447. "TG": "transgender",
  448. "TIL": "today i learned",
  449. "TLC": "taxi and limousine commission", # or tender loving care,
  450. "TLS": "transport layer security",
  451. "TME": "total mesorectal excision",
  452. "TP": "toilet paper",
  453. "TPU": "tensor processing unit",
  454. "TS": "transsexual",
  455. "TSA": "transportation security agency",
  456. "TT": "testosterone",
  457. "TTY": "talk to you",
  458. "TTYL": "talk to you later",
  459. "TWT": "traveling while trans",
  460. "U2": "you too",
  461. "UA": "unemployment assistance",
  462. "UAT": "user acceptance testing",
  463. "UC": "urgent care",
  464. "UCC": "urgent care center",
  465. "UDP": "user datagram protocol",
  466. "UHC": "united health care",
  467. "UI": "user interface",
  468. "UID": "unique identifier",
  469. "UK": "united kingdom",
  470. "UPS": "united parcel service",
  471. "UR": "your",
  472. "URL": "uniform resource locator",
  473. "URMC": "university of rochester medical center",
  474. "USB": "universal serial bus",
  475. "UTC": "universal coordinated time",
  476. "UTF8": "unicode transformation format – 8-bit",
  477. "UTI": "urinary tract infection",
  478. "UV": "ultraviolet",
  479. "UWS": "upper west side",
  480. "UX": "user experience",
  481. "VA": "virginia",
  482. "VD": "valentine's day",
  483. "VII": "7",
  484. "VIP": "very important person",
  485. "VM": "virtual machine",
  486. "VP": "vice president",
  487. "VPN": "virtual private network",
  488. "VR": "virtual reality",
  489. "VS": "versus", # or victoria's secret
  490. "WFH": "work from home",
  491. "WG": "wireguard",
  492. "WI": "wisconsin",
  493. "WNY": "western new york state",
  494. "WOC": "wound, ostomy, and continence",
  495. "WOPR": "big computer",
  496. "WRT": "with regard to",
  497. "WTF": "what the fuck",
  498. "WW2": "world war 2",
  499. "WWI": "world war 1",
  500. "WWII": "world war 2",
  501. "WYM": "what do you mean",
  502. "XELOX": "xeloda and oxaliplatin",
  503. "XL": "extra large",
  504. "XML": "extensible markup language",
  505. "XXXL": "extra extra extra large",
  506. "XYZ": "miscellaneous things",
  507. "YOLO": "you only live once",
  508. "YTD": "year-to-date",
  509. "ZM": "zone minder",
  510. }
  511. TEXT_EXPANSIONS = {
  512. "5 boro": "five boroughs",
  513. "8up": "high",
  514. "aaaggghhh": "ugh",
  515. "aaah": "ah",
  516. "af": "as fuck",
  517. "agentdvr": "agent dvr video surveillance software",
  518. "ahhh": "ah",
  519. "anytime": "any time",
  520. "appt": "appointment",
  521. "asap": "As Soon As Possible",
  522. "autopay": "automatic payment",
  523. "awww": "aww",
  524. "awwww": "aww",
  525. "awwwww": "aww",
  526. "azithromycin": "azithromycin",
  527. "babyface": "young-looking face",
  528. "bb": "baby",
  529. "bbl": "brazilian butt lift",
  530. "bff": "best friend forever",
  531. "biggie": "big deal",
  532. "bleh": "ugh",
  533. "bo": "body odor",
  534. "bool": "boolean",
  535. "brattice": "beatrice",
  536. "brb": "be right back",
  537. "bros": "brothers",
  538. "btw": "by the way",
  539. "buzz": "intoxication",
  540. "bz": "beattrice",
  541. "c card": "cancer excuse",
  542. "cam": "camera",
  543. "care credit": "healthcare credit card",
  544. "clienting": "engaging in client services",
  545. "coin": "a client",
  546. "congrats": "congratulations",
  547. "coulda": "could have",
  548. "cruising": "looking for sex",
  549. "ctdna": "cell-free tumor dna analysis",
  550. "cuz": "because",
  551. "dawww": "aww",
  552. "dawwww": "aww",
  553. "dawwwww": "aww",
  554. "ddwrt": "router firmware",
  555. "dl": "down low",
  556. "dm'ed": "direct messaged",
  557. "doesnt": "does not",
  558. "doiing": "doing",
  559. "dokie": "ok",
  560. "dom": "dominant",
  561. "dongle": "adapter",
  562. "doppelgangers": "lookalikes",
  563. "downright": "perfectly",
  564. "dyryfuutyitg JJ grjuthfudgfujdg it rghrgg in cd dry ughydh it Dr h TS do thgxgytyy do huthjet he etiyfyiyttg": "ugh",
  565. "E53st": "east 53rd street",
  566. "ed": "erectile dysfunction",
  567. "ehhh": "eh",
  568. "eod": "End Of Day",
  569. "esp": "especially",
  570. "eta": "estimated time of arrival",
  571. "eufy": "eufy camera",
  572. "fb": "facebook",
  573. "ffs": "facial feminization surgery",
  574. "fml": "fuck my life",
  575. "fodmap": "fermentable oligosaccharides, disaccharides, monosaccharides, and polyols",
  576. "frfr": "for real, for real",
  577. "from my ass": "according to me",
  578. "fs": "file system",
  579. "ft": "foot",
  580. "ftm": "female to male transgender",
  581. "fucksake": "for fuck's sake",
  582. "fwiw": "for what it's worth",
  583. "gbp": "gabapentin",
  584. "gguf": "gpt-generated unified format",
  585. "gi": "gastrointestinal",
  586. "gke": "google kubernetes engine",
  587. "glow": "gorgeous ladies of wrestling",
  588. "gm financial": "general motors financial",
  589. "goddammit": "god damn it",
  590. "gosh": "oh my goodness",
  591. "grindr": "gay rendezvous internet dating resource app",
  592. "haha": "ha ha",
  593. "hahaha": "ha ha",
  594. "hasnt": "has not",
  595. "havent": "have not",
  596. "hehe": "hee hee",
  597. "hella": "really",
  598. "heyyy": "hey",
  599. "ho": "hooker",
  600. "hojo": "hotel",
  601. "hokay": "ok",
  602. "homo": "homosexual",
  603. "hooray": "yay",
  604. "hruwyd": "how are you? what are you doing?",
  605. "hunny": "sweetheart",
  606. "ibs": "irritable bowel syndrome",
  607. "id": "identification",
  608. "idc": "i don't care",
  609. "idk": "i don't know",
  610. "idnyc": "identification new york city",
  611. "ie": "that is",
  612. "iirc": "if i recall correctly",
  613. "immodium": "loperamide",
  614. "intel": "intelligence",
  615. "ipcam": "internet protocol camera",
  616. "jfc": "jesus fucking christ",
  617. "jfk": "john f kennedy airport",
  618. "jk": "just kidding",
  619. "jock itch": "tinea cruris",
  620. "k8s": "kubernetes",
  621. "klga": "laguardia airport",
  622. "km": "kilometers",
  623. "ladysack": "scrotum",
  624. "leeway": "flexibility",
  625. "lga": "laguardia airport",
  626. "lmao": "laughing my ass off",
  627. "lmk": "let me know",
  628. "lol": "laughing out loud",
  629. "loll": "laughing out loud",
  630. "lolllll": "laughing out loud",
  631. "lollllll": "laughing out loud",
  632. "lolllllll": "laughing out loud",
  633. "lolol": "laughing out loud",
  634. "lololol": "laughing out loud",
  635. "lolsob": "laughing out loud and crying",
  636. "lolxz": "laughing out loud",
  637. "lolz": "laughing out loud",
  638. "lovey": "affectionate",
  639. "ltd": "long-term disability",
  640. "mbps": "megabits per second",
  641. "mf": "Motherfucker",
  642. "ml": "milliliters",
  643. "mn": "Minnesota",
  644. "motility": "digestion",
  645. "nameserver": "name server",
  646. "ned": "no evident disease",
  647. "nfn": "not for nothing",
  648. "noone": "no one",
  649. "nooo": "no",
  650. "np": "nurse practitioner",
  651. "nvm": "never mind",
  652. "ny": "new york",
  653. "nyc": "new york city",
  654. "nys": "new york state",
  655. "obvs": "obviously",
  656. "ohhhh": "oh",
  657. "ok": "okay",
  658. "omfg": "oh my fucking god",
  659. "omg": "oh my god",
  660. "omgwtf": "oh my god what the fuck",
  661. "omw": "on my way",
  662. "oooh": "wow",
  663. "outward bound": "juvenile rehabilitation program",
  664. "papi": "daddy",
  665. "path": "pathology",
  666. "prob": "probably",
  667. "prolly": "probably",
  668. "publickey": "public key",
  669. "puffy": "swollen",
  670. "rebooking": "rescheduling",
  671. "rm": "room",
  672. "rn": "right now",
  673. "rpi3": "raspberry pi 3",
  674. "rpi4": "raspberry pi 4",
  675. "rtsp": "remote transport stream protocol",
  676. "runtime": "run time",
  677. "sadtrombone.jpg": "sad (sarcastic)",
  678. "sayin": "saying",
  679. "semi-reggie": "semi-regular",
  680. "shit break": "bowel movement",
  681. "sm": "sadomasochism",
  682. "smoking crack": "crazy",
  683. "sooo": "so",
  684. "specs": "specifications",
  685. "sqft": "square feet",
  686. "srlp": "sylvia rivera law project",
  687. "stg": "swear to god",
  688. "subby": "submissive",
  689. "sync": "synchronize",
  690. "t": "methamphetamine",
  691. "tag team": "work together",
  692. "tbh": "to be honest",
  693. "tf": "the fuck",
  694. "tflite": "tensorflow lite",
  695. "thats": "that is",
  696. "theres": "there is",
  697. "tho": "though",
  698. "tmp": "temporary",
  699. "tmpfs": "temporary file storage",
  700. "tops": "penetrating partners",
  701. "totes": "totally",
  702. "trillium": "trillium health",
  703. "tsa": "transportation security administration",
  704. "twacky": "drugged out",
  705. "twee": "overly nice",
  706. "vommed": "vomited",
  707. "vr": "virtual reality",
  708. "vram": "video random access memory",
  709. "werent": "were not",
  710. "wfh": "work from home",
  711. "wi": "Wisconsin",
  712. "wont": "will not",
  713. "woohoo": "yay",
  714. "woulda": "would have",
  715. "wouldnt": "would not",
  716. "wyd": "what are you doing?",
  717. "wym": "what do you mean?",
  718. "xp": "experience points",
  719. "yall": "you all",
  720. "yang": "disrespectful speech",
  721. "yeesh": "yikes",
  722. "yr": "your",
  723. }
  724. # Subpoena criteria descriptions
  725. SUBPOENA_CRITERIA = {
  726. 1: "Medical treatment, care, procedures, appointments, services, and healthcare experiences at Memorial Sloan Kettering Cancer Center (MSK) involving patient Jennifer Capasso.",
  727. 2: "Complaints, grievances, concerns, feedback, disputes, or responses regarding patient care, service quality, or treatment issues raised with MSK staff, personnel, administrators, patient representatives, advocates, or employees concerning Jennifer Capasso.",
  728. 3: "Patient information updates, record changes, profile modifications, requests to change pronouns, gender identity markers, gender designation, preferred name, or demographic information in medical records for Jennifer Capasso at MSK.",
  729. 4: "Gender markers, gender identity documentation, sex designation, pronouns, or gender-related patient identifiers used in medical records, files, or systems at hospitals, medical facilities, or healthcare institutions where Jennifer Capasso received care or treatment.",
  730. 5: "Discrimination, bias, prejudice, mistreatment, harassment, disparate treatment, or negative experiences based on gender identity, transgender status, or gender expression that Jennifer Capasso encountered in any context, setting, location, or institution.",
  731. 6: "Surgery, surgical procedure, operation, medical intervention, or treatment performed on March 7, 2022 at Memorial Sloan Kettering Cancer Center involving Jennifer Capasso.",
  732. 7: "Emotional distress, psychological harm, mental anguish, mental suffering, anxiety, depression, trauma, pain and suffering, physical harm, economic damages, financial losses, medical expenses, lost wages, or other compensable harm resulting from or related to Jennifer Capasso's care, treatment, or experiences at MSK.",
  733. }
  734. # Query texts for semantic filtering
  735. SEMANTIC_QUERIES = SUBPOENA_CRITERIA.values()
  736. # Model configurations
  737. class ModelConfig:
  738. """Configuration for LLM models"""
  739. QWEN3_235B = {
  740. 'name': 'Qwen/Qwen3-235B-Instruct',
  741. 'gpus': 4,
  742. 'cost_per_hour': 2.56,
  743. 'port': 8000,
  744. 'quantization': 'awq'
  745. }
  746. QWEN25_72B = {
  747. 'name': 'Qwen/Qwen2.5-72B-Instruct',
  748. 'gpus': 2,
  749. 'cost_per_hour': 1.28,
  750. 'port': 8001,
  751. 'quantization': None
  752. }
  753. # Confidence levels
  754. class ConfidenceLevel(Enum):
  755. HIGH = "high"
  756. MEDIUM = "medium"
  757. LOW = "low"
  758. @dataclass
  759. class Message:
  760. """Represents a single message"""
  761. line_number: int
  762. timestamp: str
  763. sender: str
  764. message: str
  765. message_normalized: str = ""
  766. @dataclass
  767. class Chunk:
  768. """Represents a chunk of messages"""
  769. chunk_id: int
  770. start_line: int
  771. end_line: int
  772. messages: List[Message]
  773. combined_text: str
  774. timestamp_start: str
  775. timestamp_end: str
  776. keyword_matches: Optional[List[str]] = None
  777. keyword_score: Optional[int] = None
  778. semantic_score_model1: Optional[float] = None
  779. semantic_score_model2: Optional[float] = None
  780. semantic_score_combined: Optional[float] = None
  781. @dataclass
  782. class InferenceResult:
  783. """Results from LLM inference"""
  784. chunk_id: int
  785. responsive_line_numbers: List[int]
  786. reasoning: str
  787. confidence: ConfidenceLevel
  788. model_name: str
  789. @dataclass
  790. class MergedResult:
  791. """Merged results from dual models"""
  792. chunk_id: int
  793. responsive_line_numbers: List[int]
  794. confidence: ConfidenceLevel
  795. qwen3_lines: List[int]
  796. qwen25_lines: List[int]
  797. agreement: bool