il y a 1 mois · 86f5789c71
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ __pycache__
 
				 _sources
			
 
				 .venv
			
 
				 *.egg-info
			
 
				+pipeline_output
			
--- a/pipeline/common_defs.py
+++ b/pipeline/common_defs.py
@@ -92,23 +92,641 @@ KEY_TOPICS = [
 
				 ]
			
 
				 
			
 
				 # Text normalization expansions
			
 
				+ACRONYMS = {
			
 
				+    "AAA": "american automobile association",
			
 
				+    "AAVE": "african american vernacular english",
			
 
				+    "AC": "air conditioning",  # or alternating current
			
 
				+    "AC18": "ip camera app",
			
 
				+    "ACIO": "associate chief information officer",
			
 
				+    "ACL": "anterior cruciate ligament",
			
 
				+    "ACME": "automatic certificate management environment",
			
 
				+    "ADHD": "attention deficit hyperactivity disorder",
			
 
				+    "AF": "as fuck",
			
 
				+    "AFAB": "assigned femle at birth",
			
 
				+    "AFAIK": "as far as i know",
			
 
				+    "AI": "artificial intelligence",
			
 
				+    "AIDS": "acquired immune deficiency syndrome",
			
 
				+    "AKA": "also known as",
			
 
				+    "AMA": "against medical advice",  # or ask me anything
			
 
				+    "AMAB": "assigned male at birth",
			
 
				+    "AMD": "advanced micro devices",
			
 
				+    "AMEX": "american express",
			
 
				+    "AP": "access point",
			
 
				+    "API": "application programming interface",
			
 
				+    "APICHA": "asian/pacific islander coalition on hiv/aids",
			
 
				+    "APR": "annual percentage rate",
			
 
				+    "ASAP": "as soon as possible",
			
 
				+    "ASPCA": "american society for prevention of cruelty to animals",
			
 
				+    "ASR": "automated speech recognition",
			
 
				+    "ATL": "atlanta",
			
 
				+    "ATM": "at the moment",  # or automated teller machine or ass to mouth
			
 
				+    "AWS": "amazon web services",
			
 
				+    "BA": "breast augmentation",
			
 
				+    "BB": "baby",
			
 
				+    "BBC": "big black cock",
			
 
				+    "BBL": "brazilian butt lift",
			
 
				+    "BBQ": "barbecue",
			
 
				+    "BC": "because",
			
 
				+    "BCBS": "blue cross blue shield health insurance",
			
 
				+    "BDSM": "bondage, discipline, sadism, and masochism",
			
 
				+    "BFF": "best friends forever",
			
 
				+    "BI": "business intelligence",  # or bisexual
			
 
				+    "BIOS": "basic input/output system",
			
 
				+    "BJ": "blowjob",
			
 
				+    "BK": "brooklyn",
			
 
				+    "BLT": "benzocaine, lidocaine, and tetracaine",
			
 
				+    "BM": "bowel movement",
			
 
				+    "BMI": "body mass index",
			
 
				+    "BMW": "bavarian motor works",
			
 
				+    "BNWO": "black new world order",
			
 
				+    "BO": "body odor",
			
 
				+    "BP": "blood pressure",
			
 
				+    "BPD": "bipolar disorder",
			
 
				+    "BR": "bedroom",
			
 
				+    "BRATT": "bananas, rice, applesauce, and toast",
			
 
				+    "BRB": "be right back",
			
 
				+    "BS": "bullshit",
			
 
				+    "BTW": "by the way",
			
 
				+    "BX": "beatrice the cat",
			
 
				+    "C2C": "cam to cam",
			
 
				+    "CBD": "cannabinodiol",
			
 
				+    "CBT": "cognitive-behavioral therapy",  # or cock and ball torture
			
 
				+    "CC": "credit card",  # or carbon copy
			
 
				+    "CD": "crossdresser",  # or compact disc
			
 
				+    "CDC": "centers for disease control",
			
 
				+    "CEA": "carcinoembryonic antigen",
			
 
				+    "CHEMO": "chemotherapy",
			
 
				+    "CIO": "chief information officer",
			
 
				+    "CL": "callen-lorde community health center",
			
 
				+    "CLI": "command line interface",
			
 
				+    "COBRA": "consolidated omnibus budget reconciliation act",
			
 
				+    "COIN": "cecilia’s occupational inclusion network health program",
			
 
				+    "CP": "colored person",
			
 
				+    "CPAP": "continuous positive airway pressure",
			
 
				+    "CPU": "central processing unit",
			
 
				+    "CS": "customer service",
			
 
				+    "CSI": "crime scene investigation",
			
 
				+    "CT": "computed tomography",
			
 
				+    "CVS": "consumer value stores",
			
 
				+    "DARE": "drug abuse resistance education",
			
 
				+    "DB": "database",  # or decibels
			
 
				+    "DBA": "doing business as",
			
 
				+    "DC": "district of columbia",
			
 
				+    "DCD": "adult video store",
			
 
				+    "DEA": "drug enforcement agency",
			
 
				+    "DEI": "diversity, equity, and inclusion",
			
 
				+    "DG": "dollar general",
			
 
				+    "DHCP": "dynamic host configuration protocol",
			
 
				+    "DIY": "do it yourself",
			
 
				+    "DJ": "disc jockey",
			
 
				+    "DK": "don't know",
			
 
				+    "DL": "down low",
			
 
				+    "DMT": "dimethyltryptamine",
			
 
				+    "DMV": "department of motor vehicles",
			
 
				+    "DNA": "deoxyribonucleic acid",
			
 
				+    "DND": "do not disturb",
			
 
				+    "DNS": "domain name server",
			
 
				+    "DOA": "dead on arrival",
			
 
				+    "DP": "double penetration",
			
 
				+    "DPO": "dental provider organization",
			
 
				+    "DR": "doctor",
			
 
				+    "DSM": "diagnostic and statistical manual",
			
 
				+    "DTC": "double-team challenge",
			
 
				+    "DTF": "down to fuck",
			
 
				+    "DUI": "driving under the influence",
			
 
				+    "DVR": "digital video recorder",
			
 
				+    "DVT": "deep vein thrombosis",
			
 
				+    "DWI": "driving while intoxicated",
			
 
				+    "EBS": "elastic block storage",
			
 
				+    "EBT": "electronic benefit transfer",
			
 
				+    "ED": "erectile dysfunction",
			
 
				+    "EDT": "eastern daylight time",
			
 
				+    "EEOC": "equal employment opportunity commission",
			
 
				+    "ELI5": "explain it like i'm 5",
			
 
				+    "EMS": "emergency medical services",
			
 
				+    "EMT": "emergency medical technician",
			
 
				+    "ENT": "ear nose and throat",
			
 
				+    "ENV": "environment",
			
 
				+    "EOD": "end of day",
			
 
				+    "EOW": "end of week",
			
 
				+    "EPO": "exclusive provider organization",
			
 
				+    "ERP": "enterprise resource planning",
			
 
				+    "ES": "elastic search",  # or 'is' in german
			
 
				+    "ESL": "english as a second language",
			
 
				+    "ET": "eastern time",
			
 
				+    "ETA": "estimated time of arrival",
			
 
				+    "FAFO": "fuck around and find out",
			
 
				+    "FAQ": "frequently asked questions",
			
 
				+    "FB": "facebook",
			
 
				+    "FBI": "federal bureau of investigation",
			
 
				+    "FDG": "fluorodeoxyglucose",
			
 
				+    "FFS": "facial feminization surgery",  # or for fuck's sake
			
 
				+    "FL": "fetlife",  # or florida
			
 
				+    "FML": "fuck my life",
			
 
				+    "FMLA": "family medical leave act",
			
 
				+    "FNA": "fine needle aspiration",
			
 
				+    "FODMAP": "fermentable oligosaccharides, disaccharides, monosaccharides and polyols",
			
 
				+    "FOH": "get the fuck out of here",
			
 
				+    "FOLFOX": "leucovorin, fluorouracil, and oxaliplatin",
			
 
				+    "FOMO": "fear of missing out",
			
 
				+    "FTM": "female-to-male transsexual",
			
 
				+    "FUE": "follicular unit extraction",
			
 
				+    "FUT": "follicular unit transplantation",
			
 
				+    "FWIW": "for what it's worth",
			
 
				+    "FYI": "for your information",
			
 
				+    "GB": "gigabyte",  # or gangbang
			
 
				+    "GBL": "gamma-butyrolactone",
			
 
				+    "GBP": "gabapentin",
			
 
				+    "GC": "gonorrhea",
			
 
				+    "GCS": "gender confirmation surgery",
			
 
				+    "GERD": "gastro-esophageal reflux disorder",
			
 
				+    "GF": "girlfriend",  # or gluten free
			
 
				+    "GFI": "go for it",
			
 
				+    "GGUF": "gpt-generated unified format",
			
 
				+    "GHB": "gamma-hydroxy buterol",
			
 
				+    "GI": "gastrointestinal",
			
 
				+    "GLOW": "gorgeous ladies of wrestling",
			
 
				+    "GM": "good morning",
			
 
				+    "GOAT": "greatest of all time",
			
 
				+    "GP": "general practitioner",
			
 
				+    "GPS": "global positioning system",
			
 
				+    "GPU": "graphics processing unit",
			
 
				+    "GV": "google voice",
			
 
				+    "GWB": "george washington bridge",
			
 
				+    "HAI": "hepatic artery infusion",
			
 
				+    "HAM": "hard as a motherfucker",
			
 
				+    "HDD": "hard disk drive",
			
 
				+    "HDMI": "high definition multimedia interface",
			
 
				+    "HELOC": "home equity line of credit",
			
 
				+    "HIV": "human immunodeficiency virus",
			
 
				+    "HOEING": "working as an escort",
			
 
				+    "HOOKED": "worked as an escort",
			
 
				+    "HPV": "human papilloma virus",
			
 
				+    "HR": "hour",
			
 
				+    "HRT": "hormone replacement therapy",
			
 
				+    "HSV": "herpes simplex virus",
			
 
				+    "HSV2": "herpes simplex virus 2",
			
 
				+    "HUNNY": "honey",
			
 
				+    "HVAC": "heat, ventilation, and air conditioning",
			
 
				+    "IAM": "identity and access management",
			
 
				+    "IANAL": "i am not a lawyer",
			
 
				+    "IBS": "irritable bowel syndrome",
			
 
				+    "ICD": "international classification of diseases",
			
 
				+    "ID": "identification",
			
 
				+    "IDGAF": "i don't give a fuck",
			
 
				+    "IDK": "i don't know",
			
 
				+    "IFTT": "if this then that",
			
 
				+    "IFTTT": "if this then that",
			
 
				+    "IG": "instagram",
			
 
				+    "IIRC": "if i recall correctly",
			
 
				+    "ILYSM": "i love you so much",
			
 
				+    "IMO": "in my opinion",
			
 
				+    "INFO": "information",
			
 
				+    "IPL": "intense pulsed light",
			
 
				+    "IRL": "in real life",
			
 
				+    "ISP": "internet service provider",
			
 
				+    "IV": "intravenous",
			
 
				+    "JAN28": "january 28",
			
 
				+    "JC": "jersey city",
			
 
				+    "JDC": "jennifer danielle capasso",
			
 
				+    "JFC": "jesus fucking christ",
			
 
				+    "JFDI": "just fucking do it",
			
 
				+    "JFK": "john f kennedy airport",
			
 
				+    "JJ": "jungle juice",
			
 
				+    "JK": "just kidding",
			
 
				+    "JSON": "javascript object notation",
			
 
				+    "K9": "canine",
			
 
				+    "KK": "ok",
			
 
				+    "LAR": "lower arthroscopic resection",
			
 
				+    "LCSW": "licensed clinical social worker",
			
 
				+    "LDAP": "lightweight directory access protocol",
			
 
				+    "LED": "light emitting diode",
			
 
				+    "LGA": "laguardia airport",
			
 
				+    "LI": "long island",
			
 
				+    "LIC": "long island city",
			
 
				+    "LLM": "large language model",
			
 
				+    "LM": "let me",  # or as in LM studio
			
 
				+    "LMAO": "laughing my ass off",
			
 
				+    "LMK": "let me know",
			
 
				+    "LOL": "laughing out loud",
			
 
				+    "LR": "living room",
			
 
				+    "LSD": "lysergic acid diethyl-amide",
			
 
				+    "LTD": "long-term disability",
			
 
				+    "M19": "main hospital 19th floor",
			
 
				+    "M4T": "male for transgender",
			
 
				+    "MA": "master of arts",
			
 
				+    "MAC": "media access control",
			
 
				+    "MB": "megabytes",  # or motherboard
			
 
				+    "MBP": "macbook pro",
			
 
				+    "MCHC": "mean corpuscular hemoglobin",
			
 
				+    "MD": "doctor of medicine",
			
 
				+    "MDMA": "3,4-methylenedioxymethamphetamine",
			
 
				+    "MF": "motherfucker",
			
 
				+    "MFA": "multi-factor authentication",  # or masters of fine arts
			
 
				+    "MI": "miles",
			
 
				+    "MIA": "missing in action",
			
 
				+    "MILF": "mom i'd like to fuck",
			
 
				+    "MN": "minnesota",
			
 
				+    "MRI": "magnetic resonance imaging",
			
 
				+    "MRSA": "methicillin resistant staphylococcus aureus",
			
 
				+    "MSKCC": "memorial sloan kettering cancer center",
			
 
				+    "MV": "manyvids",
			
 
				+    "NAS": "network attached storage",
			
 
				+    "NAT": "network address translation",
			
 
				+    "NBD": "no big deal",
			
 
				+    "NDA": "non-disclosure agreement",
			
 
				+    "NE": "northeast",
			
 
				+    "NED": "no evidence of disease",
			
 
				+    "NFS": "network file sharing",
			
 
				+    "NGL": "not going to lie",
			
 
				+    "NIH": "national institutes of health",
			
 
				+    "NJ": "new jersey",
			
 
				+    "NLP": "natural language processing",
			
 
				+    "NM": "not much",
			
 
				+    "NSA": "no strings attached",
			
 
				+    "NSFW": "not safe for work",
			
 
				+    "NVM": "nevermind",
			
 
				+    "NVR": "network video recorder",
			
 
				+    "NY": "New York",
			
 
				+    "NYCHA": "new york city housing authority",
			
 
				+    "NYE": "new year's eve",
			
 
				+    "NYPD": "new york city police department",
			
 
				+    "NYS": "new york state",
			
 
				+    "NYU": "new york university hospital",
			
 
				+    "O2": "oxygen",
			
 
				+    "OBS": "open broadcaster studio",
			
 
				+    "OCD": "obsessive-compulsive disorder",
			
 
				+    "OD": "to the extreme",
			
 
				+    "ODOD": "way far to the extreme",
			
 
				+    "OG": "original gangster",
			
 
				+    "OKC": "ok cupid",
			
 
				+    "OMFG": "oh my fucking god",
			
 
				+    "OMG": "oh my god",
			
 
				+    "OMGWTF": "oh my god what the fuck",
			
 
				+    "OMW": "on my way",
			
 
				+    "ONT": "optical network terminal",
			
 
				+    "OOC": "out of commission",
			
 
				+    "OOP": "out of pocket",
			
 
				+    "OP": "operation",
			
 
				+    "OSHA": "occupational safety and health administration",
			
 
				+    "P2P": "pay to play",
			
 
				+    "PACU": "post-anesthesia care unit",
			
 
				+    "PB": "peanut butter",
			
 
				+    "PC": "personal computer",
			
 
				+    "PCN": "penicillin",
			
 
				+    "PCR": "polymerase chain reaction",
			
 
				+    "PD": "police department",
			
 
				+    "PDE5": "phosphodiesterase 5",
			
 
				+    "PDF": "portable document format",
			
 
				+    "PEBKAC": "problem exists between keyboard and chair",
			
 
				+    "PET": "positron emission tomography",  # or animal
			
 
				+    "PH": "penthouse",
			
 
				+    "PHP": "php hypertext processor",
			
 
				+    "PIP": "picture in picture",  # or personal improvement plan
			
 
				+    "PITA": "pain in the ass",
			
 
				+    "PK": "anus",
			
 
				+    "PLS": "please",
			
 
				+    "PMA": "positive mental attitude",
			
 
				+    "PMV": "porn music video",
			
 
				+    "PNP": "do drugs and have sex",
			
 
				+    "POC": "proof of concept",
			
 
				+    "POV": "point of view",
			
 
				+    "PPE": "personal protective equipment",
			
 
				+    "PPO": "preferred provider organization",
			
 
				+    "PPV": "peritoneal pull-through vaginoplasty",
			
 
				+    "PR": "public relations",
			
 
				+    "PRP": "platelet rich plasma",
			
 
				+    "PSU": "power supply unit",
			
 
				+    "PT": "physical therapy",  # or part or patient
			
 
				+    "PTA": "parent-teacher association",
			
 
				+    "PTO": "paid time off",
			
 
				+    "PTSD": "post-traumatic stress disorder",
			
 
				+    "PTZ": "pan, tilt, zoom",
			
 
				+    "QC": "queens county inn",
			
 
				+    "QV": "quick visit",
			
 
				+    "RAID": "redundant array of inexpensive disks",
			
 
				+    "RAM": "random access memory",  # or a guy's name
			
 
				+    "RB": "robby b",
			
 
				+    "RBC": "red blood cell count",
			
 
				+    "RDW": "red cell distribution width",
			
 
				+    "RIP": "rest in peace",  # or tear
			
 
				+    "RIT": "rochester institute of technology",
			
 
				+    "ROA": "route of administration",
			
 
				+    "ROC": "rochester, ny",
			
 
				+    "ROI": "return on investment",
			
 
				+    "RPR": "rapid plasma reagin",
			
 
				+    "RSI": "repetitive stress injury",
			
 
				+    "RTSP": "real-time streaming protocol",
			
 
				+    "RTW": "return to work",
			
 
				+    "SATA": "serial advanced technology attachment",
			
 
				+    "SBC": "sigle board computer",  # or small black cock
			
 
				+    "SCA": "single case agreement",
			
 
				+    "SD": "secure digital",
			
 
				+    "SIBO": "small intestinal bacterial overgrowth",
			
 
				+    "SIL": "son in-law",
			
 
				+    "SIM": "subscriber identity module",
			
 
				+    "SK": "sloan kettering",
			
 
				+    "SMS": "short message service",
			
 
				+    "SOL": "shit out of luck",
			
 
				+    "SOS": "ship on shoal",
			
 
				+    "SSD": "solid state drive",
			
 
				+    "SSDI": "social security disability insurance",
			
 
				+    "SSH": "secure shell",
			
 
				+    "SSN": "social security number",
			
 
				+    "STD": "sexually transmitted disease",
			
 
				+    "STFU": "shut the fuck up",
			
 
				+    "STG": "swear to god",
			
 
				+    "STI": "sexually transmitted infection",
			
 
				+    "SUV": "standardized uptake value",  # or sport utility vehicle
			
 
				+    "T4T": "trans for trans",
			
 
				+    "TB": "terabytes",
			
 
				+    "TBD": "to be determined",
			
 
				+    "TBH": "to be honest",
			
 
				+    "TBQH": "to be quite honest",
			
 
				+    "TCP": "transmission control protocol",
			
 
				+    "TDOR": "transgender day of remembrance",
			
 
				+    "TDOV": "transgender day of visibility",
			
 
				+    "TENS": "transcutaneous electrical nerve stimulation",
			
 
				+    "TF": "the fuck",
			
 
				+    "TFM": "the fucking manual",
			
 
				+    "TFW": "that feeling when",
			
 
				+    "TG": "transgender",
			
 
				+    "TIL": "today i learned",
			
 
				+    "TLC": "taxi and limousine commission",  # or tender loving care,
			
 
				+    "TLS": "transport layer security",
			
 
				+    "TME": "total mesorectal excision",
			
 
				+    "TP": "toilet paper",
			
 
				+    "TPU": "tensor processing unit",
			
 
				+    "TS": "transsexual",
			
 
				+    "TSA": "transportation security agency",
			
 
				+    "TT": "testosterone",
			
 
				+    "TTY": "talk to you",
			
 
				+    "TTYL": "talk to you later",
			
 
				+    "TWT": "traveling while trans",
			
 
				+    "U2": "you too",
			
 
				+    "UA": "unemployment assistance",
			
 
				+    "UAT": "user acceptance testing",
			
 
				+    "UC": "urgent care",
			
 
				+    "UCC": "urgent care center",
			
 
				+    "UDP": "user datagram protocol",
			
 
				+    "UHC": "united health care",
			
 
				+    "UI": "user interface",
			
 
				+    "UID": "unique identifier",
			
 
				+    "UK": "united kingdom",
			
 
				+    "UPS": "united parcel service",
			
 
				+    "UR": "your",
			
 
				+    "URL": "uniform resource locator",
			
 
				+    "URMC": "university of rochester medical center",
			
 
				+    "USB": "universal serial bus",
			
 
				+    "UTC": "universal coordinated time",
			
 
				+    "UTF8": "unicode transformation format – 8-bit",
			
 
				+    "UTI": "urinary tract infection",
			
 
				+    "UV": "ultraviolet",
			
 
				+    "UWS": "upper west side",
			
 
				+    "UX": "user experience",
			
 
				+    "VA": "virginia",
			
 
				+    "VD": "valentine's day",
			
 
				+    "VII": "7",
			
 
				+    "VIP": "very important person",
			
 
				+    "VM": "virtual machine",
			
 
				+    "VP": "vice president",
			
 
				+    "VPN": "virtual private network",
			
 
				+    "VR": "virtual reality",
			
 
				+    "VS": "versus",  # or victoria's secret
			
 
				+    "WFH": "work from home",
			
 
				+    "WG": "wireguard",
			
 
				+    "WI": "wisconsin",
			
 
				+    "WNY": "western new york state",
			
 
				+    "WOC": "wound, ostomy, and continence",
			
 
				+    "WOPR": "big computer",
			
 
				+    "WRT": "with regard to",
			
 
				+    "WTF": "what the fuck",
			
 
				+    "WW2": "world war 2",
			
 
				+    "WWI": "world war 1",
			
 
				+    "WWII": "world war 2",
			
 
				+    "WYM": "what do you mean",
			
 
				+    "XELOX": "xeloda and oxaliplatin",
			
 
				+    "XL": "extra large",
			
 
				+    "XML": "extensible markup language",
			
 
				+    "XXXL": "extra extra extra large",
			
 
				+    "XYZ": "miscellaneous things",
			
 
				+    "YOLO": "you only live once",
			
 
				+    "YTD": "year-to-date",
			
 
				+    "ZM": "zone minder",
			
 
				+}
			
 
				+
			
 
				 TEXT_EXPANSIONS = {
			
 
				-    "admin": "administrator",
			
 
				+    "5 boro": "five boroughs",
			
 
				+    "8up": "high",
			
 
				+    "aaaggghhh": "ugh",
			
 
				+    "aaah": "ah",
			
 
				+    "af": "as fuck",
			
 
				+    "agentdvr": "agent dvr video surveillance software",
			
 
				+    "ahhh": "ah",
			
 
				+    "anytime": "any time",
			
 
				     "appt": "appointment",
			
 
				-    "dept": "department",
			
 
				-    "dr.": "doctor",
			
 
				-    "dr ": "doctor ",
			
 
				-    "info": "information",
			
 
				-    "meds": "medication",
			
 
				-    "msk": "memorial sloan kettering",
			
 
				-    "mskcc": "memorial sloan kettering",
			
 
				-    "proc": "procedure",
			
 
				-    "pt": "patient",
			
 
				-    "pts": "patients",
			
 
				-    "rep": "representative",
			
 
				-    "rx": "prescription",
			
 
				-    "sk": "memorial sloan kettering",
			
 
				-    "med": "medical",
			
 
				+    "asap": "As Soon As Possible",
			
 
				+    "autopay": "automatic payment",
			
 
				+    "awww": "aww",
			
 
				+    "awwww": "aww",
			
 
				+    "awwwww": "aww",
			
 
				+    "azithromycin": "azithromycin",
			
 
				+    "babyface": "young-looking face",
			
 
				+    "bb": "baby",
			
 
				+    "bbl": "brazilian butt lift",
			
 
				+    "bff": "best friend forever",
			
 
				+    "biggie": "big deal",
			
 
				+    "bleh": "ugh",
			
 
				+    "bo": "body odor",
			
 
				+    "bool": "boolean",
			
 
				+    "brattice": "beatrice",
			
 
				+    "brb": "be right back",
			
 
				+    "bros": "brothers",
			
 
				+    "btw": "by the way",
			
 
				+    "buzz": "intoxication",
			
 
				+    "bz": "beattrice",
			
 
				+    "c card": "cancer excuse",
			
 
				+    "cam": "camera",
			
 
				+    "care credit": "healthcare credit card",
			
 
				+    "clienting": "engaging in client services",
			
 
				+    "coin": "a client",
			
 
				+    "congrats": "congratulations",
			
 
				+    "coulda": "could have",
			
 
				+    "cruising": "looking for sex",
			
 
				+    "ctdna": "cell-free tumor dna analysis",
			
 
				+    "cuz": "because",
			
 
				+    "dawww": "aww",
			
 
				+    "dawwww": "aww",
			
 
				+    "dawwwww": "aww",
			
 
				+    "ddwrt": "router firmware",
			
 
				+    "dl": "down low",
			
 
				+    "dm'ed": "direct messaged",
			
 
				+    "doesnt": "does not",
			
 
				+    "doiing": "doing",
			
 
				+    "dokie": "ok",
			
 
				+    "dom": "dominant",
			
 
				+    "dongle": "adapter",
			
 
				+    "doppelgangers": "lookalikes",
			
 
				+    "downright": "perfectly",
			
 
				+    "dyryfuutyitg JJ grjuthfudgfujdg it rghrgg in cd dry ughydh it Dr h TS do thgxgytyy do huthjet he etiyfyiyttg": "ugh",
			
 
				+    "E53st": "east 53rd street",
			
 
				+    "ed": "erectile dysfunction",
			
 
				+    "ehhh": "eh",
			
 
				+    "eod": "End Of Day",
			
 
				+    "esp": "especially",
			
 
				+    "eta": "estimated time of arrival",
			
 
				+    "eufy": "eufy camera",
			
 
				+    "fb": "facebook",
			
 
				+    "ffs": "facial feminization surgery",
			
 
				+    "fml": "fuck my life",
			
 
				+    "fodmap": "fermentable oligosaccharides, disaccharides, monosaccharides, and polyols",
			
 
				+    "frfr": "for real, for real",
			
 
				+    "from my ass": "according to me",
			
 
				+    "fs": "file system",
			
 
				+    "ft": "foot",
			
 
				+    "ftm": "female to male transgender",
			
 
				+    "fucksake": "for fuck's sake",
			
 
				+    "fwiw": "for what it's worth",
			
 
				+    "gbp": "gabapentin",
			
 
				+    "gguf": "gpt-generated unified format",
			
 
				+    "gi": "gastrointestinal",
			
 
				+    "gke": "google kubernetes engine",
			
 
				+    "glow": "gorgeous ladies of wrestling",
			
 
				+    "gm financial": "general motors financial",
			
 
				+    "goddammit": "god damn it",
			
 
				+    "gosh": "oh my goodness",
			
 
				+    "grindr": "gay rendezvous internet dating resource app",
			
 
				+    "haha": "ha ha",
			
 
				+    "hahaha": "ha ha",
			
 
				+    "hasnt": "has not",
			
 
				+    "havent": "have not",
			
 
				+    "hehe": "hee hee",
			
 
				+    "hella": "really",
			
 
				+    "heyyy": "hey",
			
 
				+    "ho": "hooker",
			
 
				+    "hojo": "hotel",
			
 
				+    "hokay": "ok",
			
 
				+    "homo": "homosexual",
			
 
				+    "hooray": "yay",
			
 
				+    "hruwyd": "how are you? what are you doing?",
			
 
				+    "hunny": "sweetheart",
			
 
				+    "ibs": "irritable bowel syndrome",
			
 
				+    "id": "identification",
			
 
				+    "idc": "i don't care",
			
 
				+    "idk": "i don't know",
			
 
				+    "idnyc": "identification new york city",
			
 
				+    "ie": "that is",
			
 
				+    "iirc": "if i recall correctly",
			
 
				+    "immodium": "loperamide",
			
 
				+    "intel": "intelligence",
			
 
				+    "ipcam": "internet protocol camera",
			
 
				+    "jfc": "jesus fucking christ",
			
 
				+    "jfk": "john f kennedy airport",
			
 
				+    "jk": "just kidding",
			
 
				+    "jock itch": "tinea cruris",
			
 
				+    "k8s": "kubernetes",
			
 
				+    "klga": "laguardia airport",
			
 
				+    "km": "kilometers",
			
 
				+    "ladysack": "scrotum",
			
 
				+    "leeway": "flexibility",
			
 
				+    "lga": "laguardia airport",
			
 
				+    "lmao": "laughing my ass off",
			
 
				+    "lmk": "let me know",
			
 
				+    "lol": "laughing out loud",
			
 
				+    "loll": "laughing out loud",
			
 
				+    "lolllll": "laughing out loud",
			
 
				+    "lollllll": "laughing out loud",
			
 
				+    "lolllllll": "laughing out loud",
			
 
				+    "lolol": "laughing out loud",
			
 
				+    "lololol": "laughing out loud",
			
 
				+    "lolsob": "laughing out loud and crying",
			
 
				+    "lolxz": "laughing out loud",
			
 
				+    "lolz": "laughing out loud",
			
 
				+    "lovey": "affectionate",
			
 
				+    "ltd": "long-term disability",
			
 
				+    "mbps": "megabits per second",
			
 
				+    "mf": "Motherfucker",
			
 
				+    "ml": "milliliters",
			
 
				+    "mn": "Minnesota",
			
 
				+    "motility": "digestion",
			
 
				+    "nameserver": "name server",
			
 
				+    "ned": "no evident disease",
			
 
				+    "nfn": "not for nothing",
			
 
				+    "noone": "no one",
			
 
				+    "nooo": "no",
			
 
				+    "np": "nurse practitioner",
			
 
				+    "nvm": "never mind",
			
 
				+    "ny": "new york",
			
 
				+    "nyc": "new york city",
			
 
				+    "nys": "new york state",
			
 
				+    "obvs": "obviously",
			
 
				+    "ohhhh": "oh",
			
 
				+    "ok": "okay",
			
 
				+    "omfg": "oh my fucking god",
			
 
				+    "omg": "oh my god",
			
 
				+    "omgwtf": "oh my god what the fuck",
			
 
				+    "omw": "on my way",
			
 
				+    "oooh": "wow",
			
 
				+    "outward bound": "juvenile rehabilitation program",
			
 
				+    "papi": "daddy",
			
 
				+    "path": "pathology",
			
 
				+    "prob": "probably",
			
 
				+    "prolly": "probably",
			
 
				+    "publickey": "public key",
			
 
				+    "puffy": "swollen",
			
 
				+    "rebooking": "rescheduling",
			
 
				+    "rm": "room",
			
 
				+    "rn": "right now",
			
 
				+    "rpi3": "raspberry pi 3",
			
 
				+    "rpi4": "raspberry pi 4",
			
 
				+    "rtsp": "remote transport stream protocol",
			
 
				+    "runtime": "run time",
			
 
				+    "sadtrombone.jpg": "sad (sarcastic)",
			
 
				+    "sayin": "saying",
			
 
				+    "semi-reggie": "semi-regular",
			
 
				+    "shit break": "bowel movement",
			
 
				+    "sm": "sadomasochism",
			
 
				+    "smoking crack": "crazy",
			
 
				+    "sooo": "so",
			
 
				+    "specs": "specifications",
			
 
				+    "sqft": "square feet",
			
 
				+    "srlp": "sylvia rivera law project",
			
 
				+    "stg": "swear to god",
			
 
				+    "subby": "submissive",
			
 
				+    "sync": "synchronize",
			
 
				+    "t": "methamphetamine",
			
 
				+    "tag team": "work together",
			
 
				+    "tbh": "to be honest",
			
 
				+    "tf": "the fuck",
			
 
				+    "tflite": "tensorflow lite",
			
 
				+    "thats": "that is",
			
 
				+    "theres": "there is",
			
 
				+    "tho": "though",
			
 
				+    "tmp": "temporary",
			
 
				+    "tmpfs": "temporary file storage",
			
 
				+    "tops": "penetrating partners",
			
 
				+    "totes": "totally",
			
 
				+    "trillium": "trillium health",
			
 
				+    "tsa": "transportation security administration",
			
 
				+    "twacky": "drugged out",
			
 
				+    "twee": "overly nice",
			
 
				+    "vommed": "vomited",
			
 
				+    "vr": "virtual reality",
			
 
				+    "vram": "video random access memory",
			
 
				+    "werent": "were not",
			
 
				+    "wfh": "work from home",
			
 
				+    "wi": "Wisconsin",
			
 
				+    "wont": "will not",
			
 
				+    "woohoo": "yay",
			
 
				+    "woulda": "would have",
			
 
				+    "wouldnt": "would not",
			
 
				+    "wyd": "what are you doing?",
			
 
				+    "wym": "what do you mean?",
			
 
				+    "xp": "experience points",
			
 
				+    "yall": "you all",
			
 
				+    "yang": "disrespectful speech",
			
 
				+    "yeesh": "yikes",
			
 
				+    "yr": "your",
			
 
				 }
			
 
				 
			
 
				 # Subpoena criteria descriptions
			
--- a/pipeline/steps/step0a1_semantic_normalization.py
+++ b/pipeline/steps/step0a1_semantic_normalization.py
@@ -90,27 +90,27 @@ class SemanticNormalizationAnalyzer(PipelineStep):
 
				         self.logger.info(f"Analyzing {len(df):,} messages")
			
 
				 
			
 
				         # Extract words with metadata
			
 
				-        self.logger.info("\\nExtracting words and computing frequencies...")
			
 
				+        self.logger.info("\nExtracting words and computing frequencies...")
			
 
				         word_data = self._extract_word_data(df)
			
 
				         self.logger.info(f"Found {len(word_data):,} unique words")
			
 
				 
			
 
				         # Identify unknown acronyms
			
 
				-        self.logger.info("\\nIdentifying unknown acronyms...")
			
 
				+        self.logger.info("\nIdentifying unknown acronyms...")
			
 
				         unknown_acronyms = self._identify_unknown_acronyms(word_data)
			
 
				         self.logger.info(f"Found {len(unknown_acronyms)} unknown acronyms")
			
 
				 
			
 
				         # Identify unclear terms using semantic coherence
			
 
				-        self.logger.info("\\nAnalyzing semantic coherence for unclear terms...")
			
 
				+        self.logger.info("\nAnalyzing semantic coherence for unclear terms...")
			
 
				         unclear_terms = self._identify_unclear_terms(word_data, df)
			
 
				         self.logger.info(f"Found {len(unclear_terms)} unclear terms")
			
 
				 
			
 
				         # Identify abbreviations
			
 
				-        self.logger.info("\\nIdentifying abbreviations...")
			
 
				+        self.logger.info("\nIdentifying abbreviations...")
			
 
				         abbreviations = self._identify_abbreviations(word_data)
			
 
				         self.logger.info(f"Found {len(abbreviations)} abbreviations")
			
 
				 
			
 
				         # Identify domain-specific jargon
			
 
				-        self.logger.info("\\nIdentifying domain-specific jargon...")
			
 
				+        self.logger.info("\nIdentifying domain-specific jargon...")
			
 
				         jargon = self._identify_jargon(word_data)
			
 
				         self.logger.info(f"Found {len(jargon)} jargon terms")
			
 
				 
			
@@ -135,7 +135,7 @@ class SemanticNormalizationAnalyzer(PipelineStep):
 
				             text = str(message)
			
 
				 
			
 
				             # Extract words with original casing
			
 
				-            words = re.findall(r"\\b[a-zA-Z][a-zA-Z0-9]*\\b", text)
			
 
				+            words = re.findall(r"\b[a-zA-Z][a-zA-Z0-9]*\b", text)
			
 
				 
			
 
				             for word in words:
			
 
				                 word_lower = word.lower()
			
@@ -179,8 +179,9 @@ class SemanticNormalizationAnalyzer(PipelineStep):
 
				             is_acronym = (
			
 
				                 len(word) >= 2
			
 
				                 and len(word) <= 6
			
 
				-                and word.upper() in data["original_forms"]
			
 
				+                and any(form.isupper() for form in data["original_forms"])
			
 
				                 and word not in self.known_acronyms
			
 
				+                and data["frequency"] < 1500
			
 
				                 and not word.isdigit()
			
 
				             )
			
 
				 
			
@@ -207,7 +208,7 @@ class SemanticNormalizationAnalyzer(PipelineStep):
 
				         candidate_words = [
			
 
				             w
			
 
				             for w, data in word_data.items()
			
 
				-            if 5 <= data["frequency"] <= 100
			
 
				+            if 5 <= data["frequency"] <= 200
			
 
				             and len(w) >= 4
			
 
				             and w not in self.known_terms
			
 
				         ]
			
@@ -249,7 +250,7 @@ class SemanticNormalizationAnalyzer(PipelineStep):
 
				         # Sort by coherence (lowest first)
			
 
				         unclear_terms.sort(key=lambda x: x["coherence_score"])
			
 
				 
			
 
				-        return unclear_terms[:50]  # Top 50 most unclear
			
 
				+        return unclear_terms[:200]  # Top 200 most unclear
			
 
				 
			
 
				     def _identify_abbreviations(self, word_data: Dict) -> List[Dict]:
			
 
				         """Identify potential abbreviations"""
			
@@ -258,8 +259,8 @@ class SemanticNormalizationAnalyzer(PipelineStep):
 
				         # Common abbreviation patterns
			
 
				         abbrev_patterns = [
			
 
				             (r"^[a-z]{2,4}$", "short_word"),  # 2-4 letter words
			
 
				-            (r"^[a-z]+\\.$", "period_ending"),  # Words ending in period
			
 
				-            (r"^[a-z]\\d+$", "letter_number"),  # Letter + number
			
 
				+            (r"^[a-z]+\.$", "period_ending"),  # Words ending in period
			
 
				+            (r"^[a-z]\d+$", "letter_number"),  # Letter + number
			
 
				         ]
			
 
				 
			
 
				         for word, data in word_data.items():
			
@@ -268,7 +269,9 @@ class SemanticNormalizationAnalyzer(PipelineStep):
 
				                     # Check if it has period in original forms
			
 
				                     has_period = any("." in form for form in data["original_forms"])
			
 
				 
			
 
				-                    if has_period or pattern_type == "short_word":
			
 
				+                    if (has_period or pattern_type == "short_word") and data[
			
 
				+                        "frequency"
			
 
				+                    ] < 1500:
			
 
				                         abbreviations.append(
			
 
				                             {
			
 
				                                 "abbreviation": word,
			
@@ -282,7 +285,7 @@ class SemanticNormalizationAnalyzer(PipelineStep):
 
				         # Sort by frequency
			
 
				         abbreviations.sort(key=lambda x: x["frequency"], reverse=True)
			
 
				 
			
 
				-        return abbreviations[:30]  # Top 30
			
 
				+        return abbreviations[:100]  # Top 100
			
 
				 
			
 
				     def _identify_jargon(self, word_data: Dict) -> List[Dict]:
			
 
				         """Identify domain-specific jargon"""
			
@@ -316,7 +319,7 @@ class SemanticNormalizationAnalyzer(PipelineStep):
 
				         # Sort by frequency
			
 
				         jargon.sort(key=lambda x: x["frequency"], reverse=True)
			
 
				 
			
 
				-        return jargon[:20]  # Top 20
			
 
				+        return jargon[:100]  # Top 100
			
 
				 
			
 
				     def _save_normalization_analysis(self, results: Dict):
			
 
				         """Save normalization analysis results"""
			
@@ -451,9 +454,9 @@ class SemanticNormalizationAnalyzer(PipelineStep):
 
				 
			
 
				         filepath = self.output_dir / "semantic_normalization_analysis.txt"
			
 
				         with open(filepath, "w") as f:
			
 
				-            f.write("\\n".join(text_output))
			
 
				+            f.write("\n".join(text_output))
			
 
				 
			
 
				-        self.logger.info(f"\\nSaved analysis to: {filepath}")
			
 
				+        self.logger.info(f"\nSaved analysis to: {filepath}")
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
@@ -461,11 +464,11 @@ if __name__ == "__main__":
 
				 
			
 
				     df = pd.read_csv("../_sources/signal_messages.csv")
			
 
				 
			
 
				-    analyzer = SemanticNormalizationAnalyzer(min_frequency=2, coherence_threshold=0.4)
			
 
				+    analyzer = SemanticNormalizationAnalyzer(min_frequency=1, coherence_threshold=0.4)
			
 
				 
			
 
				     results = analyzer.execute(df)
			
 
				 
			
 
				-    print("\\nSemantic normalization analysis complete:")
			
 
				+    print("\nSemantic normalization analysis complete:")
			
 
				     print(f"  Unknown acronyms: {len(results['unknown_acronyms'])}")
			
 
				     print(f"  Unclear terms: {len(results['unclear_terms'])}")
			
 
				     print(f"  Abbreviations: {len(results['abbreviations'])}")
			
--- a/pipeline/steps/step0a2_llm_normatlization.py
+++ b/pipeline/steps/step0a2_llm_normatlization.py
@@ -5,11 +5,11 @@ Uses deployed LLM to identify unclear terms and unknown acronyms.
 
				 
			
 
				 from typing import List, Dict
			
 
				 import pandas as pd
			
 
				-import json
			
 
				 import requests
			
 
				 from collections import Counter
			
 
				 import re
			
 
				 from pipeline.models.base import PipelineStep
			
 
				+from json_repair import loads
			
 
				 
			
 
				 
			
 
				 class LLMNormalizationAnalyzer(PipelineStep):
			
@@ -37,7 +37,7 @@ class LLMNormalizationAnalyzer(PipelineStep):
 
				             df: DataFrame with messages
			
 
				 
			
 
				         Returns:
			
 
				-            Dictionary with identified terms and acronyms
			
 
				+            Dictionary with identified terms and acronyms (deduplicated)
			
 
				         """
			
 
				         self.logger.info("=" * 80)
			
 
				         self.logger.info("LLM-BASED TEXT NORMALIZATION ANALYSIS")
			
@@ -47,43 +47,66 @@ class LLMNormalizationAnalyzer(PipelineStep):
 
				         # Extract frequent words and acronyms
			
 
				         word_freq, acronym_freq = self._extract_terms(df)
			
 
				 
			
 
				-        # Sample messages for LLM analysis
			
 
				+        # Track results with deduplication
			
 
				+        acronym_dict = {}  # key: acronym, value: dict with metadata
			
 
				+        term_dict = {}  # key: term, value: dict with metadata
			
 
				+        expansion_dict = {}  # key: acronym, value: dict with metadata
			
 
				+
			
 
				         sample_df = df.sample(n=min(self.sample_size, len(df)), random_state=42)
			
 
				-        all_unknown_acronyms = []
			
 
				-        all_unclear_terms = []
			
 
				-        all_expansions = []
			
 
				 
			
 
				-        for i in range(0, len(df), 100):
			
 
				-            chunk = df.iloc[i : i + 100]
			
 
				+        # Process in chunks
			
 
				+        for i in range(0, len(sample_df), 100):
			
 
				+            chunk = sample_df.iloc[i : i + 100]
			
 
				             messages_sample = "\n".join(chunk["message"].fillna("").tolist())
			
 
				 
			
 
				-            # Analyze with LLM
			
 
				-            self.logger.info("\\nAnalyzing with LLM...")
			
 
				+            self.logger.info(
			
 
				+                f"Analyzing chunk {i//100 + 1} of {(len(sample_df)-1)//100 + 1}..."
			
 
				+            )
			
 
				 
			
 
				             # Get unknown acronyms
			
 
				             unknown_acronyms = self._identify_acronyms_with_llm(
			
 
				                 messages_sample, list(acronym_freq.keys())[:50]
			
 
				             )
			
 
				-            all_unknown_acronyms = list(set(all_unknown_acronyms + unknown_acronyms))
			
 
				+            for item in unknown_acronyms:
			
 
				+                acronym = item.get("acronym", "").lower()
			
 
				+                if acronym and acronym not in acronym_dict:
			
 
				+                    acronym_dict[acronym] = item
			
 
				 
			
 
				             # Get unclear terms
			
 
				             unclear_terms = self._identify_unclear_terms_with_llm(
			
 
				                 messages_sample, list(word_freq.keys())[:100]
			
 
				             )
			
 
				-            all_unclear_terms = list(set(all_unclear_terms + unclear_terms))
			
 
				-
			
 
				-            # Get expansion suggestions
			
 
				-            expansions = self._get_expansion_suggestions_with_llm(
			
 
				-                messages_sample, unknown_acronyms
			
 
				-            )
			
 
				-            all_expansions = list(set(all_expansions + expansions))
			
 
				-
			
 
				+            for item in unclear_terms:
			
 
				+                term = item.get("term", "").lower()
			
 
				+                if term and term not in term_dict:
			
 
				+                    term_dict[term] = item
			
 
				+
			
 
				+            # Get expansion suggestions (use acronyms found in this chunk)
			
 
				+            if unknown_acronyms:
			
 
				+                expansions = self._get_expansion_suggestions_with_llm(
			
 
				+                    messages_sample, unknown_acronyms
			
 
				+                )
			
 
				+                for item in expansions:
			
 
				+                    if isinstance(item, dict):
			
 
				+                        acronym = item.get("acronym", "").lower()
			
 
				+                        if acronym and acronym not in expansion_dict:
			
 
				+                            expansion_dict[acronym] = item
			
 
				+
			
 
				+        # Convert dictionaries back to lists
			
 
				         results = {
			
 
				-            "unknown_acronyms": all_unknown_acronyms,
			
 
				-            "unclear_terms": all_unclear_terms,
			
 
				-            "suggested_expansions": all_expansions,
			
 
				+            "unknown_acronyms": list(acronym_dict.values()),
			
 
				+            "unclear_terms": list(term_dict.values()),
			
 
				+            "suggested_expansions": list(expansion_dict.values()),
			
 
				         }
			
 
				 
			
 
				+        self.logger.info(
			
 
				+            f"Found {len(results['unknown_acronyms'])} unique unknown acronyms"
			
 
				+        )
			
 
				+        self.logger.info(f"Found {len(results['unclear_terms'])} unique unclear terms")
			
 
				+        self.logger.info(
			
 
				+            f"Found {len(results['suggested_expansions'])} unique expansions"
			
 
				+        )
			
 
				+
			
 
				         self._save_llm_analysis(results)
			
 
				 
			
 
				         return results
			
@@ -97,11 +120,11 @@ class LLMNormalizationAnalyzer(PipelineStep):
 
				             text = str(message)
			
 
				 
			
 
				             # Extract words
			
 
				-            words = re.findall(r"\\b[a-z]+\\b", text.lower())
			
 
				+            words = re.findall(r"\b[a-z]+\b", text.lower())
			
 
				             word_freq.update(words)
			
 
				 
			
 
				             # Extract potential acronyms (2-6 uppercase letters)
			
 
				-            acronyms = re.findall(r"\\b[A-Z]{2,6}\\b", text)
			
 
				+            acronyms = re.findall(r"\b[A-Z]{2,6}\b", text)
			
 
				             acronym_freq.update([a.lower() for a in acronyms])
			
 
				 
			
 
				         return word_freq, acronym_freq
			
@@ -110,9 +133,10 @@ class LLMNormalizationAnalyzer(PipelineStep):
 
				         self, messages_sample: str, acronym_candidates: List[str]
			
 
				     ) -> List[Dict]:
			
 
				         """Use LLM to identify unknown acronyms"""
			
 
				+        self.logger.info("identifying acronyms...")
			
 
				         prompt = f"""You are analyzing messages.
			
 
				 
			
 
				-ACRONYMS FOUND: {', '.join(acronym_candidates[:30])}
			
 
				+ACRONYMS FOUND: {', '.join(acronym_candidates[:100])}
			
 
				 
			
 
				 SAMPLE MESSAGES:
			
 
				 {messages_sample[:2000]}
			
@@ -130,17 +154,37 @@ Respond with JSON:
 
				 }}"""
			
 
				 
			
 
				         try:
			
 
				+            # print(
			
 
				+            #     json.dumps(
			
 
				+            #         {
			
 
				+            #             "model": self.model,
			
 
				+            #             "messages": [{"role": "user", "content": prompt}],
			
 
				+            #             "max_tokens": 2048,
			
 
				+            #             "temperature": 0.3,
			
 
				+            #         },
			
 
				+            #         indent=2,
			
 
				+            #     )
			
 
				+            # )
			
 
				             response = requests.post(
			
 
				                 f"{self.llm_url}/v1/chat/completions",
			
 
				-                json={"prompt": prompt, "max_tokens": 1000, "temperature": 0.3},
			
 
				+                json={
			
 
				+                    "model": self.model,
			
 
				+                    "messages": [{"role": "user", "content": prompt}],
			
 
				+                    "max_tokens": 2048,
			
 
				+                    "temperature": 0.3,
			
 
				+                },
			
 
				                 timeout=120,
			
 
				             )
			
 
				 
			
 
				             if response.status_code == 200:
			
 
				-                text = response.json()["choices"][0]["text"]
			
 
				-                parsed = json.loads(text)
			
 
				-                return parsed.get("unknown_acronyms", [])
			
 
				+                text = response.json()["choices"][0]["message"]["content"]
			
 
				+                parsed = loads(text)
			
 
				+                if isinstance(parsed, dict):
			
 
				+                    return parsed.get("unknown_acronyms", [])
			
 
				+            else:
			
 
				+                raise RuntimeError("LLM Error")
			
 
				         except Exception as e:
			
 
				+            # raise e
			
 
				             self.logger.error(f"LLM error: {e}")
			
 
				 
			
 
				         return []
			
@@ -149,9 +193,11 @@ Respond with JSON:
 
				         self, messages_sample: str, word_candidates: List[str]
			
 
				     ) -> List[Dict]:
			
 
				         """Use LLM to identify unclear terms"""
			
 
				+        self.logger.info("identifying unclear terms...")
			
 
				+
			
 
				         prompt = f"""You are analyzing messages.
			
 
				 
			
 
				-FREQUENT WORDS: {', '.join(word_candidates[:50])}
			
 
				+FREQUENT WORDS: {', '.join(word_candidates[:100])}
			
 
				 
			
 
				 SAMPLE MESSAGES:
			
 
				 {messages_sample[:2000]}
			
@@ -175,15 +221,24 @@ Respond with JSON:
 
				         try:
			
 
				             response = requests.post(
			
 
				                 f"{self.llm_url}/v1/chat/completions",
			
 
				-                json={"prompt": prompt, "max_tokens": 1000, "temperature": 0.3},
			
 
				+                json={
			
 
				+                    "model": self.model,
			
 
				+                    "messages": [{"role": "user", "content": prompt}],
			
 
				+                    "max_tokens": 2048,
			
 
				+                    "temperature": 0.3,
			
 
				+                },
			
 
				                 timeout=120,
			
 
				             )
			
 
				 
			
 
				             if response.status_code == 200:
			
 
				-                text = response.json()["choices"][0]["text"]
			
 
				-                parsed = json.loads(text)
			
 
				-                return parsed.get("unclear_terms", [])
			
 
				+                text = response.json()["choices"][0]["message"]["content"]
			
 
				+                parsed = loads(text)
			
 
				+                if isinstance(parsed, dict):
			
 
				+                    return parsed.get("unclear_terms", [])
			
 
				+            else:
			
 
				+                raise RuntimeError("LLM Error")
			
 
				         except Exception as e:
			
 
				+            # raise e
			
 
				             self.logger.error(f"LLM error: {e}")
			
 
				 
			
 
				         return []
			
@@ -192,10 +247,12 @@ Respond with JSON:
 
				         self, messages_sample: str, acronyms: List[Dict]
			
 
				     ) -> List[Dict]:
			
 
				         """Get expansion suggestions for acronyms"""
			
 
				+        self.logger.info("getting expansion suggestions...")
			
 
				+
			
 
				         if not acronyms:
			
 
				             return []
			
 
				 
			
 
				-        acronym_list = ", ".join([a["acronym"] for a in acronyms[:10]])
			
 
				+        acronym_list = ", ".join([a["acronym"] for a in acronyms[:100]])
			
 
				 
			
 
				         prompt = f"""Based on these medical/legal messages, suggest expansions for these acronyms:
			
 
				 
			
@@ -213,17 +270,37 @@ Respond with JSON:
 
				 }}"""
			
 
				 
			
 
				         try:
			
 
				+            # print(
			
 
				+            #     json.dumps(
			
 
				+            #         {
			
 
				+            #             "model": self.model,
			
 
				+            #             "messages": [{"role": "user", "content": prompt}],
			
 
				+            #             "max_tokens": 2048,
			
 
				+            #             "temperature": 0.3,
			
 
				+            #         },
			
 
				+            #         indent=2,
			
 
				+            #     )
			
 
				+            # )
			
 
				             response = requests.post(
			
 
				                 f"{self.llm_url}/v1/chat/completions",
			
 
				-                json={"prompt": prompt, "max_tokens": 800, "temperature": 0.3},
			
 
				+                json={
			
 
				+                    "model": self.model,
			
 
				+                    "messages": [{"role": "user", "content": prompt}],
			
 
				+                    "max_tokens": 2048,
			
 
				+                    "temperature": 0.3,
			
 
				+                },
			
 
				                 timeout=120,
			
 
				             )
			
 
				 
			
 
				             if response.status_code == 200:
			
 
				-                text = response.json()["choices"][0]["text"]
			
 
				-                parsed = json.loads(text)
			
 
				-                return parsed.get("expansions", [])
			
 
				+                text = response.json()["choices"][0]["message"]["content"]
			
 
				+                parsed = loads(text)
			
 
				+                if isinstance(parsed, dict):
			
 
				+                    return parsed.get("expansions", [])
			
 
				+            else:
			
 
				+                raise RuntimeError("LLM Error")
			
 
				         except Exception as e:
			
 
				+            # raise e
			
 
				             self.logger.error(f"LLM error: {e}")
			
 
				 
			
 
				         return []
			
@@ -259,7 +336,7 @@ Respond with JSON:
 
				 
			
 
				         filepath = self.output_dir / "llm_normalization_analysis.txt"
			
 
				         with open(filepath, "w") as f:
			
 
				-            f.write("\\n".join(text_output))
			
 
				+            f.write("\n".join(text_output))
			
 
				 
			
 
				         self.logger.info(f"Saved analysis to: {filepath}")
			
 
				 
			
@@ -270,9 +347,11 @@ if __name__ == "__main__":
 
				     df = pd.read_csv("../_sources/signal_messages.csv")
			
 
				 
			
 
				     analyzer = LLMNormalizationAnalyzer(
			
 
				-        llm_url="http://localhost:8000", sample_size=500
			
 
				+        llm_url="http://eos.dgtlu.net:11434",
			
 
				+        sample_size=14000,
			
 
				+        model="hf.co/bartowski/Qwen2.5-14B-Instruct-GGUF:Q4_K_S",
			
 
				     )
			
 
				 
			
 
				     results = analyzer.execute(df)
			
 
				-    print(f"\\nFound {len(results['unknown_acronyms'])} unknown acronyms")
			
 
				+    print(f"Found {len(results['unknown_acronyms'])} unknown acronyms")
			
 
				     print(f"Found {len(results['unclear_terms'])} unclear terms")
			
--- a/pipeline/steps/step0b1_semantic_keyword_identification.py
+++ b/pipeline/steps/step0b1_semantic_keyword_identification.py
--- a/pipeline/steps/step0b2_llm_keyword_identification.py
+++ b/pipeline/steps/step0b2_llm_keyword_identification.py
--- a/pipeline/steps/step0b_keyword_identification.py
+++ b/pipeline/steps/step0b_keyword_identification.py
@@ -4,8 +4,10 @@ Step 0a: Identify relevant keywords from sample data.
 
				 
			
 
				 import pandas as pd
			
 
				 import json
			
 
				-from step0a_semantic_keyword_identification import SemanticKeywordIdentifier
			
 
				-from step0a_llm_keyword_identification import LLMKeywordIdentifier
			
 
				+from pipeline.steps.step0b1_semantic_keyword_identification import (
			
 
				+    SemanticKeywordIdentifier,
			
 
				+)
			
 
				+from pipeline.steps.step0b2_llm_keyword_identification import LLMKeywordIdentifier
			
 
				 from pipeline.utils.combine_keywords import combine_keywords, analyze_overlap
			
 
				 
			
 
				 
			
--- a/pipeline/utils/text_utils.py
+++ b/pipeline/utils/text_utils.py
@@ -5,7 +5,7 @@ Utility functions for text processing.
 
				 import re
			
 
				 from typing import List
			
 
				 import pandas as pd
			
 
				-from pipeline.common_defs import TEXT_EXPANSIONS
			
 
				+from pipeline.common_defs import TEXT_EXPANSIONS, ACRONYMS
			
 
				 
			
 
				 def normalize_text(text: str) -> str:
			
 
				     """
			
@@ -20,7 +20,15 @@ def normalize_text(text: str) -> str:
 
				     if pd.isna(text) or text == '':
			
 
				         return ""
			
 
				 
			
 
				-    text = str(text).lower()
			
 
				+    text = str(text)
			
 
				+
			
 
				+    # Apply expansions for acronyms
			
 
				+    for abbr, full in ACRONYMS.items():
			
 
				+        # Use \b for word boundaries to only match complete words
			
 
				+        pattern = r"\b" + re.escape(abbr) + r"\b"
			
 
				+        text = re.sub(pattern, full, text)
			
 
				+
			
 
				+    text = text.lower()
			
 
				 
			
 
				     # Apply expansions
			
 
				     for abbr, full in TEXT_EXPANSIONS.items():
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,7 @@ description = "Add your description here"
 
				 readme = "README.md"
			
 
				 requires-python = ">=3.12"
			
 
				 dependencies = [
			
 
				+    "json-repair>=0.54.2",
			
 
				     "openpyxl>=3.1.5",
			
 
				     "pandas>=2.3.3",
			
 
				     "scikit-learn>=1.7.2",
			
--- a/uv.lock
+++ b/uv.lock
@@ -82,6 +82,7 @@ name = "discovery"
 
				 version = "0.1.0"
			
 
				 source = { virtual = "." }
			
 
				 dependencies = [
			
 
				+    { name = "json-repair" },
			
 
				     { name = "openpyxl" },
			
 
				     { name = "pandas" },
			
 
				     { name = "scikit-learn" },
			
@@ -90,6 +91,7 @@ dependencies = [
 
				 
			
 
				 [package.metadata]
			
 
				 requires-dist = [
			
 
				+    { name = "json-repair", specifier = ">=0.54.2" },
			
 
				     { name = "openpyxl", specifier = ">=3.1.5" },
			
 
				     { name = "pandas", specifier = ">=2.3.3" },
			
 
				     { name = "scikit-learn", specifier = ">=1.7.2" },
			
@@ -201,6 +203,15 @@ wheels = [
 
				     { url = "https://files.pythonhosted.org/packages/1e/e8/685f47e0d754320684db4425a0967f7d3fa70126bffd76110b7009a0090f/joblib-1.5.2-py3-none-any.whl", hash = "sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241", size = 308396, upload-time = "2025-08-27T12:15:45.188Z" },
			
 
				 ]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "json-repair"
			
 
				+version = "0.54.2"
			
 
				+source = { registry = "https://pypi.org/simple" }
			
 
				+sdist = { url = "https://files.pythonhosted.org/packages/ff/05/9fbcd5ffab9c41455e7d80af65a90876718b8ea2fb4525e187ab11836dd4/json_repair-0.54.2.tar.gz", hash = "sha256:4b6b62ce17f1a505b220fa4aadba1fc37dc9c221544f158471efe3775620bad6", size = 38575, upload-time = "2025-11-25T19:31:22.768Z" }
			
 
				+wheels = [
			
 
				+    { url = "https://files.pythonhosted.org/packages/53/3a/1b4df9adcd69fee9c9e4b439c13e8c866f2fae520054aede7030b2278be9/json_repair-0.54.2-py3-none-any.whl", hash = "sha256:be51cce5dca97e0c24ebdf61a1ede2449a8a7666012de99467bb7b0afb35179b", size = 29322, upload-time = "2025-11-25T19:31:21.492Z" },
			
 
				+]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "markupsafe"
			
 
				 version = "3.0.3"