Parcourir la source

updated text normalization & new replacements

adri il y a 1 mois
Parent
commit
86f5789c71

+ 1 - 0
.gitignore

@@ -3,3 +3,4 @@ __pycache__
 _sources
 .venv
 *.egg-info
+pipeline_output

+ 633 - 15
pipeline/common_defs.py

@@ -92,23 +92,641 @@ KEY_TOPICS = [
 ]
 
 # Text normalization expansions
+ACRONYMS = {
+    "AAA": "american automobile association",
+    "AAVE": "african american vernacular english",
+    "AC": "air conditioning",  # or alternating current
+    "AC18": "ip camera app",
+    "ACIO": "associate chief information officer",
+    "ACL": "anterior cruciate ligament",
+    "ACME": "automatic certificate management environment",
+    "ADHD": "attention deficit hyperactivity disorder",
+    "AF": "as fuck",
+    "AFAB": "assigned femle at birth",
+    "AFAIK": "as far as i know",
+    "AI": "artificial intelligence",
+    "AIDS": "acquired immune deficiency syndrome",
+    "AKA": "also known as",
+    "AMA": "against medical advice",  # or ask me anything
+    "AMAB": "assigned male at birth",
+    "AMD": "advanced micro devices",
+    "AMEX": "american express",
+    "AP": "access point",
+    "API": "application programming interface",
+    "APICHA": "asian/pacific islander coalition on hiv/aids",
+    "APR": "annual percentage rate",
+    "ASAP": "as soon as possible",
+    "ASPCA": "american society for prevention of cruelty to animals",
+    "ASR": "automated speech recognition",
+    "ATL": "atlanta",
+    "ATM": "at the moment",  # or automated teller machine or ass to mouth
+    "AWS": "amazon web services",
+    "BA": "breast augmentation",
+    "BB": "baby",
+    "BBC": "big black cock",
+    "BBL": "brazilian butt lift",
+    "BBQ": "barbecue",
+    "BC": "because",
+    "BCBS": "blue cross blue shield health insurance",
+    "BDSM": "bondage, discipline, sadism, and masochism",
+    "BFF": "best friends forever",
+    "BI": "business intelligence",  # or bisexual
+    "BIOS": "basic input/output system",
+    "BJ": "blowjob",
+    "BK": "brooklyn",
+    "BLT": "benzocaine, lidocaine, and tetracaine",
+    "BM": "bowel movement",
+    "BMI": "body mass index",
+    "BMW": "bavarian motor works",
+    "BNWO": "black new world order",
+    "BO": "body odor",
+    "BP": "blood pressure",
+    "BPD": "bipolar disorder",
+    "BR": "bedroom",
+    "BRATT": "bananas, rice, applesauce, and toast",
+    "BRB": "be right back",
+    "BS": "bullshit",
+    "BTW": "by the way",
+    "BX": "beatrice the cat",
+    "C2C": "cam to cam",
+    "CBD": "cannabinodiol",
+    "CBT": "cognitive-behavioral therapy",  # or cock and ball torture
+    "CC": "credit card",  # or carbon copy
+    "CD": "crossdresser",  # or compact disc
+    "CDC": "centers for disease control",
+    "CEA": "carcinoembryonic antigen",
+    "CHEMO": "chemotherapy",
+    "CIO": "chief information officer",
+    "CL": "callen-lorde community health center",
+    "CLI": "command line interface",
+    "COBRA": "consolidated omnibus budget reconciliation act",
+    "COIN": "cecilia’s occupational inclusion network health program",
+    "CP": "colored person",
+    "CPAP": "continuous positive airway pressure",
+    "CPU": "central processing unit",
+    "CS": "customer service",
+    "CSI": "crime scene investigation",
+    "CT": "computed tomography",
+    "CVS": "consumer value stores",
+    "DARE": "drug abuse resistance education",
+    "DB": "database",  # or decibels
+    "DBA": "doing business as",
+    "DC": "district of columbia",
+    "DCD": "adult video store",
+    "DEA": "drug enforcement agency",
+    "DEI": "diversity, equity, and inclusion",
+    "DG": "dollar general",
+    "DHCP": "dynamic host configuration protocol",
+    "DIY": "do it yourself",
+    "DJ": "disc jockey",
+    "DK": "don't know",
+    "DL": "down low",
+    "DMT": "dimethyltryptamine",
+    "DMV": "department of motor vehicles",
+    "DNA": "deoxyribonucleic acid",
+    "DND": "do not disturb",
+    "DNS": "domain name server",
+    "DOA": "dead on arrival",
+    "DP": "double penetration",
+    "DPO": "dental provider organization",
+    "DR": "doctor",
+    "DSM": "diagnostic and statistical manual",
+    "DTC": "double-team challenge",
+    "DTF": "down to fuck",
+    "DUI": "driving under the influence",
+    "DVR": "digital video recorder",
+    "DVT": "deep vein thrombosis",
+    "DWI": "driving while intoxicated",
+    "EBS": "elastic block storage",
+    "EBT": "electronic benefit transfer",
+    "ED": "erectile dysfunction",
+    "EDT": "eastern daylight time",
+    "EEOC": "equal employment opportunity commission",
+    "ELI5": "explain it like i'm 5",
+    "EMS": "emergency medical services",
+    "EMT": "emergency medical technician",
+    "ENT": "ear nose and throat",
+    "ENV": "environment",
+    "EOD": "end of day",
+    "EOW": "end of week",
+    "EPO": "exclusive provider organization",
+    "ERP": "enterprise resource planning",
+    "ES": "elastic search",  # or 'is' in german
+    "ESL": "english as a second language",
+    "ET": "eastern time",
+    "ETA": "estimated time of arrival",
+    "FAFO": "fuck around and find out",
+    "FAQ": "frequently asked questions",
+    "FB": "facebook",
+    "FBI": "federal bureau of investigation",
+    "FDG": "fluorodeoxyglucose",
+    "FFS": "facial feminization surgery",  # or for fuck's sake
+    "FL": "fetlife",  # or florida
+    "FML": "fuck my life",
+    "FMLA": "family medical leave act",
+    "FNA": "fine needle aspiration",
+    "FODMAP": "fermentable oligosaccharides, disaccharides, monosaccharides and polyols",
+    "FOH": "get the fuck out of here",
+    "FOLFOX": "leucovorin, fluorouracil, and oxaliplatin",
+    "FOMO": "fear of missing out",
+    "FTM": "female-to-male transsexual",
+    "FUE": "follicular unit extraction",
+    "FUT": "follicular unit transplantation",
+    "FWIW": "for what it's worth",
+    "FYI": "for your information",
+    "GB": "gigabyte",  # or gangbang
+    "GBL": "gamma-butyrolactone",
+    "GBP": "gabapentin",
+    "GC": "gonorrhea",
+    "GCS": "gender confirmation surgery",
+    "GERD": "gastro-esophageal reflux disorder",
+    "GF": "girlfriend",  # or gluten free
+    "GFI": "go for it",
+    "GGUF": "gpt-generated unified format",
+    "GHB": "gamma-hydroxy buterol",
+    "GI": "gastrointestinal",
+    "GLOW": "gorgeous ladies of wrestling",
+    "GM": "good morning",
+    "GOAT": "greatest of all time",
+    "GP": "general practitioner",
+    "GPS": "global positioning system",
+    "GPU": "graphics processing unit",
+    "GV": "google voice",
+    "GWB": "george washington bridge",
+    "HAI": "hepatic artery infusion",
+    "HAM": "hard as a motherfucker",
+    "HDD": "hard disk drive",
+    "HDMI": "high definition multimedia interface",
+    "HELOC": "home equity line of credit",
+    "HIV": "human immunodeficiency virus",
+    "HOEING": "working as an escort",
+    "HOOKED": "worked as an escort",
+    "HPV": "human papilloma virus",
+    "HR": "hour",
+    "HRT": "hormone replacement therapy",
+    "HSV": "herpes simplex virus",
+    "HSV2": "herpes simplex virus 2",
+    "HUNNY": "honey",
+    "HVAC": "heat, ventilation, and air conditioning",
+    "IAM": "identity and access management",
+    "IANAL": "i am not a lawyer",
+    "IBS": "irritable bowel syndrome",
+    "ICD": "international classification of diseases",
+    "ID": "identification",
+    "IDGAF": "i don't give a fuck",
+    "IDK": "i don't know",
+    "IFTT": "if this then that",
+    "IFTTT": "if this then that",
+    "IG": "instagram",
+    "IIRC": "if i recall correctly",
+    "ILYSM": "i love you so much",
+    "IMO": "in my opinion",
+    "INFO": "information",
+    "IPL": "intense pulsed light",
+    "IRL": "in real life",
+    "ISP": "internet service provider",
+    "IV": "intravenous",
+    "JAN28": "january 28",
+    "JC": "jersey city",
+    "JDC": "jennifer danielle capasso",
+    "JFC": "jesus fucking christ",
+    "JFDI": "just fucking do it",
+    "JFK": "john f kennedy airport",
+    "JJ": "jungle juice",
+    "JK": "just kidding",
+    "JSON": "javascript object notation",
+    "K9": "canine",
+    "KK": "ok",
+    "LAR": "lower arthroscopic resection",
+    "LCSW": "licensed clinical social worker",
+    "LDAP": "lightweight directory access protocol",
+    "LED": "light emitting diode",
+    "LGA": "laguardia airport",
+    "LI": "long island",
+    "LIC": "long island city",
+    "LLM": "large language model",
+    "LM": "let me",  # or as in LM studio
+    "LMAO": "laughing my ass off",
+    "LMK": "let me know",
+    "LOL": "laughing out loud",
+    "LR": "living room",
+    "LSD": "lysergic acid diethyl-amide",
+    "LTD": "long-term disability",
+    "M19": "main hospital 19th floor",
+    "M4T": "male for transgender",
+    "MA": "master of arts",
+    "MAC": "media access control",
+    "MB": "megabytes",  # or motherboard
+    "MBP": "macbook pro",
+    "MCHC": "mean corpuscular hemoglobin",
+    "MD": "doctor of medicine",
+    "MDMA": "3,4-methylenedioxymethamphetamine",
+    "MF": "motherfucker",
+    "MFA": "multi-factor authentication",  # or masters of fine arts
+    "MI": "miles",
+    "MIA": "missing in action",
+    "MILF": "mom i'd like to fuck",
+    "MN": "minnesota",
+    "MRI": "magnetic resonance imaging",
+    "MRSA": "methicillin resistant staphylococcus aureus",
+    "MSKCC": "memorial sloan kettering cancer center",
+    "MV": "manyvids",
+    "NAS": "network attached storage",
+    "NAT": "network address translation",
+    "NBD": "no big deal",
+    "NDA": "non-disclosure agreement",
+    "NE": "northeast",
+    "NED": "no evidence of disease",
+    "NFS": "network file sharing",
+    "NGL": "not going to lie",
+    "NIH": "national institutes of health",
+    "NJ": "new jersey",
+    "NLP": "natural language processing",
+    "NM": "not much",
+    "NSA": "no strings attached",
+    "NSFW": "not safe for work",
+    "NVM": "nevermind",
+    "NVR": "network video recorder",
+    "NY": "New York",
+    "NYCHA": "new york city housing authority",
+    "NYE": "new year's eve",
+    "NYPD": "new york city police department",
+    "NYS": "new york state",
+    "NYU": "new york university hospital",
+    "O2": "oxygen",
+    "OBS": "open broadcaster studio",
+    "OCD": "obsessive-compulsive disorder",
+    "OD": "to the extreme",
+    "ODOD": "way far to the extreme",
+    "OG": "original gangster",
+    "OKC": "ok cupid",
+    "OMFG": "oh my fucking god",
+    "OMG": "oh my god",
+    "OMGWTF": "oh my god what the fuck",
+    "OMW": "on my way",
+    "ONT": "optical network terminal",
+    "OOC": "out of commission",
+    "OOP": "out of pocket",
+    "OP": "operation",
+    "OSHA": "occupational safety and health administration",
+    "P2P": "pay to play",
+    "PACU": "post-anesthesia care unit",
+    "PB": "peanut butter",
+    "PC": "personal computer",
+    "PCN": "penicillin",
+    "PCR": "polymerase chain reaction",
+    "PD": "police department",
+    "PDE5": "phosphodiesterase 5",
+    "PDF": "portable document format",
+    "PEBKAC": "problem exists between keyboard and chair",
+    "PET": "positron emission tomography",  # or animal
+    "PH": "penthouse",
+    "PHP": "php hypertext processor",
+    "PIP": "picture in picture",  # or personal improvement plan
+    "PITA": "pain in the ass",
+    "PK": "anus",
+    "PLS": "please",
+    "PMA": "positive mental attitude",
+    "PMV": "porn music video",
+    "PNP": "do drugs and have sex",
+    "POC": "proof of concept",
+    "POV": "point of view",
+    "PPE": "personal protective equipment",
+    "PPO": "preferred provider organization",
+    "PPV": "peritoneal pull-through vaginoplasty",
+    "PR": "public relations",
+    "PRP": "platelet rich plasma",
+    "PSU": "power supply unit",
+    "PT": "physical therapy",  # or part or patient
+    "PTA": "parent-teacher association",
+    "PTO": "paid time off",
+    "PTSD": "post-traumatic stress disorder",
+    "PTZ": "pan, tilt, zoom",
+    "QC": "queens county inn",
+    "QV": "quick visit",
+    "RAID": "redundant array of inexpensive disks",
+    "RAM": "random access memory",  # or a guy's name
+    "RB": "robby b",
+    "RBC": "red blood cell count",
+    "RDW": "red cell distribution width",
+    "RIP": "rest in peace",  # or tear
+    "RIT": "rochester institute of technology",
+    "ROA": "route of administration",
+    "ROC": "rochester, ny",
+    "ROI": "return on investment",
+    "RPR": "rapid plasma reagin",
+    "RSI": "repetitive stress injury",
+    "RTSP": "real-time streaming protocol",
+    "RTW": "return to work",
+    "SATA": "serial advanced technology attachment",
+    "SBC": "sigle board computer",  # or small black cock
+    "SCA": "single case agreement",
+    "SD": "secure digital",
+    "SIBO": "small intestinal bacterial overgrowth",
+    "SIL": "son in-law",
+    "SIM": "subscriber identity module",
+    "SK": "sloan kettering",
+    "SMS": "short message service",
+    "SOL": "shit out of luck",
+    "SOS": "ship on shoal",
+    "SSD": "solid state drive",
+    "SSDI": "social security disability insurance",
+    "SSH": "secure shell",
+    "SSN": "social security number",
+    "STD": "sexually transmitted disease",
+    "STFU": "shut the fuck up",
+    "STG": "swear to god",
+    "STI": "sexually transmitted infection",
+    "SUV": "standardized uptake value",  # or sport utility vehicle
+    "T4T": "trans for trans",
+    "TB": "terabytes",
+    "TBD": "to be determined",
+    "TBH": "to be honest",
+    "TBQH": "to be quite honest",
+    "TCP": "transmission control protocol",
+    "TDOR": "transgender day of remembrance",
+    "TDOV": "transgender day of visibility",
+    "TENS": "transcutaneous electrical nerve stimulation",
+    "TF": "the fuck",
+    "TFM": "the fucking manual",
+    "TFW": "that feeling when",
+    "TG": "transgender",
+    "TIL": "today i learned",
+    "TLC": "taxi and limousine commission",  # or tender loving care,
+    "TLS": "transport layer security",
+    "TME": "total mesorectal excision",
+    "TP": "toilet paper",
+    "TPU": "tensor processing unit",
+    "TS": "transsexual",
+    "TSA": "transportation security agency",
+    "TT": "testosterone",
+    "TTY": "talk to you",
+    "TTYL": "talk to you later",
+    "TWT": "traveling while trans",
+    "U2": "you too",
+    "UA": "unemployment assistance",
+    "UAT": "user acceptance testing",
+    "UC": "urgent care",
+    "UCC": "urgent care center",
+    "UDP": "user datagram protocol",
+    "UHC": "united health care",
+    "UI": "user interface",
+    "UID": "unique identifier",
+    "UK": "united kingdom",
+    "UPS": "united parcel service",
+    "UR": "your",
+    "URL": "uniform resource locator",
+    "URMC": "university of rochester medical center",
+    "USB": "universal serial bus",
+    "UTC": "universal coordinated time",
+    "UTF8": "unicode transformation format – 8-bit",
+    "UTI": "urinary tract infection",
+    "UV": "ultraviolet",
+    "UWS": "upper west side",
+    "UX": "user experience",
+    "VA": "virginia",
+    "VD": "valentine's day",
+    "VII": "7",
+    "VIP": "very important person",
+    "VM": "virtual machine",
+    "VP": "vice president",
+    "VPN": "virtual private network",
+    "VR": "virtual reality",
+    "VS": "versus",  # or victoria's secret
+    "WFH": "work from home",
+    "WG": "wireguard",
+    "WI": "wisconsin",
+    "WNY": "western new york state",
+    "WOC": "wound, ostomy, and continence",
+    "WOPR": "big computer",
+    "WRT": "with regard to",
+    "WTF": "what the fuck",
+    "WW2": "world war 2",
+    "WWI": "world war 1",
+    "WWII": "world war 2",
+    "WYM": "what do you mean",
+    "XELOX": "xeloda and oxaliplatin",
+    "XL": "extra large",
+    "XML": "extensible markup language",
+    "XXXL": "extra extra extra large",
+    "XYZ": "miscellaneous things",
+    "YOLO": "you only live once",
+    "YTD": "year-to-date",
+    "ZM": "zone minder",
+}
+
 TEXT_EXPANSIONS = {
-    "admin": "administrator",
+    "5 boro": "five boroughs",
+    "8up": "high",
+    "aaaggghhh": "ugh",
+    "aaah": "ah",
+    "af": "as fuck",
+    "agentdvr": "agent dvr video surveillance software",
+    "ahhh": "ah",
+    "anytime": "any time",
     "appt": "appointment",
-    "dept": "department",
-    "dr.": "doctor",
-    "dr ": "doctor ",
-    "info": "information",
-    "meds": "medication",
-    "msk": "memorial sloan kettering",
-    "mskcc": "memorial sloan kettering",
-    "proc": "procedure",
-    "pt": "patient",
-    "pts": "patients",
-    "rep": "representative",
-    "rx": "prescription",
-    "sk": "memorial sloan kettering",
-    "med": "medical",
+    "asap": "As Soon As Possible",
+    "autopay": "automatic payment",
+    "awww": "aww",
+    "awwww": "aww",
+    "awwwww": "aww",
+    "azithromycin": "azithromycin",
+    "babyface": "young-looking face",
+    "bb": "baby",
+    "bbl": "brazilian butt lift",
+    "bff": "best friend forever",
+    "biggie": "big deal",
+    "bleh": "ugh",
+    "bo": "body odor",
+    "bool": "boolean",
+    "brattice": "beatrice",
+    "brb": "be right back",
+    "bros": "brothers",
+    "btw": "by the way",
+    "buzz": "intoxication",
+    "bz": "beattrice",
+    "c card": "cancer excuse",
+    "cam": "camera",
+    "care credit": "healthcare credit card",
+    "clienting": "engaging in client services",
+    "coin": "a client",
+    "congrats": "congratulations",
+    "coulda": "could have",
+    "cruising": "looking for sex",
+    "ctdna": "cell-free tumor dna analysis",
+    "cuz": "because",
+    "dawww": "aww",
+    "dawwww": "aww",
+    "dawwwww": "aww",
+    "ddwrt": "router firmware",
+    "dl": "down low",
+    "dm'ed": "direct messaged",
+    "doesnt": "does not",
+    "doiing": "doing",
+    "dokie": "ok",
+    "dom": "dominant",
+    "dongle": "adapter",
+    "doppelgangers": "lookalikes",
+    "downright": "perfectly",
+    "dyryfuutyitg JJ grjuthfudgfujdg it rghrgg in cd dry ughydh it Dr h TS do thgxgytyy do huthjet he etiyfyiyttg": "ugh",
+    "E53st": "east 53rd street",
+    "ed": "erectile dysfunction",
+    "ehhh": "eh",
+    "eod": "End Of Day",
+    "esp": "especially",
+    "eta": "estimated time of arrival",
+    "eufy": "eufy camera",
+    "fb": "facebook",
+    "ffs": "facial feminization surgery",
+    "fml": "fuck my life",
+    "fodmap": "fermentable oligosaccharides, disaccharides, monosaccharides, and polyols",
+    "frfr": "for real, for real",
+    "from my ass": "according to me",
+    "fs": "file system",
+    "ft": "foot",
+    "ftm": "female to male transgender",
+    "fucksake": "for fuck's sake",
+    "fwiw": "for what it's worth",
+    "gbp": "gabapentin",
+    "gguf": "gpt-generated unified format",
+    "gi": "gastrointestinal",
+    "gke": "google kubernetes engine",
+    "glow": "gorgeous ladies of wrestling",
+    "gm financial": "general motors financial",
+    "goddammit": "god damn it",
+    "gosh": "oh my goodness",
+    "grindr": "gay rendezvous internet dating resource app",
+    "haha": "ha ha",
+    "hahaha": "ha ha",
+    "hasnt": "has not",
+    "havent": "have not",
+    "hehe": "hee hee",
+    "hella": "really",
+    "heyyy": "hey",
+    "ho": "hooker",
+    "hojo": "hotel",
+    "hokay": "ok",
+    "homo": "homosexual",
+    "hooray": "yay",
+    "hruwyd": "how are you? what are you doing?",
+    "hunny": "sweetheart",
+    "ibs": "irritable bowel syndrome",
+    "id": "identification",
+    "idc": "i don't care",
+    "idk": "i don't know",
+    "idnyc": "identification new york city",
+    "ie": "that is",
+    "iirc": "if i recall correctly",
+    "immodium": "loperamide",
+    "intel": "intelligence",
+    "ipcam": "internet protocol camera",
+    "jfc": "jesus fucking christ",
+    "jfk": "john f kennedy airport",
+    "jk": "just kidding",
+    "jock itch": "tinea cruris",
+    "k8s": "kubernetes",
+    "klga": "laguardia airport",
+    "km": "kilometers",
+    "ladysack": "scrotum",
+    "leeway": "flexibility",
+    "lga": "laguardia airport",
+    "lmao": "laughing my ass off",
+    "lmk": "let me know",
+    "lol": "laughing out loud",
+    "loll": "laughing out loud",
+    "lolllll": "laughing out loud",
+    "lollllll": "laughing out loud",
+    "lolllllll": "laughing out loud",
+    "lolol": "laughing out loud",
+    "lololol": "laughing out loud",
+    "lolsob": "laughing out loud and crying",
+    "lolxz": "laughing out loud",
+    "lolz": "laughing out loud",
+    "lovey": "affectionate",
+    "ltd": "long-term disability",
+    "mbps": "megabits per second",
+    "mf": "Motherfucker",
+    "ml": "milliliters",
+    "mn": "Minnesota",
+    "motility": "digestion",
+    "nameserver": "name server",
+    "ned": "no evident disease",
+    "nfn": "not for nothing",
+    "noone": "no one",
+    "nooo": "no",
+    "np": "nurse practitioner",
+    "nvm": "never mind",
+    "ny": "new york",
+    "nyc": "new york city",
+    "nys": "new york state",
+    "obvs": "obviously",
+    "ohhhh": "oh",
+    "ok": "okay",
+    "omfg": "oh my fucking god",
+    "omg": "oh my god",
+    "omgwtf": "oh my god what the fuck",
+    "omw": "on my way",
+    "oooh": "wow",
+    "outward bound": "juvenile rehabilitation program",
+    "papi": "daddy",
+    "path": "pathology",
+    "prob": "probably",
+    "prolly": "probably",
+    "publickey": "public key",
+    "puffy": "swollen",
+    "rebooking": "rescheduling",
+    "rm": "room",
+    "rn": "right now",
+    "rpi3": "raspberry pi 3",
+    "rpi4": "raspberry pi 4",
+    "rtsp": "remote transport stream protocol",
+    "runtime": "run time",
+    "sadtrombone.jpg": "sad (sarcastic)",
+    "sayin": "saying",
+    "semi-reggie": "semi-regular",
+    "shit break": "bowel movement",
+    "sm": "sadomasochism",
+    "smoking crack": "crazy",
+    "sooo": "so",
+    "specs": "specifications",
+    "sqft": "square feet",
+    "srlp": "sylvia rivera law project",
+    "stg": "swear to god",
+    "subby": "submissive",
+    "sync": "synchronize",
+    "t": "methamphetamine",
+    "tag team": "work together",
+    "tbh": "to be honest",
+    "tf": "the fuck",
+    "tflite": "tensorflow lite",
+    "thats": "that is",
+    "theres": "there is",
+    "tho": "though",
+    "tmp": "temporary",
+    "tmpfs": "temporary file storage",
+    "tops": "penetrating partners",
+    "totes": "totally",
+    "trillium": "trillium health",
+    "tsa": "transportation security administration",
+    "twacky": "drugged out",
+    "twee": "overly nice",
+    "vommed": "vomited",
+    "vr": "virtual reality",
+    "vram": "video random access memory",
+    "werent": "were not",
+    "wfh": "work from home",
+    "wi": "Wisconsin",
+    "wont": "will not",
+    "woohoo": "yay",
+    "woulda": "would have",
+    "wouldnt": "would not",
+    "wyd": "what are you doing?",
+    "wym": "what do you mean?",
+    "xp": "experience points",
+    "yall": "you all",
+    "yang": "disrespectful speech",
+    "yeesh": "yikes",
+    "yr": "your",
 }
 
 # Subpoena criteria descriptions

+ 21 - 18
pipeline/steps/step0a_semantic_normalization.py → pipeline/steps/step0a1_semantic_normalization.py

@@ -90,27 +90,27 @@ class SemanticNormalizationAnalyzer(PipelineStep):
         self.logger.info(f"Analyzing {len(df):,} messages")
 
         # Extract words with metadata
-        self.logger.info("\\nExtracting words and computing frequencies...")
+        self.logger.info("\nExtracting words and computing frequencies...")
         word_data = self._extract_word_data(df)
         self.logger.info(f"Found {len(word_data):,} unique words")
 
         # Identify unknown acronyms
-        self.logger.info("\\nIdentifying unknown acronyms...")
+        self.logger.info("\nIdentifying unknown acronyms...")
         unknown_acronyms = self._identify_unknown_acronyms(word_data)
         self.logger.info(f"Found {len(unknown_acronyms)} unknown acronyms")
 
         # Identify unclear terms using semantic coherence
-        self.logger.info("\\nAnalyzing semantic coherence for unclear terms...")
+        self.logger.info("\nAnalyzing semantic coherence for unclear terms...")
         unclear_terms = self._identify_unclear_terms(word_data, df)
         self.logger.info(f"Found {len(unclear_terms)} unclear terms")
 
         # Identify abbreviations
-        self.logger.info("\\nIdentifying abbreviations...")
+        self.logger.info("\nIdentifying abbreviations...")
         abbreviations = self._identify_abbreviations(word_data)
         self.logger.info(f"Found {len(abbreviations)} abbreviations")
 
         # Identify domain-specific jargon
-        self.logger.info("\\nIdentifying domain-specific jargon...")
+        self.logger.info("\nIdentifying domain-specific jargon...")
         jargon = self._identify_jargon(word_data)
         self.logger.info(f"Found {len(jargon)} jargon terms")
 
@@ -135,7 +135,7 @@ class SemanticNormalizationAnalyzer(PipelineStep):
             text = str(message)
 
             # Extract words with original casing
-            words = re.findall(r"\\b[a-zA-Z][a-zA-Z0-9]*\\b", text)
+            words = re.findall(r"\b[a-zA-Z][a-zA-Z0-9]*\b", text)
 
             for word in words:
                 word_lower = word.lower()
@@ -179,8 +179,9 @@ class SemanticNormalizationAnalyzer(PipelineStep):
             is_acronym = (
                 len(word) >= 2
                 and len(word) <= 6
-                and word.upper() in data["original_forms"]
+                and any(form.isupper() for form in data["original_forms"])
                 and word not in self.known_acronyms
+                and data["frequency"] < 1500
                 and not word.isdigit()
             )
 
@@ -207,7 +208,7 @@ class SemanticNormalizationAnalyzer(PipelineStep):
         candidate_words = [
             w
             for w, data in word_data.items()
-            if 5 <= data["frequency"] <= 100
+            if 5 <= data["frequency"] <= 200
             and len(w) >= 4
             and w not in self.known_terms
         ]
@@ -249,7 +250,7 @@ class SemanticNormalizationAnalyzer(PipelineStep):
         # Sort by coherence (lowest first)
         unclear_terms.sort(key=lambda x: x["coherence_score"])
 
-        return unclear_terms[:50]  # Top 50 most unclear
+        return unclear_terms[:200]  # Top 200 most unclear
 
     def _identify_abbreviations(self, word_data: Dict) -> List[Dict]:
         """Identify potential abbreviations"""
@@ -258,8 +259,8 @@ class SemanticNormalizationAnalyzer(PipelineStep):
         # Common abbreviation patterns
         abbrev_patterns = [
             (r"^[a-z]{2,4}$", "short_word"),  # 2-4 letter words
-            (r"^[a-z]+\\.$", "period_ending"),  # Words ending in period
-            (r"^[a-z]\\d+$", "letter_number"),  # Letter + number
+            (r"^[a-z]+\.$", "period_ending"),  # Words ending in period
+            (r"^[a-z]\d+$", "letter_number"),  # Letter + number
         ]
 
         for word, data in word_data.items():
@@ -268,7 +269,9 @@ class SemanticNormalizationAnalyzer(PipelineStep):
                     # Check if it has period in original forms
                     has_period = any("." in form for form in data["original_forms"])
 
-                    if has_period or pattern_type == "short_word":
+                    if (has_period or pattern_type == "short_word") and data[
+                        "frequency"
+                    ] < 1500:
                         abbreviations.append(
                             {
                                 "abbreviation": word,
@@ -282,7 +285,7 @@ class SemanticNormalizationAnalyzer(PipelineStep):
         # Sort by frequency
         abbreviations.sort(key=lambda x: x["frequency"], reverse=True)
 
-        return abbreviations[:30]  # Top 30
+        return abbreviations[:100]  # Top 100
 
     def _identify_jargon(self, word_data: Dict) -> List[Dict]:
         """Identify domain-specific jargon"""
@@ -316,7 +319,7 @@ class SemanticNormalizationAnalyzer(PipelineStep):
         # Sort by frequency
         jargon.sort(key=lambda x: x["frequency"], reverse=True)
 
-        return jargon[:20]  # Top 20
+        return jargon[:100]  # Top 100
 
     def _save_normalization_analysis(self, results: Dict):
         """Save normalization analysis results"""
@@ -451,9 +454,9 @@ class SemanticNormalizationAnalyzer(PipelineStep):
 
         filepath = self.output_dir / "semantic_normalization_analysis.txt"
         with open(filepath, "w") as f:
-            f.write("\\n".join(text_output))
+            f.write("\n".join(text_output))
 
-        self.logger.info(f"\\nSaved analysis to: {filepath}")
+        self.logger.info(f"\nSaved analysis to: {filepath}")
 
 
 if __name__ == "__main__":
@@ -461,11 +464,11 @@ if __name__ == "__main__":
 
     df = pd.read_csv("../_sources/signal_messages.csv")
 
-    analyzer = SemanticNormalizationAnalyzer(min_frequency=2, coherence_threshold=0.4)
+    analyzer = SemanticNormalizationAnalyzer(min_frequency=1, coherence_threshold=0.4)
 
     results = analyzer.execute(df)
 
-    print("\\nSemantic normalization analysis complete:")
+    print("\nSemantic normalization analysis complete:")
     print(f"  Unknown acronyms: {len(results['unknown_acronyms'])}")
     print(f"  Unclear terms: {len(results['unclear_terms'])}")
     print(f"  Abbreviations: {len(results['abbreviations'])}")

+ 121 - 42
pipeline/steps/step01a_llm_normatlization.py → pipeline/steps/step0a2_llm_normatlization.py

@@ -5,11 +5,11 @@ Uses deployed LLM to identify unclear terms and unknown acronyms.
 
 from typing import List, Dict
 import pandas as pd
-import json
 import requests
 from collections import Counter
 import re
 from pipeline.models.base import PipelineStep
+from json_repair import loads
 
 
 class LLMNormalizationAnalyzer(PipelineStep):
@@ -37,7 +37,7 @@ class LLMNormalizationAnalyzer(PipelineStep):
             df: DataFrame with messages
 
         Returns:
-            Dictionary with identified terms and acronyms
+            Dictionary with identified terms and acronyms (deduplicated)
         """
         self.logger.info("=" * 80)
         self.logger.info("LLM-BASED TEXT NORMALIZATION ANALYSIS")
@@ -47,43 +47,66 @@ class LLMNormalizationAnalyzer(PipelineStep):
         # Extract frequent words and acronyms
         word_freq, acronym_freq = self._extract_terms(df)
 
-        # Sample messages for LLM analysis
+        # Track results with deduplication
+        acronym_dict = {}  # key: acronym, value: dict with metadata
+        term_dict = {}  # key: term, value: dict with metadata
+        expansion_dict = {}  # key: acronym, value: dict with metadata
+
         sample_df = df.sample(n=min(self.sample_size, len(df)), random_state=42)
-        all_unknown_acronyms = []
-        all_unclear_terms = []
-        all_expansions = []
 
-        for i in range(0, len(df), 100):
-            chunk = df.iloc[i : i + 100]
+        # Process in chunks
+        for i in range(0, len(sample_df), 100):
+            chunk = sample_df.iloc[i : i + 100]
             messages_sample = "\n".join(chunk["message"].fillna("").tolist())
 
-            # Analyze with LLM
-            self.logger.info("\\nAnalyzing with LLM...")
+            self.logger.info(
+                f"Analyzing chunk {i//100 + 1} of {(len(sample_df)-1)//100 + 1}..."
+            )
 
             # Get unknown acronyms
             unknown_acronyms = self._identify_acronyms_with_llm(
                 messages_sample, list(acronym_freq.keys())[:50]
             )
-            all_unknown_acronyms = list(set(all_unknown_acronyms + unknown_acronyms))
+            for item in unknown_acronyms:
+                acronym = item.get("acronym", "").lower()
+                if acronym and acronym not in acronym_dict:
+                    acronym_dict[acronym] = item
 
             # Get unclear terms
             unclear_terms = self._identify_unclear_terms_with_llm(
                 messages_sample, list(word_freq.keys())[:100]
             )
-            all_unclear_terms = list(set(all_unclear_terms + unclear_terms))
-
-            # Get expansion suggestions
-            expansions = self._get_expansion_suggestions_with_llm(
-                messages_sample, unknown_acronyms
-            )
-            all_expansions = list(set(all_expansions + expansions))
-
+            for item in unclear_terms:
+                term = item.get("term", "").lower()
+                if term and term not in term_dict:
+                    term_dict[term] = item
+
+            # Get expansion suggestions (use acronyms found in this chunk)
+            if unknown_acronyms:
+                expansions = self._get_expansion_suggestions_with_llm(
+                    messages_sample, unknown_acronyms
+                )
+                for item in expansions:
+                    if isinstance(item, dict):
+                        acronym = item.get("acronym", "").lower()
+                        if acronym and acronym not in expansion_dict:
+                            expansion_dict[acronym] = item
+
+        # Convert dictionaries back to lists
         results = {
-            "unknown_acronyms": all_unknown_acronyms,
-            "unclear_terms": all_unclear_terms,
-            "suggested_expansions": all_expansions,
+            "unknown_acronyms": list(acronym_dict.values()),
+            "unclear_terms": list(term_dict.values()),
+            "suggested_expansions": list(expansion_dict.values()),
         }
 
+        self.logger.info(
+            f"Found {len(results['unknown_acronyms'])} unique unknown acronyms"
+        )
+        self.logger.info(f"Found {len(results['unclear_terms'])} unique unclear terms")
+        self.logger.info(
+            f"Found {len(results['suggested_expansions'])} unique expansions"
+        )
+
         self._save_llm_analysis(results)
 
         return results
@@ -97,11 +120,11 @@ class LLMNormalizationAnalyzer(PipelineStep):
             text = str(message)
 
             # Extract words
-            words = re.findall(r"\\b[a-z]+\\b", text.lower())
+            words = re.findall(r"\b[a-z]+\b", text.lower())
             word_freq.update(words)
 
             # Extract potential acronyms (2-6 uppercase letters)
-            acronyms = re.findall(r"\\b[A-Z]{2,6}\\b", text)
+            acronyms = re.findall(r"\b[A-Z]{2,6}\b", text)
             acronym_freq.update([a.lower() for a in acronyms])
 
         return word_freq, acronym_freq
@@ -110,9 +133,10 @@ class LLMNormalizationAnalyzer(PipelineStep):
         self, messages_sample: str, acronym_candidates: List[str]
     ) -> List[Dict]:
         """Use LLM to identify unknown acronyms"""
+        self.logger.info("identifying acronyms...")
         prompt = f"""You are analyzing messages.
 
-ACRONYMS FOUND: {', '.join(acronym_candidates[:30])}
+ACRONYMS FOUND: {', '.join(acronym_candidates[:100])}
 
 SAMPLE MESSAGES:
 {messages_sample[:2000]}
@@ -130,17 +154,37 @@ Respond with JSON:
 }}"""
 
         try:
+            # print(
+            #     json.dumps(
+            #         {
+            #             "model": self.model,
+            #             "messages": [{"role": "user", "content": prompt}],
+            #             "max_tokens": 2048,
+            #             "temperature": 0.3,
+            #         },
+            #         indent=2,
+            #     )
+            # )
             response = requests.post(
                 f"{self.llm_url}/v1/chat/completions",
-                json={"prompt": prompt, "max_tokens": 1000, "temperature": 0.3},
+                json={
+                    "model": self.model,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "max_tokens": 2048,
+                    "temperature": 0.3,
+                },
                 timeout=120,
             )
 
             if response.status_code == 200:
-                text = response.json()["choices"][0]["text"]
-                parsed = json.loads(text)
-                return parsed.get("unknown_acronyms", [])
+                text = response.json()["choices"][0]["message"]["content"]
+                parsed = loads(text)
+                if isinstance(parsed, dict):
+                    return parsed.get("unknown_acronyms", [])
+            else:
+                raise RuntimeError("LLM Error")
         except Exception as e:
+            # raise e
             self.logger.error(f"LLM error: {e}")
 
         return []
@@ -149,9 +193,11 @@ Respond with JSON:
         self, messages_sample: str, word_candidates: List[str]
     ) -> List[Dict]:
         """Use LLM to identify unclear terms"""
+        self.logger.info("identifying unclear terms...")
+
         prompt = f"""You are analyzing messages.
 
-FREQUENT WORDS: {', '.join(word_candidates[:50])}
+FREQUENT WORDS: {', '.join(word_candidates[:100])}
 
 SAMPLE MESSAGES:
 {messages_sample[:2000]}
@@ -175,15 +221,24 @@ Respond with JSON:
         try:
             response = requests.post(
                 f"{self.llm_url}/v1/chat/completions",
-                json={"prompt": prompt, "max_tokens": 1000, "temperature": 0.3},
+                json={
+                    "model": self.model,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "max_tokens": 2048,
+                    "temperature": 0.3,
+                },
                 timeout=120,
             )
 
             if response.status_code == 200:
-                text = response.json()["choices"][0]["text"]
-                parsed = json.loads(text)
-                return parsed.get("unclear_terms", [])
+                text = response.json()["choices"][0]["message"]["content"]
+                parsed = loads(text)
+                if isinstance(parsed, dict):
+                    return parsed.get("unclear_terms", [])
+            else:
+                raise RuntimeError("LLM Error")
         except Exception as e:
+            # raise e
             self.logger.error(f"LLM error: {e}")
 
         return []
@@ -192,10 +247,12 @@ Respond with JSON:
         self, messages_sample: str, acronyms: List[Dict]
     ) -> List[Dict]:
         """Get expansion suggestions for acronyms"""
+        self.logger.info("getting expansion suggestions...")
+
         if not acronyms:
             return []
 
-        acronym_list = ", ".join([a["acronym"] for a in acronyms[:10]])
+        acronym_list = ", ".join([a["acronym"] for a in acronyms[:100]])
 
         prompt = f"""Based on these medical/legal messages, suggest expansions for these acronyms:
 
@@ -213,17 +270,37 @@ Respond with JSON:
 }}"""
 
         try:
+            # print(
+            #     json.dumps(
+            #         {
+            #             "model": self.model,
+            #             "messages": [{"role": "user", "content": prompt}],
+            #             "max_tokens": 2048,
+            #             "temperature": 0.3,
+            #         },
+            #         indent=2,
+            #     )
+            # )
             response = requests.post(
                 f"{self.llm_url}/v1/chat/completions",
-                json={"prompt": prompt, "max_tokens": 800, "temperature": 0.3},
+                json={
+                    "model": self.model,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "max_tokens": 2048,
+                    "temperature": 0.3,
+                },
                 timeout=120,
             )
 
             if response.status_code == 200:
-                text = response.json()["choices"][0]["text"]
-                parsed = json.loads(text)
-                return parsed.get("expansions", [])
+                text = response.json()["choices"][0]["message"]["content"]
+                parsed = loads(text)
+                if isinstance(parsed, dict):
+                    return parsed.get("expansions", [])
+            else:
+                raise RuntimeError("LLM Error")
         except Exception as e:
+            # raise e
             self.logger.error(f"LLM error: {e}")
 
         return []
@@ -259,7 +336,7 @@ Respond with JSON:
 
         filepath = self.output_dir / "llm_normalization_analysis.txt"
         with open(filepath, "w") as f:
-            f.write("\\n".join(text_output))
+            f.write("\n".join(text_output))
 
         self.logger.info(f"Saved analysis to: {filepath}")
 
@@ -270,9 +347,11 @@ if __name__ == "__main__":
     df = pd.read_csv("../_sources/signal_messages.csv")
 
     analyzer = LLMNormalizationAnalyzer(
-        llm_url="http://localhost:8000", sample_size=500
+        llm_url="http://eos.dgtlu.net:11434",
+        sample_size=14000,
+        model="hf.co/bartowski/Qwen2.5-14B-Instruct-GGUF:Q4_K_S",
     )
 
     results = analyzer.execute(df)
-    print(f"\\nFound {len(results['unknown_acronyms'])} unknown acronyms")
+    print(f"Found {len(results['unknown_acronyms'])} unknown acronyms")
     print(f"Found {len(results['unclear_terms'])} unclear terms")

+ 0 - 0
pipeline/steps/step0a_semantic_keyword_identification.py → pipeline/steps/step0b1_semantic_keyword_identification.py


+ 0 - 0
pipeline/steps/step0a_llm_keyword_identification.py → pipeline/steps/step0b2_llm_keyword_identification.py


+ 4 - 2
pipeline/steps/step0a_keyword_identification.py → pipeline/steps/step0b_keyword_identification.py

@@ -4,8 +4,10 @@ Step 0a: Identify relevant keywords from sample data.
 
 import pandas as pd
 import json
-from step0a_semantic_keyword_identification import SemanticKeywordIdentifier
-from step0a_llm_keyword_identification import LLMKeywordIdentifier
+from pipeline.steps.step0b1_semantic_keyword_identification import (
+    SemanticKeywordIdentifier,
+)
+from pipeline.steps.step0b2_llm_keyword_identification import LLMKeywordIdentifier
 from pipeline.utils.combine_keywords import combine_keywords, analyze_overlap
 
 

+ 10 - 2
pipeline/utils/text_utils.py

@@ -5,7 +5,7 @@ Utility functions for text processing.
 import re
 from typing import List
 import pandas as pd
-from pipeline.common_defs import TEXT_EXPANSIONS
+from pipeline.common_defs import TEXT_EXPANSIONS, ACRONYMS
 
 def normalize_text(text: str) -> str:
     """
@@ -20,7 +20,15 @@ def normalize_text(text: str) -> str:
     if pd.isna(text) or text == '':
         return ""
 
-    text = str(text).lower()
+    text = str(text)
+
+    # Apply expansions for acronyms
+    for abbr, full in ACRONYMS.items():
+        # Use \b for word boundaries to only match complete words
+        pattern = r"\b" + re.escape(abbr) + r"\b"
+        text = re.sub(pattern, full, text)
+
+    text = text.lower()
 
     # Apply expansions
     for abbr, full in TEXT_EXPANSIONS.items():

+ 1 - 0
pyproject.toml

@@ -5,6 +5,7 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
+    "json-repair>=0.54.2",
     "openpyxl>=3.1.5",
     "pandas>=2.3.3",
     "scikit-learn>=1.7.2",

+ 11 - 0
uv.lock

@@ -82,6 +82,7 @@ name = "discovery"
 version = "0.1.0"
 source = { virtual = "." }
 dependencies = [
+    { name = "json-repair" },
     { name = "openpyxl" },
     { name = "pandas" },
     { name = "scikit-learn" },
@@ -90,6 +91,7 @@ dependencies = [
 
 [package.metadata]
 requires-dist = [
+    { name = "json-repair", specifier = ">=0.54.2" },
     { name = "openpyxl", specifier = ">=3.1.5" },
     { name = "pandas", specifier = ">=2.3.3" },
     { name = "scikit-learn", specifier = ">=1.7.2" },
@@ -201,6 +203,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1e/e8/685f47e0d754320684db4425a0967f7d3fa70126bffd76110b7009a0090f/joblib-1.5.2-py3-none-any.whl", hash = "sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241", size = 308396, upload-time = "2025-08-27T12:15:45.188Z" },
 ]
 
+[[package]]
+name = "json-repair"
+version = "0.54.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ff/05/9fbcd5ffab9c41455e7d80af65a90876718b8ea2fb4525e187ab11836dd4/json_repair-0.54.2.tar.gz", hash = "sha256:4b6b62ce17f1a505b220fa4aadba1fc37dc9c221544f158471efe3775620bad6", size = 38575, upload-time = "2025-11-25T19:31:22.768Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/53/3a/1b4df9adcd69fee9c9e4b439c13e8c866f2fae520054aede7030b2278be9/json_repair-0.54.2-py3-none-any.whl", hash = "sha256:be51cce5dca97e0c24ebdf61a1ede2449a8a7666012de99467bb7b0afb35179b", size = 29322, upload-time = "2025-11-25T19:31:21.492Z" },
+]
+
 [[package]]
 name = "markupsafe"
 version = "3.0.3"