""" Step 0b: Analyze text patterns and suggest normalizations. """ from typing import List, Dict, Set, Tuple from collections import Counter import re import pandas as pd from pipeline.models.base import PipelineStep class NormalizationAnalyzer(PipelineStep): """Analyze text patterns and suggest normalizations""" def __init__(self, output_dir: str = "./pipeline_output"): super().__init__(output_dir) def execute(self, df: pd.DataFrame) -> Dict[str, Dict[str, str]]: """ Analyze text and suggest normalizations. Args: df: DataFrame with messages Returns: Dictionary of suggested normalizations """ self.logger.info("Analyzing text patterns for normalization...") # Find abbreviations abbreviations = self._find_abbreviations(df) # Find acronyms acronyms = self._find_acronyms(df) # Find common misspellings misspellings = self._find_misspellings(df) # Find date/time patterns datetime_patterns = self._find_datetime_patterns(df) # Combine suggestions suggestions = { "abbreviations": abbreviations, "acronyms": acronyms, "misspellings": misspellings, "datetime_patterns": datetime_patterns, } # Save results self._save_normalization_suggestions(suggestions) return suggestions def _find_abbreviations(self, df: pd.DataFrame) -> Dict[str, str]: """Find common abbreviations""" self.logger.info("Finding abbreviations...") # Common medical/legal abbreviations known_abbrevs = { "dr.": "doctor", "dr ": "doctor ", "appt": "appointment", "hosp": "hospital", "med": "medical", "meds": "medications", "rx": "prescription", "pt": "patient", "pts": "patients", "pron": "pronoun", "prns": "pronouns", "info": "information", "dept": "department", "rep": "representative", "admin": "administration", "surg": "surgery", "proc": "procedure", } # Find abbreviations in text found_abbrevs = {} pattern = r"\b[a-z]{2,5}\.?\b" for message in df["message"].fillna(""): text = str(message).lower() matches = re.findall(pattern, text) for match in matches: if match in known_abbrevs: found_abbrevs[match] = known_abbrevs[match] self.logger.info(f"Found {len(found_abbrevs)} abbreviations") return found_abbrevs def _find_acronyms(self, df: pd.DataFrame) -> Dict[str, str]: """Find common acronyms""" self.logger.info("Finding acronyms...") known_acronyms = { "msk": "memorial sloan kettering", "er": "emergency room", "icu": "intensive care unit", "ob": "obstetrics", "gyn": "gynecology", "obgyn": "obstetrics gynecology", "pcp": "primary care physician", "np": "nurse practitioner", "pa": "physician assistant", "rn": "registered nurse", "lpn": "licensed practical nurse", "emr": "electronic medical record", "ehr": "electronic health record", "hipaa": "health insurance portability accountability act", "lgbtq": "lesbian gay bisexual transgender queer", "lgbt": "lesbian gay bisexual transgender", } found_acronyms = {} pattern = r"\b[A-Z]{2,6}\b" for message in df["message"].fillna(""): text = str(message) matches = re.findall(pattern, text) for match in matches: match_lower = match.lower() if match_lower in known_acronyms: found_acronyms[match_lower] = known_acronyms[match_lower] self.logger.info(f"Found {len(found_acronyms)} acronyms") return found_acronyms def _find_misspellings(self, df: pd.DataFrame) -> Dict[str, str]: """Find common misspellings""" self.logger.info("Finding common misspellings...") # Common misspellings in medical/legal context known_misspellings = { "recieve": "receive", "occured": "occurred", "seperate": "separate", "definately": "definitely", "accomodate": "accommodate", "untill": "until", "thier": "their", "recieved": "received", } found_misspellings = {} for message in df["message"].fillna(""): text = str(message).lower() words = text.split() for word in words: clean_word = re.sub(r"[^a-z]", "", word) if clean_word in known_misspellings: found_misspellings[clean_word] = known_misspellings[clean_word] self.logger.info(f"Found {len(found_misspellings)} misspellings") return found_misspellings def _find_datetime_patterns(self, df: pd.DataFrame) -> Dict[str, str]: """Find date/time patterns""" self.logger.info("Finding date/time patterns...") patterns = {} # Common date patterns date_patterns = [ (r"\d{1,2}/\d{1,2}/\d{2,4}", "date_slash"), (r"\d{1,2}-\d{1,2}-\d{2,4}", "date_dash"), ( r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+\d{1,2}", "date_month_day", ), ( r"\d{1,2}\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)", "date_day_month", ), ] for message in df["message"].fillna(""): text = str(message).lower() for pattern, pattern_name in date_patterns: if re.search(pattern, text): patterns[pattern_name] = pattern self.logger.info(f"Found {len(patterns)} date/time patterns") return patterns def _save_normalization_suggestions(self, suggestions: Dict): """Save normalization suggestions""" self.save_results(suggestions, "normalization_suggestions.json") # Create readable text file text_output = [] text_output.append("TEXT NORMALIZATION SUGGESTIONS") text_output.append("=" * 80) text_output.append("") text_output.append("ABBREVIATIONS TO EXPAND:") text_output.append("-" * 80) for abbrev, expansion in sorted(suggestions["abbreviations"].items()): text_output.append(f" {abbrev:20} -> {expansion}") text_output.append("") text_output.append("ACRONYMS TO EXPAND:") text_output.append("-" * 80) for acronym, expansion in sorted(suggestions["acronyms"].items()): text_output.append(f" {acronym:20} -> {expansion}") text_output.append("") if suggestions["misspellings"]: text_output.append("MISSPELLINGS TO CORRECT:") text_output.append("-" * 80) for misspell, correct in sorted(suggestions["misspellings"].items()): text_output.append(f" {misspell:20} -> {correct}") text_output.append("") text_output.append("DATE/TIME PATTERNS FOUND:") text_output.append("-" * 80) for pattern_name, pattern in suggestions["datetime_patterns"].items(): text_output.append(f" {pattern_name}: {pattern}") filepath = self.output_dir / "normalization_suggestions.txt" with open(filepath, "w") as f: f.write("\n".join(text_output)) self.logger.info(f"Saved normalization suggestions to: {filepath}") if __name__ == "__main__": import pandas as pd df = pd.read_csv("../_sources/signal_messages.csv") analyzer = NormalizationAnalyzer() suggestions = analyzer.execute(df) print("\nNormalization suggestions:") print(f" Abbreviations: {len(suggestions['abbreviations'])}") print(f" Acronyms: {len(suggestions['acronyms'])}") print(f" Misspellings: {len(suggestions['misspellings'])}") print(f" Date patterns: {len(suggestions['datetime_patterns'])}")