justadri
/
disco


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
							"""
Step 0b: Analyze text patterns and suggest normalizations.
"""

from typing import List, Dict, Set, Tuple
from collections import Counter
import re
import pandas as pd
from pipeline.models.base import PipelineStep


class NormalizationAnalyzer(PipelineStep):
    """Analyze text patterns and suggest normalizations"""

    def __init__(self, output_dir: str = "./pipeline_output"):
        super().__init__(output_dir)

    def execute(self, df: pd.DataFrame) -> Dict[str, Dict[str, str]]:
        """
        Analyze text and suggest normalizations.

        Args:
            df: DataFrame with messages

        Returns:
            Dictionary of suggested normalizations
        """
        self.logger.info("Analyzing text patterns for normalization...")

        # Find abbreviations
        abbreviations = self._find_abbreviations(df)

        # Find acronyms
        acronyms = self._find_acronyms(df)

        # Find common misspellings
        misspellings = self._find_misspellings(df)

        # Find date/time patterns
        datetime_patterns = self._find_datetime_patterns(df)

        # Combine suggestions
        suggestions = {
            "abbreviations": abbreviations,
            "acronyms": acronyms,
            "misspellings": misspellings,
            "datetime_patterns": datetime_patterns,
        }

        # Save results
        self._save_normalization_suggestions(suggestions)

        return suggestions

    def _find_abbreviations(self, df: pd.DataFrame) -> Dict[str, str]:
        """Find common abbreviations"""
        self.logger.info("Finding abbreviations...")

        # Common medical/legal abbreviations
        known_abbrevs = {
            "dr.": "doctor",
            "dr ": "doctor ",
            "appt": "appointment",
            "hosp": "hospital",
            "med": "medical",
            "meds": "medications",
            "rx": "prescription",
            "pt": "patient",
            "pts": "patients",
            "pron": "pronoun",
            "prns": "pronouns",
            "info": "information",
            "dept": "department",
            "rep": "representative",
            "admin": "administration",
            "surg": "surgery",
            "proc": "procedure",
        }

        # Find abbreviations in text
        found_abbrevs = {}
        pattern = r"\b[a-z]{2,5}\.?\b"

        for message in df["message"].fillna(""):
            text = str(message).lower()
            matches = re.findall(pattern, text)

            for match in matches:
                if match in known_abbrevs:
                    found_abbrevs[match] = known_abbrevs[match]

        self.logger.info(f"Found {len(found_abbrevs)} abbreviations")
        return found_abbrevs

    def _find_acronyms(self, df: pd.DataFrame) -> Dict[str, str]:
        """Find common acronyms"""
        self.logger.info("Finding acronyms...")

        known_acronyms = {
            "msk": "memorial sloan kettering",
            "er": "emergency room",
            "icu": "intensive care unit",
            "ob": "obstetrics",
            "gyn": "gynecology",
            "obgyn": "obstetrics gynecology",
            "pcp": "primary care physician",
            "np": "nurse practitioner",
            "pa": "physician assistant",
            "rn": "registered nurse",
            "lpn": "licensed practical nurse",
            "emr": "electronic medical record",
            "ehr": "electronic health record",
            "hipaa": "health insurance portability accountability act",
            "lgbtq": "lesbian gay bisexual transgender queer",
            "lgbt": "lesbian gay bisexual transgender",
        }

        found_acronyms = {}
        pattern = r"\b[A-Z]{2,6}\b"

        for message in df["message"].fillna(""):
            text = str(message)
            matches = re.findall(pattern, text)

            for match in matches:
                match_lower = match.lower()
                if match_lower in known_acronyms:
                    found_acronyms[match_lower] = known_acronyms[match_lower]

        self.logger.info(f"Found {len(found_acronyms)} acronyms")
        return found_acronyms

    def _find_misspellings(self, df: pd.DataFrame) -> Dict[str, str]:
        """Find common misspellings"""
        self.logger.info("Finding common misspellings...")

        # Common misspellings in medical/legal context
        known_misspellings = {
            "recieve": "receive",
            "occured": "occurred",
            "seperate": "separate",
            "definately": "definitely",
            "accomodate": "accommodate",
            "untill": "until",
            "thier": "their",
            "recieved": "received",
        }

        found_misspellings = {}

        for message in df["message"].fillna(""):
            text = str(message).lower()
            words = text.split()

            for word in words:
                clean_word = re.sub(r"[^a-z]", "", word)
                if clean_word in known_misspellings:
                    found_misspellings[clean_word] = known_misspellings[clean_word]

        self.logger.info(f"Found {len(found_misspellings)} misspellings")
        return found_misspellings

    def _find_datetime_patterns(self, df: pd.DataFrame) -> Dict[str, str]:
        """Find date/time patterns"""
        self.logger.info("Finding date/time patterns...")

        patterns = {}

        # Common date patterns
        date_patterns = [
            (r"\d{1,2}/\d{1,2}/\d{2,4}", "date_slash"),
            (r"\d{1,2}-\d{1,2}-\d{2,4}", "date_dash"),
            (
                r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+\d{1,2}",
                "date_month_day",
            ),
            (
                r"\d{1,2}\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)",
                "date_day_month",
            ),
        ]

        for message in df["message"].fillna(""):
            text = str(message).lower()

            for pattern, pattern_name in date_patterns:
                if re.search(pattern, text):
                    patterns[pattern_name] = pattern

        self.logger.info(f"Found {len(patterns)} date/time patterns")
        return patterns

    def _save_normalization_suggestions(self, suggestions: Dict):
        """Save normalization suggestions"""
        self.save_results(suggestions, "normalization_suggestions.json")

        # Create readable text file
        text_output = []
        text_output.append("TEXT NORMALIZATION SUGGESTIONS")
        text_output.append("=" * 80)
        text_output.append("")

        text_output.append("ABBREVIATIONS TO EXPAND:")
        text_output.append("-" * 80)
        for abbrev, expansion in sorted(suggestions["abbreviations"].items()):
            text_output.append(f"  {abbrev:20} -> {expansion}")
        text_output.append("")

        text_output.append("ACRONYMS TO EXPAND:")
        text_output.append("-" * 80)
        for acronym, expansion in sorted(suggestions["acronyms"].items()):
            text_output.append(f"  {acronym:20} -> {expansion}")
        text_output.append("")

        if suggestions["misspellings"]:
            text_output.append("MISSPELLINGS TO CORRECT:")
            text_output.append("-" * 80)
            for misspell, correct in sorted(suggestions["misspellings"].items()):
                text_output.append(f"  {misspell:20} -> {correct}")
            text_output.append("")

        text_output.append("DATE/TIME PATTERNS FOUND:")
        text_output.append("-" * 80)
        for pattern_name, pattern in suggestions["datetime_patterns"].items():
            text_output.append(f"  {pattern_name}: {pattern}")

        filepath = self.output_dir / "normalization_suggestions.txt"
        with open(filepath, "w") as f:
            f.write("\n".join(text_output))

        self.logger.info(f"Saved normalization suggestions to: {filepath}")


if __name__ == "__main__":
    import pandas as pd

    df = pd.read_csv("../_sources/signal_messages.csv")

    analyzer = NormalizationAnalyzer()
    suggestions = analyzer.execute(df)

    print("\nNormalization suggestions:")
    print(f"  Abbreviations: {len(suggestions['abbreviations'])}")
    print(f"  Acronyms: {len(suggestions['acronyms'])}")
    print(f"  Misspellings: {len(suggestions['misspellings'])}")
    print(f"  Date patterns: {len(suggestions['datetime_patterns'])}")