| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246 |
- """
- Step 0b: Analyze text patterns and suggest normalizations.
- """
- from typing import List, Dict, Set, Tuple
- from collections import Counter
- import re
- import pandas as pd
- from pipeline.models.base import PipelineStep
- class NormalizationAnalyzer(PipelineStep):
- """Analyze text patterns and suggest normalizations"""
- def __init__(self, output_dir: str = "./pipeline_output"):
- super().__init__(output_dir)
- def execute(self, df: pd.DataFrame) -> Dict[str, Dict[str, str]]:
- """
- Analyze text and suggest normalizations.
- Args:
- df: DataFrame with messages
- Returns:
- Dictionary of suggested normalizations
- """
- self.logger.info("Analyzing text patterns for normalization...")
- # Find abbreviations
- abbreviations = self._find_abbreviations(df)
- # Find acronyms
- acronyms = self._find_acronyms(df)
- # Find common misspellings
- misspellings = self._find_misspellings(df)
- # Find date/time patterns
- datetime_patterns = self._find_datetime_patterns(df)
- # Combine suggestions
- suggestions = {
- "abbreviations": abbreviations,
- "acronyms": acronyms,
- "misspellings": misspellings,
- "datetime_patterns": datetime_patterns,
- }
- # Save results
- self._save_normalization_suggestions(suggestions)
- return suggestions
- def _find_abbreviations(self, df: pd.DataFrame) -> Dict[str, str]:
- """Find common abbreviations"""
- self.logger.info("Finding abbreviations...")
- # Common medical/legal abbreviations
- known_abbrevs = {
- "dr.": "doctor",
- "dr ": "doctor ",
- "appt": "appointment",
- "hosp": "hospital",
- "med": "medical",
- "meds": "medications",
- "rx": "prescription",
- "pt": "patient",
- "pts": "patients",
- "pron": "pronoun",
- "prns": "pronouns",
- "info": "information",
- "dept": "department",
- "rep": "representative",
- "admin": "administration",
- "surg": "surgery",
- "proc": "procedure",
- }
- # Find abbreviations in text
- found_abbrevs = {}
- pattern = r"\b[a-z]{2,5}\.?\b"
- for message in df["message"].fillna(""):
- text = str(message).lower()
- matches = re.findall(pattern, text)
- for match in matches:
- if match in known_abbrevs:
- found_abbrevs[match] = known_abbrevs[match]
- self.logger.info(f"Found {len(found_abbrevs)} abbreviations")
- return found_abbrevs
- def _find_acronyms(self, df: pd.DataFrame) -> Dict[str, str]:
- """Find common acronyms"""
- self.logger.info("Finding acronyms...")
- known_acronyms = {
- "msk": "memorial sloan kettering",
- "er": "emergency room",
- "icu": "intensive care unit",
- "ob": "obstetrics",
- "gyn": "gynecology",
- "obgyn": "obstetrics gynecology",
- "pcp": "primary care physician",
- "np": "nurse practitioner",
- "pa": "physician assistant",
- "rn": "registered nurse",
- "lpn": "licensed practical nurse",
- "emr": "electronic medical record",
- "ehr": "electronic health record",
- "hipaa": "health insurance portability accountability act",
- "lgbtq": "lesbian gay bisexual transgender queer",
- "lgbt": "lesbian gay bisexual transgender",
- }
- found_acronyms = {}
- pattern = r"\b[A-Z]{2,6}\b"
- for message in df["message"].fillna(""):
- text = str(message)
- matches = re.findall(pattern, text)
- for match in matches:
- match_lower = match.lower()
- if match_lower in known_acronyms:
- found_acronyms[match_lower] = known_acronyms[match_lower]
- self.logger.info(f"Found {len(found_acronyms)} acronyms")
- return found_acronyms
- def _find_misspellings(self, df: pd.DataFrame) -> Dict[str, str]:
- """Find common misspellings"""
- self.logger.info("Finding common misspellings...")
- # Common misspellings in medical/legal context
- known_misspellings = {
- "recieve": "receive",
- "occured": "occurred",
- "seperate": "separate",
- "definately": "definitely",
- "accomodate": "accommodate",
- "untill": "until",
- "thier": "their",
- "recieved": "received",
- }
- found_misspellings = {}
- for message in df["message"].fillna(""):
- text = str(message).lower()
- words = text.split()
- for word in words:
- clean_word = re.sub(r"[^a-z]", "", word)
- if clean_word in known_misspellings:
- found_misspellings[clean_word] = known_misspellings[clean_word]
- self.logger.info(f"Found {len(found_misspellings)} misspellings")
- return found_misspellings
- def _find_datetime_patterns(self, df: pd.DataFrame) -> Dict[str, str]:
- """Find date/time patterns"""
- self.logger.info("Finding date/time patterns...")
- patterns = {}
- # Common date patterns
- date_patterns = [
- (r"\d{1,2}/\d{1,2}/\d{2,4}", "date_slash"),
- (r"\d{1,2}-\d{1,2}-\d{2,4}", "date_dash"),
- (
- r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+\d{1,2}",
- "date_month_day",
- ),
- (
- r"\d{1,2}\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)",
- "date_day_month",
- ),
- ]
- for message in df["message"].fillna(""):
- text = str(message).lower()
- for pattern, pattern_name in date_patterns:
- if re.search(pattern, text):
- patterns[pattern_name] = pattern
- self.logger.info(f"Found {len(patterns)} date/time patterns")
- return patterns
- def _save_normalization_suggestions(self, suggestions: Dict):
- """Save normalization suggestions"""
- self.save_results(suggestions, "normalization_suggestions.json")
- # Create readable text file
- text_output = []
- text_output.append("TEXT NORMALIZATION SUGGESTIONS")
- text_output.append("=" * 80)
- text_output.append("")
- text_output.append("ABBREVIATIONS TO EXPAND:")
- text_output.append("-" * 80)
- for abbrev, expansion in sorted(suggestions["abbreviations"].items()):
- text_output.append(f" {abbrev:20} -> {expansion}")
- text_output.append("")
- text_output.append("ACRONYMS TO EXPAND:")
- text_output.append("-" * 80)
- for acronym, expansion in sorted(suggestions["acronyms"].items()):
- text_output.append(f" {acronym:20} -> {expansion}")
- text_output.append("")
- if suggestions["misspellings"]:
- text_output.append("MISSPELLINGS TO CORRECT:")
- text_output.append("-" * 80)
- for misspell, correct in sorted(suggestions["misspellings"].items()):
- text_output.append(f" {misspell:20} -> {correct}")
- text_output.append("")
- text_output.append("DATE/TIME PATTERNS FOUND:")
- text_output.append("-" * 80)
- for pattern_name, pattern in suggestions["datetime_patterns"].items():
- text_output.append(f" {pattern_name}: {pattern}")
- filepath = self.output_dir / "normalization_suggestions.txt"
- with open(filepath, "w") as f:
- f.write("\n".join(text_output))
- self.logger.info(f"Saved normalization suggestions to: {filepath}")
- if __name__ == "__main__":
- import pandas as pd
- df = pd.read_csv("../_sources/signal_messages.csv")
- analyzer = NormalizationAnalyzer()
- suggestions = analyzer.execute(df)
- print("\nNormalization suggestions:")
- print(f" Abbreviations: {len(suggestions['abbreviations'])}")
- print(f" Acronyms: {len(suggestions['acronyms'])}")
- print(f" Misspellings: {len(suggestions['misspellings'])}")
- print(f" Date patterns: {len(suggestions['datetime_patterns'])}")
|