|
|
@@ -1,246 +0,0 @@
|
|
|
-"""
|
|
|
-Step 0b: Analyze text patterns and suggest normalizations.
|
|
|
-"""
|
|
|
-
|
|
|
-from typing import List, Dict, Set, Tuple
|
|
|
-from collections import Counter
|
|
|
-import re
|
|
|
-import pandas as pd
|
|
|
-from pipeline.models.base import PipelineStep
|
|
|
-
|
|
|
-
|
|
|
-class NormalizationAnalyzer(PipelineStep):
|
|
|
- """Analyze text patterns and suggest normalizations"""
|
|
|
-
|
|
|
- def __init__(self, output_dir: str = "./pipeline_output"):
|
|
|
- super().__init__(output_dir)
|
|
|
-
|
|
|
- def execute(self, df: pd.DataFrame) -> Dict[str, Dict[str, str]]:
|
|
|
- """
|
|
|
- Analyze text and suggest normalizations.
|
|
|
-
|
|
|
- Args:
|
|
|
- df: DataFrame with messages
|
|
|
-
|
|
|
- Returns:
|
|
|
- Dictionary of suggested normalizations
|
|
|
- """
|
|
|
- self.logger.info("Analyzing text patterns for normalization...")
|
|
|
-
|
|
|
- # Find abbreviations
|
|
|
- abbreviations = self._find_abbreviations(df)
|
|
|
-
|
|
|
- # Find acronyms
|
|
|
- acronyms = self._find_acronyms(df)
|
|
|
-
|
|
|
- # Find common misspellings
|
|
|
- misspellings = self._find_misspellings(df)
|
|
|
-
|
|
|
- # Find date/time patterns
|
|
|
- datetime_patterns = self._find_datetime_patterns(df)
|
|
|
-
|
|
|
- # Combine suggestions
|
|
|
- suggestions = {
|
|
|
- "abbreviations": abbreviations,
|
|
|
- "acronyms": acronyms,
|
|
|
- "misspellings": misspellings,
|
|
|
- "datetime_patterns": datetime_patterns,
|
|
|
- }
|
|
|
-
|
|
|
- # Save results
|
|
|
- self._save_normalization_suggestions(suggestions)
|
|
|
-
|
|
|
- return suggestions
|
|
|
-
|
|
|
- def _find_abbreviations(self, df: pd.DataFrame) -> Dict[str, str]:
|
|
|
- """Find common abbreviations"""
|
|
|
- self.logger.info("Finding abbreviations...")
|
|
|
-
|
|
|
- # Common medical/legal abbreviations
|
|
|
- known_abbrevs = {
|
|
|
- "dr.": "doctor",
|
|
|
- "dr ": "doctor ",
|
|
|
- "appt": "appointment",
|
|
|
- "hosp": "hospital",
|
|
|
- "med": "medical",
|
|
|
- "meds": "medications",
|
|
|
- "rx": "prescription",
|
|
|
- "pt": "patient",
|
|
|
- "pts": "patients",
|
|
|
- "pron": "pronoun",
|
|
|
- "prns": "pronouns",
|
|
|
- "info": "information",
|
|
|
- "dept": "department",
|
|
|
- "rep": "representative",
|
|
|
- "admin": "administration",
|
|
|
- "surg": "surgery",
|
|
|
- "proc": "procedure",
|
|
|
- }
|
|
|
-
|
|
|
- # Find abbreviations in text
|
|
|
- found_abbrevs = {}
|
|
|
- pattern = r"\b[a-z]{2,5}\.?\b"
|
|
|
-
|
|
|
- for message in df["message"].fillna(""):
|
|
|
- text = str(message).lower()
|
|
|
- matches = re.findall(pattern, text)
|
|
|
-
|
|
|
- for match in matches:
|
|
|
- if match in known_abbrevs:
|
|
|
- found_abbrevs[match] = known_abbrevs[match]
|
|
|
-
|
|
|
- self.logger.info(f"Found {len(found_abbrevs)} abbreviations")
|
|
|
- return found_abbrevs
|
|
|
-
|
|
|
- def _find_acronyms(self, df: pd.DataFrame) -> Dict[str, str]:
|
|
|
- """Find common acronyms"""
|
|
|
- self.logger.info("Finding acronyms...")
|
|
|
-
|
|
|
- known_acronyms = {
|
|
|
- "msk": "memorial sloan kettering",
|
|
|
- "er": "emergency room",
|
|
|
- "icu": "intensive care unit",
|
|
|
- "ob": "obstetrics",
|
|
|
- "gyn": "gynecology",
|
|
|
- "obgyn": "obstetrics gynecology",
|
|
|
- "pcp": "primary care physician",
|
|
|
- "np": "nurse practitioner",
|
|
|
- "pa": "physician assistant",
|
|
|
- "rn": "registered nurse",
|
|
|
- "lpn": "licensed practical nurse",
|
|
|
- "emr": "electronic medical record",
|
|
|
- "ehr": "electronic health record",
|
|
|
- "hipaa": "health insurance portability accountability act",
|
|
|
- "lgbtq": "lesbian gay bisexual transgender queer",
|
|
|
- "lgbt": "lesbian gay bisexual transgender",
|
|
|
- }
|
|
|
-
|
|
|
- found_acronyms = {}
|
|
|
- pattern = r"\b[A-Z]{2,6}\b"
|
|
|
-
|
|
|
- for message in df["message"].fillna(""):
|
|
|
- text = str(message)
|
|
|
- matches = re.findall(pattern, text)
|
|
|
-
|
|
|
- for match in matches:
|
|
|
- match_lower = match.lower()
|
|
|
- if match_lower in known_acronyms:
|
|
|
- found_acronyms[match_lower] = known_acronyms[match_lower]
|
|
|
-
|
|
|
- self.logger.info(f"Found {len(found_acronyms)} acronyms")
|
|
|
- return found_acronyms
|
|
|
-
|
|
|
- def _find_misspellings(self, df: pd.DataFrame) -> Dict[str, str]:
|
|
|
- """Find common misspellings"""
|
|
|
- self.logger.info("Finding common misspellings...")
|
|
|
-
|
|
|
- # Common misspellings in medical/legal context
|
|
|
- known_misspellings = {
|
|
|
- "recieve": "receive",
|
|
|
- "occured": "occurred",
|
|
|
- "seperate": "separate",
|
|
|
- "definately": "definitely",
|
|
|
- "accomodate": "accommodate",
|
|
|
- "untill": "until",
|
|
|
- "thier": "their",
|
|
|
- "recieved": "received",
|
|
|
- }
|
|
|
-
|
|
|
- found_misspellings = {}
|
|
|
-
|
|
|
- for message in df["message"].fillna(""):
|
|
|
- text = str(message).lower()
|
|
|
- words = text.split()
|
|
|
-
|
|
|
- for word in words:
|
|
|
- clean_word = re.sub(r"[^a-z]", "", word)
|
|
|
- if clean_word in known_misspellings:
|
|
|
- found_misspellings[clean_word] = known_misspellings[clean_word]
|
|
|
-
|
|
|
- self.logger.info(f"Found {len(found_misspellings)} misspellings")
|
|
|
- return found_misspellings
|
|
|
-
|
|
|
- def _find_datetime_patterns(self, df: pd.DataFrame) -> Dict[str, str]:
|
|
|
- """Find date/time patterns"""
|
|
|
- self.logger.info("Finding date/time patterns...")
|
|
|
-
|
|
|
- patterns = {}
|
|
|
-
|
|
|
- # Common date patterns
|
|
|
- date_patterns = [
|
|
|
- (r"\d{1,2}/\d{1,2}/\d{2,4}", "date_slash"),
|
|
|
- (r"\d{1,2}-\d{1,2}-\d{2,4}", "date_dash"),
|
|
|
- (
|
|
|
- r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+\d{1,2}",
|
|
|
- "date_month_day",
|
|
|
- ),
|
|
|
- (
|
|
|
- r"\d{1,2}\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)",
|
|
|
- "date_day_month",
|
|
|
- ),
|
|
|
- ]
|
|
|
-
|
|
|
- for message in df["message"].fillna(""):
|
|
|
- text = str(message).lower()
|
|
|
-
|
|
|
- for pattern, pattern_name in date_patterns:
|
|
|
- if re.search(pattern, text):
|
|
|
- patterns[pattern_name] = pattern
|
|
|
-
|
|
|
- self.logger.info(f"Found {len(patterns)} date/time patterns")
|
|
|
- return patterns
|
|
|
-
|
|
|
- def _save_normalization_suggestions(self, suggestions: Dict):
|
|
|
- """Save normalization suggestions"""
|
|
|
- self.save_results(suggestions, "normalization_suggestions.json")
|
|
|
-
|
|
|
- # Create readable text file
|
|
|
- text_output = []
|
|
|
- text_output.append("TEXT NORMALIZATION SUGGESTIONS")
|
|
|
- text_output.append("=" * 80)
|
|
|
- text_output.append("")
|
|
|
-
|
|
|
- text_output.append("ABBREVIATIONS TO EXPAND:")
|
|
|
- text_output.append("-" * 80)
|
|
|
- for abbrev, expansion in sorted(suggestions["abbreviations"].items()):
|
|
|
- text_output.append(f" {abbrev:20} -> {expansion}")
|
|
|
- text_output.append("")
|
|
|
-
|
|
|
- text_output.append("ACRONYMS TO EXPAND:")
|
|
|
- text_output.append("-" * 80)
|
|
|
- for acronym, expansion in sorted(suggestions["acronyms"].items()):
|
|
|
- text_output.append(f" {acronym:20} -> {expansion}")
|
|
|
- text_output.append("")
|
|
|
-
|
|
|
- if suggestions["misspellings"]:
|
|
|
- text_output.append("MISSPELLINGS TO CORRECT:")
|
|
|
- text_output.append("-" * 80)
|
|
|
- for misspell, correct in sorted(suggestions["misspellings"].items()):
|
|
|
- text_output.append(f" {misspell:20} -> {correct}")
|
|
|
- text_output.append("")
|
|
|
-
|
|
|
- text_output.append("DATE/TIME PATTERNS FOUND:")
|
|
|
- text_output.append("-" * 80)
|
|
|
- for pattern_name, pattern in suggestions["datetime_patterns"].items():
|
|
|
- text_output.append(f" {pattern_name}: {pattern}")
|
|
|
-
|
|
|
- filepath = self.output_dir / "normalization_suggestions.txt"
|
|
|
- with open(filepath, "w") as f:
|
|
|
- f.write("\n".join(text_output))
|
|
|
-
|
|
|
- self.logger.info(f"Saved normalization suggestions to: {filepath}")
|
|
|
-
|
|
|
-
|
|
|
-if __name__ == "__main__":
|
|
|
- import pandas as pd
|
|
|
-
|
|
|
- df = pd.read_csv("../_sources/signal_messages.csv")
|
|
|
-
|
|
|
- analyzer = NormalizationAnalyzer()
|
|
|
- suggestions = analyzer.execute(df)
|
|
|
-
|
|
|
- print("\nNormalization suggestions:")
|
|
|
- print(f" Abbreviations: {len(suggestions['abbreviations'])}")
|
|
|
- print(f" Acronyms: {len(suggestions['acronyms'])}")
|
|
|
- print(f" Misspellings: {len(suggestions['misspellings'])}")
|
|
|
- print(f" Date patterns: {len(suggestions['datetime_patterns'])}")
|