""" Utility functions for text processing. """ import re from typing import List import pandas as pd from pipeline.common_defs import TEXT_EXPANSIONS, ACRONYMS def normalize_text(text: str) -> str: """ Normalize text with abbreviation expansion. Args: text: Input text to normalize Returns: Normalized text """ if pd.isna(text) or text == '': return "" text = str(text) # Apply expansions for acronyms for abbr, full in ACRONYMS.items(): # Use \b for word boundaries to only match complete words pattern = r"\b" + re.escape(abbr) + r"\b" text = re.sub(pattern, full, text) text = text.lower() # Apply expansions for abbr, full in TEXT_EXPANSIONS.items(): # Use \b for word boundaries to only match complete words pattern = r"\b" + re.escape(abbr) + r"\b" text = re.sub(pattern, full, text) return text def extract_keywords(text: str, keywords: List[str]) -> List[str]: """ Extract matching keywords from text. Args: text: Text to search keywords: List of keywords to find Returns: List of matched keywords """ text_lower = text.lower() matches = [kw for kw in keywords if kw in text_lower] return matches def calculate_keyword_score(matches: List[str]) -> int: """ Calculate keyword score based on unique matches. Args: matches: List of matched keywords Returns: Number of unique matches """ return len(set(matches))