""" Utility functions for text processing. """ import regex as re from typing import List import pandas as pd from pipeline.common_defs import TEXT_EXPANSIONS, ACRONYMS # Build a single combined pattern for all acronyms (case-sensitive) _ACRONYM_PATTERN = re.compile( r"\b(" + "|".join(re.escape(k) for k in ACRONYMS.keys()) + r")\b", flags=re.WORD | re.V1, ) # Build a single combined pattern for all expansions (will be case-insensitive after lowering) _EXPANSION_PATTERN = re.compile( r"\b(" + "|".join(re.escape(k) for k in TEXT_EXPANSIONS.keys()) + r")\b", flags=re.WORD | re.V1, ) def normalize_text(text: str) -> str: """ Normalize text with abbreviation expansion - OPTIMIZED. Args: text: Input text to normalize Returns: Normalized text """ if pd.isna(text) or text == "": return "" text = str(text) # Apply acronym expansions using single pattern def replace_acronym(match): return ACRONYMS[match.group(0)] text = _ACRONYM_PATTERN.sub(replace_acronym, text) text = text.lower() # Apply text expansions using single pattern def replace_expansion(match): return TEXT_EXPANSIONS[match.group(0)] text = _EXPANSION_PATTERN.sub(replace_expansion, text) return text def extract_keywords(text: str, keywords: List[str]) -> List[str]: """ Extract matching keywords from text. Args: text: Text to search keywords: List of keywords to find Returns: List of matched keywords """ text_lower = text.lower() matches = [kw for kw in keywords if kw in text_lower] return matches def calculate_keyword_score(matches: List[str]) -> int: """ Calculate keyword score based on unique matches. Args: matches: List of matched keywords Returns: Number of unique matches """ return len(set(matches))