| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- """
- Utility functions for text processing.
- """
- import re
- from typing import List
- import pandas as pd
- from pipeline.common_defs import TEXT_EXPANSIONS, ACRONYMS
- # Build a single combined pattern for all acronyms (case-sensitive)
- _ACRONYM_PATTERN = re.compile(
- r"\b(" + "|".join(re.escape(k) for k in ACRONYMS.keys()) + r")\b"
- )
- # Build a single combined pattern for all expansions (will be case-insensitive after lowering)
- _EXPANSION_PATTERN = re.compile(
- r"\b(" + "|".join(re.escape(k) for k in TEXT_EXPANSIONS.keys()) + r")\b"
- )
- def normalize_text(text: str) -> str:
- """
- Normalize text with abbreviation expansion - OPTIMIZED.
- Args:
- text: Input text to normalize
- Returns:
- Normalized text
- """
- if pd.isna(text) or text == "":
- return ""
- text = str(text)
- # Apply acronym expansions using single pattern
- def replace_acronym(match):
- return ACRONYMS[match.group(0)]
- text = _ACRONYM_PATTERN.sub(replace_acronym, text)
- text = text.lower()
- # Apply text expansions using single pattern
- def replace_expansion(match):
- return TEXT_EXPANSIONS[match.group(0)]
- text = _EXPANSION_PATTERN.sub(replace_expansion, text)
- return text
- def extract_keywords(text: str, keywords: List[str]) -> List[str]:
- """
- Extract matching keywords from text.
-
- Args:
- text: Text to search
- keywords: List of keywords to find
-
- Returns:
- List of matched keywords
- """
- text_lower = text.lower()
- matches = [kw for kw in keywords if kw in text_lower]
- return matches
- def calculate_keyword_score(matches: List[str]) -> int:
- """
- Calculate keyword score based on unique matches.
-
- Args:
- matches: List of matched keywords
-
- Returns:
- Number of unique matches
- """
- return len(set(matches))
|