| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- """
- Utility functions for text processing.
- """
- import re
- from typing import List
- import pandas as pd
- from pipeline.common_defs import TEXT_EXPANSIONS
- def normalize_text(text: str) -> str:
- """
- Normalize text with abbreviation expansion.
-
- Args:
- text: Input text to normalize
-
- Returns:
- Normalized text
- """
- if pd.isna(text) or text == '':
- return ""
- text = str(text).lower()
- # Apply expansions
- for abbr, full in TEXT_EXPANSIONS.items():
- # Use \b for word boundaries to only match complete words
- pattern = r"\b" + re.escape(abbr) + r"\b"
- text = re.sub(pattern, full, text)
- return text
- def extract_keywords(text: str, keywords: List[str]) -> List[str]:
- """
- Extract matching keywords from text.
-
- Args:
- text: Text to search
- keywords: List of keywords to find
-
- Returns:
- List of matched keywords
- """
- text_lower = text.lower()
- matches = [kw for kw in keywords if kw in text_lower]
- return matches
- def calculate_keyword_score(matches: List[str]) -> int:
- """
- Calculate keyword score based on unique matches.
-
- Args:
- matches: List of matched keywords
-
- Returns:
- Number of unique matches
- """
- return len(set(matches))
|