| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566 |
- """
- Utility functions for text processing.
- """
- import re
- from typing import List
- import pandas as pd
- from pipeline.common_defs import TEXT_EXPANSIONS, ACRONYMS
- def normalize_text(text: str) -> str:
- """
- Normalize text with abbreviation expansion.
-
- Args:
- text: Input text to normalize
-
- Returns:
- Normalized text
- """
- if pd.isna(text) or text == '':
- return ""
- text = str(text)
- # Apply expansions for acronyms
- for abbr, full in ACRONYMS.items():
- # Use \b for word boundaries to only match complete words
- pattern = r"\b" + re.escape(abbr) + r"\b"
- text = re.sub(pattern, full, text)
- text = text.lower()
- # Apply expansions
- for abbr, full in TEXT_EXPANSIONS.items():
- # Use \b for word boundaries to only match complete words
- pattern = r"\b" + re.escape(abbr) + r"\b"
- text = re.sub(pattern, full, text)
- return text
- def extract_keywords(text: str, keywords: List[str]) -> List[str]:
- """
- Extract matching keywords from text.
-
- Args:
- text: Text to search
- keywords: List of keywords to find
-
- Returns:
- List of matched keywords
- """
- text_lower = text.lower()
- matches = [kw for kw in keywords if kw in text_lower]
- return matches
- def calculate_keyword_score(matches: List[str]) -> int:
- """
- Calculate keyword score based on unique matches.
-
- Args:
- matches: List of matched keywords
-
- Returns:
- Number of unique matches
- """
- return len(set(matches))
|