text_utils.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. """
  2. Utility functions for text processing.
  3. """
  4. import re
  5. from typing import List
  6. import pandas as pd
  7. from pipeline.common_defs import TEXT_EXPANSIONS
  8. def normalize_text(text: str) -> str:
  9. """
  10. Normalize text with abbreviation expansion.
  11. Args:
  12. text: Input text to normalize
  13. Returns:
  14. Normalized text
  15. """
  16. if pd.isna(text) or text == '':
  17. return ""
  18. text = str(text).lower()
  19. # Apply expansions
  20. for abbr, full in TEXT_EXPANSIONS.items():
  21. # Use \b for word boundaries to only match complete words
  22. pattern = r"\b" + re.escape(abbr) + r"\b"
  23. text = re.sub(pattern, full, text)
  24. return text
  25. def extract_keywords(text: str, keywords: List[str]) -> List[str]:
  26. """
  27. Extract matching keywords from text.
  28. Args:
  29. text: Text to search
  30. keywords: List of keywords to find
  31. Returns:
  32. List of matched keywords
  33. """
  34. text_lower = text.lower()
  35. matches = [kw for kw in keywords if kw in text_lower]
  36. return matches
  37. def calculate_keyword_score(matches: List[str]) -> int:
  38. """
  39. Calculate keyword score based on unique matches.
  40. Args:
  41. matches: List of matched keywords
  42. Returns:
  43. Number of unique matches
  44. """
  45. return len(set(matches))