text_utils.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. """
  2. Utility functions for text processing.
  3. """
  4. import re
  5. from typing import List
  6. import pandas as pd
  7. from pipeline.common_defs import TEXT_EXPANSIONS, ACRONYMS
  8. def normalize_text(text: str) -> str:
  9. """
  10. Normalize text with abbreviation expansion.
  11. Args:
  12. text: Input text to normalize
  13. Returns:
  14. Normalized text
  15. """
  16. if pd.isna(text) or text == '':
  17. return ""
  18. text = str(text)
  19. # Apply expansions for acronyms
  20. for abbr, full in ACRONYMS.items():
  21. # Use \b for word boundaries to only match complete words
  22. pattern = r"\b" + re.escape(abbr) + r"\b"
  23. text = re.sub(pattern, full, text)
  24. text = text.lower()
  25. # Apply expansions
  26. for abbr, full in TEXT_EXPANSIONS.items():
  27. # Use \b for word boundaries to only match complete words
  28. pattern = r"\b" + re.escape(abbr) + r"\b"
  29. text = re.sub(pattern, full, text)
  30. return text
  31. def extract_keywords(text: str, keywords: List[str]) -> List[str]:
  32. """
  33. Extract matching keywords from text.
  34. Args:
  35. text: Text to search
  36. keywords: List of keywords to find
  37. Returns:
  38. List of matched keywords
  39. """
  40. text_lower = text.lower()
  41. matches = [kw for kw in keywords if kw in text_lower]
  42. return matches
  43. def calculate_keyword_score(matches: List[str]) -> int:
  44. """
  45. Calculate keyword score based on unique matches.
  46. Args:
  47. matches: List of matched keywords
  48. Returns:
  49. Number of unique matches
  50. """
  51. return len(set(matches))