text_utils.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. """
  2. Utility functions for text processing.
  3. """
  4. import regex as re
  5. from typing import List
  6. import pandas as pd
  7. from pipeline.common_defs import TEXT_EXPANSIONS, ACRONYMS
  8. # Build a single combined pattern for all acronyms (case-sensitive)
  9. _ACRONYM_PATTERN = re.compile(
  10. r"\b(" + "|".join(re.escape(k) for k in ACRONYMS.keys()) + r")\b",
  11. flags=re.WORD | re.V1,
  12. )
  13. # Build a single combined pattern for all expansions (will be case-insensitive after lowering)
  14. _EXPANSION_PATTERN = re.compile(
  15. r"\b(" + "|".join(re.escape(k) for k in TEXT_EXPANSIONS.keys()) + r")\b",
  16. flags=re.WORD | re.V1,
  17. )
  18. def normalize_text(text: str) -> str:
  19. """
  20. Normalize text with abbreviation expansion - OPTIMIZED.
  21. Args:
  22. text: Input text to normalize
  23. Returns:
  24. Normalized text
  25. """
  26. if pd.isna(text) or text == "":
  27. return ""
  28. text = str(text)
  29. # Apply acronym expansions using single pattern
  30. def replace_acronym(match):
  31. return ACRONYMS[match.group(0)]
  32. text = _ACRONYM_PATTERN.sub(replace_acronym, text)
  33. text = text.lower()
  34. # Apply text expansions using single pattern
  35. def replace_expansion(match):
  36. return TEXT_EXPANSIONS[match.group(0)]
  37. text = _EXPANSION_PATTERN.sub(replace_expansion, text)
  38. return text
  39. def extract_keywords(text: str, keywords: List[str]) -> List[str]:
  40. """
  41. Extract matching keywords from text.
  42. Args:
  43. text: Text to search
  44. keywords: List of keywords to find
  45. Returns:
  46. List of matched keywords
  47. """
  48. text_lower = text.lower()
  49. matches = [kw for kw in keywords if kw in text_lower]
  50. return matches
  51. def calculate_keyword_score(matches: List[str]) -> int:
  52. """
  53. Calculate keyword score based on unique matches.
  54. Args:
  55. matches: List of matched keywords
  56. Returns:
  57. Number of unique matches
  58. """
  59. return len(set(matches))