text_utils.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. """
  2. Utility functions for text processing.
  3. """
  4. import re
  5. from typing import List
  6. import pandas as pd
  7. from pipeline.common_defs import TEXT_EXPANSIONS, ACRONYMS
  8. # Build a single combined pattern for all acronyms (case-sensitive)
  9. _ACRONYM_PATTERN = re.compile(
  10. r"\b(" + "|".join(re.escape(k) for k in ACRONYMS.keys()) + r")\b"
  11. )
  12. # Build a single combined pattern for all expansions (will be case-insensitive after lowering)
  13. _EXPANSION_PATTERN = re.compile(
  14. r"\b(" + "|".join(re.escape(k) for k in TEXT_EXPANSIONS.keys()) + r")\b"
  15. )
  16. def normalize_text(text: str) -> str:
  17. """
  18. Normalize text with abbreviation expansion - OPTIMIZED.
  19. Args:
  20. text: Input text to normalize
  21. Returns:
  22. Normalized text
  23. """
  24. if pd.isna(text) or text == "":
  25. return ""
  26. text = str(text)
  27. # Apply acronym expansions using single pattern
  28. def replace_acronym(match):
  29. return ACRONYMS[match.group(0)]
  30. text = _ACRONYM_PATTERN.sub(replace_acronym, text)
  31. text = text.lower()
  32. # Apply text expansions using single pattern
  33. def replace_expansion(match):
  34. return TEXT_EXPANSIONS[match.group(0)]
  35. text = _EXPANSION_PATTERN.sub(replace_expansion, text)
  36. return text
  37. def extract_keywords(text: str, keywords: List[str]) -> List[str]:
  38. """
  39. Extract matching keywords from text.
  40. Args:
  41. text: Text to search
  42. keywords: List of keywords to find
  43. Returns:
  44. List of matched keywords
  45. """
  46. text_lower = text.lower()
  47. matches = [kw for kw in keywords if kw in text_lower]
  48. return matches
  49. def calculate_keyword_score(matches: List[str]) -> int:
  50. """
  51. Calculate keyword score based on unique matches.
  52. Args:
  53. matches: List of matched keywords
  54. Returns:
  55. Number of unique matches
  56. """
  57. return len(set(matches))