justadri
/
disco


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
							"""
Utility functions for text processing.
"""

import re
from typing import List
import pandas as pd
from pipeline.common_defs import TEXT_EXPANSIONS, ACRONYMS

# Build a single combined pattern for all acronyms (case-sensitive)
_ACRONYM_PATTERN = re.compile(
    r"\b(" + "|".join(re.escape(k) for k in ACRONYMS.keys()) + r")\b"
)

# Build a single combined pattern for all expansions (will be case-insensitive after lowering)
_EXPANSION_PATTERN = re.compile(
    r"\b(" + "|".join(re.escape(k) for k in TEXT_EXPANSIONS.keys()) + r")\b"
)


def normalize_text(text: str) -> str:
    """
    Normalize text with abbreviation expansion - OPTIMIZED.

    Args:
        text: Input text to normalize

    Returns:
        Normalized text
    """
    if pd.isna(text) or text == "":
        return ""

    text = str(text)

    # Apply acronym expansions using single pattern
    def replace_acronym(match):
        return ACRONYMS[match.group(0)]

    text = _ACRONYM_PATTERN.sub(replace_acronym, text)
    text = text.lower()

    # Apply text expansions using single pattern
    def replace_expansion(match):
        return TEXT_EXPANSIONS[match.group(0)]

    text = _EXPANSION_PATTERN.sub(replace_expansion, text)

    return text


def extract_keywords(text: str, keywords: List[str]) -> List[str]:
    """
    Extract matching keywords from text.
    
    Args:
        text: Text to search
        keywords: List of keywords to find
        
    Returns:
        List of matched keywords
    """
    text_lower = text.lower()
    matches = [kw for kw in keywords if kw in text_lower]
    return matches

def calculate_keyword_score(matches: List[str]) -> int:
    """
    Calculate keyword score based on unique matches.
    
    Args:
        matches: List of matched keywords
        
    Returns:
        Number of unique matches
    """
    return len(set(matches))