justadri
/
disco


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
							"""
Utility functions for text processing.
"""

import re
from typing import List
import pandas as pd
from pipeline.common_defs import TEXT_EXPANSIONS, ACRONYMS

def normalize_text(text: str) -> str:
    """
    Normalize text with abbreviation expansion.
    
    Args:
        text: Input text to normalize
        
    Returns:
        Normalized text
    """
    if pd.isna(text) or text == '':
        return ""

    text = str(text)

    # Apply expansions for acronyms
    for abbr, full in ACRONYMS.items():
        # Use \b for word boundaries to only match complete words
        pattern = r"\b" + re.escape(abbr) + r"\b"
        text = re.sub(pattern, full, text)

    text = text.lower()

    # Apply expansions
    for abbr, full in TEXT_EXPANSIONS.items():
        # Use \b for word boundaries to only match complete words
        pattern = r"\b" + re.escape(abbr) + r"\b"
        text = re.sub(pattern, full, text)

    return text

def extract_keywords(text: str, keywords: List[str]) -> List[str]:
    """
    Extract matching keywords from text.
    
    Args:
        text: Text to search
        keywords: List of keywords to find
        
    Returns:
        List of matched keywords
    """
    text_lower = text.lower()
    matches = [kw for kw in keywords if kw in text_lower]
    return matches

def calculate_keyword_score(matches: List[str]) -> int:
    """
    Calculate keyword score based on unique matches.
    
    Args:
        matches: List of matched keywords
        
    Returns:
        Number of unique matches
    """
    return len(set(matches))