{ "advanced_features": { "keyword_identification": { "file": "pipeline/steps/step0a_keyword_identification.py", "class": "KeywordIdentifier", "purpose": "Automatically identify relevant keywords from messages", "features": [ "Extracts words from all messages", "Counts word frequencies", "Categorizes by type (medical, actions, emotions, etc.)", "Filters by minimum frequency threshold", "Generates top 100 most frequent words" ], "output_files": [ "keyword_analysis.json", "keyword_analysis.txt" ], "categories": [ "names", "medical", "locations", "actions", "emotions", "dates", "other" ] }, "normalization_analysis": { "file": "pipeline/steps/step0b_normalization_analysis.py", "class": "NormalizationAnalyzer", "purpose": "Analyze text patterns and suggest normalizations", "features": [ "Finds abbreviations (dr., appt, etc.)", "Identifies acronyms (MSK, ER, ICU, etc.)", "Detects common misspellings", "Discovers date/time patterns", "Generates expansion suggestions" ], "output_files": [ "normalization_suggestions.json", "normalization_suggestions.txt" ], "suggestion_types": [ "abbreviations", "acronyms", "misspellings", "datetime_patterns" ] }, "parallel_inference": { "file": "pipeline/utils/parallel_inference_runner.py", "class": "ParallelInferenceRunner", "purpose": "Process LLM inference requests in parallel", "features": [ "Concurrent request processing", "Configurable worker count", "Automatic error handling", "Progress tracking with tqdm", "3-4x speedup over sequential" ], "performance": { "sequential": "2-3 requests/second", "parallel_4_workers": "8-12 requests/second", "speedup": "3-4x", "example_300_chunks": "25 min vs 100 min" } } } }