| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- {
- "advanced_features": {
- "keyword_identification": {
- "file": "pipeline/steps/step0a_keyword_identification.py",
- "class": "KeywordIdentifier",
- "purpose": "Automatically identify relevant keywords from messages",
- "features": [
- "Extracts words from all messages",
- "Counts word frequencies",
- "Categorizes by type (medical, actions, emotions, etc.)",
- "Filters by minimum frequency threshold",
- "Generates top 100 most frequent words"
- ],
- "output_files": [
- "keyword_analysis.json",
- "keyword_analysis.txt"
- ],
- "categories": [
- "names",
- "medical",
- "locations",
- "actions",
- "emotions",
- "dates",
- "other"
- ]
- },
- "normalization_analysis": {
- "file": "pipeline/steps/step0b_normalization_analysis.py",
- "class": "NormalizationAnalyzer",
- "purpose": "Analyze text patterns and suggest normalizations",
- "features": [
- "Finds abbreviations (dr., appt, etc.)",
- "Identifies acronyms (MSK, ER, ICU, etc.)",
- "Detects common misspellings",
- "Discovers date/time patterns",
- "Generates expansion suggestions"
- ],
- "output_files": [
- "normalization_suggestions.json",
- "normalization_suggestions.txt"
- ],
- "suggestion_types": [
- "abbreviations",
- "acronyms",
- "misspellings",
- "datetime_patterns"
- ]
- },
- "parallel_inference": {
- "file": "pipeline/utils/parallel_inference_runner.py",
- "class": "ParallelInferenceRunner",
- "purpose": "Process LLM inference requests in parallel",
- "features": [
- "Concurrent request processing",
- "Configurable worker count",
- "Automatic error handling",
- "Progress tracking with tqdm",
- "3-4x speedup over sequential"
- ],
- "performance": {
- "sequential": "2-3 requests/second",
- "parallel_4_workers": "8-12 requests/second",
- "speedup": "3-4x",
- "example_300_chunks": "25 min vs 100 min"
- }
- }
- }
- }
|