ADVANCED_FEATURES_SUMMARY.json 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. {
  2. "advanced_features": {
  3. "keyword_identification": {
  4. "file": "pipeline/steps/step0a_keyword_identification.py",
  5. "class": "KeywordIdentifier",
  6. "purpose": "Automatically identify relevant keywords from messages",
  7. "features": [
  8. "Extracts words from all messages",
  9. "Counts word frequencies",
  10. "Categorizes by type (medical, actions, emotions, etc.)",
  11. "Filters by minimum frequency threshold",
  12. "Generates top 100 most frequent words"
  13. ],
  14. "output_files": [
  15. "keyword_analysis.json",
  16. "keyword_analysis.txt"
  17. ],
  18. "categories": [
  19. "names",
  20. "medical",
  21. "locations",
  22. "actions",
  23. "emotions",
  24. "dates",
  25. "other"
  26. ]
  27. },
  28. "normalization_analysis": {
  29. "file": "pipeline/steps/step0b_normalization_analysis.py",
  30. "class": "NormalizationAnalyzer",
  31. "purpose": "Analyze text patterns and suggest normalizations",
  32. "features": [
  33. "Finds abbreviations (dr., appt, etc.)",
  34. "Identifies acronyms (MSK, ER, ICU, etc.)",
  35. "Detects common misspellings",
  36. "Discovers date/time patterns",
  37. "Generates expansion suggestions"
  38. ],
  39. "output_files": [
  40. "normalization_suggestions.json",
  41. "normalization_suggestions.txt"
  42. ],
  43. "suggestion_types": [
  44. "abbreviations",
  45. "acronyms",
  46. "misspellings",
  47. "datetime_patterns"
  48. ]
  49. },
  50. "parallel_inference": {
  51. "file": "pipeline/utils/parallel_inference_runner.py",
  52. "class": "ParallelInferenceRunner",
  53. "purpose": "Process LLM inference requests in parallel",
  54. "features": [
  55. "Concurrent request processing",
  56. "Configurable worker count",
  57. "Automatic error handling",
  58. "Progress tracking with tqdm",
  59. "3-4x speedup over sequential"
  60. ],
  61. "performance": {
  62. "sequential": "2-3 requests/second",
  63. "parallel_4_workers": "8-12 requests/second",
  64. "speedup": "3-4x",
  65. "example_300_chunks": "25 min vs 100 min"
  66. }
  67. }
  68. }
  69. }