PIPELINE_SUMMARY.json 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. {
  2. "pipeline_name": "Qwen 3 + Qwen 2.5 Legal Discovery Pipeline",
  3. "version": "1.0",
  4. "architecture": "Object-Oriented",
  5. "configuration": {
  6. "primary_model": "Qwen 3 235B Instruct",
  7. "secondary_model": "Qwen 2.5 72B Instruct",
  8. "total_gpus": "6 \u00d7 A100 80GB",
  9. "cost_per_hour": "$3.84",
  10. "total_cost": "$515-968 (including attorney)"
  11. },
  12. "pipeline_steps": [
  13. {
  14. "step": 1,
  15. "name": "Load Data",
  16. "script": "pipeline/steps/step1_load_data.py",
  17. "class": "DataLoader",
  18. "description": "Load and preprocess Signal CSV messages"
  19. },
  20. {
  21. "step": 2,
  22. "name": "Create Chunks",
  23. "script": "pipeline/steps/step2_create_chunks.py",
  24. "class": "ChunkCreator",
  25. "description": "Create overlapping 20-message chunks"
  26. },
  27. {
  28. "step": 3,
  29. "name": "Keyword Filter",
  30. "script": "pipeline/steps/step3_keyword_filter.py",
  31. "class": "KeywordFilter",
  32. "description": "Filter by case-specific keywords"
  33. },
  34. {
  35. "step": 4,
  36. "name": "Semantic Filter",
  37. "script": "pipeline/steps/step4_semantic_filter.py",
  38. "class": "SemanticFilter",
  39. "description": "Dual-model semantic filtering"
  40. },
  41. {
  42. "step": 5,
  43. "name": "Random Sampling",
  44. "script": "pipeline/steps/step5_random_sampling.py",
  45. "class": "RandomSampler",
  46. "description": "Stratified random sampling for attorney"
  47. },
  48. {
  49. "step": 6,
  50. "name": "Labeling Template",
  51. "script": "pipeline/steps/step6_labeling_template.py",
  52. "class": "LabelingTemplateGenerator",
  53. "description": "Generate attorney labeling template"
  54. },
  55. {
  56. "step": 7,
  57. "name": "Inference Prep",
  58. "script": "pipeline/steps/step7_inference_prep.py",
  59. "class": "InferencePreparation",
  60. "description": "Prepare dual Qwen inference requests"
  61. },
  62. {
  63. "step": 8,
  64. "name": "Merge Results",
  65. "script": "pipeline/steps/step8_merge_results.py",
  66. "class": "ResultsMerger",
  67. "description": "Merge dual-model results with confidence"
  68. }
  69. ],
  70. "utilities": [
  71. {
  72. "name": "Text Utils",
  73. "script": "pipeline/utils/text_utils.py",
  74. "functions": [
  75. "normalize_text",
  76. "extract_keywords",
  77. "calculate_keyword_score"
  78. ]
  79. },
  80. {
  81. "name": "Deployment Helper",
  82. "script": "pipeline/utils/deployment_helper.py",
  83. "class": "ModelDeployer",
  84. "description": "Helper for deploying Qwen models on Vast.ai"
  85. },
  86. {
  87. "name": "Inference Runner",
  88. "script": "pipeline/utils/inference_runner.py",
  89. "class": "InferenceRunner",
  90. "description": "Run inference on dual Qwen models"
  91. }
  92. ],
  93. "core_modules": [
  94. {
  95. "name": "Common Definitions",
  96. "script": "pipeline/common_defs.py",
  97. "contains": [
  98. "Case criteria",
  99. "Model configs",
  100. "Data classes",
  101. "Constants"
  102. ]
  103. },
  104. {
  105. "name": "Base Classes",
  106. "script": "pipeline/models/base.py",
  107. "contains": [
  108. "PipelineStep abstract class",
  109. "Logging setup",
  110. "File I/O"
  111. ]
  112. },
  113. {
  114. "name": "Main Pipeline",
  115. "script": "pipeline/main_pipeline.py",
  116. "class": "DiscoveryPipeline",
  117. "description": "Main orchestrator for running all steps"
  118. }
  119. ],
  120. "expected_performance": {
  121. "recall": "88-97%",
  122. "precision": "65-85%",
  123. "high_confidence_cases": "60-70%",
  124. "medium_confidence_cases": "25-35%",
  125. "low_confidence_cases": "5-10%"
  126. },
  127. "file_structure": {
  128. "total_files": 17,
  129. "total_size_kb": 59.4,
  130. "python_modules": 14,
  131. "documentation": 1,
  132. "config_files": 1,
  133. "scripts": 1
  134. }
  135. }