{ "pipeline_name": "Qwen 3 + Qwen 2.5 Legal Discovery Pipeline", "version": "1.0", "architecture": "Object-Oriented", "configuration": { "primary_model": "Qwen 3 235B Instruct", "secondary_model": "Qwen 2.5 72B Instruct", "total_gpus": "6 \u00d7 A100 80GB", "cost_per_hour": "$3.84", "total_cost": "$515-968 (including attorney)" }, "pipeline_steps": [ { "step": 1, "name": "Load Data", "script": "pipeline/steps/step1_load_data.py", "class": "DataLoader", "description": "Load and preprocess Signal CSV messages" }, { "step": 2, "name": "Create Chunks", "script": "pipeline/steps/step2_create_chunks.py", "class": "ChunkCreator", "description": "Create overlapping 20-message chunks" }, { "step": 3, "name": "Keyword Filter", "script": "pipeline/steps/step3_keyword_filter.py", "class": "KeywordFilter", "description": "Filter by case-specific keywords" }, { "step": 4, "name": "Semantic Filter", "script": "pipeline/steps/step4_semantic_filter.py", "class": "SemanticFilter", "description": "Dual-model semantic filtering" }, { "step": 5, "name": "Random Sampling", "script": "pipeline/steps/step5_random_sampling.py", "class": "RandomSampler", "description": "Stratified random sampling for attorney" }, { "step": 6, "name": "Labeling Template", "script": "pipeline/steps/step6_labeling_template.py", "class": "LabelingTemplateGenerator", "description": "Generate attorney labeling template" }, { "step": 7, "name": "Inference Prep", "script": "pipeline/steps/step7_inference_prep.py", "class": "InferencePreparation", "description": "Prepare dual Qwen inference requests" }, { "step": 8, "name": "Merge Results", "script": "pipeline/steps/step8_merge_results.py", "class": "ResultsMerger", "description": "Merge dual-model results with confidence" } ], "utilities": [ { "name": "Text Utils", "script": "pipeline/utils/text_utils.py", "functions": [ "normalize_text", "extract_keywords", "calculate_keyword_score" ] }, { "name": "Deployment Helper", "script": "pipeline/utils/deployment_helper.py", "class": "ModelDeployer", "description": "Helper for deploying Qwen models on Vast.ai" }, { "name": "Inference Runner", "script": "pipeline/utils/inference_runner.py", "class": "InferenceRunner", "description": "Run inference on dual Qwen models" } ], "core_modules": [ { "name": "Common Definitions", "script": "pipeline/common_defs.py", "contains": [ "Case criteria", "Model configs", "Data classes", "Constants" ] }, { "name": "Base Classes", "script": "pipeline/models/base.py", "contains": [ "PipelineStep abstract class", "Logging setup", "File I/O" ] }, { "name": "Main Pipeline", "script": "pipeline/main_pipeline.py", "class": "DiscoveryPipeline", "description": "Main orchestrator for running all steps" } ], "expected_performance": { "recall": "88-97%", "precision": "65-85%", "high_confidence_cases": "60-70%", "medium_confidence_cases": "25-35%", "low_confidence_cases": "5-10%" }, "file_structure": { "total_files": 17, "total_size_kb": 59.4, "python_modules": 14, "documentation": 1, "config_files": 1, "scripts": 1 } }