| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- {
- "pipeline_name": "Qwen 3 + Qwen 2.5 Legal Discovery Pipeline",
- "version": "1.0",
- "architecture": "Object-Oriented",
- "configuration": {
- "primary_model": "Qwen 3 235B Instruct",
- "secondary_model": "Qwen 2.5 72B Instruct",
- "total_gpus": "6 \u00d7 A100 80GB",
- "cost_per_hour": "$3.84",
- "total_cost": "$515-968 (including attorney)"
- },
- "pipeline_steps": [
- {
- "step": 1,
- "name": "Load Data",
- "script": "pipeline/steps/step1_load_data.py",
- "class": "DataLoader",
- "description": "Load and preprocess Signal CSV messages"
- },
- {
- "step": 2,
- "name": "Create Chunks",
- "script": "pipeline/steps/step2_create_chunks.py",
- "class": "ChunkCreator",
- "description": "Create overlapping 20-message chunks"
- },
- {
- "step": 3,
- "name": "Keyword Filter",
- "script": "pipeline/steps/step3_keyword_filter.py",
- "class": "KeywordFilter",
- "description": "Filter by case-specific keywords"
- },
- {
- "step": 4,
- "name": "Semantic Filter",
- "script": "pipeline/steps/step4_semantic_filter.py",
- "class": "SemanticFilter",
- "description": "Dual-model semantic filtering"
- },
- {
- "step": 5,
- "name": "Random Sampling",
- "script": "pipeline/steps/step5_random_sampling.py",
- "class": "RandomSampler",
- "description": "Stratified random sampling for attorney"
- },
- {
- "step": 6,
- "name": "Labeling Template",
- "script": "pipeline/steps/step6_labeling_template.py",
- "class": "LabelingTemplateGenerator",
- "description": "Generate attorney labeling template"
- },
- {
- "step": 7,
- "name": "Inference Prep",
- "script": "pipeline/steps/step7_inference_prep.py",
- "class": "InferencePreparation",
- "description": "Prepare dual Qwen inference requests"
- },
- {
- "step": 8,
- "name": "Merge Results",
- "script": "pipeline/steps/step8_merge_results.py",
- "class": "ResultsMerger",
- "description": "Merge dual-model results with confidence"
- }
- ],
- "utilities": [
- {
- "name": "Text Utils",
- "script": "pipeline/utils/text_utils.py",
- "functions": [
- "normalize_text",
- "extract_keywords",
- "calculate_keyword_score"
- ]
- },
- {
- "name": "Deployment Helper",
- "script": "pipeline/utils/deployment_helper.py",
- "class": "ModelDeployer",
- "description": "Helper for deploying Qwen models on Vast.ai"
- },
- {
- "name": "Inference Runner",
- "script": "pipeline/utils/inference_runner.py",
- "class": "InferenceRunner",
- "description": "Run inference on dual Qwen models"
- }
- ],
- "core_modules": [
- {
- "name": "Common Definitions",
- "script": "pipeline/common_defs.py",
- "contains": [
- "Case criteria",
- "Model configs",
- "Data classes",
- "Constants"
- ]
- },
- {
- "name": "Base Classes",
- "script": "pipeline/models/base.py",
- "contains": [
- "PipelineStep abstract class",
- "Logging setup",
- "File I/O"
- ]
- },
- {
- "name": "Main Pipeline",
- "script": "pipeline/main_pipeline.py",
- "class": "DiscoveryPipeline",
- "description": "Main orchestrator for running all steps"
- }
- ],
- "expected_performance": {
- "recall": "88-97%",
- "precision": "65-85%",
- "high_confidence_cases": "60-70%",
- "medium_confidence_cases": "25-35%",
- "low_confidence_cases": "5-10%"
- },
- "file_structure": {
- "total_files": 17,
- "total_size_kb": 59.4,
- "python_modules": 14,
- "documentation": 1,
- "config_files": 1,
- "scripts": 1
- }
- }
|