justadri
/
disco


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
							#!/usr/bin/env python3
"""
Random Sample Selector for Attorney Labeling
Selects representative messages from filtered candidates for few-shot learning
"""

import pandas as pd
import random
import json
from pathlib import Path
from datetime import datetime

class RandomSampleSelector:
    """
    Selects random representative samples for attorney labeling.
    Ensures diversity across senders, time periods, and keyword matches.
    """
    
    def __init__(self, output_dir='./labeling_samples'):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        
    def select_stratified_sample(self, messages_df, n_samples=20, 
                                 stratify_by='sender', seed=42):
        """
        Select stratified random sample ensuring diversity.
        
        Args:
            messages_df: DataFrame with filtered candidate messages
            n_samples: Number of samples to select
            stratify_by: Column to stratify by ('sender', 'date', etc.)
            seed: Random seed for reproducibility
        """
        random.seed(seed)
        
        print(f"\nSelecting {n_samples} samples stratified by {stratify_by}...")
        
        # Get unique values for stratification
        if stratify_by in messages_df.columns:
            strata = messages_df[stratify_by].unique()
            samples_per_stratum = max(1, n_samples // len(strata))
            
            selected = []
            for stratum in strata:
                stratum_data = messages_df[messages_df[stratify_by] == stratum]
                n_select = min(samples_per_stratum, len(stratum_data))
                selected.extend(stratum_data.sample(n=n_select, random_state=seed).to_dict('records'))
            
            # If we need more samples, randomly select from remaining
            if len(selected) < n_samples:
                remaining = messages_df[~messages_df.index.isin([s['line_number'] for s in selected])]
                additional = remaining.sample(n=n_samples - len(selected), random_state=seed)
                selected.extend(additional.to_dict('records'))
            
            # Shuffle final selection
            random.shuffle(selected)
            selected = selected[:n_samples]
        else:
            # Simple random sample if stratify column doesn't exist
            selected = messages_df.sample(n=min(n_samples, len(messages_df)), 
                                        random_state=seed).to_dict('records')
        
        print(f"Selected {len(selected)} samples")
        return selected
    
    def create_labeling_template(self, samples, context_window=3):
        """
        Create attorney labeling template with context.
        Shows each message with surrounding context for better evaluation.
        """
        print(f"\nCreating labeling template with context window of {context_window}...")
        
        labeling_data = []
        
        for i, sample in enumerate(samples, 1):
            # Create context (would need full dataset to get actual context)
            # For now, just format the sample message
            entry = {
                'sample_id': i,
                'line_number': sample.get('line_number', i),
                'timestamp': sample.get('timestamp', ''),
                'sender': sample.get('sender', ''),
                'message': sample.get('message', ''),
                'context_before': sample.get('context_before', []),
                'context_after': sample.get('context_after', []),
                'responsive': '',  # Attorney fills this
                'reasoning': '',   # Attorney fills this
                'criteria_matched': []  # Attorney fills this
            }
            labeling_data.append(entry)
        
        return labeling_data
    
    def save_labeling_template(self, labeling_data, filename='attorney_labeling_template.json'):
        """Save labeling template for attorney"""
        filepath = self.output_dir / filename
        
        with open(filepath, 'w') as f:
            json.dump(labeling_data, f, indent=2)
        
        print(f"\nLabeling template saved: {filepath}")
        
        # Also create a readable text version
        text_filepath = self.output_dir / filename.replace('.json', '.txt')
        with open(text_filepath, 'w') as f:
            f.write("ATTORNEY LABELING INSTRUCTIONS\n")
            f.write("=" * 80 + "\n\n")
            f.write("For each message below, please provide:\n")
            f.write("1. RESPONSIVE: YES or NO\n")
            f.write("2. REASONING: Brief explanation\n")
            f.write("3. CRITERIA: Which subpoena criteria matched (1-7)\n\n")
            f.write("=" * 80 + "\n\n")
            
            for entry in labeling_data:
                f.write(f"SAMPLE {entry['sample_id']}\n")
                f.write("-" * 80 + "\n")
                f.write(f"Line: {entry['line_number']}\n")
                f.write(f"Time: {entry['timestamp']}\n")
                f.write(f"Sender: {entry['sender']}\n")
                f.write(f"Message: {entry['message']}\n\n")
                f.write("RESPONSIVE: _______\n")
                f.write("REASONING: _______________________________________\n")
                f.write("CRITERIA: _______\n")
                f.write("\n" + "=" * 80 + "\n\n")
        
        print(f"Text template saved: {text_filepath}")
        
        return filepath
    
    def load_labeled_samples(self, filepath):
        """Load attorney-labeled samples"""
        with open(filepath, 'r') as f:
            return json.load(f)
    
    def create_few_shot_examples(self, labeled_samples):
        """
        Convert attorney-labeled samples into few-shot examples for prompts.
        """
        few_shot_examples = []
        
        for sample in labeled_samples:
            if sample.get('responsive'):  # Only include if attorney labeled it
                example = {
                    'message': sample['message'],
                    'responsive': sample['responsive'],
                    'reasoning': sample['reasoning'],
                    'criteria': sample.get('criteria_matched', [])
                }
                few_shot_examples.append(example)
        
        return few_shot_examples
    
    def format_few_shot_prompt(self, few_shot_examples):
        """Format few-shot examples for inclusion in prompts"""
        prompt_text = "Here are examples of how to classify messages:\n\n"
        
        for i, example in enumerate(few_shot_examples, 1):
            status = "RESPONSIVE" if example['responsive'].upper() == 'YES' else "NOT RESPONSIVE"
            prompt_text += f"Example {i} ({status}):\n"
            prompt_text += f'Message: "{example["message"]}"\n'
            prompt_text += f"Reasoning: {example['reasoning']}\n"
            if example.get('criteria'):
                prompt_text += f"Criteria matched: {', '.join(map(str, example['criteria']))}\n"
            prompt_text += "\n"
        
        return prompt_text


# Example usage
if __name__ == "__main__":
    selector = RandomSampleSelector()
    
    # Load filtered candidates (from previous pipeline step)
    # candidates_df = pd.read_csv('discovery_output/filtered/candidate_messages.csv')
    
    # Select 20 random samples
    # samples = selector.select_stratified_sample(candidates_df, n_samples=20)
    
    # Create labeling template
    # labeling_data = selector.create_labeling_template(samples)
    
    # Save for attorney
    # selector.save_labeling_template(labeling_data)
    
    print("\nTo use this script:")
    print("1. Load your filtered candidate messages")
    print("2. Run select_stratified_sample() to get random samples")
    print("3. Run create_labeling_template() to format for attorney")
    print("4. Attorney labels the samples")
    print("5. Run create_few_shot_examples() to convert to prompt format")