#!/usr/bin/env python3 """ Random Sample Selector for Attorney Labeling Selects representative messages from filtered candidates for few-shot learning """ import pandas as pd import random import json from pathlib import Path from datetime import datetime class RandomSampleSelector: """ Selects random representative samples for attorney labeling. Ensures diversity across senders, time periods, and keyword matches. """ def __init__(self, output_dir='./labeling_samples'): self.output_dir = Path(output_dir) self.output_dir.mkdir(exist_ok=True) def select_stratified_sample(self, messages_df, n_samples=20, stratify_by='sender', seed=42): """ Select stratified random sample ensuring diversity. Args: messages_df: DataFrame with filtered candidate messages n_samples: Number of samples to select stratify_by: Column to stratify by ('sender', 'date', etc.) seed: Random seed for reproducibility """ random.seed(seed) print(f"\nSelecting {n_samples} samples stratified by {stratify_by}...") # Get unique values for stratification if stratify_by in messages_df.columns: strata = messages_df[stratify_by].unique() samples_per_stratum = max(1, n_samples // len(strata)) selected = [] for stratum in strata: stratum_data = messages_df[messages_df[stratify_by] == stratum] n_select = min(samples_per_stratum, len(stratum_data)) selected.extend(stratum_data.sample(n=n_select, random_state=seed).to_dict('records')) # If we need more samples, randomly select from remaining if len(selected) < n_samples: remaining = messages_df[~messages_df.index.isin([s['line_number'] for s in selected])] additional = remaining.sample(n=n_samples - len(selected), random_state=seed) selected.extend(additional.to_dict('records')) # Shuffle final selection random.shuffle(selected) selected = selected[:n_samples] else: # Simple random sample if stratify column doesn't exist selected = messages_df.sample(n=min(n_samples, len(messages_df)), random_state=seed).to_dict('records') print(f"Selected {len(selected)} samples") return selected def create_labeling_template(self, samples, context_window=3): """ Create attorney labeling template with context. Shows each message with surrounding context for better evaluation. """ print(f"\nCreating labeling template with context window of {context_window}...") labeling_data = [] for i, sample in enumerate(samples, 1): # Create context (would need full dataset to get actual context) # For now, just format the sample message entry = { 'sample_id': i, 'line_number': sample.get('line_number', i), 'timestamp': sample.get('timestamp', ''), 'sender': sample.get('sender', ''), 'message': sample.get('message', ''), 'context_before': sample.get('context_before', []), 'context_after': sample.get('context_after', []), 'responsive': '', # Attorney fills this 'reasoning': '', # Attorney fills this 'criteria_matched': [] # Attorney fills this } labeling_data.append(entry) return labeling_data def save_labeling_template(self, labeling_data, filename='attorney_labeling_template.json'): """Save labeling template for attorney""" filepath = self.output_dir / filename with open(filepath, 'w') as f: json.dump(labeling_data, f, indent=2) print(f"\nLabeling template saved: {filepath}") # Also create a readable text version text_filepath = self.output_dir / filename.replace('.json', '.txt') with open(text_filepath, 'w') as f: f.write("ATTORNEY LABELING INSTRUCTIONS\n") f.write("=" * 80 + "\n\n") f.write("For each message below, please provide:\n") f.write("1. RESPONSIVE: YES or NO\n") f.write("2. REASONING: Brief explanation\n") f.write("3. CRITERIA: Which subpoena criteria matched (1-7)\n\n") f.write("=" * 80 + "\n\n") for entry in labeling_data: f.write(f"SAMPLE {entry['sample_id']}\n") f.write("-" * 80 + "\n") f.write(f"Line: {entry['line_number']}\n") f.write(f"Time: {entry['timestamp']}\n") f.write(f"Sender: {entry['sender']}\n") f.write(f"Message: {entry['message']}\n\n") f.write("RESPONSIVE: _______\n") f.write("REASONING: _______________________________________\n") f.write("CRITERIA: _______\n") f.write("\n" + "=" * 80 + "\n\n") print(f"Text template saved: {text_filepath}") return filepath def load_labeled_samples(self, filepath): """Load attorney-labeled samples""" with open(filepath, 'r') as f: return json.load(f) def create_few_shot_examples(self, labeled_samples): """ Convert attorney-labeled samples into few-shot examples for prompts. """ few_shot_examples = [] for sample in labeled_samples: if sample.get('responsive'): # Only include if attorney labeled it example = { 'message': sample['message'], 'responsive': sample['responsive'], 'reasoning': sample['reasoning'], 'criteria': sample.get('criteria_matched', []) } few_shot_examples.append(example) return few_shot_examples def format_few_shot_prompt(self, few_shot_examples): """Format few-shot examples for inclusion in prompts""" prompt_text = "Here are examples of how to classify messages:\n\n" for i, example in enumerate(few_shot_examples, 1): status = "RESPONSIVE" if example['responsive'].upper() == 'YES' else "NOT RESPONSIVE" prompt_text += f"Example {i} ({status}):\n" prompt_text += f'Message: "{example["message"]}"\n' prompt_text += f"Reasoning: {example['reasoning']}\n" if example.get('criteria'): prompt_text += f"Criteria matched: {', '.join(map(str, example['criteria']))}\n" prompt_text += "\n" return prompt_text # Example usage if __name__ == "__main__": selector = RandomSampleSelector() # Load filtered candidates (from previous pipeline step) # candidates_df = pd.read_csv('discovery_output/filtered/candidate_messages.csv') # Select 20 random samples # samples = selector.select_stratified_sample(candidates_df, n_samples=20) # Create labeling template # labeling_data = selector.create_labeling_template(samples) # Save for attorney # selector.save_labeling_template(labeling_data) print("\nTo use this script:") print("1. Load your filtered candidate messages") print("2. Run select_stratified_sample() to get random samples") print("3. Run create_labeling_template() to format for attorney") print("4. Attorney labels the samples") print("5. Run create_few_shot_examples() to convert to prompt format")