| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190 |
- #!/usr/bin/env python3
- """
- Random Sample Selector for Attorney Labeling
- Selects representative messages from filtered candidates for few-shot learning
- """
- import pandas as pd
- import random
- import json
- from pathlib import Path
- from datetime import datetime
- class RandomSampleSelector:
- """
- Selects random representative samples for attorney labeling.
- Ensures diversity across senders, time periods, and keyword matches.
- """
-
- def __init__(self, output_dir='./labeling_samples'):
- self.output_dir = Path(output_dir)
- self.output_dir.mkdir(exist_ok=True)
-
- def select_stratified_sample(self, messages_df, n_samples=20,
- stratify_by='sender', seed=42):
- """
- Select stratified random sample ensuring diversity.
-
- Args:
- messages_df: DataFrame with filtered candidate messages
- n_samples: Number of samples to select
- stratify_by: Column to stratify by ('sender', 'date', etc.)
- seed: Random seed for reproducibility
- """
- random.seed(seed)
-
- print(f"\nSelecting {n_samples} samples stratified by {stratify_by}...")
-
- # Get unique values for stratification
- if stratify_by in messages_df.columns:
- strata = messages_df[stratify_by].unique()
- samples_per_stratum = max(1, n_samples // len(strata))
-
- selected = []
- for stratum in strata:
- stratum_data = messages_df[messages_df[stratify_by] == stratum]
- n_select = min(samples_per_stratum, len(stratum_data))
- selected.extend(stratum_data.sample(n=n_select, random_state=seed).to_dict('records'))
-
- # If we need more samples, randomly select from remaining
- if len(selected) < n_samples:
- remaining = messages_df[~messages_df.index.isin([s['line_number'] for s in selected])]
- additional = remaining.sample(n=n_samples - len(selected), random_state=seed)
- selected.extend(additional.to_dict('records'))
-
- # Shuffle final selection
- random.shuffle(selected)
- selected = selected[:n_samples]
- else:
- # Simple random sample if stratify column doesn't exist
- selected = messages_df.sample(n=min(n_samples, len(messages_df)),
- random_state=seed).to_dict('records')
-
- print(f"Selected {len(selected)} samples")
- return selected
-
- def create_labeling_template(self, samples, context_window=3):
- """
- Create attorney labeling template with context.
- Shows each message with surrounding context for better evaluation.
- """
- print(f"\nCreating labeling template with context window of {context_window}...")
-
- labeling_data = []
-
- for i, sample in enumerate(samples, 1):
- # Create context (would need full dataset to get actual context)
- # For now, just format the sample message
- entry = {
- 'sample_id': i,
- 'line_number': sample.get('line_number', i),
- 'timestamp': sample.get('timestamp', ''),
- 'sender': sample.get('sender', ''),
- 'message': sample.get('message', ''),
- 'context_before': sample.get('context_before', []),
- 'context_after': sample.get('context_after', []),
- 'responsive': '', # Attorney fills this
- 'reasoning': '', # Attorney fills this
- 'criteria_matched': [] # Attorney fills this
- }
- labeling_data.append(entry)
-
- return labeling_data
-
- def save_labeling_template(self, labeling_data, filename='attorney_labeling_template.json'):
- """Save labeling template for attorney"""
- filepath = self.output_dir / filename
-
- with open(filepath, 'w') as f:
- json.dump(labeling_data, f, indent=2)
-
- print(f"\nLabeling template saved: {filepath}")
-
- # Also create a readable text version
- text_filepath = self.output_dir / filename.replace('.json', '.txt')
- with open(text_filepath, 'w') as f:
- f.write("ATTORNEY LABELING INSTRUCTIONS\n")
- f.write("=" * 80 + "\n\n")
- f.write("For each message below, please provide:\n")
- f.write("1. RESPONSIVE: YES or NO\n")
- f.write("2. REASONING: Brief explanation\n")
- f.write("3. CRITERIA: Which subpoena criteria matched (1-7)\n\n")
- f.write("=" * 80 + "\n\n")
-
- for entry in labeling_data:
- f.write(f"SAMPLE {entry['sample_id']}\n")
- f.write("-" * 80 + "\n")
- f.write(f"Line: {entry['line_number']}\n")
- f.write(f"Time: {entry['timestamp']}\n")
- f.write(f"Sender: {entry['sender']}\n")
- f.write(f"Message: {entry['message']}\n\n")
- f.write("RESPONSIVE: _______\n")
- f.write("REASONING: _______________________________________\n")
- f.write("CRITERIA: _______\n")
- f.write("\n" + "=" * 80 + "\n\n")
-
- print(f"Text template saved: {text_filepath}")
-
- return filepath
-
- def load_labeled_samples(self, filepath):
- """Load attorney-labeled samples"""
- with open(filepath, 'r') as f:
- return json.load(f)
-
- def create_few_shot_examples(self, labeled_samples):
- """
- Convert attorney-labeled samples into few-shot examples for prompts.
- """
- few_shot_examples = []
-
- for sample in labeled_samples:
- if sample.get('responsive'): # Only include if attorney labeled it
- example = {
- 'message': sample['message'],
- 'responsive': sample['responsive'],
- 'reasoning': sample['reasoning'],
- 'criteria': sample.get('criteria_matched', [])
- }
- few_shot_examples.append(example)
-
- return few_shot_examples
-
- def format_few_shot_prompt(self, few_shot_examples):
- """Format few-shot examples for inclusion in prompts"""
- prompt_text = "Here are examples of how to classify messages:\n\n"
-
- for i, example in enumerate(few_shot_examples, 1):
- status = "RESPONSIVE" if example['responsive'].upper() == 'YES' else "NOT RESPONSIVE"
- prompt_text += f"Example {i} ({status}):\n"
- prompt_text += f'Message: "{example["message"]}"\n'
- prompt_text += f"Reasoning: {example['reasoning']}\n"
- if example.get('criteria'):
- prompt_text += f"Criteria matched: {', '.join(map(str, example['criteria']))}\n"
- prompt_text += "\n"
-
- return prompt_text
- # Example usage
- if __name__ == "__main__":
- selector = RandomSampleSelector()
-
- # Load filtered candidates (from previous pipeline step)
- # candidates_df = pd.read_csv('discovery_output/filtered/candidate_messages.csv')
-
- # Select 20 random samples
- # samples = selector.select_stratified_sample(candidates_df, n_samples=20)
-
- # Create labeling template
- # labeling_data = selector.create_labeling_template(samples)
-
- # Save for attorney
- # selector.save_labeling_template(labeling_data)
-
- print("\nTo use this script:")
- print("1. Load your filtered candidate messages")
- print("2. Run select_stratified_sample() to get random samples")
- print("3. Run create_labeling_template() to format for attorney")
- print("4. Attorney labels the samples")
- print("5. Run create_few_shot_examples() to convert to prompt format")
|