""" Step 6: Generate attorney labeling template. """ import random from typing import List from pipeline.models.base import PipelineStep from pipeline.common_defs import Chunk, CASE_NAME, SUBPOENA_CRITERIA import pandas as pd class LabelingTemplateGenerator(PipelineStep): """Generate attorney labeling template""" def __init__(self, output_dir: str = './pipeline_output'): super().__init__(output_dir) def execute(self, samples: List[Chunk]) -> str: """ Generate attorney labeling template. Args: samples: List of sampled chunks Returns: Path to generated template file """ self.logger.info(f"Generating labeling template for {len(samples)} samples...") template = self._create_template(samples) filepath = self.output_dir / 'attorney_labeling_template.txt' with open(filepath, 'w') as f: f.write(template) self.logger.info(f"Template saved to: {filepath}") return str(filepath) def _create_template(self, samples: List[Chunk]) -> str: """Create the template content""" lines = [] # Header lines.append("ATTORNEY LABELING TEMPLATE") lines.append(CASE_NAME) lines.append("=" * 80) lines.append("") # Instructions lines.append("INSTRUCTIONS:") lines.append("For each message below, please provide:") lines.append("1. RESPONSIVE: YES or NO") lines.append("2. REASONING: Brief explanation of your decision") lines.append("3. CRITERIA: Which subpoena criteria matched (1-7):") lines.append("") for num, desc in SUBPOENA_CRITERIA.items(): lines.append(f" {num}. {desc}") lines.append("") lines.append("=" * 80) lines.append("") # Samples for i, sample in enumerate(samples, 1): lines.extend(self._format_sample(i, sample)) return "\n".join(lines) def _format_sample(self, sample_num: int, chunk: Chunk) -> List[str]: """Format a single sample""" lines = [] lines.append(f"SAMPLE {sample_num}") lines.append("-" * 80) # First message (target for labeling) if chunk.messages: target_message_idx = random.randint(3, len(chunk.messages) - 4) target_msg = chunk.messages[target_message_idx] lines.append(f"Line: {target_msg.line_number}") lines.append(f"Time: {target_msg.timestamp}") lines.append(f"Sender: {target_msg.sender}") lines.append(f"Message: {target_msg.message}") lines.append("") # Context (surrounding messages) lines.append("Context (surrounding messages):") start_message_idx = max(0, target_message_idx - 3) end_message_idx = target_message_idx + 4 for j, msg in enumerate(chunk.messages[start_message_idx:end_message_idx]): marker = ">>>" if j + start_message_idx == target_message_idx else " " msg_preview = ( msg.message[:100] + "..." if len(msg.message) > 100 else msg.message ) lines.append(f"{marker} [{msg.sender}]: {msg_preview}") lines.append("") # Response fields lines.append("RESPONSIVE: ") lines.append("REASONING: ") lines.append("CRITERIA: ") lines.append("") lines.append("=" * 80) lines.append("") return lines if __name__ == "__main__": # Example usage import json from pipeline.common_defs import Chunk, Message with open('pipeline_output/random_samples.json', 'r') as f: samples_data = json.load(f) generator = LabelingTemplateGenerator() # Reconstruct chunks (simplified) samples = [] message_df = pd.read_csv(f"{generator.output_dir}/preprocessed_messages.csv") for item in samples_data: # retrieve messages messages = [] start_timestamp = "" end_timestamp = "" normalized_messages = [] for i in range(item["start_line"], item["end_line"] + 1): # print(f"line {i}\n") row_df = message_df.query(f"line_number == {i}") # print(row_df) row = row_df.iloc[0] if i == item["start_line"]: start_timestamp = row["timestamp"] elif i == item["end_line"]: end_timestamp = row["timestamp"] message = Message( line_number=i, timestamp=row["timestamp"], sender=row["sender"], message=row["message"], message_normalized=row["message_normalized"], ) messages.append(message) normalized_messages.append(row["message_normalized"]) chunk = Chunk( chunk_id=item["chunk_id"], start_line=item["start_line"], end_line=item["end_line"], messages=messages, combined_text="\n".join(normalized_messages), timestamp_start=start_timestamp, timestamp_end=end_timestamp, ) samples.append(chunk) template_path = generator.execute(samples) print(f"Template created: {template_path}")