justadri
/
disco


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
							"""
Step 6: Generate attorney labeling template.
"""

import random
from typing import List
from pipeline.models.base import PipelineStep
from pipeline.common_defs import Chunk, CASE_NAME, SUBPOENA_CRITERIA
import pandas as pd

class LabelingTemplateGenerator(PipelineStep):
    """Generate attorney labeling template"""

    def __init__(self, output_dir: str = './pipeline_output'):
        super().__init__(output_dir)

    def execute(self, samples: List[Chunk]) -> str:
        """
        Generate attorney labeling template.
        
        Args:
            samples: List of sampled chunks
            
        Returns:
            Path to generated template file
        """
        self.logger.info(f"Generating labeling template for {len(samples)} samples...")

        template = self._create_template(samples)

        filepath = self.output_dir / 'attorney_labeling_template.txt'
        with open(filepath, 'w') as f:
            f.write(template)

        self.logger.info(f"Template saved to: {filepath}")

        return str(filepath)

    def _create_template(self, samples: List[Chunk]) -> str:
        """Create the template content"""
        lines = []

        # Header
        lines.append("ATTORNEY LABELING TEMPLATE")
        lines.append(CASE_NAME)
        lines.append("=" * 80)
        lines.append("")

        # Instructions
        lines.append("INSTRUCTIONS:")
        lines.append("For each message below, please provide:")
        lines.append("1. RESPONSIVE: YES or NO")
        lines.append("2. REASONING: Brief explanation of your decision")
        lines.append("3. CRITERIA: Which subpoena criteria matched (1-7):")
        lines.append("")

        for num, desc in SUBPOENA_CRITERIA.items():
            lines.append(f"   {num}. {desc}")

        lines.append("")
        lines.append("=" * 80)
        lines.append("")

        # Samples
        for i, sample in enumerate(samples, 1):
            lines.extend(self._format_sample(i, sample))

        return "\n".join(lines)

    def _format_sample(self, sample_num: int, chunk: Chunk) -> List[str]:
        """Format a single sample"""
        lines = []

        lines.append(f"SAMPLE {sample_num}")
        lines.append("-" * 80)

        # First message (target for labeling)
        if chunk.messages:
            target_message_idx = random.randint(3, len(chunk.messages) - 4)
            target_msg = chunk.messages[target_message_idx]
            lines.append(f"Line: {target_msg.line_number}")
            lines.append(f"Time: {target_msg.timestamp}")
            lines.append(f"Sender: {target_msg.sender}")
            lines.append(f"Message: {target_msg.message}")
            lines.append("")

            # Context (surrounding messages)
            lines.append("Context (surrounding messages):")
            start_message_idx = max(0, target_message_idx - 3)
            end_message_idx = target_message_idx + 4
            for j, msg in enumerate(chunk.messages[start_message_idx:end_message_idx]):
                marker = ">>>" if j + start_message_idx == target_message_idx else "   "
                msg_preview = (
                    msg.message[:100] + "..." if len(msg.message) > 100 else msg.message
                )
                lines.append(f"{marker} [{msg.sender}]: {msg_preview}")
            lines.append("")

        # Response fields
        lines.append("RESPONSIVE: ")
        lines.append("REASONING: ")
        lines.append("CRITERIA: ")
        lines.append("")
        lines.append("=" * 80)
        lines.append("")

        return lines

if __name__ == "__main__":
    # Example usage
    import json
    from pipeline.common_defs import Chunk, Message

    with open('pipeline_output/random_samples.json', 'r') as f:
        samples_data = json.load(f)

    generator = LabelingTemplateGenerator()

    # Reconstruct chunks (simplified)
    samples = []
    message_df = pd.read_csv(f"{generator.output_dir}/preprocessed_messages.csv")
    for item in samples_data:
        # retrieve messages
        messages = []
        start_timestamp = ""
        end_timestamp = ""
        normalized_messages = []
        for i in range(item["start_line"], item["end_line"] + 1):
            # print(f"line {i}\n")
            row_df = message_df.query(f"line_number == {i}")
            # print(row_df)
            row = row_df.iloc[0]
            if i == item["start_line"]:
                start_timestamp = row["timestamp"]
            elif i == item["end_line"]:
                end_timestamp = row["timestamp"]
            message = Message(
                line_number=i,
                timestamp=row["timestamp"],
                sender=row["sender"],
                message=row["message"],
                message_normalized=row["message_normalized"],
            )
            messages.append(message)
            normalized_messages.append(row["message_normalized"])

        chunk = Chunk(
            chunk_id=item["chunk_id"],
            start_line=item["start_line"],
            end_line=item["end_line"],
            messages=messages,
            combined_text="\n".join(normalized_messages),
            timestamp_start=start_timestamp,
            timestamp_end=end_timestamp,
        )
        samples.append(chunk)

    template_path = generator.execute(samples)
    print(f"Template created: {template_path}")