| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159 |
- """
- Step 6: Generate attorney labeling template.
- """
- import random
- from typing import List
- from pipeline.models.base import PipelineStep
- from pipeline.common_defs import Chunk, CASE_NAME, SUBPOENA_CRITERIA
- import pandas as pd
- class LabelingTemplateGenerator(PipelineStep):
- """Generate attorney labeling template"""
- def __init__(self, output_dir: str = './pipeline_output'):
- super().__init__(output_dir)
- def execute(self, samples: List[Chunk]) -> str:
- """
- Generate attorney labeling template.
-
- Args:
- samples: List of sampled chunks
-
- Returns:
- Path to generated template file
- """
- self.logger.info(f"Generating labeling template for {len(samples)} samples...")
- template = self._create_template(samples)
- filepath = self.output_dir / 'attorney_labeling_template.txt'
- with open(filepath, 'w') as f:
- f.write(template)
- self.logger.info(f"Template saved to: {filepath}")
- return str(filepath)
- def _create_template(self, samples: List[Chunk]) -> str:
- """Create the template content"""
- lines = []
- # Header
- lines.append("ATTORNEY LABELING TEMPLATE")
- lines.append(CASE_NAME)
- lines.append("=" * 80)
- lines.append("")
- # Instructions
- lines.append("INSTRUCTIONS:")
- lines.append("For each message below, please provide:")
- lines.append("1. RESPONSIVE: YES or NO")
- lines.append("2. REASONING: Brief explanation of your decision")
- lines.append("3. CRITERIA: Which subpoena criteria matched (1-7):")
- lines.append("")
- for num, desc in SUBPOENA_CRITERIA.items():
- lines.append(f" {num}. {desc}")
- lines.append("")
- lines.append("=" * 80)
- lines.append("")
- # Samples
- for i, sample in enumerate(samples, 1):
- lines.extend(self._format_sample(i, sample))
- return "\n".join(lines)
- def _format_sample(self, sample_num: int, chunk: Chunk) -> List[str]:
- """Format a single sample"""
- lines = []
- lines.append(f"SAMPLE {sample_num}")
- lines.append("-" * 80)
- # First message (target for labeling)
- if chunk.messages:
- target_message_idx = random.randint(3, len(chunk.messages) - 4)
- target_msg = chunk.messages[target_message_idx]
- lines.append(f"Line: {target_msg.line_number}")
- lines.append(f"Time: {target_msg.timestamp}")
- lines.append(f"Sender: {target_msg.sender}")
- lines.append(f"Message: {target_msg.message}")
- lines.append("")
- # Context (surrounding messages)
- lines.append("Context (surrounding messages):")
- start_message_idx = max(0, target_message_idx - 3)
- end_message_idx = target_message_idx + 4
- for j, msg in enumerate(chunk.messages[start_message_idx:end_message_idx]):
- marker = ">>>" if j + start_message_idx == target_message_idx else " "
- msg_preview = (
- msg.message[:100] + "..." if len(msg.message) > 100 else msg.message
- )
- lines.append(f"{marker} [{msg.sender}]: {msg_preview}")
- lines.append("")
- # Response fields
- lines.append("RESPONSIVE: ")
- lines.append("REASONING: ")
- lines.append("CRITERIA: ")
- lines.append("")
- lines.append("=" * 80)
- lines.append("")
- return lines
- if __name__ == "__main__":
- # Example usage
- import json
- from pipeline.common_defs import Chunk, Message
- with open('pipeline_output/random_samples.json', 'r') as f:
- samples_data = json.load(f)
- generator = LabelingTemplateGenerator()
- # Reconstruct chunks (simplified)
- samples = []
- message_df = pd.read_csv(f"{generator.output_dir}/preprocessed_messages.csv")
- for item in samples_data:
- # retrieve messages
- messages = []
- start_timestamp = ""
- end_timestamp = ""
- normalized_messages = []
- for i in range(item["start_line"], item["end_line"] + 1):
- # print(f"line {i}\n")
- row_df = message_df.query(f"line_number == {i}")
- # print(row_df)
- row = row_df.iloc[0]
- if i == item["start_line"]:
- start_timestamp = row["timestamp"]
- elif i == item["end_line"]:
- end_timestamp = row["timestamp"]
- message = Message(
- line_number=i,
- timestamp=row["timestamp"],
- sender=row["sender"],
- message=row["message"],
- message_normalized=row["message_normalized"],
- )
- messages.append(message)
- normalized_messages.append(row["message_normalized"])
- chunk = Chunk(
- chunk_id=item["chunk_id"],
- start_line=item["start_line"],
- end_line=item["end_line"],
- messages=messages,
- combined_text="\n".join(normalized_messages),
- timestamp_start=start_timestamp,
- timestamp_end=end_timestamp,
- )
- samples.append(chunk)
- template_path = generator.execute(samples)
- print(f"Template created: {template_path}")
|