step6_labeling_template.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. """
  2. Step 6: Generate attorney labeling template.
  3. """
  4. import random
  5. from typing import List
  6. from pipeline.models.base import PipelineStep
  7. from pipeline.common_defs import Chunk, CASE_NAME, SUBPOENA_CRITERIA
  8. import pandas as pd
  9. class LabelingTemplateGenerator(PipelineStep):
  10. """Generate attorney labeling template"""
  11. def __init__(self, output_dir: str = './pipeline_output'):
  12. super().__init__(output_dir)
  13. def execute(self, samples: List[Chunk]) -> str:
  14. """
  15. Generate attorney labeling template.
  16. Args:
  17. samples: List of sampled chunks
  18. Returns:
  19. Path to generated template file
  20. """
  21. self.logger.info(f"Generating labeling template for {len(samples)} samples...")
  22. template = self._create_template(samples)
  23. filepath = self.output_dir / 'attorney_labeling_template.txt'
  24. with open(filepath, 'w') as f:
  25. f.write(template)
  26. self.logger.info(f"Template saved to: {filepath}")
  27. return str(filepath)
  28. def _create_template(self, samples: List[Chunk]) -> str:
  29. """Create the template content"""
  30. lines = []
  31. # Header
  32. lines.append("ATTORNEY LABELING TEMPLATE")
  33. lines.append(CASE_NAME)
  34. lines.append("=" * 80)
  35. lines.append("")
  36. # Instructions
  37. lines.append("INSTRUCTIONS:")
  38. lines.append("For each message below, please provide:")
  39. lines.append("1. RESPONSIVE: YES or NO")
  40. lines.append("2. REASONING: Brief explanation of your decision")
  41. lines.append("3. CRITERIA: Which subpoena criteria matched (1-7):")
  42. lines.append("")
  43. for num, desc in SUBPOENA_CRITERIA.items():
  44. lines.append(f" {num}. {desc}")
  45. lines.append("")
  46. lines.append("=" * 80)
  47. lines.append("")
  48. # Samples
  49. for i, sample in enumerate(samples, 1):
  50. lines.extend(self._format_sample(i, sample))
  51. return "\n".join(lines)
  52. def _format_sample(self, sample_num: int, chunk: Chunk) -> List[str]:
  53. """Format a single sample"""
  54. lines = []
  55. lines.append(f"SAMPLE {sample_num}")
  56. lines.append("-" * 80)
  57. # First message (target for labeling)
  58. if chunk.messages:
  59. target_message_idx = random.randint(3, len(chunk.messages) - 4)
  60. target_msg = chunk.messages[target_message_idx]
  61. lines.append(f"Line: {target_msg.line_number}")
  62. lines.append(f"Time: {target_msg.timestamp}")
  63. lines.append(f"Sender: {target_msg.sender}")
  64. lines.append(f"Message: {target_msg.message}")
  65. lines.append("")
  66. # Context (surrounding messages)
  67. lines.append("Context (surrounding messages):")
  68. start_message_idx = max(0, target_message_idx - 3)
  69. end_message_idx = target_message_idx + 4
  70. for j, msg in enumerate(chunk.messages[start_message_idx:end_message_idx]):
  71. marker = ">>>" if j + start_message_idx == target_message_idx else " "
  72. msg_preview = (
  73. msg.message[:100] + "..." if len(msg.message) > 100 else msg.message
  74. )
  75. lines.append(f"{marker} [{msg.sender}]: {msg_preview}")
  76. lines.append("")
  77. # Response fields
  78. lines.append("RESPONSIVE: ")
  79. lines.append("REASONING: ")
  80. lines.append("CRITERIA: ")
  81. lines.append("")
  82. lines.append("=" * 80)
  83. lines.append("")
  84. return lines
  85. if __name__ == "__main__":
  86. # Example usage
  87. import json
  88. from pipeline.common_defs import Chunk, Message
  89. with open('pipeline_output/random_samples.json', 'r') as f:
  90. samples_data = json.load(f)
  91. generator = LabelingTemplateGenerator()
  92. # Reconstruct chunks (simplified)
  93. samples = []
  94. message_df = pd.read_csv(f"{generator.output_dir}/preprocessed_messages.csv")
  95. for item in samples_data:
  96. # retrieve messages
  97. messages = []
  98. start_timestamp = ""
  99. end_timestamp = ""
  100. normalized_messages = []
  101. for i in range(item["start_line"], item["end_line"] + 1):
  102. # print(f"line {i}\n")
  103. row_df = message_df.query(f"line_number == {i}")
  104. # print(row_df)
  105. row = row_df.iloc[0]
  106. if i == item["start_line"]:
  107. start_timestamp = row["timestamp"]
  108. elif i == item["end_line"]:
  109. end_timestamp = row["timestamp"]
  110. message = Message(
  111. line_number=i,
  112. timestamp=row["timestamp"],
  113. sender=row["sender"],
  114. message=row["message"],
  115. message_normalized=row["message_normalized"],
  116. )
  117. messages.append(message)
  118. normalized_messages.append(row["message_normalized"])
  119. chunk = Chunk(
  120. chunk_id=item["chunk_id"],
  121. start_line=item["start_line"],
  122. end_line=item["end_line"],
  123. messages=messages,
  124. combined_text="\n".join(normalized_messages),
  125. timestamp_start=start_timestamp,
  126. timestamp_end=end_timestamp,
  127. )
  128. samples.append(chunk)
  129. template_path = generator.execute(samples)
  130. print(f"Template created: {template_path}")