4 veckor sedan · dba66894ae
--- a/_docs/methodology
+++ b/_docs/methodology
@@ -0,0 +1,53 @@
 
				+i have a transcript of a chat between two individuals spanning  6 years and nearly 200,000 messages, with an average token length of 50. i have received a legal subpoerna to produce the chat messages that are responsive to a specific set of criteria (e.g having to do with discrimination and/or treatment at a specific medical facility). the chat data is of a personal nature and most of it is not responsive to the subpoena. i would like to have an llm analyze the chat messages and identify the responsive messages. i have access to datacenter gpu compute instances. what would be efficient ways to approach this task for maximum accuracy?
			
 
				+
			
 
				+my budget for this project, not including labor, is $100. i need the llm output as soon as possible, preferably within a day. 
			
 
				+
			
 
				+the source is signal chat backups; they have been decrypted resulting in sqlite databases of messages & metadata and indiviual binary attachment files of chat media (images, videos, voice memos, pdfs, other files). the databases have the messgae timestamps and senders. i do not know how the media files are linked to their proper places in the chat, if at all.
			
 
				+
			
 
				+for ease of processing, the messages have been combined into a single csv file with message, timestamp, and sender, but no attachments are included. most of the attachments are probably non-responsive; if you think they should be processed, that can be done after the initial production.
			
 
				+
			
 
				+here is the complete subpoena criteria:
			
 
				+
			
 
				+All messages and media related to:
			
 
				+• Plaintiff Jennifer Capasso's treatment at Memorial Sloan Kettering Cancer Center (“MSK”).
			
 
				+• Any complaint and/or response to such complaint made to any MSK staff member, personnel, patient representative, or any other employee at MSK related to Plaintiff Jennifer Capasso.
			
 
				+• Requests to update Plaintiff Jennifer Capasso's patient information at MSK, including requests to update her pronouns or gender identity markers.
			
 
				+• The gender markers used to identify Plaintiff Jennifer Capasso at other hospitals where she received medical care.
			
 
				+• Prior instances of discrimination that Plaintiff Jennifer Capasso has experienced based on her gender identity in any setting.
			
 
				+• Plaintiff Jennifer Capasso’s March 7, 2022 surgery at MSK.
			
 
				+• Any emotional distress, mental anguish, pain and suffering, and/or other economic and noneconomic loss resulting from Plaintiff Jennifer Capasso’s treatment at MSK.
			
 
				+
			
 
				+The methodology should attemp to balance recall and precision, but err on the side of recall/over-inclusion
			
 
				+
			
 
				+The methodology should used a hybrid approach to determine responsivenes, with semantic analysis, embedding comparisons & keyword inclusion.
			
 
				+
			
 
				+The methodology should include text normalization and keyword expansion .
			
 
				+
			
 
				+The results will be manually reviewed by myself and then by my attorney. We will need a way to delete or redact entire messages or parts thereof that are non-responsive.
			
 
				+
			
 
				+cloud/iaas/paas/third party apis can be used if data is not potentially shared with law enforcement or otherwise retained. the gpu compute instances are on services like runpod and vast.ai.
			
 
				+
			
 
				+legal counsel has approved of using this approach as long as the methodology is defensibe and documented (methodology documentaton needed) and the results are human-verifiable
			
 
				+
			
 
				+the method will need to analyze chunks of chat and not lines in isolation. some topics appear in the chat and then are not mentioned again until dozens or hundreds of messages later; i am not sure if we neeed some means of preserving that context.
			
 
				+
			
 
				+the llm step(s) are the first pass; only a sample of the messages deemed non-responsive will be human-reviewed for confirmation.
			
 
				+
			
 
				+the results should be in spreadsheet format, one row per message, with line number, original message content, responsiveness score, reasoning, and 2-5 messages around responsive messages. the reasoning can be for blocks of messages rather than individual messages. 
			
 
				+
			
 
				+must meet time & budget constraints and minimize labor
			
 
				+
			
 
				+the subpoena applies to entire chat timeline. i have authority for both parties in the chat; the other party is the plaintiff in the case
			
 
				+
			
 
				+either parties' messages may be responsive & they share similar privacy concerns
			
 
				+
			
 
				+large models (70b+) can be used and llm fine-tuning can be considered. testing and training datasets can be created.
			
 
				+
			
 
				+
			
 
				+https://maps.dgtlu.net/api/v1/ingest/owntracks?token=db032ce8-5ff1-423e-97cc-5fbbc2276008&_type=location&acc=10&lat=37.7749&lon=-122.4194&tst=2023-01-01T12%3A00%3A00Z
			
 
				+
			
 
				+
			
 
				+https://maps-notify.dgtlu.net?key=db032ce8-5ff1-423e-97cc-5fbbc2276008&_type=location&acc=10&lat={0}&lon={1}&tst={2}
			
 
				+
			
 
				+
			
 
				+https://maps-notify.dgtlu.net?key=db032ce8-5ff1-423e-97cc-5fbbc2276008&_type=location&acc=10&tst=1765192905&lat=43.111585&lon=-77.644959
			
--- a/_docs/subpoena_criteria_summary.txt
+++ b/_docs/subpoena_criteria_summary.txt
@@ -0,0 +1,7 @@
 
				+1: Jennifer's medical treatment at MSK
			
 
				+2: Any complaints related to Jennifer made to any MSK staff
			
 
				+3: Requests to change Jennifer's MSK medical records
			
 
				+4: Jennifer's gender identity issues at other medical facilities
			
 
				+5: Jennifer's negative experiences due to gender identity in any context
			
 
				+6: Jennifer's March 7, 2022 surgery at MSK
			
 
				+7: Distress or loss related to Jennifer's negative experiences at MSK.
			
--- a/pipeline/README.md
+++ b/pipeline/README.md
--- a/pipeline/common_defs.py
+++ b/pipeline/common_defs.py
@@ -741,7 +741,7 @@ SUBPOENA_CRITERIA = {
 
				 }
			
 
				 
			
 
				 # Query texts for semantic filtering
			
 
				-SEMANTIC_QUERIES = SUBPOENA_CRITERIA.values()
			
 
				+SEMANTIC_QUERIES: list[str] = list(SUBPOENA_CRITERIA.values())
			
 
				 
			
 
				 # Model configurations
			
 
				 class ModelConfig:
			
--- a/pipeline/main.py
+++ b/pipeline/main.py
@@ -0,0 +1,6 @@
 
				+def main():
			
 
				+    print("Hello from pipeline!")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/pipeline/pyproject.toml
+++ b/pipeline/pyproject.toml
@@ -0,0 +1,7 @@
 
				+[project]
			
 
				+name = "pipeline"
			
 
				+version = "0.1.0"
			
 
				+description = "Add your description here"
			
 
				+readme = "README.md"
			
 
				+requires-python = ">=3.12"
			
 
				+dependencies = []
			
--- a/pipeline/steps/step0b2_llm_keyword_identification.py
+++ b/pipeline/steps/step0b2_llm_keyword_identification.py
@@ -17,9 +17,9 @@ class LLMKeywordIdentifier(PipelineStep):
 
				 
			
 
				     def __init__(
			
 
				         self,
			
 
				-        llm_url: str = "https://scripting-logo-pad-flat.trycloudflare.com",
			
 
				+        llm_url: str = "http://localhost:8000",
			
 
				         sample_size: int = 1000,
			
 
				-        model: str = "Qwen/Qwen2.5-14B-Instruct",
			
 
				+        model: str = "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8",
			
 
				         output_dir: str = "./pipeline_output",
			
 
				     ):
			
 
				         super().__init__(output_dir)
			
--- a/pipeline/steps/step0b_keyword_identification.py
+++ b/pipeline/steps/step0b_keyword_identification.py
@@ -14,31 +14,35 @@ from pipeline.utils.combine_keywords import combine_keywords, analyze_overlap
 
				 if __name__ == "__main__":
			
 
				     df = pd.read_csv("../_sources/signal_messages.csv")
			
 
				     ski = SemanticKeywordIdentifier()
			
 
				-    # semantic_keywords = ski.execute(df=df)
			
 
				+    semantic_keywords = ski.execute(df=df)
			
 
				 
			
 
				-    # lki = LLMKeywordIdentifier(llm_url="http://localhost:8000", sample_size=14000)
			
 
				-    # llm_keywords = lki.execute(df=df)
			
 
				+    lki = LLMKeywordIdentifier(
			
 
				+        llm_url="http://localhost:8000",
			
 
				+        sample_size=14000,
			
 
				+        model="Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8",
			
 
				+    )
			
 
				+    llm_keywords = lki.execute(df=df)
			
 
				 
			
 
				     out_dir = ski.output_dir
			
 
				 
			
 
				-    # with open(f"{out_dir}/semantic_keywords.json") as file:
			
 
				-    #     semantic_keywords = json.load(file)
			
 
				+    with open(f"{out_dir}/semantic_keywords.json") as file:
			
 
				+        semantic_keywords = json.load(file)
			
 
				 
			
 
				-    # with open(f"{out_dir}/llm_keywords.json") as file:
			
 
				-    #     llm_keywords = json.load(file)
			
 
				+    with open(f"{out_dir}/llm_keywords.json") as file:
			
 
				+        llm_keywords = json.load(file)
			
 
				 
			
 
				-    # combined = combine_keywords(
			
 
				-    #     semantic_results=semantic_keywords, llm_results=llm_keywords
			
 
				-    # )
			
 
				+    combined = combine_keywords(
			
 
				+        semantic_results=semantic_keywords, llm_results=llm_keywords
			
 
				+    )
			
 
				 
			
 
				-    # with open(f"{out_dir}/combined_keywords.json", "w") as out_file:
			
 
				-    #     out_file.write(json.dumps(combined))
			
 
				+    with open(f"{out_dir}/combined_keywords.json", "w") as out_file:
			
 
				+        out_file.write(json.dumps(combined))
			
 
				 
			
 
				-    # overlap = analyze_overlap(
			
 
				-    #     semantic_results=semantic_keywords, llm_results=llm_keywords
			
 
				-    # )
			
 
				-    # with open(f"{out_dir}/keyword_overlap.json", "w") as out_file:
			
 
				-    #     out_file.write(json.dumps(combined))
			
 
				+    overlap = analyze_overlap(
			
 
				+        semantic_results=semantic_keywords, llm_results=llm_keywords
			
 
				+    )
			
 
				+    with open(f"{out_dir}/keyword_overlap.json", "w") as out_file:
			
 
				+        out_file.write(json.dumps(combined))
			
 
				 
			
 
				     with open(f"{out_dir}/combined_keywords.json", "r") as file:
			
 
				         dict_all = json.load(file)
			
--- a/pipeline/steps/step3_keyword_filter.py
+++ b/pipeline/steps/step3_keyword_filter.py
@@ -6,14 +6,19 @@ from typing import List
 
				 from pipeline.models.base import PipelineStep
			
 
				 from pipeline.common_defs import Chunk, PLAINTIFF_VARIATIONS, FACILITY_NAMES, KEY_TOPICS
			
 
				 from pipeline.utils.text_utils import extract_keywords, calculate_keyword_score
			
 
				+import json
			
 
				 
			
 
				 class KeywordFilter(PipelineStep):
			
 
				     """Filter chunks by keyword matching"""
			
 
				-    
			
 
				+
			
 
				     def __init__(self, output_dir: str = './pipeline_output'):
			
 
				         super().__init__(output_dir)
			
 
				-        self.all_keywords = PLAINTIFF_VARIATIONS + FACILITY_NAMES + KEY_TOPICS
			
 
				-    
			
 
				+        with open(f"{output_dir}/final_keyword_list.json") as file:
			
 
				+            keywords = json.load(file)
			
 
				+        self.all_keywords = list(
			
 
				+            set(keywords + PLAINTIFF_VARIATIONS + FACILITY_NAMES + KEY_TOPICS)
			
 
				+        )
			
 
				+
			
 
				     def execute(self, chunks: List[Chunk]) -> List[Chunk]:
			
 
				         """
			
 
				         Filter chunks that contain relevant keywords.
			
@@ -26,26 +31,26 @@ class KeywordFilter(PipelineStep):
 
				         """
			
 
				         self.logger.info(f"Applying keyword filter to {len(chunks):,} chunks...")
			
 
				         self.logger.info(f"Using {len(self.all_keywords)} keywords")
			
 
				-        
			
 
				+
			
 
				         filtered_chunks = []
			
 
				-        
			
 
				+
			
 
				         for chunk in chunks:
			
 
				             matches = extract_keywords(chunk.combined_text, self.all_keywords)
			
 
				-            
			
 
				+
			
 
				             if matches:
			
 
				                 chunk.keyword_matches = matches
			
 
				                 chunk.keyword_score = calculate_keyword_score(matches)
			
 
				                 filtered_chunks.append(chunk)
			
 
				-        
			
 
				+
			
 
				         reduction = (1 - len(filtered_chunks) / len(chunks)) * 100
			
 
				         self.logger.info(f"Filtered: {len(filtered_chunks):,} / {len(chunks):,} chunks")
			
 
				         self.logger.info(f"Reduction: {reduction:.1f}%")
			
 
				-        
			
 
				+
			
 
				         # Save filtered chunks
			
 
				         self._save_filtered_chunks(filtered_chunks)
			
 
				-        
			
 
				+
			
 
				         return filtered_chunks
			
 
				-    
			
 
				+
			
 
				     def _save_filtered_chunks(self, chunks: List[Chunk]):
			
 
				         """Save filtered chunks with keyword info"""
			
 
				         filtered_data = []
			
@@ -59,7 +64,7 @@ class KeywordFilter(PipelineStep):
 
				                 'num_messages': len(chunk.messages)
			
 
				             }
			
 
				             filtered_data.append(chunk_dict)
			
 
				-        
			
 
				+
			
 
				         self.save_results(filtered_data, 'keyword_filtered_chunks.json')
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/pipeline/steps/step4_semantic_filter.py
+++ b/pipeline/steps/step4_semantic_filter.py
@@ -11,26 +11,18 @@ from pipeline.common_defs import Chunk, SEMANTIC_QUERIES
 
				 
			
 
				 class SemanticFilter(PipelineStep):
			
 
				     """Dual-model semantic filtering"""
			
 
				-    
			
 
				+
			
 
				     def __init__(self, threshold1: float = 0.25, threshold2: float = 0.25,
			
 
				                  merge_strategy: str = 'union', output_dir: str = './pipeline_output'):
			
 
				         super().__init__(output_dir)
			
 
				         self.threshold1 = threshold1
			
 
				         self.threshold2 = threshold2
			
 
				         self.merge_strategy = merge_strategy
			
 
				-        self.model1 = None
			
 
				-        self.model2 = None
			
 
				-    
			
 
				-    def _load_models(self):
			
 
				-        """Load embedding models"""
			
 
				-        if self.model1 is None:
			
 
				-            self.logger.info("Loading Model 1: all-MiniLM-L6-v2...")
			
 
				-            self.model1 = SentenceTransformer('all-MiniLM-L6-v2')
			
 
				-        
			
 
				-        if self.model2 is None:
			
 
				-            self.logger.info("Loading Model 2: all-mpnet-base-v2...")
			
 
				-            self.model2 = SentenceTransformer('all-mpnet-base-v2')
			
 
				-    
			
 
				+        self.logger.info("Loading Model 1: all-MiniLM-L6-v2...")
			
 
				+        self.model1 = SentenceTransformer("all-MiniLM-L6-v2")
			
 
				+        self.logger.info("Loading Model 2: all-mpnet-base-v2...")
			
 
				+        self.model2 = SentenceTransformer("all-mpnet-base-v2")
			
 
				+
			
 
				     def execute(self, chunks: List[Chunk]) -> List[Chunk]:
			
 
				         """
			
 
				         Apply semantic filtering with dual models.
			
@@ -44,58 +36,55 @@ class SemanticFilter(PipelineStep):
 
				         self.logger.info(f"Applying dual-model semantic filter...")
			
 
				         self.logger.info(f"Strategy: {self.merge_strategy}")
			
 
				         self.logger.info(f"Thresholds: Model1={self.threshold1}, Model2={self.threshold2}")
			
 
				-        
			
 
				-        # Load models
			
 
				-        self._load_models()
			
 
				-        
			
 
				+
			
 
				         # Compute query embeddings
			
 
				         self.logger.info("Computing query embeddings...")
			
 
				         query_emb1 = self.model1.encode(SEMANTIC_QUERIES)
			
 
				         query_emb2 = self.model2.encode(SEMANTIC_QUERIES)
			
 
				-        
			
 
				+
			
 
				         # Compute chunk embeddings
			
 
				         self.logger.info(f"Computing embeddings for {len(chunks):,} chunks...")
			
 
				         chunk_texts = [c.combined_text for c in chunks]
			
 
				-        
			
 
				+
			
 
				         chunk_emb1 = self.model1.encode(chunk_texts, show_progress_bar=True, batch_size=32)
			
 
				         chunk_emb2 = self.model2.encode(chunk_texts, show_progress_bar=True, batch_size=32)
			
 
				-        
			
 
				+
			
 
				         # Compute similarities
			
 
				         self.logger.info("Computing semantic similarities...")
			
 
				         similarities1 = cosine_similarity(chunk_emb1, query_emb1)
			
 
				         similarities2 = cosine_similarity(chunk_emb2, query_emb2)
			
 
				-        
			
 
				+
			
 
				         max_sim1 = similarities1.max(axis=1)
			
 
				         max_sim2 = similarities2.max(axis=1)
			
 
				-        
			
 
				+
			
 
				         # Apply merge strategy
			
 
				         filtered_chunks = []
			
 
				         for i, chunk in enumerate(chunks):
			
 
				             score1 = float(max_sim1[i])
			
 
				             score2 = float(max_sim2[i])
			
 
				-            
			
 
				+
			
 
				             passes, combined_score = self._apply_merge_strategy(
			
 
				                 score1, score2, self.merge_strategy
			
 
				             )
			
 
				-            
			
 
				+
			
 
				             if passes:
			
 
				                 chunk.semantic_score_model1 = score1
			
 
				                 chunk.semantic_score_model2 = score2
			
 
				                 chunk.semantic_score_combined = combined_score
			
 
				                 filtered_chunks.append(chunk)
			
 
				-        
			
 
				+
			
 
				         self.logger.info(f"Model 1 alone: {(max_sim1 >= self.threshold1).sum()}")
			
 
				         self.logger.info(f"Model 2 alone: {(max_sim2 >= self.threshold2).sum()}")
			
 
				         self.logger.info(f"Combined: {len(filtered_chunks):,} chunks")
			
 
				-        
			
 
				+
			
 
				         reduction = (1 - len(filtered_chunks) / len(chunks)) * 100
			
 
				         self.logger.info(f"Reduction: {reduction:.1f}%")
			
 
				-        
			
 
				+
			
 
				         # Save results
			
 
				         self._save_semantic_results(filtered_chunks, max_sim1, max_sim2)
			
 
				-        
			
 
				+
			
 
				         return filtered_chunks
			
 
				-    
			
 
				+
			
 
				     def _apply_merge_strategy(self, score1: float, score2: float, 
			
 
				                               strategy: str) -> tuple:
			
 
				         """Apply merge strategy to determine if chunk passes"""
			
@@ -109,9 +98,9 @@ class SemanticFilter(PipelineStep):
 
				             combined = 0.4 * score1 + 0.6 * score2
			
 
				             avg_threshold = 0.4 * self.threshold1 + 0.6 * self.threshold2
			
 
				             passes = combined >= avg_threshold
			
 
				-        
			
 
				+
			
 
				         return passes, combined
			
 
				-    
			
 
				+
			
 
				     def _save_semantic_results(self, chunks: List[Chunk], 
			
 
				                                max_sim1: np.ndarray, max_sim2: np.ndarray):
			
 
				         """Save semantic filtering results"""
			
@@ -139,7 +128,7 @@ class SemanticFilter(PipelineStep):
 
				                 for c in chunks
			
 
				             ]
			
 
				         }
			
 
				-        
			
 
				+
			
 
				         self.save_results(results, 'semantic_filtered_chunks.json')
			
 
				 
			
 
				 if __name__ == "__main__":
			
@@ -147,14 +136,14 @@ if __name__ == "__main__":
 
				     from pipeline.steps.step3_keyword_filter import KeywordFilter
			
 
				     from pipeline.steps.step2_create_chunks import ChunkCreator
			
 
				     import pandas as pd
			
 
				-    
			
 
				+
			
 
				     df = pd.read_csv('pipeline_output/preprocessed_messages.csv')
			
 
				     creator = ChunkCreator()
			
 
				     chunks = creator.execute(df)
			
 
				-    
			
 
				-    kw_filter = KeywordFilter()
			
 
				-    filtered = kw_filter.execute(chunks)
			
 
				-    
			
 
				-    sem_filter = SemanticFilter(threshold1=0.25, threshold2=0.25, merge_strategy='union')
			
 
				-    semantic_filtered = sem_filter.execute(filtered)
			
 
				+
			
 
				+    # kw_filter = KeywordFilter()
			
 
				+    # filtered = kw_filter.execute(chunks)
			
 
				+
			
 
				+    sem_filter = SemanticFilter(threshold1=0.1, threshold2=0.2, merge_strategy="union")
			
 
				+    semantic_filtered = sem_filter.execute(chunks)
			
 
				     print(f"Semantically filtered to {len(semantic_filtered)} chunks")
			
--- a/pipeline/steps/step5_random_sampling.py
+++ b/pipeline/steps/step5_random_sampling.py
@@ -10,13 +10,14 @@ from pipeline.common_defs import Chunk
 
				 
			
 
				 class RandomSampler(PipelineStep):
			
 
				     """Random stratified sampling for attorney labeling"""
			
 
				-    
			
 
				-    def __init__(self, n_samples: int = 20, seed: int = 42,
			
 
				-                 output_dir: str = './pipeline_output'):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, n_samples: int = 50, seed: int = 42, output_dir: str = "./pipeline_output"
			
 
				+    ):
			
 
				         super().__init__(output_dir)
			
 
				         self.n_samples = n_samples
			
 
				         self.seed = seed
			
 
				-    
			
 
				+
			
 
				     def execute(self, chunks: List[Chunk]) -> List[Chunk]:
			
 
				         """
			
 
				         Select random stratified samples.
			
@@ -29,31 +30,31 @@ class RandomSampler(PipelineStep):
 
				         """
			
 
				         self.logger.info(f"Selecting {self.n_samples} random samples...")
			
 
				         self.logger.info(f"Random seed: {self.seed}")
			
 
				-        
			
 
				+
			
 
				         random.seed(self.seed)
			
 
				-        
			
 
				+
			
 
				         # Stratify by semantic score quartiles
			
 
				         scores = [c.semantic_score_combined for c in chunks if c.semantic_score_combined]
			
 
				-        
			
 
				+
			
 
				         if not scores:
			
 
				             self.logger.warning("No semantic scores found, using random sampling")
			
 
				             samples = random.sample(chunks, min(self.n_samples, len(chunks)))
			
 
				         else:
			
 
				-            quartiles = np.percentile(scores, [25, 50, 75])
			
 
				+            quartiles = np.percentile(scores, [60, 80, 90])
			
 
				             samples = self._stratified_sample(chunks, quartiles)
			
 
				-        
			
 
				+
			
 
				         self.logger.info(f"Selected {len(samples)} samples")
			
 
				-        
			
 
				+
			
 
				         # Save samples
			
 
				         self._save_samples(samples)
			
 
				-        
			
 
				+
			
 
				         return samples
			
 
				-    
			
 
				+
			
 
				     def _stratified_sample(self, chunks: List[Chunk], 
			
 
				                           quartiles: np.ndarray) -> List[Chunk]:
			
 
				         """Perform stratified sampling by score quartiles"""
			
 
				         samples = []
			
 
				-        
			
 
				+
			
 
				         # Sample from each quartile
			
 
				         for q_low, q_high in [(0, quartiles[0]), (quartiles[0], quartiles[1]),
			
 
				                               (quartiles[1], quartiles[2]), (quartiles[2], 1.0)]:
			
@@ -62,44 +63,44 @@ class RandomSampler(PipelineStep):
 
				                 if c.semantic_score_combined and 
			
 
				                 q_low <= c.semantic_score_combined < q_high
			
 
				             ]
			
 
				-            
			
 
				+
			
 
				             if stratum:
			
 
				                 n_select = min(self.n_samples // 4, len(stratum))
			
 
				                 samples.extend(random.sample(stratum, n_select))
			
 
				-        
			
 
				+
			
 
				         # Fill remaining if needed
			
 
				         if len(samples) < self.n_samples:
			
 
				             remaining = [c for c in chunks if c not in samples]
			
 
				             if remaining:
			
 
				                 n_more = min(self.n_samples - len(samples), len(remaining))
			
 
				                 samples.extend(random.sample(remaining, n_more))
			
 
				-        
			
 
				+
			
 
				         # Shuffle and limit
			
 
				         random.shuffle(samples)
			
 
				         return samples[:self.n_samples]
			
 
				-    
			
 
				+
			
 
				     def _save_samples(self, samples: List[Chunk]):
			
 
				         """Save sampled chunks"""
			
 
				         samples_data = [
			
 
				             {
			
 
				-                'chunk_id': c.chunk_id,
			
 
				-                'start_line': c.start_line,
			
 
				-                'end_line': c.end_line,
			
 
				-                'semantic_score': c.semantic_score_combined,
			
 
				-                'num_messages': len(c.messages)
			
 
				+                "chunk_id": c.chunk_id,
			
 
				+                "start_line": c.start_line,
			
 
				+                "end_line": c.end_line,
			
 
				+                "semantic_score": c.semantic_score_combined,
			
 
				+                "num_messages": c.end_line - c.start_line,
			
 
				             }
			
 
				             for c in samples
			
 
				         ]
			
 
				-        
			
 
				+
			
 
				         self.save_results(samples_data, 'random_samples.json')
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     # Example usage
			
 
				     import json
			
 
				-    
			
 
				+
			
 
				     with open('pipeline_output/semantic_filtered_chunks.json', 'r') as f:
			
 
				         data = json.load(f)
			
 
				-    
			
 
				+
			
 
				     # Reconstruct chunks (simplified for example)
			
 
				     from pipeline.common_defs import Chunk, Message
			
 
				     chunks = []
			
@@ -115,7 +116,7 @@ if __name__ == "__main__":
 
				             semantic_score_combined=item['score_combined']
			
 
				         )
			
 
				         chunks.append(chunk)
			
 
				-    
			
 
				-    sampler = RandomSampler(n_samples=20)
			
 
				+
			
 
				+    sampler = RandomSampler(n_samples=100)
			
 
				     samples = sampler.execute(chunks)
			
 
				     print(f"Selected {len(samples)} samples")
			
--- a/pipeline/steps/step6_labeling_template.py
+++ b/pipeline/steps/step6_labeling_template.py
@@ -2,16 +2,18 @@
 
				 Step 6: Generate attorney labeling template.
			
 
				 """
			
 
				 
			
 
				+import random
			
 
				 from typing import List
			
 
				 from pipeline.models.base import PipelineStep
			
 
				 from pipeline.common_defs import Chunk, CASE_NAME, SUBPOENA_CRITERIA
			
 
				+import pandas as pd
			
 
				 
			
 
				 class LabelingTemplateGenerator(PipelineStep):
			
 
				     """Generate attorney labeling template"""
			
 
				-    
			
 
				+
			
 
				     def __init__(self, output_dir: str = './pipeline_output'):
			
 
				         super().__init__(output_dir)
			
 
				-    
			
 
				+
			
 
				     def execute(self, samples: List[Chunk]) -> str:
			
 
				         """
			
 
				         Generate attorney labeling template.
			
@@ -23,27 +25,27 @@ class LabelingTemplateGenerator(PipelineStep):
 
				             Path to generated template file
			
 
				         """
			
 
				         self.logger.info(f"Generating labeling template for {len(samples)} samples...")
			
 
				-        
			
 
				+
			
 
				         template = self._create_template(samples)
			
 
				-        
			
 
				+
			
 
				         filepath = self.output_dir / 'attorney_labeling_template.txt'
			
 
				         with open(filepath, 'w') as f:
			
 
				             f.write(template)
			
 
				-        
			
 
				+
			
 
				         self.logger.info(f"Template saved to: {filepath}")
			
 
				-        
			
 
				+
			
 
				         return str(filepath)
			
 
				-    
			
 
				+
			
 
				     def _create_template(self, samples: List[Chunk]) -> str:
			
 
				         """Create the template content"""
			
 
				         lines = []
			
 
				-        
			
 
				+
			
 
				         # Header
			
 
				         lines.append("ATTORNEY LABELING TEMPLATE")
			
 
				         lines.append(CASE_NAME)
			
 
				         lines.append("=" * 80)
			
 
				         lines.append("")
			
 
				-        
			
 
				+
			
 
				         # Instructions
			
 
				         lines.append("INSTRUCTIONS:")
			
 
				         lines.append("For each message below, please provide:")
			
@@ -51,76 +53,107 @@ class LabelingTemplateGenerator(PipelineStep):
 
				         lines.append("2. REASONING: Brief explanation of your decision")
			
 
				         lines.append("3. CRITERIA: Which subpoena criteria matched (1-7):")
			
 
				         lines.append("")
			
 
				-        
			
 
				+
			
 
				         for num, desc in SUBPOENA_CRITERIA.items():
			
 
				             lines.append(f"   {num}. {desc}")
			
 
				-        
			
 
				+
			
 
				         lines.append("")
			
 
				         lines.append("=" * 80)
			
 
				         lines.append("")
			
 
				-        
			
 
				+
			
 
				         # Samples
			
 
				         for i, sample in enumerate(samples, 1):
			
 
				             lines.extend(self._format_sample(i, sample))
			
 
				-        
			
 
				+
			
 
				         return "\n".join(lines)
			
 
				-    
			
 
				+
			
 
				     def _format_sample(self, sample_num: int, chunk: Chunk) -> List[str]:
			
 
				         """Format a single sample"""
			
 
				         lines = []
			
 
				-        
			
 
				+
			
 
				         lines.append(f"SAMPLE {sample_num}")
			
 
				         lines.append("-" * 80)
			
 
				-        
			
 
				+
			
 
				         # First message (target for labeling)
			
 
				         if chunk.messages:
			
 
				-            first_msg = chunk.messages[0]
			
 
				-            lines.append(f"Line: {first_msg.line_number}")
			
 
				-            lines.append(f"Time: {first_msg.timestamp}")
			
 
				-            lines.append(f"Sender: {first_msg.sender}")
			
 
				-            lines.append(f"Message: {first_msg.message}")
			
 
				+            target_message_idx = random.randint(3, len(chunk.messages) - 4)
			
 
				+            target_msg = chunk.messages[target_message_idx]
			
 
				+            lines.append(f"Line: {target_msg.line_number}")
			
 
				+            lines.append(f"Time: {target_msg.timestamp}")
			
 
				+            lines.append(f"Sender: {target_msg.sender}")
			
 
				+            lines.append(f"Message: {target_msg.message}")
			
 
				             lines.append("")
			
 
				-            
			
 
				+
			
 
				             # Context (surrounding messages)
			
 
				             lines.append("Context (surrounding messages):")
			
 
				-            for j, msg in enumerate(chunk.messages[:5], 1):
			
 
				-                marker = ">>>" if j == 1 else "   "
			
 
				-                msg_preview = msg.message[:80] + "..." if len(msg.message) > 80 else msg.message
			
 
				+            start_message_idx = max(0, target_message_idx - 3)
			
 
				+            end_message_idx = target_message_idx + 4
			
 
				+            for j, msg in enumerate(chunk.messages[start_message_idx:end_message_idx]):
			
 
				+                marker = ">>>" if j + start_message_idx == target_message_idx else "   "
			
 
				+                msg_preview = (
			
 
				+                    msg.message[:100] + "..." if len(msg.message) > 100 else msg.message
			
 
				+                )
			
 
				                 lines.append(f"{marker} [{msg.sender}]: {msg_preview}")
			
 
				             lines.append("")
			
 
				-        
			
 
				+
			
 
				         # Response fields
			
 
				-        lines.append("RESPONSIVE: _______")
			
 
				-        lines.append("REASONING: _____________________________________________")
			
 
				-        lines.append("CRITERIA: _______")
			
 
				+        lines.append("RESPONSIVE: ")
			
 
				+        lines.append("REASONING: ")
			
 
				+        lines.append("CRITERIA: ")
			
 
				         lines.append("")
			
 
				         lines.append("=" * 80)
			
 
				         lines.append("")
			
 
				-        
			
 
				+
			
 
				         return lines
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     # Example usage
			
 
				     import json
			
 
				     from pipeline.common_defs import Chunk, Message
			
 
				-    
			
 
				+
			
 
				     with open('pipeline_output/random_samples.json', 'r') as f:
			
 
				         samples_data = json.load(f)
			
 
				-    
			
 
				+
			
 
				+    generator = LabelingTemplateGenerator()
			
 
				+
			
 
				     # Reconstruct chunks (simplified)
			
 
				     samples = []
			
 
				+    message_df = pd.read_csv(f"{generator.output_dir}/preprocessed_messages.csv")
			
 
				     for item in samples_data:
			
 
				+        # retrieve messages
			
 
				+        messages = []
			
 
				+        start_timestamp = ""
			
 
				+        end_timestamp = ""
			
 
				+        normalized_messages = []
			
 
				+        for i in range(item["start_line"], item["end_line"] + 1):
			
 
				+            # print(f"line {i}\n")
			
 
				+            row_df = message_df.query(f"line_number == {i}")
			
 
				+            # print(row_df)
			
 
				+            row = row_df.iloc[0]
			
 
				+            if i == item["start_line"]:
			
 
				+                start_timestamp = row["timestamp"]
			
 
				+            elif i == item["end_line"]:
			
 
				+                end_timestamp = row["timestamp"]
			
 
				+            message = Message(
			
 
				+                line_number=i,
			
 
				+                timestamp=row["timestamp"],
			
 
				+                sender=row["sender"],
			
 
				+                message=row["message"],
			
 
				+                message_normalized=row["message_normalized"],
			
 
				+            )
			
 
				+            messages.append(message)
			
 
				+            normalized_messages.append(row["message_normalized"])
			
 
				+
			
 
				         chunk = Chunk(
			
 
				-            chunk_id=item['chunk_id'],
			
 
				-            start_line=item['start_line'],
			
 
				-            end_line=item['end_line'],
			
 
				-            messages=[Message(1, "", "Sender", "Sample message", "")],
			
 
				-            combined_text="",
			
 
				-            timestamp_start="",
			
 
				-            timestamp_end=""
			
 
				+            chunk_id=item["chunk_id"],
			
 
				+            start_line=item["start_line"],
			
 
				+            end_line=item["end_line"],
			
 
				+            messages=messages,
			
 
				+            combined_text="\n".join(normalized_messages),
			
 
				+            timestamp_start=start_timestamp,
			
 
				+            timestamp_end=end_timestamp,
			
 
				         )
			
 
				         samples.append(chunk)
			
 
				-    
			
 
				-    generator = LabelingTemplateGenerator()
			
 
				+
			
 
				     template_path = generator.execute(samples)
			
 
				     print(f"Template created: {template_path}")
			
--- a/pipeline/steps/step7_inference_prep.py
+++ b/pipeline/steps/step7_inference_prep.py
@@ -10,12 +10,12 @@ from pipeline.common_defs import Chunk, CASE_NAME, SUBPOENA_CRITERIA
 
				 
			
 
				 class InferencePreparation(PipelineStep):
			
 
				     """Prepare inference requests for Qwen models"""
			
 
				-    
			
 
				+
			
 
				     def __init__(self, few_shot_file: Optional[str] = None,
			
 
				                  output_dir: str = './pipeline_output'):
			
 
				         super().__init__(output_dir)
			
 
				         self.few_shot_file = few_shot_file
			
 
				-    
			
 
				+
			
 
				     def execute(self, chunks: List[Chunk]) -> str:
			
 
				         """
			
 
				         Prepare inference requests for dual Qwen models.
			
@@ -29,105 +29,55 @@ class InferencePreparation(PipelineStep):
 
				         self.logger.info("Preparing data for dual Qwen inference...")
			
 
				         self.logger.info(f"  Primary: Qwen 3 235B (state-of-the-art)")
			
 
				         self.logger.info(f"  Secondary: Qwen 2.5 72B (proven accuracy)")
			
 
				-        
			
 
				-        # Load few-shot examples if provided
			
 
				-        few_shot_prompt = self._load_few_shot_examples()
			
 
				-        
			
 
				-        # Create system prompt
			
 
				-        system_prompt = self._create_system_prompt()
			
 
				-        
			
 
				+
			
 
				         # Create inference requests
			
 
				         requests = []
			
 
				         for chunk in chunks:
			
 
				-            request = self._create_request(chunk, system_prompt, few_shot_prompt)
			
 
				+            request = self._create_request(chunk)
			
 
				             requests.append(request)
			
 
				-        
			
 
				+
			
 
				         # Save requests
			
 
				         filepath = self._save_requests(requests)
			
 
				-        
			
 
				+
			
 
				         self.logger.info(f"Created {len(requests):,} inference requests")
			
 
				         self.logger.info(f"Saved to: {filepath}")
			
 
				-        
			
 
				-        return str(filepath)
			
 
				-    
			
 
				-    def _load_few_shot_examples(self) -> str:
			
 
				-        """Load few-shot examples from attorney labels"""
			
 
				-        if not self.few_shot_file:
			
 
				-            return ""
			
 
				-        
			
 
				-        filepath = Path(self.few_shot_file)
			
 
				-        if not filepath.exists():
			
 
				-            self.logger.warning(f"Few-shot file not found: {filepath}")
			
 
				-            return ""
			
 
				-        
			
 
				-        self.logger.info(f"Loading few-shot examples from: {filepath}")
			
 
				-        
			
 
				-        # Parse attorney labels and create examples
			
 
				-        # (Simplified - would need actual parser for completed template)
			
 
				-        few_shot = "\n\nHere are examples of how to classify messages:\n"
			
 
				-        few_shot += "[Attorney-labeled examples would be inserted here]\n"
			
 
				-        
			
 
				-        return few_shot
			
 
				-    
			
 
				-    def _create_system_prompt(self) -> str:
			
 
				-        """Create system prompt for LLM"""
			
 
				-        criteria_text = "\n".join([
			
 
				-            f"{num}. {desc}" 
			
 
				-            for num, desc in SUBPOENA_CRITERIA.items()
			
 
				-        ])
			
 
				-        
			
 
				-        prompt = f"""You are a legal document review specialist analyzing Signal chat messages for a discrimination lawsuit.
			
 
				-
			
 
				-CASE: {CASE_NAME}
			
 
				-CLAIM: Discrimination based on gender identity
			
 
				 
			
 
				-SUBPOENA CRITERIA - Messages are responsive if they relate to:
			
 
				-{criteria_text}
			
 
				+        return str(filepath)
			
 
				 
			
 
				-IMPORTANT: Err on side of OVER-INCLUSION (high recall)."""
			
 
				-        
			
 
				-        return prompt
			
 
				-    
			
 
				-    def _create_request(self, chunk: Chunk, system_prompt: str, 
			
 
				-                       few_shot_prompt: str) -> dict:
			
 
				+    def _create_request(self, chunk: Chunk) -> dict:
			
 
				         """Create inference request for a chunk"""
			
 
				         # Format messages
			
 
				         messages_text = ""
			
 
				         for msg in chunk.messages:
			
 
				             messages_text += f"Line {msg.line_number} [{msg.sender}]: {msg.message}\n"
			
 
				-        
			
 
				-        # Create full prompt
			
 
				-        prompt = f"""{system_prompt}
			
 
				 
			
 
				-{few_shot_prompt}
			
 
				+        # Create full prompt
			
 
				+        prompt = f"""
			
 
				+Review and classify the following messages.
			
 
				 
			
 
				 MESSAGES TO REVIEW (Lines {chunk.start_line}-{chunk.end_line}):
			
 
				 
			
 
				 {messages_text}
			
 
				 
			
 
				-Respond with JSON:
			
 
				-{{
			
 
				-  "responsive_line_numbers": [list of responsive line numbers],
			
 
				-  "reasoning": "brief explanation",
			
 
				-  "confidence": "high/medium/low"
			
 
				-}}"""
			
 
				-        
			
 
				+Provide your response as valid JSON following the specified format.
			
 
				+"""
			
 
				+
			
 
				         return {
			
 
				-            'chunk_id': chunk.chunk_id,
			
 
				-            'start_line': chunk.start_line,
			
 
				-            'end_line': chunk.end_line,
			
 
				-            'prompt': prompt,
			
 
				-            'num_messages': len(chunk.messages)
			
 
				+            "chunk_id": chunk.chunk_id,
			
 
				+            "start_line": chunk.start_line,
			
 
				+            "end_line": chunk.end_line,
			
 
				+            "prompt": prompt,
			
 
				+            "num_messages": len(chunk.messages),
			
 
				         }
			
 
				-    
			
 
				+
			
 
				     def _save_requests(self, requests: List[dict]) -> Path:
			
 
				         """Save inference requests to JSONL"""
			
 
				-        filepath = self.output_dir / 'dual_qwen_inference_requests.jsonl'
			
 
				-        
			
 
				+        filepath = self.output_dir / "inference_requests.jsonl"
			
 
				+
			
 
				         with open(filepath, 'w') as f:
			
 
				             for req in requests:
			
 
				                 f.write(json.dumps(req) + '\n')
			
 
				-        
			
 
				+
			
 
				         return filepath
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/pipeline/utils/Dockerfile
+++ b/pipeline/utils/Dockerfile
@@ -0,0 +1,15 @@
 
				+FROM python:3.11-slim
			
 
				+
			
 
				+WORKDIR /app
			
 
				+
			
 
				+# Install Flask
			
 
				+RUN pip install --no-cache-dir flask
			
 
				+
			
 
				+# Copy application
			
 
				+COPY app.py .
			
 
				+
			
 
				+# Expose port
			
 
				+EXPOSE 5000
			
 
				+
			
 
				+# Run the application
			
 
				+CMD ["python", "app.py"]
			
--- a/pipeline/utils/deployment_helper.py
+++ b/pipeline/utils/deployment_helper.py
@@ -8,17 +8,17 @@ import logging
 
				 
			
 
				 class ModelDeployer:
			
 
				     """Helper class for deploying Qwen models"""
			
 
				-    
			
 
				+
			
 
				     def __init__(self):
			
 
				         self.logger = logging.getLogger("ModelDeployer")
			
 
				         self.logger.setLevel(logging.INFO)
			
 
				-        
			
 
				+
			
 
				         if not self.logger.handlers:
			
 
				             handler = logging.StreamHandler()
			
 
				             formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
			
 
				             handler.setFormatter(formatter)
			
 
				             self.logger.addHandler(handler)
			
 
				-    
			
 
				+
			
 
				     def generate_deployment_command(self, model_config: Dict) -> str:
			
 
				         """Generate vLLM deployment command"""
			
 
				         cmd_parts = [
			
@@ -28,47 +28,46 @@ class ModelDeployer:
 
				             f"--port {model_config['port']}",
			
 
				             "--max-model-len 4096"
			
 
				         ]
			
 
				-        
			
 
				+
			
 
				         if model_config.get("quantization"):
			
 
				             cmd_parts.append(f"--quantization {model_config['quantization']}")
			
 
				-        
			
 
				-        return " \\
			
 
				-    ".join(cmd_parts)
			
 
				-    
			
 
				+
			
 
				+        return " \\\n".join(cmd_parts)
			
 
				+
			
 
				     def print_deployment_instructions(self):
			
 
				         """Print deployment instructions"""
			
 
				         from pipeline.common_defs import ModelConfig
			
 
				-        
			
 
				+
			
 
				         print("\n" + "=" * 80)
			
 
				         print("QWEN MODEL DEPLOYMENT INSTRUCTIONS")
			
 
				         print("=" * 80)
			
 
				-        
			
 
				+
			
 
				         print("\n1. RENT GPUS ON VAST.AI")
			
 
				         print("-" * 80)
			
 
				         print("\nFor Qwen 3 235B (Primary):")
			
 
				         print("  - Select: 4 × A100 80GB PCIe")
			
 
				         print("  - Image: pytorch/pytorch:latest")
			
 
				         print(f"  - Cost: ${ModelConfig.QWEN3_235B['cost_per_hour']}/hr")
			
 
				-        
			
 
				+
			
 
				         print("\nFor Qwen 2.5 72B (Secondary):")
			
 
				         print("  - Select: 2 × A100 80GB PCIe")
			
 
				         print("  - Image: pytorch/pytorch:latest")
			
 
				         print(f"  - Cost: ${ModelConfig.QWEN25_72B['cost_per_hour']}/hr")
			
 
				-        
			
 
				+
			
 
				         print("\n2. INSTALL DEPENDENCIES")
			
 
				         print("-" * 80)
			
 
				         print("pip install vllm transformers accelerate")
			
 
				-        
			
 
				+
			
 
				         print("\n3. DEPLOY QWEN 3 235B (Primary)")
			
 
				         print("-" * 80)
			
 
				         qwen3_cmd = self.generate_deployment_command(ModelConfig.QWEN3_235B)
			
 
				         print(qwen3_cmd)
			
 
				-        
			
 
				+
			
 
				         print("\n4. DEPLOY QWEN 2.5 72B (Secondary)")
			
 
				         print("-" * 80)
			
 
				         qwen25_cmd = self.generate_deployment_command(ModelConfig.QWEN25_72B)
			
 
				         print(qwen25_cmd)
			
 
				-        
			
 
				+
			
 
				         print("\n5. VERIFY DEPLOYMENT")
			
 
				         print("-" * 80)
			
 
				         print("curl http://localhost:8000/health  # Qwen 3")
			
--- a/pipeline/utils/inference_runner.py
+++ b/pipeline/utils/inference_runner.py
@@ -2,150 +2,370 @@
 
				 Inference runner for dual Qwen models.
			
 
				 """
			
 
				 
			
 
				+from collections import defaultdict
			
 
				 import json
			
 
				+import pandas as pd
			
 
				 import requests
			
 
				-from typing import List, Dict
			
 
				+from typing import List, Dict, TypedDict, cast
			
 
				 from pathlib import Path
			
 
				 import logging
			
 
				 from tqdm import tqdm
			
 
				+import sqlite3
			
 
				+from json_repair import repair_json, loads
			
 
				+
			
 
				+from pipeline.common_defs import Chunk, Message
			
 
				 
			
 
				 class InferenceRunner:
			
 
				     """Run inference on dual Qwen models"""
			
 
				-    
			
 
				-    def __init__(self, qwen3_url: str = "http://localhost:8000",
			
 
				-                 qwen25_url: str = "http://localhost:8001",
			
 
				-                 output_dir: str = "./pipeline_output"):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        batch_name: str,
			
 
				+        qwen3_url: str = "http://localhost:8000",
			
 
				+        qwen25_url: str = "http://localhost:8001",
			
 
				+        output_dir: str = "./pipeline_output",
			
 
				+    ):
			
 
				+        self.batch_name = batch_name
			
 
				         self.qwen3_url = qwen3_url
			
 
				         self.qwen25_url = qwen25_url
			
 
				         self.output_dir = Path(output_dir)
			
 
				-        
			
 
				+
			
 
				         self.logger = logging.getLogger("InferenceRunner")
			
 
				         self.logger.setLevel(logging.INFO)
			
 
				-        
			
 
				+
			
 
				         if not self.logger.handlers:
			
 
				             handler = logging.StreamHandler()
			
 
				-            formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
			
 
				+            formatter = logging.Formatter(
			
 
				+                "%(levelname)s - [%(filename)s:%(lineno)d] %(message)s"
			
 
				+            )
			
 
				             handler.setFormatter(formatter)
			
 
				             self.logger.addHandler(handler)
			
 
				-    
			
 
				-    def load_requests(self, requests_file: str) -> List[Dict]:
			
 
				-        """Load inference requests from JSONL file"""
			
 
				-        requests_data = []
			
 
				-        
			
 
				-        with open(requests_file, "r") as f:
			
 
				-            for line in f:
			
 
				-                requests_data.append(json.loads(line))
			
 
				-        
			
 
				-        self.logger.info(f"Loaded {len(requests_data)} inference requests")
			
 
				-        return requests_data
			
 
				-    
			
 
				-    def run_inference(self, requests_file: str, 
			
 
				-                     temperature: float = 0.1,
			
 
				-                     max_tokens: int = 500):
			
 
				+
			
 
				+        self.db = sqlite3.connect(
			
 
				+            self.output_dir / f"{batch_name}_processed.db3",
			
 
				+            check_same_thread=False,
			
 
				+        )
			
 
				+        self.cursor = self.db.cursor()
			
 
				+        sql = """
			
 
				+CREATE TABLE IF NOT EXISTS processed (
			
 
				+    chunk_id INTEGER,
			
 
				+    model_name: TEXT,
			
 
				+    message_index INTEGER,
			
 
				+    timestamp DATETIME,
			
 
				+    sender: TEXT,
			
 
				+    message TEXT,
			
 
				+    responsive BOOLEAN,
			
 
				+    reason TEXT,
			
 
				+    criteria TEXT,
			
 
				+    confidence TEXT,
			
 
				+    PRIMARY KEY (model_name, message_index)
			
 
				+);
			
 
				+        """
			
 
				+        self.cursor.execute(sql)
			
 
				+        self.db.row_factory = sqlite3.Row
			
 
				+        self.logger.info("summary database initialized")
			
 
				+
			
 
				+    def _create_user_prompt(self, chunk: Chunk) -> str:
			
 
				+        """Create inference request for a chunk"""
			
 
				+        # Format messages
			
 
				+        messages_text = ""
			
 
				+        for msg in chunk.messages:
			
 
				+            messages_text += (
			
 
				+                f"#{msg.line_number} [{msg.timestamp}] [{msg.sender}]: {msg.message}\n"
			
 
				+            )
			
 
				+
			
 
				+        # Create full prompt
			
 
				+        prompt = f"""
			
 
				+Review and classify the following messages.
			
 
				+
			
 
				+MESSAGES TO REVIEW (Lines {chunk.start_line}-{chunk.end_line}):
			
 
				+
			
 
				+{messages_text}
			
 
				+
			
 
				+Provide your response as valid JSON following the specified format.
			
 
				+"""
			
 
				+        return prompt
			
 
				+
			
 
				+    def _create_chunks(self) -> list[Chunk]:
			
 
				+        with open("pipeline_output/chunks.json", "r") as f:
			
 
				+            chunk_data = json.load(f)
			
 
				+
			
 
				+        msg_df = pd.read_csv(self.output_dir / "preprocessed_messages.csv")
			
 
				+
			
 
				+        # Reconstruct chunks (simplified)
			
 
				+        chunks = []
			
 
				+        for item in chunk_data["filtered_chunks"][:10]:  # First 10 for testing
			
 
				+            chunk = Chunk(
			
 
				+                chunk_id=item["chunk_id"],
			
 
				+                start_line=item["start_line"],
			
 
				+                end_line=item["end_line"],
			
 
				+                messages=[],
			
 
				+                combined_text="",
			
 
				+                timestamp_start=item["timestamp_start"],
			
 
				+                timestamp_end=item["timetamp_end"],
			
 
				+            )
			
 
				+            chunk_messages = []
			
 
				+            dfRange = msg_df.iloc[item["start_line"] - 1 : item["end_line"] - 1]
			
 
				+            for index, row in dfRange.itertuples():
			
 
				+                message = Message(
			
 
				+                    (index + 1),
			
 
				+                    row["timestamp"],
			
 
				+                    row["sender"],
			
 
				+                    row["message_normalized"],
			
 
				+                )
			
 
				+
			
 
				+            chunks.append(chunk)
			
 
				+        return chunks
			
 
				+
			
 
				+    def run_inference(self, temperature: float = 0.1, max_tokens: int = 2048):
			
 
				         """Run inference on both models"""
			
 
				         self.logger.info("=" * 80)
			
 
				         self.logger.info("RUNNING DUAL QWEN INFERENCE")
			
 
				         self.logger.info("=" * 80)
			
 
				-        
			
 
				-        requests_data = self.load_requests(requests_file)
			
 
				-        
			
 
				+
			
 
				+        chunks = self._create_chunks()
			
 
				+
			
 
				         self.logger.info("\nRunning Qwen 3 235B inference...")
			
 
				-        qwen3_results = self._run_model_inference(
			
 
				-            requests_data, self.qwen3_url, "Qwen3-235B", temperature, max_tokens
			
 
				+        self._run_model_inference(
			
 
				+            chunks, self.qwen3_url, "Qwen3-235B", temperature, max_tokens
			
 
				         )
			
 
				-        
			
 
				-        qwen3_file = self.output_dir / "qwen3_results.jsonl"
			
 
				-        self._save_results(qwen3_results, qwen3_file)
			
 
				-        
			
 
				+
			
 
				         self.logger.info("\nRunning Qwen 2.5 72B inference...")
			
 
				-        qwen25_results = self._run_model_inference(
			
 
				-            requests_data, self.qwen25_url, "Qwen2.5-72B", temperature, max_tokens
			
 
				+        self._run_model_inference(
			
 
				+            chunks, self.qwen25_url, "Qwen2.5-72B", temperature, max_tokens
			
 
				         )
			
 
				-        
			
 
				-        qwen25_file = self.output_dir / "qwen25_results.jsonl"
			
 
				-        self._save_results(qwen25_results, qwen25_file)
			
 
				-        
			
 
				+
			
 
				         self.logger.info("\n" + "=" * 80)
			
 
				         self.logger.info("INFERENCE COMPLETE")
			
 
				         self.logger.info("=" * 80)
			
 
				-        
			
 
				-        return str(qwen3_file), str(qwen25_file)
			
 
				-    
			
 
				-    def _run_model_inference(self, requests_data: List[Dict], 
			
 
				-                            model_url: str, model_name: str,
			
 
				-                            temperature: float, max_tokens: int) -> List[Dict]:
			
 
				+
			
 
				+    def _create_system_prompt(self) -> str:
			
 
				+        """Create system prompt for LLM"""
			
 
				+        prompt = ""
			
 
				+        with Path(self.output_dir, "system_prompt.txt").open("r") as file:
			
 
				+            prompt = file.read()
			
 
				+        return prompt
			
 
				+
			
 
				+    def _create_response_format(self) -> str:
			
 
				+        """Create response format for LLM"""
			
 
				+        response_format = ""
			
 
				+        with Path(self.output_dir, "response_format.json").open() as file:
			
 
				+            response_format = file.read()
			
 
				+        return response_format
			
 
				+
			
 
				+    def _check_existing_result(self, chunk: Chunk, model_name) -> bool:
			
 
				+        """Check if result already exists in db"""
			
 
				+        sql = """
			
 
				+SELECT 
			
 
				+    COUNT(*) AS num_messages
			
 
				+FROM
			
 
				+    processed
			
 
				+WHERE
			
 
				+    model_name = ?
			
 
				+    AND chunk_id = ?
			
 
				+    AND responsive IS NOT NULL
			
 
				+        """
			
 
				+        result = self.cursor.execute(sql, (model_name, chunk.chunk_id))
			
 
				+        row: dict = self.cursor.fetchone()
			
 
				+        if row and row["0"] == len(chunk.messages):
			
 
				+            return True
			
 
				+        return False
			
 
				+
			
 
				+    def _save_result(self, chunk: Chunk, results: list[dict], model_name: str):
			
 
				+        """Save result to db"""
			
 
				+        # merge the chunk messages with the results
			
 
				+        merged_results = {}
			
 
				+        for msg in chunk.messages:
			
 
				+            merged_results[msg.line_number] = {"message": msg}
			
 
				+        for item in results:
			
 
				+            if item["message_index"] in merged_results:
			
 
				+                merged_results[item["message_index"]].update(item)
			
 
				+            else:
			
 
				+                merged_results[item["message_index"]] = item.copy()
			
 
				+
			
 
				+        sql = """
			
 
				+INSERT INTO processed (
			
 
				+    chunk_id,
			
 
				+    model_name,
			
 
				+    message_index,
			
 
				+    timestamp,
			
 
				+    sender,
			
 
				+    message,
			
 
				+    responsive,
			
 
				+    reason,
			
 
				+    criteria,
			
 
				+    confidence
			
 
				+) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )
			
 
				+ON CONFLICT (model_name, message_index) DO UPDATE SET
			
 
				+    chunk_id = excluded.chunk_id,
			
 
				+    timestamp = excluded.timestamp,
			
 
				+    sender = excluded.sender,
			
 
				+    message = excluded.message,
			
 
				+    responsive = excluded.responsive,
			
 
				+    reason = excluded.reason,
			
 
				+    criteria = excluded.criteria,
			
 
				+    confidence = excluded.confidence
			
 
				+        """
			
 
				+        for result in merged_results:
			
 
				+            msg = result.get("message", None)
			
 
				+
			
 
				+            if msg is None or not isinstance(msg, Message):
			
 
				+                self.logger.error(
			
 
				+                    f"somehow we have a result without a message: \n{result}"
			
 
				+                )
			
 
				+                continue
			
 
				+
			
 
				+            self.cursor.execute(
			
 
				+                sql,
			
 
				+                (
			
 
				+                    result.get("chunk_id", None),
			
 
				+                    model_name,
			
 
				+                    msg.line_number,
			
 
				+                    msg.timestamp,
			
 
				+                    msg.sender,
			
 
				+                    msg.message,
			
 
				+                    result.get("responsive", None),
			
 
				+                    result.get("reason", None),
			
 
				+                    result.get("criteria", None),
			
 
				+                ),
			
 
				+            )
			
 
				+
			
 
				+    def _run_model_inference(
			
 
				+        self,
			
 
				+        chunks: List[Chunk],
			
 
				+        model_url: str,
			
 
				+        model_name: str,
			
 
				+        temperature: float,
			
 
				+        max_tokens: int,
			
 
				+    ):
			
 
				         """Run inference on a single model"""
			
 
				-        results = []
			
 
				-        
			
 
				-        for req in tqdm(requests_data, desc=f"{model_name} inference"):
			
 
				+        system_prompt = self._create_system_prompt()
			
 
				+        response_format = self._create_response_format()
			
 
				+
			
 
				+        success = 0
			
 
				+        errors = 0
			
 
				+
			
 
				+        for chunk in tqdm(chunks, desc=f"{model_name} inference"):
			
 
				+            # check if this chunk has already been processed
			
 
				+            if self._check_existing_result(chunk, model_name):
			
 
				+                continue
			
 
				+
			
 
				+            prompt_messages = []
			
 
				+            prompt_messages.append({"role": "system", "content": system_prompt})
			
 
				+            prompt_messages.append(
			
 
				+                {"role": "user", "content": self._create_user_prompt(chunk)}
			
 
				+            )
			
 
				+
			
 
				+            payload = {
			
 
				+                "model": model_name,
			
 
				+                "messages": prompt_messages,
			
 
				+                "temperature": temperature,
			
 
				+                "max_tokens": max_tokens,
			
 
				+                "response_format": {
			
 
				+                    "type": "json_schema",
			
 
				+                    "json_schema": {
			
 
				+                        "name": "structured_response",
			
 
				+                        "schema": json.loads(response_format),
			
 
				+                    },
			
 
				+                },
			
 
				+            }
			
 
				+
			
 
				+            # "top_p",
			
 
				+            # "top_k",
			
 
				+            # "frequency_penalty",
			
 
				+            # "presence_penalty",
			
 
				+            # # "stop",
			
 
				+            # # "skip_special_tokens",
			
 
				+            # "enable_thinking",
			
 
				+
			
 
				+            headers = {"Content-Type": "application/json"}
			
 
				+
			
 
				+            response = "Not Processed"
			
 
				+
			
 
				             try:
			
 
				                 response = requests.post(
			
 
				-                    f"{model_url}/v1/completions",
			
 
				-                    json={
			
 
				-                        "prompt": req["prompt"],
			
 
				-                        "max_tokens": max_tokens,
			
 
				-                        "temperature": temperature
			
 
				-                    },
			
 
				-                    timeout=60
			
 
				+                    f"{model_url}/v1/completions", headers=headers, json=payload
			
 
				                 )
			
 
				-                
			
 
				-                if response.status_code == 200:
			
 
				-                    result = self._parse_response(response.json(), req, model_name)
			
 
				-                    results.append(result)
			
 
				+                response.raise_for_status()
			
 
				+                # logger.log(LEVEL_TRACE, f"Response {response.status_code}\n{response.text}")
			
 
				+
			
 
				+                data = response.json()
			
 
				+                if "error" in data:
			
 
				+                    raise RuntimeError("LLM error")
			
 
				+
			
 
				+                choices = data.get("choices", [])
			
 
				+                if not choices:
			
 
				+                    raise KeyError("No choices found in response")
			
 
				+
			
 
				+                first_choice = choices[0]
			
 
				+                if "message" in first_choice and first_choice["message"]:
			
 
				+                    response_text = first_choice["message"].get("content", "")
			
 
				                 else:
			
 
				-                    results.append(self._create_error_result(req, model_name))
			
 
				-            
			
 
				+                    response_text = first_choice.get("text", "")
			
 
				+
			
 
				+                if not response_text:
			
 
				+                    raise ValueError("No response found")
			
 
				+
			
 
				+                result = self._parse_response(response_text, chunk, model_name)
			
 
				+                if result:
			
 
				+                    success += 1
			
 
				+                else:
			
 
				+                    raise RuntimeError("Could not parse result")
			
 
				+
			
 
				             except Exception as e:
			
 
				-                self.logger.error(f"Exception for chunk {req['chunk_id']}: {e}")
			
 
				-                results.append(self._create_error_result(req, model_name))
			
 
				-        
			
 
				-        return results
			
 
				-    
			
 
				-    def _parse_response(self, response: Dict, request: Dict, model_name: str) -> Dict:
			
 
				+                self.logger.error(
			
 
				+                    f"Error processing chunk {chunk.chunk_id}: \nResponse was:\n{response}\n{e.with_traceback}"
			
 
				+                )
			
 
				+                self._save_result(chunk, [], model_name)
			
 
				+                errors += 1
			
 
				+        return success, errors
			
 
				+
			
 
				+    def _parse_response(
			
 
				+        self, response_text, chunk: Chunk, model_name: str
			
 
				+    ) -> list[dict]:
			
 
				         """Parse model response"""
			
 
				+        parsed_list = {}
			
 
				         try:
			
 
				-            text = response["choices"][0]["text"]
			
 
				-            parsed = json.loads(text)
			
 
				-            
			
 
				-            return {
			
 
				-                "chunk_id": request["chunk_id"],
			
 
				-                "responsive_line_numbers": parsed.get("responsive_line_numbers", []),
			
 
				-                "reasoning": parsed.get("reasoning", ""),
			
 
				-                "confidence": parsed.get("confidence", "medium"),
			
 
				-                "model_name": model_name
			
 
				-            }
			
 
				-        except Exception:
			
 
				-            return self._create_error_result(request, model_name)
			
 
				-    
			
 
				-    def _create_error_result(self, request: Dict, model_name: str) -> Dict:
			
 
				-        """Create error result"""
			
 
				-        return {
			
 
				-            "chunk_id": request["chunk_id"],
			
 
				-            "responsive_line_numbers": [],
			
 
				-            "reasoning": "Error during inference",
			
 
				-            "confidence": "low",
			
 
				-            "model_name": model_name,
			
 
				-            "error": True
			
 
				-        }
			
 
				-    
			
 
				-    def _save_results(self, results: List[Dict], filepath: Path):
			
 
				-        """Save results to JSONL"""
			
 
				-        with open(filepath, "w") as f:
			
 
				-            for result in results:
			
 
				-                f.write(json.dumps(result) + "\n")
			
 
				-        
			
 
				-        self.logger.info(f"Saved {len(results)} results to {filepath}")
			
 
				+            parsed = loads(response_text)
			
 
				+            parsed_list = cast(List[Dict], parsed)
			
 
				+        except Exception as e:
			
 
				+            self.logger.error(f"Errror parsing response for chunk {chunk.chunk_id}")
			
 
				+
			
 
				+        if not parsed_list:
			
 
				+            return []
			
 
				+
			
 
				+        responses = []
			
 
				+        for result in parsed_list:
			
 
				+            try:
			
 
				+                responses.append(
			
 
				+                    {
			
 
				+                        "chunk_id": chunk.chunk_id,
			
 
				+                        "message_index": result.get("message_index", None),
			
 
				+                        "responsive": result.get("responsive", None),
			
 
				+                        "reason": result.get("reason", ""),
			
 
				+                        "criteria": result.get("criteria", []),
			
 
				+                        "confidence": result.get("confidence", "low"),
			
 
				+                    }
			
 
				+                )
			
 
				+            except Exception as e:
			
 
				+                self.logger.error(
			
 
				+                    f"Error parsing response line: \n{e.with_traceback}\n{result}"
			
 
				+                )
			
 
				+        return responses
			
 
				+
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     import argparse
			
 
				-    
			
 
				+
			
 
				     parser = argparse.ArgumentParser(description="Run dual Qwen inference")
			
 
				-    parser.add_argument("requests_file", help="Path to inference requests JSONL")
			
 
				-    parser.add_argument("--qwen3-url", default="http://localhost:8000")
			
 
				-    parser.add_argument("--qwen25-url", default="http://localhost:8001")
			
 
				+    parser.add_argument("batch_name")
			
 
				+    parser.add_argument("--qwen3-url", default="http://localhost:8001")
			
 
				+    parser.add_argument("--qwen25-url", default="http://localhost:8002")
			
 
				     parser.add_argument("--output-dir", default="./pipeline_output")
			
 
				-    
			
 
				+
			
 
				     args = parser.parse_args()
			
 
				-    
			
 
				-    runner = InferenceRunner(args.qwen3_url, args.qwen25_url, args.output_dir)
			
 
				-    runner.run_inference(args.requests_file)
			
 
				+
			
 
				+    runner = InferenceRunner(
			
 
				+        args.batch_name, args.qwen3_url, args.qwen25_url, args.output_dir
			
 
				+    )
			
 
				+    runner.run_inference()
			
--- a/pipeline/utils/keyword_pruner.py
+++ b/pipeline/utils/keyword_pruner.py
@@ -0,0 +1,48 @@
 
				+import json
			
 
				+
			
 
				+
			
 
				+def filter_phrases(list1, list2):
			
 
				+    # Combine lists and remove duplicates
			
 
				+    combined = list(set(list1 + list2))
			
 
				+
			
 
				+    # Filter out phrases that contain other phrases as discrete words
			
 
				+    result = []
			
 
				+    for phrase in combined:
			
 
				+        # Split phrase into words
			
 
				+        phrase_words = phrase.split()
			
 
				+
			
 
				+        # Check if any other phrase is a discrete subset
			
 
				+        is_subset = False
			
 
				+        for other_phrase in combined:
			
 
				+            if phrase == other_phrase:
			
 
				+                continue
			
 
				+
			
 
				+            other_words = other_phrase.split()
			
 
				+
			
 
				+            # Check if other_phrase matches a contiguous sequence in phrase
			
 
				+            if len(other_words) < len(phrase_words):
			
 
				+                for i in range(len(phrase_words) - len(other_words) + 1):
			
 
				+                    if phrase_words[i : i + len(other_words)] == other_words:
			
 
				+                        is_subset = True
			
 
				+                        break
			
 
				+
			
 
				+            if is_subset:
			
 
				+                break
			
 
				+
			
 
				+        if not is_subset:
			
 
				+            result.append(phrase)
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    with open("../pipeline_output/final_keyword_list_0.json") as f:
			
 
				+        list_1 = json.load(f)
			
 
				+
			
 
				+    with open("../pipeline_output/final_keyword_list.json") as f:
			
 
				+        list_2 = json.load(f)
			
 
				+
			
 
				+    pruned_list = filter_phrases(list_1, list_2)
			
 
				+
			
 
				+    with open("../pipeline_output/final_keyword_list_1.json", "w") as f:
			
 
				+        json.dump(pruned_list, f, indent=2)
			
--- a/pipeline/utils/text_utils.py
+++ b/pipeline/utils/text_utils.py
@@ -2,19 +2,21 @@
 
				 Utility functions for text processing.
			
 
				 """
			
 
				 
			
 
				-import re
			
 
				+import regex as re
			
 
				 from typing import List
			
 
				 import pandas as pd
			
 
				 from pipeline.common_defs import TEXT_EXPANSIONS, ACRONYMS
			
 
				 
			
 
				 # Build a single combined pattern for all acronyms (case-sensitive)
			
 
				 _ACRONYM_PATTERN = re.compile(
			
 
				-    r"\b(" + "|".join(re.escape(k) for k in ACRONYMS.keys()) + r")\b"
			
 
				+    r"\b(" + "|".join(re.escape(k) for k in ACRONYMS.keys()) + r")\b",
			
 
				+    flags=re.WORD | re.V1,
			
 
				 )
			
 
				 
			
 
				 # Build a single combined pattern for all expansions (will be case-insensitive after lowering)
			
 
				 _EXPANSION_PATTERN = re.compile(
			
 
				-    r"\b(" + "|".join(re.escape(k) for k in TEXT_EXPANSIONS.keys()) + r")\b"
			
 
				+    r"\b(" + "|".join(re.escape(k) for k in TEXT_EXPANSIONS.keys()) + r")\b",
			
 
				+    flags=re.WORD | re.V1,
			
 
				 )
			
 
				 
			
 
				 
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,3 +11,8 @@ dependencies = [
 
				     "scikit-learn>=1.7.2",
			
 
				     "sentence-transformers>=5.1.2",
			
 
				 ]
			
 
				+
			
 
				+[tool.uv.workspace]
			
 
				+members = [
			
 
				+    "pipeline",
			
 
				+]