step0a_llm_keyword_identification.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. """
  2. Step 0a (Alternative): LLM-based keyword identification.
  3. """
  4. from typing import List, Dict
  5. import pandas as pd
  6. import json
  7. import requests
  8. from pipeline.models.base import PipelineStep
  9. from pipeline.common_defs import SUBPOENA_CRITERIA
  10. MODEL = "hf.co/bartowski/Qwen2.5-14B-Instruct-GGUF:Q4_K_S"
  11. class LLMKeywordIdentifier(PipelineStep):
  12. """Identify keywords using LLM analysis"""
  13. def __init__(self, llm_url: str = "http://localhost:8000",
  14. sample_size: int = 1000, output_dir: str = "./pipeline_output"):
  15. super().__init__(output_dir)
  16. self.llm_url = llm_url
  17. self.sample_size = sample_size
  18. def execute(self, df: pd.DataFrame) -> Dict[int, List[str]]:
  19. """Use LLM to identify relevant keywords"""
  20. self.logger.info("LLM-BASED KEYWORD IDENTIFICATION")
  21. sample_df = df.sample(n=min(self.sample_size, len(df)), random_state=42)
  22. keywords_by_criterion = {}
  23. for num, desc in SUBPOENA_CRITERIA.items():
  24. keywords = self._identify_keywords_for_criterion(sample_df, num, desc)
  25. keywords_by_criterion[num] = keywords
  26. self.logger.info(f"Criterion {num}: {len(keywords)} keywords")
  27. self._save_llm_keywords(keywords_by_criterion)
  28. return keywords_by_criterion
  29. def _identify_keywords_for_criterion(self, df, num, desc):
  30. """Use LLM to identify keywords"""
  31. all_keywords = []
  32. # Loop through df in chunks of 100 rows
  33. for i in range(0, len(df), 100):
  34. chunk = df.iloc[i : i + 100]
  35. messages_sample = "\n".join(chunk["message"].fillna("").tolist())
  36. prompt = f"Identify 30-50 keywords for: {desc}\n\nMessages:\n{messages_sample}\n\nJSON:"
  37. try:
  38. response = requests.post(
  39. f"{self.llm_url}/v1/chat/completions",
  40. json={"prompt": prompt, "max_tokens": 1000, "model": MODEL},
  41. timeout=120,
  42. )
  43. if response.status_code == 200:
  44. text = response.json()["choices"][0]["text"]
  45. parsed = json.loads(text)
  46. keywords = parsed.get("keywords", [])
  47. all_keywords.extend(keywords)
  48. except Exception as e:
  49. self.logger.error(f"Error processing chunk {i//100 + 1}: {e}")
  50. # Remove duplicates while preserving order
  51. seen = set()
  52. unique_keywords = []
  53. for keyword in all_keywords:
  54. if keyword not in seen:
  55. seen.add(keyword)
  56. unique_keywords.append(keyword)
  57. return unique_keywords
  58. def _save_llm_keywords(self, keywords_by_criterion):
  59. """Save LLM keywords"""
  60. results = {"method": "llm_analysis", "criteria": {str(n): {"keywords": k} for n, k in keywords_by_criterion.items()}}
  61. self.save_results(results, "llm_keywords.json")
  62. self.logger.info("Saved LLM keywords")