step0b_keyword_identification.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. """
  2. Step 0a: Identify relevant keywords from sample data.
  3. """
  4. import pandas as pd
  5. import json
  6. from pipeline.steps.step0b1_semantic_keyword_identification import (
  7. SemanticKeywordIdentifier,
  8. )
  9. from pipeline.steps.step0b2_llm_keyword_identification import LLMKeywordIdentifier
  10. from pipeline.utils.combine_keywords import combine_keywords, analyze_overlap
  11. if __name__ == "__main__":
  12. df = pd.read_csv("../_sources/signal_messages.csv")
  13. ski = SemanticKeywordIdentifier()
  14. semantic_keywords = ski.execute(df=df)
  15. lki = LLMKeywordIdentifier(
  16. llm_url="http://localhost:8000",
  17. sample_size=14000,
  18. model="Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8",
  19. )
  20. llm_keywords = lki.execute(df=df)
  21. out_dir = ski.output_dir
  22. with open(f"{out_dir}/semantic_keywords.json") as file:
  23. semantic_keywords = json.load(file)
  24. with open(f"{out_dir}/llm_keywords.json") as file:
  25. llm_keywords = json.load(file)
  26. combined = combine_keywords(
  27. semantic_results=semantic_keywords, llm_results=llm_keywords
  28. )
  29. with open(f"{out_dir}/combined_keywords.json", "w") as out_file:
  30. out_file.write(json.dumps(combined))
  31. overlap = analyze_overlap(
  32. semantic_results=semantic_keywords, llm_results=llm_keywords
  33. )
  34. with open(f"{out_dir}/keyword_overlap.json", "w") as out_file:
  35. out_file.write(json.dumps(combined))
  36. with open(f"{out_dir}/combined_keywords.json", "r") as file:
  37. dict_all = json.load(file)
  38. merged_list = list(
  39. dict.fromkeys(item for sublist in dict_all.values() for item in sublist)
  40. )
  41. with open(f"{out_dir}/final_keyword_list.json", "w") as out_file:
  42. json.dump(merged_list, out_file)