step0b_keyword_identification.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. """
  2. Step 0a: Identify relevant keywords from sample data.
  3. """
  4. import pandas as pd
  5. import json
  6. from pipeline.steps.step0b1_semantic_keyword_identification import (
  7. SemanticKeywordIdentifier,
  8. )
  9. from pipeline.steps.step0b2_llm_keyword_identification import LLMKeywordIdentifier
  10. from pipeline.utils.combine_keywords import combine_keywords, analyze_overlap
  11. if __name__ == "__main__":
  12. df = pd.read_csv("../_sources/signal_messages.csv")
  13. ski = SemanticKeywordIdentifier()
  14. # semantic_keywords = ski.execute(df=df)
  15. # lki = LLMKeywordIdentifier(llm_url="http://localhost:8000", sample_size=14000)
  16. # llm_keywords = lki.execute(df=df)
  17. out_dir = ski.output_dir
  18. # with open(f"{out_dir}/semantic_keywords.json") as file:
  19. # semantic_keywords = json.load(file)
  20. # with open(f"{out_dir}/llm_keywords.json") as file:
  21. # llm_keywords = json.load(file)
  22. # combined = combine_keywords(
  23. # semantic_results=semantic_keywords, llm_results=llm_keywords
  24. # )
  25. # with open(f"{out_dir}/combined_keywords.json", "w") as out_file:
  26. # out_file.write(json.dumps(combined))
  27. # overlap = analyze_overlap(
  28. # semantic_results=semantic_keywords, llm_results=llm_keywords
  29. # )
  30. # with open(f"{out_dir}/keyword_overlap.json", "w") as out_file:
  31. # out_file.write(json.dumps(combined))
  32. with open(f"{out_dir}/combined_keywords.json", "r") as file:
  33. dict_all = json.load(file)
  34. merged_list = list(
  35. dict.fromkeys(item for sublist in dict_all.values() for item in sublist)
  36. )
  37. with open(f"{out_dir}/final_keyword_list.json", "w") as out_file:
  38. json.dump(merged_list, out_file)