step0a_keyword_identification.py 1.0 KB

12345678910111213141516171819202122232425262728293031
  1. """
  2. Step 0a: Identify relevant keywords from sample data.
  3. """
  4. import pandas as pd
  5. import json
  6. from step0a_semantic_keyword_identification import SemanticKeywordIdentifier
  7. from step0a_llm_keyword_identification import LLMKeywordIdentifier
  8. from pipeline.utils.combine_keywords import combine_keywords, analyze_overlap
  9. if __name__ == "__main__":
  10. df = pd.read_csv("../_sources/signal_messages.csv")
  11. ski = SemanticKeywordIdentifier()
  12. semantic_keywords = ski.execute(df=df)
  13. lki = LLMKeywordIdentifier(llm_url="http://eos.dgtlu.net:11434", sample_size=14000)
  14. llm_keywords = lki.execute(df=df)
  15. combined = combine_keywords(
  16. semantic_results=semantic_keywords, llm_results=llm_keywords
  17. )
  18. out_dir = ski.output_dir
  19. with open(f"{out_dir}/combined_keywords.json") as out_file:
  20. out_file.write(json.dumps(combined))
  21. overlap = analyze_overlap(
  22. semantic_results=semantic_keywords, llm_results=llm_keywords
  23. )
  24. with open(f"{out_dir}/keyword_overlap.json") as out_file:
  25. out_file.write(json.dumps(combined))