step0b_keyword_identification.py 1.1 KB

123456789101112131415161718192021222324252627282930313233
  1. """
  2. Step 0a: Identify relevant keywords from sample data.
  3. """
  4. import pandas as pd
  5. import json
  6. from pipeline.steps.step0b1_semantic_keyword_identification import (
  7. SemanticKeywordIdentifier,
  8. )
  9. from pipeline.steps.step0b2_llm_keyword_identification import LLMKeywordIdentifier
  10. from pipeline.utils.combine_keywords import combine_keywords, analyze_overlap
  11. if __name__ == "__main__":
  12. df = pd.read_csv("../_sources/signal_messages.csv")
  13. ski = SemanticKeywordIdentifier()
  14. semantic_keywords = ski.execute(df=df)
  15. lki = LLMKeywordIdentifier(llm_url="http://eos.dgtlu.net:11434", sample_size=14000)
  16. llm_keywords = lki.execute(df=df)
  17. combined = combine_keywords(
  18. semantic_results=semantic_keywords, llm_results=llm_keywords
  19. )
  20. out_dir = ski.output_dir
  21. with open(f"{out_dir}/combined_keywords.json") as out_file:
  22. out_file.write(json.dumps(combined))
  23. overlap = analyze_overlap(
  24. semantic_results=semantic_keywords, llm_results=llm_keywords
  25. )
  26. with open(f"{out_dir}/keyword_overlap.json") as out_file:
  27. out_file.write(json.dumps(combined))