""" Step 0a: Identify relevant keywords from sample data. """ import pandas as pd import json from step0a_semantic_keyword_identification import SemanticKeywordIdentifier from step0a_llm_keyword_identification import LLMKeywordIdentifier from pipeline.utils.combine_keywords import combine_keywords, analyze_overlap if __name__ == "__main__": df = pd.read_csv("../_sources/signal_messages.csv") ski = SemanticKeywordIdentifier() semantic_keywords = ski.execute(df=df) lki = LLMKeywordIdentifier(llm_url="http://eos.dgtlu.net:11434", sample_size=14000) llm_keywords = lki.execute(df=df) combined = combine_keywords( semantic_results=semantic_keywords, llm_results=llm_keywords ) out_dir = ski.output_dir with open(f"{out_dir}/combined_keywords.json") as out_file: out_file.write(json.dumps(combined)) overlap = analyze_overlap( semantic_results=semantic_keywords, llm_results=llm_keywords ) with open(f"{out_dir}/keyword_overlap.json") as out_file: out_file.write(json.dumps(combined))