""" Step 0a: Identify relevant keywords from sample data. """ import pandas as pd import json from pipeline.steps.step0b1_semantic_keyword_identification import ( SemanticKeywordIdentifier, ) from pipeline.steps.step0b2_llm_keyword_identification import LLMKeywordIdentifier from pipeline.utils.combine_keywords import combine_keywords, analyze_overlap if __name__ == "__main__": df = pd.read_csv("../_sources/signal_messages.csv") ski = SemanticKeywordIdentifier() # semantic_keywords = ski.execute(df=df) # lki = LLMKeywordIdentifier(llm_url="http://localhost:8000", sample_size=14000) # llm_keywords = lki.execute(df=df) out_dir = ski.output_dir # with open(f"{out_dir}/semantic_keywords.json") as file: # semantic_keywords = json.load(file) # with open(f"{out_dir}/llm_keywords.json") as file: # llm_keywords = json.load(file) # combined = combine_keywords( # semantic_results=semantic_keywords, llm_results=llm_keywords # ) # with open(f"{out_dir}/combined_keywords.json", "w") as out_file: # out_file.write(json.dumps(combined)) # overlap = analyze_overlap( # semantic_results=semantic_keywords, llm_results=llm_keywords # ) # with open(f"{out_dir}/keyword_overlap.json", "w") as out_file: # out_file.write(json.dumps(combined)) with open(f"{out_dir}/combined_keywords.json", "r") as file: dict_all = json.load(file) merged_list = list( dict.fromkeys(item for sublist in dict_all.values() for item in sublist) ) with open(f"{out_dir}/final_keyword_list.json", "w") as out_file: json.dump(merged_list, out_file)