| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051 |
- """
- Step 0a: Identify relevant keywords from sample data.
- """
- import pandas as pd
- import json
- from pipeline.steps.step0b1_semantic_keyword_identification import (
- SemanticKeywordIdentifier,
- )
- from pipeline.steps.step0b2_llm_keyword_identification import LLMKeywordIdentifier
- from pipeline.utils.combine_keywords import combine_keywords, analyze_overlap
- if __name__ == "__main__":
- df = pd.read_csv("../_sources/signal_messages.csv")
- ski = SemanticKeywordIdentifier()
- # semantic_keywords = ski.execute(df=df)
- # lki = LLMKeywordIdentifier(llm_url="http://localhost:8000", sample_size=14000)
- # llm_keywords = lki.execute(df=df)
- out_dir = ski.output_dir
- # with open(f"{out_dir}/semantic_keywords.json") as file:
- # semantic_keywords = json.load(file)
- # with open(f"{out_dir}/llm_keywords.json") as file:
- # llm_keywords = json.load(file)
- # combined = combine_keywords(
- # semantic_results=semantic_keywords, llm_results=llm_keywords
- # )
- # with open(f"{out_dir}/combined_keywords.json", "w") as out_file:
- # out_file.write(json.dumps(combined))
- # overlap = analyze_overlap(
- # semantic_results=semantic_keywords, llm_results=llm_keywords
- # )
- # with open(f"{out_dir}/keyword_overlap.json", "w") as out_file:
- # out_file.write(json.dumps(combined))
- with open(f"{out_dir}/combined_keywords.json", "r") as file:
- dict_all = json.load(file)
- merged_list = list(
- dict.fromkeys(item for sublist in dict_all.values() for item in sublist)
- )
- with open(f"{out_dir}/final_keyword_list.json", "w") as out_file:
- json.dump(merged_list, out_file)
|