| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455 |
- """
- Step 0a: Identify relevant keywords from sample data.
- """
- import pandas as pd
- import json
- from pipeline.steps.step0b1_semantic_keyword_identification import (
- SemanticKeywordIdentifier,
- )
- from pipeline.steps.step0b2_llm_keyword_identification import LLMKeywordIdentifier
- from pipeline.utils.combine_keywords import combine_keywords, analyze_overlap
- if __name__ == "__main__":
- df = pd.read_csv("../_sources/signal_messages.csv")
- ski = SemanticKeywordIdentifier()
- semantic_keywords = ski.execute(df=df)
- lki = LLMKeywordIdentifier(
- llm_url="http://localhost:8000",
- sample_size=14000,
- model="Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8",
- )
- llm_keywords = lki.execute(df=df)
- out_dir = ski.output_dir
- with open(f"{out_dir}/semantic_keywords.json") as file:
- semantic_keywords = json.load(file)
- with open(f"{out_dir}/llm_keywords.json") as file:
- llm_keywords = json.load(file)
- combined = combine_keywords(
- semantic_results=semantic_keywords, llm_results=llm_keywords
- )
- with open(f"{out_dir}/combined_keywords.json", "w") as out_file:
- out_file.write(json.dumps(combined))
- overlap = analyze_overlap(
- semantic_results=semantic_keywords, llm_results=llm_keywords
- )
- with open(f"{out_dir}/keyword_overlap.json", "w") as out_file:
- out_file.write(json.dumps(combined))
- with open(f"{out_dir}/combined_keywords.json", "r") as file:
- dict_all = json.load(file)
- merged_list = list(
- dict.fromkeys(item for sublist in dict_all.values() for item in sublist)
- )
- with open(f"{out_dir}/final_keyword_list.json", "w") as out_file:
- json.dump(merged_list, out_file)
|