justadri
/
disco


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
							"""
Step 0a: Identify relevant keywords from sample data.
"""

import pandas as pd
import json
from pipeline.steps.step0b1_semantic_keyword_identification import (
    SemanticKeywordIdentifier,
)
from pipeline.steps.step0b2_llm_keyword_identification import LLMKeywordIdentifier
from pipeline.utils.combine_keywords import combine_keywords, analyze_overlap


if __name__ == "__main__":
    df = pd.read_csv("../_sources/signal_messages.csv")
    ski = SemanticKeywordIdentifier()
    semantic_keywords = ski.execute(df=df)

    lki = LLMKeywordIdentifier(
        llm_url="http://localhost:8000",
        sample_size=14000,
        model="Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8",
    )
    llm_keywords = lki.execute(df=df)

    out_dir = ski.output_dir

    with open(f"{out_dir}/semantic_keywords.json") as file:
        semantic_keywords = json.load(file)

    with open(f"{out_dir}/llm_keywords.json") as file:
        llm_keywords = json.load(file)

    combined = combine_keywords(
        semantic_results=semantic_keywords, llm_results=llm_keywords
    )

    with open(f"{out_dir}/combined_keywords.json", "w") as out_file:
        out_file.write(json.dumps(combined))

    overlap = analyze_overlap(
        semantic_results=semantic_keywords, llm_results=llm_keywords
    )
    with open(f"{out_dir}/keyword_overlap.json", "w") as out_file:
        out_file.write(json.dumps(combined))

    with open(f"{out_dir}/combined_keywords.json", "r") as file:
        dict_all = json.load(file)

    merged_list = list(
        dict.fromkeys(item for sublist in dict_all.values() for item in sublist)
    )

    with open(f"{out_dir}/final_keyword_list.json", "w") as out_file:
        json.dump(merged_list, out_file)