| 12345678910111213141516171819202122232425262728293031 |
- """
- Step 0a: Identify relevant keywords from sample data.
- """
- import pandas as pd
- import json
- from step0a_semantic_keyword_identification import SemanticKeywordIdentifier
- from step0a_llm_keyword_identification import LLMKeywordIdentifier
- from pipeline.utils.combine_keywords import combine_keywords, analyze_overlap
- if __name__ == "__main__":
- df = pd.read_csv("../_sources/signal_messages.csv")
- ski = SemanticKeywordIdentifier()
- semantic_keywords = ski.execute(df=df)
- lki = LLMKeywordIdentifier(llm_url="http://eos.dgtlu.net:11434", sample_size=14000)
- llm_keywords = lki.execute(df=df)
- combined = combine_keywords(
- semantic_results=semantic_keywords, llm_results=llm_keywords
- )
- out_dir = ski.output_dir
- with open(f"{out_dir}/combined_keywords.json") as out_file:
- out_file.write(json.dumps(combined))
- overlap = analyze_overlap(
- semantic_results=semantic_keywords, llm_results=llm_keywords
- )
- with open(f"{out_dir}/keyword_overlap.json") as out_file:
- out_file.write(json.dumps(combined))
|