keyword_pruner.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. import json
  2. def filter_phrases(list1, list2):
  3. # Combine lists and remove duplicates
  4. combined = list(set(list1 + list2))
  5. # Filter out phrases that contain other phrases as discrete words
  6. result = []
  7. for phrase in combined:
  8. # Split phrase into words
  9. phrase_words = phrase.split()
  10. # Check if any other phrase is a discrete subset
  11. is_subset = False
  12. for other_phrase in combined:
  13. if phrase == other_phrase:
  14. continue
  15. other_words = other_phrase.split()
  16. # Check if other_phrase matches a contiguous sequence in phrase
  17. if len(other_words) < len(phrase_words):
  18. for i in range(len(phrase_words) - len(other_words) + 1):
  19. if phrase_words[i : i + len(other_words)] == other_words:
  20. is_subset = True
  21. break
  22. if is_subset:
  23. break
  24. if not is_subset:
  25. result.append(phrase)
  26. return result
  27. if __name__ == "__main__":
  28. with open("../pipeline_output/final_keyword_list_0.json") as f:
  29. list_1 = json.load(f)
  30. with open("../pipeline_output/final_keyword_list.json") as f:
  31. list_2 = json.load(f)
  32. pruned_list = filter_phrases(list_1, list_2)
  33. with open("../pipeline_output/final_keyword_list_1.json", "w") as f:
  34. json.dump(pruned_list, f, indent=2)