import json def filter_phrases(list1, list2): # Combine lists and remove duplicates combined = list(set(list1 + list2)) # Filter out phrases that contain other phrases as discrete words result = [] for phrase in combined: # Split phrase into words phrase_words = phrase.split() # Check if any other phrase is a discrete subset is_subset = False for other_phrase in combined: if phrase == other_phrase: continue other_words = other_phrase.split() # Check if other_phrase matches a contiguous sequence in phrase if len(other_words) < len(phrase_words): for i in range(len(phrase_words) - len(other_words) + 1): if phrase_words[i : i + len(other_words)] == other_words: is_subset = True break if is_subset: break if not is_subset: result.append(phrase) return result if __name__ == "__main__": with open("../pipeline_output/final_keyword_list_0.json") as f: list_1 = json.load(f) with open("../pipeline_output/final_keyword_list.json") as f: list_2 = json.load(f) pruned_list = filter_phrases(list_1, list_2) with open("../pipeline_output/final_keyword_list_1.json", "w") as f: json.dump(pruned_list, f, indent=2)