| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748 |
- import json
- def filter_phrases(list1, list2):
- # Combine lists and remove duplicates
- combined = list(set(list1 + list2))
- # Filter out phrases that contain other phrases as discrete words
- result = []
- for phrase in combined:
- # Split phrase into words
- phrase_words = phrase.split()
- # Check if any other phrase is a discrete subset
- is_subset = False
- for other_phrase in combined:
- if phrase == other_phrase:
- continue
- other_words = other_phrase.split()
- # Check if other_phrase matches a contiguous sequence in phrase
- if len(other_words) < len(phrase_words):
- for i in range(len(phrase_words) - len(other_words) + 1):
- if phrase_words[i : i + len(other_words)] == other_words:
- is_subset = True
- break
- if is_subset:
- break
- if not is_subset:
- result.append(phrase)
- return result
- if __name__ == "__main__":
- with open("../pipeline_output/final_keyword_list_0.json") as f:
- list_1 = json.load(f)
- with open("../pipeline_output/final_keyword_list.json") as f:
- list_2 = json.load(f)
- pruned_list = filter_phrases(list_1, list_2)
- with open("../pipeline_output/final_keyword_list_1.json", "w") as f:
- json.dump(pruned_list, f, indent=2)
|