justadri
/
disco


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
							import json


def filter_phrases(list1, list2):
    # Combine lists and remove duplicates
    combined = list(set(list1 + list2))

    # Filter out phrases that contain other phrases as discrete words
    result = []
    for phrase in combined:
        # Split phrase into words
        phrase_words = phrase.split()

        # Check if any other phrase is a discrete subset
        is_subset = False
        for other_phrase in combined:
            if phrase == other_phrase:
                continue

            other_words = other_phrase.split()

            # Check if other_phrase matches a contiguous sequence in phrase
            if len(other_words) < len(phrase_words):
                for i in range(len(phrase_words) - len(other_words) + 1):
                    if phrase_words[i : i + len(other_words)] == other_words:
                        is_subset = True
                        break

            if is_subset:
                break

        if not is_subset:
            result.append(phrase)

    return result


if __name__ == "__main__":
    with open("../pipeline_output/final_keyword_list_0.json") as f:
        list_1 = json.load(f)

    with open("../pipeline_output/final_keyword_list.json") as f:
        list_2 = json.load(f)

    pruned_list = filter_phrases(list_1, list_2)

    with open("../pipeline_output/final_keyword_list_1.json", "w") as f:
        json.dump(pruned_list, f, indent=2)