""" Usage examples for advanced features. """ # Example 1: Keyword Identification # ================================== from pipeline.steps.step0a_keyword_identification import KeywordIdentifier import pandas as pd # Load data df = pd.read_csv('signal_messages.csv') # Identify keywords identifier = KeywordIdentifier(min_frequency=5, max_keywords=100) categories = identifier.execute(df) print("Identified keywords:") for category, words in categories.items(): if words: print(f" {category}: {len(words)} keywords") print(f" Examples: {', '.join(words[:5])}") # Example 2: Normalization Analysis # ================================== from pipeline.steps.step0b_normalization_analysis import NormalizationAnalyzer # Analyze text patterns analyzer = NormalizationAnalyzer() suggestions = analyzer.execute(df) print("\nNormalization suggestions:") print(f" Abbreviations: {len(suggestions['abbreviations'])}") print(f" Acronyms: {len(suggestions['acronyms'])}") print(f" Misspellings: {len(suggestions['misspellings'])}") # Apply suggestions to common_defs.py print("\nSuggested additions to TEXT_EXPANSIONS in common_defs.py:") for abbrev, expansion in suggestions['abbreviations'].items(): print(f" '{abbrev}': '{expansion}',") # Example 3: Parallel Inference # ============================== from pipeline.utils.parallel_inference_runner import ParallelInferenceRunner # Run parallel inference (4x faster than sequential) runner = ParallelInferenceRunner( qwen3_url='http://localhost:8000', qwen25_url='http://localhost:8001', max_workers=4 # Adjust based on your system ) qwen3_file, qwen25_file = runner.run_inference( 'pipeline_output/dual_qwen_inference_requests.jsonl' ) print(f"\nResults saved to:") print(f" {qwen3_file}") print(f" {qwen25_file}") # Example 4: Complete Pipeline with Analysis # =========================================== from pipeline.main_pipeline import DiscoveryPipeline pipeline = DiscoveryPipeline('signal_messages.csv') # Step 0a: Identify keywords print("Step 0a: Identifying keywords...") df = pipeline.data_loader.execute() identifier = KeywordIdentifier() keywords = identifier.execute(df) # Step 0b: Analyze normalizations print("Step 0b: Analyzing normalizations...") analyzer = NormalizationAnalyzer() normalizations = analyzer.execute(df) # Continue with regular pipeline print("Running main pipeline...") results = pipeline.run_preprocessing() print("\nPipeline complete!") print(f" Keywords identified: {sum(len(v) for v in keywords.values())}") print(f" Normalizations suggested: {len(normalizations['abbreviations']) + len(normalizations['acronyms'])}") print(f" Chunks filtered: {len(results['semantic_filtered'])}")