| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384 |
- """
- Usage examples for advanced features.
- """
- # Example 1: Keyword Identification
- # ==================================
- from pipeline.steps.step0a_keyword_identification import KeywordIdentifier
- import pandas as pd
- # Load data
- df = pd.read_csv('signal_messages.csv')
- # Identify keywords
- identifier = KeywordIdentifier(min_frequency=5, max_keywords=100)
- categories = identifier.execute(df)
- print("Identified keywords:")
- for category, words in categories.items():
- if words:
- print(f" {category}: {len(words)} keywords")
- print(f" Examples: {', '.join(words[:5])}")
- # Example 2: Normalization Analysis
- # ==================================
- from pipeline.steps.step0b_normalization_analysis import NormalizationAnalyzer
- # Analyze text patterns
- analyzer = NormalizationAnalyzer()
- suggestions = analyzer.execute(df)
- print("\nNormalization suggestions:")
- print(f" Abbreviations: {len(suggestions['abbreviations'])}")
- print(f" Acronyms: {len(suggestions['acronyms'])}")
- print(f" Misspellings: {len(suggestions['misspellings'])}")
- # Apply suggestions to common_defs.py
- print("\nSuggested additions to TEXT_EXPANSIONS in common_defs.py:")
- for abbrev, expansion in suggestions['abbreviations'].items():
- print(f" '{abbrev}': '{expansion}',")
- # Example 3: Parallel Inference
- # ==============================
- from pipeline.utils.parallel_inference_runner import ParallelInferenceRunner
- # Run parallel inference (4x faster than sequential)
- runner = ParallelInferenceRunner(
- qwen3_url='http://localhost:8000',
- qwen25_url='http://localhost:8001',
- max_workers=4 # Adjust based on your system
- )
- qwen3_file, qwen25_file = runner.run_inference(
- 'pipeline_output/dual_qwen_inference_requests.jsonl'
- )
- print(f"\nResults saved to:")
- print(f" {qwen3_file}")
- print(f" {qwen25_file}")
- # Example 4: Complete Pipeline with Analysis
- # ===========================================
- from pipeline.main_pipeline import DiscoveryPipeline
- pipeline = DiscoveryPipeline('signal_messages.csv')
- # Step 0a: Identify keywords
- print("Step 0a: Identifying keywords...")
- df = pipeline.data_loader.execute()
- identifier = KeywordIdentifier()
- keywords = identifier.execute(df)
- # Step 0b: Analyze normalizations
- print("Step 0b: Analyzing normalizations...")
- analyzer = NormalizationAnalyzer()
- normalizations = analyzer.execute(df)
- # Continue with regular pipeline
- print("Running main pipeline...")
- results = pipeline.run_preprocessing()
- print("\nPipeline complete!")
- print(f" Keywords identified: {sum(len(v) for v in keywords.values())}")
- print(f" Normalizations suggested: {len(normalizations['abbreviations']) + len(normalizations['acronyms'])}")
- print(f" Chunks filtered: {len(results['semantic_filtered'])}")
|