ADVANCED_EXAMPLES.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. """
  2. Usage examples for advanced features.
  3. """
  4. # Example 1: Keyword Identification
  5. # ==================================
  6. from pipeline.steps.step0a_keyword_identification import KeywordIdentifier
  7. import pandas as pd
  8. # Load data
  9. df = pd.read_csv('signal_messages.csv')
  10. # Identify keywords
  11. identifier = KeywordIdentifier(min_frequency=5, max_keywords=100)
  12. categories = identifier.execute(df)
  13. print("Identified keywords:")
  14. for category, words in categories.items():
  15. if words:
  16. print(f" {category}: {len(words)} keywords")
  17. print(f" Examples: {', '.join(words[:5])}")
  18. # Example 2: Normalization Analysis
  19. # ==================================
  20. from pipeline.steps.step0b_normalization_analysis import NormalizationAnalyzer
  21. # Analyze text patterns
  22. analyzer = NormalizationAnalyzer()
  23. suggestions = analyzer.execute(df)
  24. print("\nNormalization suggestions:")
  25. print(f" Abbreviations: {len(suggestions['abbreviations'])}")
  26. print(f" Acronyms: {len(suggestions['acronyms'])}")
  27. print(f" Misspellings: {len(suggestions['misspellings'])}")
  28. # Apply suggestions to common_defs.py
  29. print("\nSuggested additions to TEXT_EXPANSIONS in common_defs.py:")
  30. for abbrev, expansion in suggestions['abbreviations'].items():
  31. print(f" '{abbrev}': '{expansion}',")
  32. # Example 3: Parallel Inference
  33. # ==============================
  34. from pipeline.utils.parallel_inference_runner import ParallelInferenceRunner
  35. # Run parallel inference (4x faster than sequential)
  36. runner = ParallelInferenceRunner(
  37. qwen3_url='http://localhost:8000',
  38. qwen25_url='http://localhost:8001',
  39. max_workers=4 # Adjust based on your system
  40. )
  41. qwen3_file, qwen25_file = runner.run_inference(
  42. 'pipeline_output/dual_qwen_inference_requests.jsonl'
  43. )
  44. print(f"\nResults saved to:")
  45. print(f" {qwen3_file}")
  46. print(f" {qwen25_file}")
  47. # Example 4: Complete Pipeline with Analysis
  48. # ===========================================
  49. from pipeline.main_pipeline import DiscoveryPipeline
  50. pipeline = DiscoveryPipeline('signal_messages.csv')
  51. # Step 0a: Identify keywords
  52. print("Step 0a: Identifying keywords...")
  53. df = pipeline.data_loader.execute()
  54. identifier = KeywordIdentifier()
  55. keywords = identifier.execute(df)
  56. # Step 0b: Analyze normalizations
  57. print("Step 0b: Analyzing normalizations...")
  58. analyzer = NormalizationAnalyzer()
  59. normalizations = analyzer.execute(df)
  60. # Continue with regular pipeline
  61. print("Running main pipeline...")
  62. results = pipeline.run_preprocessing()
  63. print("\nPipeline complete!")
  64. print(f" Keywords identified: {sum(len(v) for v in keywords.values())}")
  65. print(f" Normalizations suggested: {len(normalizations['abbreviations']) + len(normalizations['acronyms'])}")
  66. print(f" Chunks filtered: {len(results['semantic_filtered'])}")