feat: implement unified configuration loading and logging setup across entry points

This commit is contained in:
2025-08-06 22:17:02 +02:00
parent d7aa24a935
commit 9338d6eab8
11 changed files with 263 additions and 128 deletions
+3 -3
View File
@@ -1,7 +1,7 @@
#!.venv/bin/python3
import streamlit as st
from core.config import get_config
from core.config import setup_config_and_logging
from core.utils.data_loader import DataLoader
from interface.configuration import Configuration
from interface.dashboard import Dashboard
@@ -25,8 +25,8 @@ st.set_page_config(
@st.cache_data
def load_config():
"""Load application configuration"""
return get_config()
"""Load application configuration with unified setup"""
return setup_config_and_logging(env="development")
class StreamlitApp:
+19 -11
View File
@@ -6,7 +6,7 @@ from pathlib import Path
import pandas as pd
from core.config import get_config, setup_logging
from core.config import setup_config_and_logging
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
@@ -104,7 +104,8 @@ def show_experiment_details(args):
def compare_experiments_cmd(args):
"""Compare multiple experiments"""
runner = ExperimentRunner(get_config())
config = setup_config_and_logging(env="development")
runner = ExperimentRunner(config)
comparison = runner.compare_experiments(args.experiment_ids)
if comparison.empty:
@@ -130,14 +131,20 @@ def export_results(args):
def main():
"""Main CLI entry point"""
"""Main CLI entry point with unified configuration loading"""
parser = argparse.ArgumentParser(
description="DRC Names Research Experiment Manager",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
# Setup logging
# Global arguments
parser.add_argument("--config", type=Path, help="Path to configuration file")
parser.add_argument(
"--env", type=str, default="development",
help="Environment name (default: development)"
)
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose logging")
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# List experiments
@@ -163,14 +170,15 @@ def main():
parser.print_help()
return 1
# Setup logging
config = get_config()
if args.verbose:
config.logging.level = "DEBUG"
setup_logging(config)
# Execute command
try:
# Load configuration and setup logging
config = setup_config_and_logging(config_path=args.config, env=args.env)
# Override log level if verbose requested
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
# Execute command
command_map = {
"list": list_experiments,
"show": show_experiment_details,
+3 -1
View File
@@ -31,12 +31,14 @@ llm:
max_concurrent_requests: 4
enable_rate_limiting: true
# Production data settings
# Development data settings - limited dataset for faster testing
data:
split_evaluation: true
split_by_gender: true
evaluation_fraction: 0.2
random_seed: 42
max_dataset_size: 10_000 # Limit to 10k records for development/testing
balance_by_sex: true # Balance male/female samples when limiting
# Enhanced logging for development
logging:
+2
View File
@@ -37,6 +37,8 @@ data:
split_by_gender: true
evaluation_fraction: 0.2
random_seed: 42
max_dataset_size: null
balance_by_sex: false
# Production logging (less verbose)
logging:
+2
View File
@@ -58,6 +58,8 @@ data:
split_by_gender: true
evaluation_fraction: 0.2
random_seed: 42
max_dataset_size: null
balance_by_sex: false
# Logging configuration
logging:
+35
View File
@@ -21,6 +21,41 @@ def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfi
return config_manager.get_config()
def setup_config_and_logging(
config_path: Optional[Path] = None,
env: str = "development"
) -> PipelineConfig:
"""
Unified configuration loading and logging setup for all entrypoint scripts.
Args:
config_path: Direct path to config file (takes precedence over env)
env: Environment name (defaults to "development")
Returns:
Loaded configuration object
"""
# Determine config path
if config_path is None:
config_path = Path("config") / f"pipeline.{env}.yaml"
# Load configuration
config = ConfigManager(config_path).load_config()
# Setup logging
setup_logging(config)
# Ensure required directories exist
from core.utils import ensure_directories
ensure_directories(config)
logging.info(f"Loaded configuration: {config.name} v{config.version}")
logging.info(f"Environment: {config.environment}")
logging.info(f"Config file: {config_path}")
return config
def setup_logging(config: PipelineConfig):
"""Setup logging based on configuration"""
+5 -1
View File
@@ -1,5 +1,5 @@
from dataclasses import field
from typing import Dict
from typing import Dict, Optional
from pydantic import BaseModel
@@ -20,3 +20,7 @@ class DataConfig(BaseModel):
split_by_gender: bool = True
evaluation_fraction: float = 0.2
random_seed: int = 42
# Dataset size limiting options
max_dataset_size: Optional[int] = None
balance_by_sex: bool = False
+64 -2
View File
@@ -44,9 +44,71 @@ class DataLoader:
raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")
def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
"""Load complete CSV file into memory"""
"""Load complete CSV file into memory with size limiting and balancing"""
chunks = list(self.load_csv_chunked(filepath))
return pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
if not chunks:
return pd.DataFrame()
df = pd.concat(chunks, ignore_index=True)
# Apply dataset size limiting if configured
if self.config.data.max_dataset_size is not None:
df = self._limit_dataset_size(df)
return df
def _limit_dataset_size(self, df: pd.DataFrame) -> pd.DataFrame:
"""Limit dataset size with optional sex balancing"""
max_size = self.config.data.max_dataset_size
if max_size is None or len(df) <= max_size:
return df
if self.config.data.balance_by_sex and "sex" in df.columns:
return self._balanced_sample(df, max_size)
else:
# Simple random sampling
return df.sample(n=max_size, random_state=self.config.data.random_seed)
def _balanced_sample(self, df: pd.DataFrame, max_size: int) -> pd.DataFrame:
"""Sample data with balanced sex distribution"""
# Get unique sex values
sex_values = df["sex"].dropna().unique()
if len(sex_values) == 0:
logging.warning(f"No valid values found in sex column 'sex', using random sampling")
return df.sample(n=max_size, random_state=self.config.data.random_seed)
# Calculate samples per sex category
samples_per_sex = max_size // len(sex_values)
remaining_samples = max_size % len(sex_values)
balanced_samples = []
for i, sex in enumerate(sex_values):
sex_df = df[df["sex"] == sex]
# Distribute remaining samples to first categories
current_samples = samples_per_sex + (1 if i < remaining_samples else 0)
current_samples = min(current_samples, len(sex_df))
if current_samples > 0:
sample = sex_df.sample(n=current_samples, random_state=self.config.data.random_seed + i)
balanced_samples.append(sample)
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
if not balanced_samples:
logging.warning("No balanced samples could be created, using random sampling")
return df.sample(n=max_size, random_state=self.config.data.random_seed)
result = pd.concat(balanced_samples, ignore_index=True)
# Shuffle the final result
result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index(drop=True)
logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total records")
return result
@classmethod
def save_csv(
+24 -36
View File
@@ -3,11 +3,10 @@ import sys
import argparse
import logging
from pathlib import Path
from typing import Optional
from core.utils.data_loader import DataLoader
from core.config import ConfigManager, setup_logging
from core.utils import ensure_directories, get_data_file_path
from core.config import setup_config_and_logging
from core.utils import get_data_file_path
from processing.pipeline import Pipeline
from processing.batch.batch_config import BatchConfig
@@ -17,13 +16,8 @@ from processing.steps.feature_extraction_step import FeatureExtractionStep
from processing.steps.data_cleaning_step import DataCleaningStep
def create_pipeline_from_config(config_path: Optional[Path] = None) -> Pipeline:
"""Create pipeline from configuration file"""
config = ConfigManager(config_path).load_config()
# Setup logging
setup_logging(config)
ensure_directories(config)
def create_pipeline_from_config(config) -> Pipeline:
"""Create pipeline from configuration"""
batch_config = BatchConfig(
batch_size=config.processing.batch_size,
max_workers=config.processing.max_workers,
@@ -48,13 +42,10 @@ def create_pipeline_from_config(config_path: Optional[Path] = None) -> Pipeline:
return pipeline
def run_pipeline(config_path: Optional[Path] = None, resume: bool = False) -> int:
def run_pipeline(config, resume: bool = False) -> int:
"""Run the complete pipeline"""
try:
config = ConfigManager(config_path).load_config()
logging.info(f"Starting pipeline: {config.name} v{config.version}")
logging.info(f"Environment: {config.environment}")
# Load input data
input_file_path = get_data_file_path(config.data.input_file, config)
@@ -69,7 +60,7 @@ def run_pipeline(config_path: Optional[Path] = None, resume: bool = False) -> in
logging.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
# Create and run pipeline
pipeline = create_pipeline_from_config(config_path)
pipeline = create_pipeline_from_config(config)
logging.info("Starting pipeline execution")
result_df = pipeline.run(df)
@@ -99,27 +90,28 @@ def run_pipeline(config_path: Optional[Path] = None, resume: bool = False) -> in
def main():
"""Main entry point with minimal command-line interface"""
"""Main entry point with unified configuration loading"""
parser = argparse.ArgumentParser(
description="DRC Names Processing Pipeline",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Configuration File Examples:
config/pipeline.yaml - Main configuration
config/pipeline.development.yaml - Development environment
config/pipeline.development.yaml - Development environment (default)
config/pipeline.production.yaml - Production environment
Usage Examples:
python processing/main.py # Use default config
python processing/main.py --config config/pipeline.yaml # Use specific config
python processing/main.py --env development # Use environment config
python processing/main.py --resume # Resume from checkpoints
python main.py # Use development config (default)
python main.py --config config/pipeline.yaml # Use specific config
python main.py --env production # Use production environment
python main.py --resume # Resume from checkpoints
""",
)
parser.add_argument("--config", type=Path, help="Path to configuration file")
parser.add_argument(
"--env", type=str, help="Environment name (loads config/pipeline.{env}.yaml)"
"--env", type=str, default="development",
help="Environment name (default: development)"
)
parser.add_argument(
"--resume", action="store_true", help="Resume pipeline from existing checkpoints"
@@ -129,24 +121,20 @@ Usage Examples:
)
args = parser.parse_args()
# Determine config path
config_path = None
if args.config:
config_path = args.config
elif args.env:
config_path = Path("config") / f"pipeline.{args.env}.yaml"
try:
# Load configuration and setup logging
config = setup_config_and_logging(config_path=args.config, env=args.env)
if args.validate_config:
try:
config = ConfigManager(config_path).load_config()
if args.validate_config:
print(f"Configuration is valid: {config.name} v{config.version}")
return 0
except Exception as e:
print(f"Configuration validation failed: {e}")
return 1
# Run pipeline
return run_pipeline(config_path, args.resume)
# Run pipeline
return run_pipeline(config, args.resume)
except Exception as e:
print(f"Configuration or pipeline failed: {e}")
return 1
if __name__ == "__main__":
+79 -64
View File
@@ -1,8 +1,9 @@
#!.venv/bin/python3
import argparse
import sys
from pathlib import Path
from core.config.config_manager import ConfigManager
from core.config import setup_config_and_logging
from processing.monitoring.data_analyzer import DatasetAnalyzer
from processing.monitoring.pipeline_monitor import PipelineMonitor
@@ -11,6 +12,12 @@ def main():
parser = argparse.ArgumentParser(
description="Monitor and manage the DRC names processing pipeline"
)
parser.add_argument("--config", type=Path, help="Path to configuration file")
parser.add_argument(
"--env", type=str, default="development",
help="Environment name (default: development)"
)
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Status command
@@ -62,80 +69,88 @@ def main():
parser.print_help()
return 1
monitor = PipelineMonitor()
try:
# Load configuration and setup logging
config = setup_config_and_logging(config_path=args.config, env=args.env)
if args.command == "status":
monitor.print_status(detailed=args.detailed)
monitor = PipelineMonitor()
elif args.command == "clean":
checkpoint_info = monitor.count_checkpoint_files()
print(f"Current checkpoint storage: {checkpoint_info['total_size_mb']:.1f} MB")
if args.command == "status":
monitor.print_status(detailed=args.detailed)
if not args.force:
response = input("Are you sure you want to clean checkpoints? (y/N): ")
if response.lower() != "y":
print("Cancelled")
return 0
elif args.command == "clean":
checkpoint_info = monitor.count_checkpoint_files()
print(f"Current checkpoint storage: {checkpoint_info['total_size_mb']:.1f} MB")
if args.step:
monitor.clean_step_checkpoints(args.step, args.keep_last)
else:
for step in monitor.steps:
monitor.clean_step_checkpoints(step, args.keep_last)
if not args.force:
response = input("Are you sure you want to clean checkpoints? (y/N): ")
if response.lower() != "y":
print("Cancelled")
return 0
print("Checkpoint cleaning completed")
if args.step:
monitor.clean_step_checkpoints(args.step, args.keep_last)
else:
for step in monitor.steps:
monitor.clean_step_checkpoints(step, args.keep_last)
elif args.command == "reset":
if not args.force:
response = input(
f"Are you sure you want to reset {args.step}? This will delete all checkpoints. (y/N): "
print("Checkpoint cleaning completed")
elif args.command == "reset":
if not args.force:
response = input(
f"Are you sure you want to reset {args.step}? This will delete all checkpoints. (y/N): "
)
if response.lower() != "y":
print("Cancelled")
return 0
monitor.reset_step(args.step)
print(f"Reset completed for {args.step}")
elif args.command == "analyze":
# Use configured data directory
data_dir = config.paths.data_dir
filepath = data_dir / args.file
if not filepath.exists():
print(f"File not found: {filepath}")
return 1
analyzer = DatasetAnalyzer(str(filepath))
if not analyzer.load_data():
return 1
completion_stats = analyzer.analyze_completion()
print(f"\n=== Dataset Analysis: {args.file} ===")
print(f"Total rows: {completion_stats['total_rows']:,}")
print(f"Annotated: {completion_stats['annotated_rows']:,} ({completion_stats['annotation_percentage']:.1f}%)")
print(f"Unannotated: {completion_stats['unannotated_rows']:,}")
print(
f"Complete names: {completion_stats['complete_names']:,} ({completion_stats['completeness_percentage']:.1f}%)"
)
if response.lower() != "y":
print("Cancelled")
return 0
monitor.reset_step(args.step)
print(f"Reset completed for {args.step}")
elif args.command == "info":
checkpoint_info = monitor.count_checkpoint_files()
elif args.command == "analyze":
# Use configured data directory instead of hardcoded DATA_DIR
data_dir = ConfigManager().default_paths.data_dir
filepath = data_dir / args.file
if not filepath.exists():
print(f"File not found: {filepath}")
return 1
analyzer = DatasetAnalyzer(str(filepath))
if not analyzer.load_data():
return 1
completion_stats = analyzer.analyze_completion()
print(f"\n=== Dataset Analysis: {args.file} ===")
print(f"Total rows: {completion_stats['total_rows']:,}")
print(f"Annotated: {completion_stats['annotated_rows']:,} ({completion_stats['annotation_percentage']:.1f}%)")
print(f"Unannotated: {completion_stats['unannotated_rows']:,}")
print(
f"Complete names: {completion_stats['complete_names']:,} ({completion_stats['completeness_percentage']:.1f}%)"
)
elif args.command == "info":
checkpoint_info = monitor.count_checkpoint_files()
print(f"\n=== Checkpoint Information ===")
print(f"Total storage: {checkpoint_info['total_size_mb']:.1f} MB")
print()
for step in monitor.steps:
step_info = checkpoint_info[step]
print(f"{step.replace('_', ' ').title()}:")
print(f" Files: {step_info['files']}")
print(f" Size: {step_info['size_mb']:.1f} MB")
print(f"\n=== Checkpoint Information ===")
print(f"Total storage: {checkpoint_info['total_size_mb']:.1f} MB")
print()
return 0
for step in monitor.steps:
step_info = checkpoint_info[step]
print(f"{step.replace('_', ' ').title()}:")
print(f" Files: {step_info['files']}")
print(f" Size: {step_info['size_mb']:.1f} MB")
print()
return 0
except Exception as e:
print(f"Monitor command failed: {e}")
return 1
if __name__ == "__main__":
+27 -10
View File
@@ -1,26 +1,43 @@
#!.venv/bin/python3
import argparse
import sys
from core.config import setup_logging, get_config
from core.config import setup_config_and_logging
from research.model_trainer import ModelTrainer
def main():
setup_logging(get_config())
parser = argparse.ArgumentParser(description="Train DRC Names Models")
parser.add_argument("--config", type=str, help="Path to configuration file")
parser.add_argument(
"--env", type=str, default="development",
help="Environment name (default: development)"
)
parser.add_argument("--type", type=str, help="Specific model type to train")
parser.add_argument("--name", type=str, help="Model name")
args = parser.parse_args()
trainer = ModelTrainer()
# Train specific model
trainer.train_single_model(
model_name=args.name,
model_type=args.type,
features=["full_name"]
)
try:
# Load configuration and setup logging
config = setup_config_and_logging(config_path=args.config, env=args.env)
trainer = ModelTrainer()
# Train specific model
trainer.train_single_model(
model_name=args.name,
model_type=args.type,
features=["full_name"]
)
return 0
except Exception as e:
print(f"Training failed: {e}")
return 1
if __name__ == "__main__":
main()
exit_code = main()
sys.exit(exit_code)