refactor: reorganize project structure and enhance model verbosity

This commit is contained in:
2025-08-06 21:57:10 +02:00
parent ad8db43748
commit d7aa24a935
23 changed files with 1209 additions and 1416 deletions
+21 -999
View File
File diff suppressed because it is too large Load Diff
+13 -199
View File
@@ -1,159 +1,14 @@
#!.venv/bin/python3
import argparse
import logging
import sys
from pathlib import Path
import json
import pandas as pd
import logging
from core.config import get_config, setup_logging
from research.experiment import ExperimentConfig
from research.experiment.experiment_tracker import ExperimentTracker
from research.experiment.feature_extractor import FeatureType
from research.experiment.experiment_builder import ExperimentBuilder
from research.experiment.experiment_runner import ExperimentRunner
from research.model_registry import list_available_models
def create_experiment_from_args(args) -> ExperimentConfig:
"""Create experiment configuration from command line arguments"""
features = []
if args.features:
for feature_name in args.features:
try:
features.append(FeatureType(feature_name))
except ValueError:
logging.warning(f"Unknown feature type '{feature_name}', skipping")
if not features:
features = [FeatureType.FULL_NAME] # Default
# Parse model parameters
model_params = {}
if args.model_params:
try:
model_params = json.loads(args.model_params)
except json.JSONDecodeError:
logging.warning("Invalid JSON for model parameters, using defaults")
# Parse feature parameters
feature_params = {}
if args.feature_params:
try:
feature_params = json.loads(args.feature_params)
except json.JSONDecodeError:
logging.warning("Invalid JSON for feature parameters, using defaults")
# Parse data filters
train_filter = None
if args.train_filter:
try:
train_filter = json.loads(args.train_filter)
except json.JSONDecodeError:
logging.warning("Invalid JSON for train filter, ignoring")
return ExperimentConfig(
name=args.name,
description=args.description or "",
tags=args.tags or [],
model_type=args.model_type,
model_params=model_params,
features=features,
feature_params=feature_params,
train_data_filter=train_filter,
target_column=args.target,
test_size=args.test_size,
random_seed=args.seed,
cross_validation_folds=args.cv_folds,
metrics=args.metrics or ["accuracy", "precision", "recall", "f1"],
)
def run_single_experiment(args):
"""Run a single experiment"""
config = create_experiment_from_args(args)
runner = ExperimentRunner()
experiment_id = runner.run_experiment(config)
logging.info(f"Experiment completed: {experiment_id}")
# Show results
experiment = runner.tracker.get_experiment(experiment_id)
if experiment:
logging.info("Results:")
for metric, value in experiment.test_metrics.items():
logging.info(f" Test {metric}: {value:.4f}")
if experiment.cv_metrics:
logging.info("Cross-validation:")
for metric, value in experiment.cv_metrics.items():
if not metric.endswith("_std"):
std_key = f"{metric}_std"
std_val = experiment.cv_metrics.get(std_key, 0)
logging.info(f" CV {metric}: {value:.4f} ± {std_val:.4f}")
def run_baseline_experiments(args):
"""Run baseline experiments"""
logger = logging.getLogger(__name__)
builder = ExperimentBuilder()
experiments = builder.create_baseline_experiments()
runner = ExperimentRunner()
experiment_ids = runner.run_experiment_batch(experiments)
logging.info(f"Completed {len(experiment_ids)} baseline experiments")
# Show comparison
if experiment_ids:
comparison = runner.compare_experiments(experiment_ids)
logging.info("Baseline Results Comparison:")
logging.info(
comparison[["name", "model_type", "features", "test_accuracy"]].to_string(index=False)
)
def run_ablation_study(args):
"""Run feature ablation study"""
builder = ExperimentBuilder()
experiments = builder.create_feature_ablation_study()
runner = ExperimentRunner()
experiment_ids = runner.run_experiment_batch(experiments)
logging.info(f"Completed {len(experiment_ids)} ablation experiments")
# Show results
if experiment_ids:
comparison = runner.compare_experiments(experiment_ids)
logging.info("Ablation Study Results:")
logging.info(comparison[["name", "test_accuracy", "test_f1"]].to_string(index=False))
def run_component_study(args):
"""Run name component study"""
builder = ExperimentBuilder()
experiments = builder.create_name_component_study()
runner = ExperimentRunner()
experiment_ids = runner.run_experiment_batch(experiments)
logging.info(f"Completed {len(experiment_ids)} component study experiments")
# Show results
if experiment_ids:
comparison = runner.compare_experiments(experiment_ids)
logging.info("Name Component Study Results:")
logging.info(
comparison[["name", "test_accuracy", "test_precision", "test_recall"]].to_string(
index=False
)
)
from research.experiment.experiment_tracker import ExperimentTracker
def list_experiments(args):
@@ -249,7 +104,7 @@ def show_experiment_details(args):
def compare_experiments_cmd(args):
"""Compare multiple experiments"""
runner = ExperimentRunner()
runner = ExperimentRunner(get_config())
comparison = runner.compare_experiments(args.experiment_ids)
if comparison.empty:
@@ -285,43 +140,9 @@ def main():
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose logging")
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Single experiment command
exp_parser = subparsers.add_parser("run", help="Run a single experiment")
exp_parser.add_argument("--name", required=True, help="Experiment name")
exp_parser.add_argument("--description", help="Experiment description")
exp_parser.add_argument(
"--model-type",
default="logistic_regression",
choices=list_available_models(),
help="Model type",
)
exp_parser.add_argument(
"--features", nargs="+", choices=[f.value for f in FeatureType], help="Features to use"
)
exp_parser.add_argument("--model-params", help="Model parameters as JSON")
exp_parser.add_argument("--feature-params", help="Feature parameters as JSON")
exp_parser.add_argument("--train-filter", help="Training data filter as JSON")
exp_parser.add_argument("--target", default="sex", help="Target column")
exp_parser.add_argument("--test-size", type=float, default=0.2, help="Test set size")
exp_parser.add_argument("--seed", type=int, default=42, help="Random seed")
exp_parser.add_argument("--cv-folds", type=int, default=5, help="CV folds")
exp_parser.add_argument(
"--metrics",
nargs="+",
choices=["accuracy", "precision", "recall", "f1"],
help="Metrics to calculate",
)
exp_parser.add_argument("--tags", nargs="+", help="Experiment tags")
# Batch experiment commands
subparsers.add_parser("baseline", help="Run baseline experiments")
subparsers.add_parser("ablation", help="Run feature ablation study")
subparsers.add_parser("components", help="Run name component study")
# List experiments
list_parser = subparsers.add_parser("list", help="List experiments")
list_parser.add_argument("--status", choices=["pending", "running", "completed", "failed"])
list_parser.add_argument("--model-type", choices=list_available_models())
list_parser.add_argument("--tags", nargs="+", help="Filter by tags")
# Show experiment details
@@ -350,22 +171,15 @@ def main():
# Execute command
try:
if args.command == "run":
run_single_experiment(args)
elif args.command == "baseline":
run_baseline_experiments(args)
elif args.command == "ablation":
run_ablation_study(args)
elif args.command == "components":
run_component_study(args)
elif args.command == "list":
list_experiments(args)
elif args.command == "show":
show_experiment_details(args)
elif args.command == "compare":
compare_experiments_cmd(args)
elif args.command == "export":
export_results(args)
command_map = {
"list": list_experiments,
"show": show_experiment_details,
"compare": compare_experiments_cmd,
"export": export_results,
}
handler = command_map.get(args.command)
if handler:
handler(args)
return 0
+12
View File
@@ -0,0 +1,12 @@
import streamlit as st
class Configuration:
"""Handles configuration display and management"""
def __init__(self, config):
self.config = config
def index(self):
st.header("Current Configuration")
st.json(self.config.model_dump())
@@ -2,7 +2,7 @@ import pandas as pd
import plotly.express as px
import streamlit as st
from web.log_reader import LogReader
from interface.log_reader import LogReader
def load_dataset(file_path: str) -> pd.DataFrame:
+398
View File
@@ -0,0 +1,398 @@
from typing import List, Dict, Any
import streamlit as st
from core.utils.region_mapper import RegionMapper
from research.experiment import ExperimentConfig, ExperimentStatus
from research.experiment.experiment_builder import ExperimentBuilder
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
from research.experiment.feature_extractor import FeatureType
from research.model_registry import list_available_models
class Experiments:
"""Handles experiment management interface"""
def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner):
self.config = config
self.experiment_tracker = experiment_tracker
self.experiment_runner = experiment_runner
def index(self):
"""Main experiments page"""
st.header("Experiment Management")
tab1, tab2, tab3 = st.tabs(["New Experiment", "Experiment List", "Batch Experiments"])
with tab1:
self.show_experiment_creation()
with tab2:
self.show_experiment_list()
with tab3:
self.show_batch_experiments()
def show_experiment_creation(self):
"""Show interface for creating new experiments"""
st.subheader("Create New Experiment")
with st.form("new_experiment"):
col1, col2 = st.columns(2)
with col1:
exp_name = st.text_input("Experiment Name", placeholder="e.g., native_name_gender_prediction")
description = st.text_area("Description", placeholder="Brief description of the experiment")
model_type = st.selectbox("Model Type", list_available_models())
# Feature selection
feature_options = [f.value for f in FeatureType]
selected_features = st.multiselect("Features to Use", feature_options, default=["full_name"])
with col2:
# Model parameters
st.write("**Model Parameters**")
model_params = {}
if model_type == "logistic_regression":
ngram_min = st.number_input("N-gram Min", 1, 5, 2)
ngram_max = st.number_input("N-gram Max", 2, 8, 5)
max_features = st.number_input("Max Features", 1000, 50000, 10000)
model_params = {
"ngram_range": [ngram_min, ngram_max],
"max_features": max_features,
}
elif model_type == "random_forest":
n_estimators = st.number_input("Number of Trees", 10, 500, 100)
max_depth = st.number_input("Max Depth", 1, 20, 10)
model_params = {
"n_estimators": n_estimators,
"max_depth": max_depth if max_depth > 0 else None,
}
# Training parameters
st.write("**Training Parameters**")
test_size = st.slider("Test Set Size", 0.1, 0.5, 0.2)
cv_folds = st.number_input("Cross-Validation Folds", 3, 10, 5)
tags = st.text_input("Tags (comma-separated)", placeholder="e.g., baseline, feature_study")
# Advanced options
with st.expander("Advanced Options"):
# Data filters
st.write("**Data Filters**")
filter_province = st.selectbox(
"Filter by Province (optional)",
["None"] + RegionMapper().get_provinces(),
)
min_words = st.number_input("Minimum Word Count", 0, 10, 0)
max_words = st.number_input("Maximum Word Count (0 = no limit)", 0, 20, 0)
submitted = st.form_submit_button("Create and Run Experiment", type="primary")
if submitted:
self._handle_experiment_submission(
exp_name, description, model_type, selected_features, model_params,
test_size, cv_folds, tags, filter_province, min_words, max_words
)
def _handle_experiment_submission(self, exp_name: str, description: str, model_type: str,
selected_features: List[str], model_params: Dict[str, Any],
test_size: float, cv_folds: int, tags: str,
filter_province: str, min_words: int, max_words: int):
"""Handle experiment form submission"""
if not exp_name:
st.error("Please provide an experiment name")
return
if not selected_features:
st.error("Please select at least one feature")
return
try:
# Prepare data filters
train_filter = {}
if filter_province != "None":
train_filter["province"] = filter_province
if min_words > 0:
train_filter["words"] = {"min": min_words}
if max_words > 0:
if "words" in train_filter:
train_filter["words"]["max"] = max_words
else:
train_filter["words"] = {"max": max_words}
# Create experiment config
features = [FeatureType(f) for f in selected_features]
tag_list = [tag.strip() for tag in tags.split(",") if tag.strip()]
config = ExperimentConfig(
name=exp_name,
description=description,
tags=tag_list,
model_type=model_type,
model_params=model_params,
features=features,
train_data_filter=train_filter if train_filter else None,
test_size=test_size,
cross_validation_folds=cv_folds,
)
# Run experiment
with st.spinner("Running experiment..."):
experiment_id = self.experiment_runner.run_experiment(config)
st.success(f"Experiment completed successfully!")
st.info(f"Experiment ID: `{experiment_id}`")
# Show results
experiment = self.experiment_tracker.get_experiment(experiment_id)
if experiment and experiment.test_metrics:
st.write("**Results:**")
for metric, value in experiment.test_metrics.items():
st.metric(metric.title(), f"{value:.4f}")
except Exception as e:
st.error(f"Error running experiment: {e}")
def show_experiment_list(self):
"""Show list of all experiments with filtering"""
st.subheader("All Experiments")
# Filters
col1, col2, col3 = st.columns(3)
with col1:
status_filter = st.selectbox(
"Filter by Status", ["All", "completed", "running", "failed", "pending"]
)
with col2:
model_filter = st.selectbox("Filter by Model", ["All"] + list_available_models())
with col3:
tag_filter = st.text_input("Filter by Tags (comma-separated)")
# Get and filter experiments
experiments = self._get_filtered_experiments(status_filter, model_filter, tag_filter)
if not experiments:
st.info("No experiments found matching the filters.")
return
# Display experiments
for i, exp in enumerate(experiments):
with st.expander(
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
):
self._display_experiment_details(exp, i)
def _get_filtered_experiments(self, status_filter: str, model_filter: str, tag_filter: str):
"""Get experiments with applied filters"""
experiments = self.experiment_tracker.list_experiments()
# Apply filters
if status_filter != "All":
experiments = [e for e in experiments if e.status == ExperimentStatus(status_filter)]
if model_filter != "All":
experiments = [e for e in experiments if e.config.model_type == model_filter]
if tag_filter:
tags = [tag.strip() for tag in tag_filter.split(",")]
experiments = [e for e in experiments if any(tag in e.config.tags for tag in tags)]
return experiments
def _display_experiment_details(self, exp, index: int):
"""Display details for a single experiment"""
col1, col2, col3 = st.columns(3)
with col1:
st.write(f"**Model:** {exp.config.model_type}")
st.write(f"**Features:** {', '.join([f.value for f in exp.config.features])}")
st.write(f"**Tags:** {', '.join(exp.config.tags)}")
with col2:
if exp.test_metrics:
for metric, value in exp.test_metrics.items():
st.metric(metric.title(), f"{value:.4f}")
with col3:
st.write(f"**Train Size:** {exp.train_size:,}")
st.write(f"**Test Size:** {exp.test_size:,}")
if st.button(f"View Details", key=f"details_{index}"):
st.session_state.selected_experiment = exp.experiment_id
st.rerun()
if exp.config.description:
st.write(f"**Description:** {exp.config.description}")
def show_batch_experiments(self):
"""Show interface for running batch experiments"""
st.subheader("Batch Experiments")
st.write("Run multiple experiments with different parameter combinations.")
# Parameter sweep configuration
with st.form("batch_experiments"):
st.write("**Parameter Sweep Configuration**")
col1, col2 = st.columns(2)
with col1:
base_name = st.text_input("Base Experiment Name", "parameter_sweep")
model_types = st.multiselect(
"Model Types", list_available_models(), default=["logistic_regression"]
)
# N-gram ranges for logistic regression
st.write("**Logistic Regression Parameters**")
ngram_ranges = st.text_area(
"N-gram Ranges (one per line, format: min,max)", "2,4\n2,5\n3,6"
)
with col2:
feature_combinations = st.multiselect(
"Feature Combinations",
[f.value for f in FeatureType],
default=["full_name", "native_name", "surname"],
)
test_sizes = st.text_input("Test Sizes (comma-separated)", "0.15,0.2,0.25")
tags = st.text_input("Common Tags", "parameter_sweep,batch")
if st.form_submit_button("🚀 Run Batch Experiments"):
self.run_batch_experiments(
base_name, model_types, ngram_ranges, feature_combinations, test_sizes, tags
)
def run_batch_experiments(self, base_name: str, model_types: List[str], ngram_ranges: str,
feature_combinations: List[str], test_sizes: str, tags: str):
"""Run batch experiments with parameter combinations"""
with st.spinner("Running batch experiments..."):
try:
experiments = []
# Parse parameters
ngram_list = []
for line in ngram_ranges.strip().split("\n"):
if "," in line:
min_val, max_val = map(int, line.split(","))
ngram_list.append([min_val, max_val])
test_size_list = [float(x.strip()) for x in test_sizes.split(",")]
tag_list = [tag.strip() for tag in tags.split(",") if tag.strip()]
# Generate experiment combinations
exp_count = 0
for model_type in model_types:
for feature_combo in feature_combinations:
for test_size in test_size_list:
if model_type == "logistic_regression":
for ngram_range in ngram_list:
exp_name = f"{base_name}_{model_type}_{feature_combo}_{ngram_range[0]}_{ngram_range[1]}_{test_size}"
config = ExperimentConfig(
name=exp_name,
description=f"Batch experiment: {model_type} with {feature_combo}",
model_type=model_type,
features=[FeatureType(feature_combo)],
model_params={"ngram_range": ngram_range},
test_size=test_size,
tags=tag_list,
)
experiments.append(config)
exp_count += 1
else:
exp_name = f"{base_name}_{model_type}_{feature_combo}_{test_size}"
config = ExperimentConfig(
name=exp_name,
description=f"Batch experiment: {model_type} with {feature_combo}",
model_type=model_type,
features=[FeatureType(feature_combo)],
test_size=test_size,
tags=tag_list,
)
experiments.append(config)
exp_count += 1
# Run experiments
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
st.success(f"Completed {len(experiment_ids)} batch experiments")
# Show summary
if experiment_ids:
comparison = self.experiment_runner.compare_experiments(experiment_ids)
st.write("**Batch Results Summary:**")
st.dataframe(
comparison[["name", "model_type", "test_accuracy"]],
use_container_width=True,
)
except Exception as e:
st.error(f"Error running batch experiments: {e}")
def run_baseline_experiments(self):
"""Run baseline experiments"""
with st.spinner("Running baseline experiments..."):
try:
builder = ExperimentBuilder()
experiments = builder.create_baseline_experiments()
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
st.success(f"Completed {len(experiment_ids)} baseline experiments")
# Show quick comparison
if experiment_ids:
comparison = self.experiment_runner.compare_experiments(experiment_ids)
st.write("**Results Summary:**")
st.dataframe(
comparison[["name", "model_type", "test_accuracy"]],
use_container_width=True,
)
except Exception as e:
st.error(f"Error running baseline experiments: {e}")
def run_ablation_study(self):
"""Run feature ablation study"""
with st.spinner("Running ablation study..."):
try:
builder = ExperimentBuilder()
experiments = builder.create_feature_ablation_study()
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
st.success(f"Completed {len(experiment_ids)} ablation experiments")
except Exception as e:
st.error(f"Error running ablation study: {e}")
def run_component_study(self):
"""Run name component study"""
with st.spinner("Running component study..."):
try:
builder = ExperimentBuilder()
experiments = builder.create_name_component_study()
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
st.success(f"Completed {len(experiment_ids)} component experiments")
except Exception as e:
st.error(f"Error running component study: {e}")
def run_province_study(self):
"""Run province-specific study"""
with st.spinner("Running province study..."):
try:
builder = ExperimentBuilder()
experiments = builder.create_province_specific_study()
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
st.success(f"Completed {len(experiment_ids)} province experiments")
except Exception as e:
st.error(f"Error running province study: {e}")
@@ -37,7 +37,7 @@ class LogReader:
# Parse log entries from the end
entries = []
for line in reversed(lines[-count*2:]): # Read more lines in case some don't match
for line in reversed(lines[-count * 2:]): # Read more lines in case some don't match
entry = self._parse_log_line(line.strip())
if entry:
entries.append(entry)
+373
View File
@@ -0,0 +1,373 @@
"""Predictions interface for the Streamlit app"""
from datetime import datetime
from typing import Optional
import numpy as np
import pandas as pd
import plotly.express as px
import streamlit as st
from core.utils import get_data_file_path
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
class Predictions:
"""Handles prediction interface"""
def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner):
self.config = config
self.experiment_tracker = experiment_tracker
self.experiment_runner = experiment_runner
def index(self):
"""Main predictions page"""
st.header("Make Predictions")
# Load available models
experiments = self.experiment_tracker.list_experiments()
completed_experiments = [
e for e in experiments if e.status.value == "completed" and e.model_path
]
if not completed_experiments:
st.warning("No trained models available. Please run some experiments first.")
return
# Model selection
model_options = {
f"{exp.config.name} (Acc: {exp.test_metrics.get('accuracy', 0):.3f})": exp
for exp in completed_experiments
if exp.test_metrics
}
selected_model_name = st.selectbox("Select Model", list(model_options.keys()))
if not selected_model_name:
return
selected_experiment = model_options[selected_model_name]
# Prediction modes
prediction_mode = st.radio(
"Prediction Mode", ["Single Name", "Batch Upload", "Dataset Prediction"]
)
if prediction_mode == "Single Name":
self.show_single_prediction(selected_experiment)
elif prediction_mode == "Batch Upload":
self.show_batch_prediction(selected_experiment)
elif prediction_mode == "Dataset Prediction":
self.show_dataset_prediction(selected_experiment)
def show_single_prediction(self, experiment):
"""Show single name prediction interface"""
st.subheader("Single Name Prediction")
name_input = st.text_input("Enter a name:", placeholder="e.g., Jean Baptiste Mukendi")
if name_input and st.button("Predict Gender"):
try:
# Load the model
model = self.experiment_runner.load_experiment_model(experiment.experiment_id)
if model is None:
st.error("Failed to load model")
return
# Create a DataFrame with the input
input_df = self._prepare_single_input(name_input)
# Make prediction
prediction = model.predict(input_df)[0]
# Get prediction probability if available
confidence = self._get_prediction_confidence(model, input_df)
# Display results
self._display_single_prediction_results(prediction, confidence, experiment, name_input)
except Exception as e:
st.error(f"Error making prediction: {e}")
def _prepare_single_input(self, name_input: str) -> pd.DataFrame:
"""Prepare single name input for prediction"""
return pd.DataFrame(
{
"name": [name_input],
"words": [len(name_input.split())],
"length": [len(name_input.replace(" ", ""))],
"province": ["unknown"], # Default values
"identified_name": [None],
"identified_surname": [None],
"probable_native": [None],
"probable_surname": [None],
}
)
def _get_prediction_confidence(self, model, input_df: pd.DataFrame) -> Optional[float]:
"""Get prediction confidence if available"""
try:
probabilities = model.predict_proba(input_df)[0]
return max(probabilities)
except:
return None
def _display_single_prediction_results(self, prediction: str, confidence: Optional[float],
experiment, name_input: str):
"""Display single prediction results"""
col1, col2 = st.columns(2)
with col1:
gender_label = "Female" if prediction == "f" else "Male"
st.success(f"**Predicted Gender:** {gender_label}")
with col2:
if confidence:
st.metric("Confidence", f"{confidence:.2%}")
# Additional info
st.info(f"Model used: {experiment.config.name}")
st.info(
f"Features used: {', '.join([f.value for f in experiment.config.features])}"
)
def show_batch_prediction(self, experiment):
"""Show batch prediction interface"""
st.subheader("Batch Prediction")
uploaded_file = st.file_uploader("Upload CSV file with names", type="csv")
if uploaded_file is not None:
try:
df = pd.read_csv(uploaded_file)
st.write("**Uploaded Data Preview:**")
st.dataframe(df.head(), use_container_width=True)
# Column selection
df = self._prepare_batch_data(df)
if st.button("Run Batch Prediction"):
self._run_batch_prediction(df, experiment)
except Exception as e:
st.error(f"Error processing file: {e}")
def _prepare_batch_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""Prepare batch data for prediction"""
# Column selection
if "name" not in df.columns:
name_column = st.selectbox("Select the name column:", df.columns)
df = df.rename(columns={name_column: "name"})
# Add missing columns with defaults
required_columns = [
"words",
"length",
"province",
"identified_name",
"identified_surname",
"probable_native",
"probable_surname",
]
for col in required_columns:
if col not in df.columns:
if col == "words":
df[col] = df["name"].str.split().str.len()
elif col == "length":
df[col] = df["name"].str.replace(" ", "").str.len()
else:
df[col] = None
return df
def _run_batch_prediction(self, df: pd.DataFrame, experiment):
"""Run batch prediction and display results"""
with st.spinner("Making predictions..."):
# Load model
model = self.experiment_runner.load_experiment_model(experiment.experiment_id)
if model is None:
st.error("Failed to load model")
return
# Make predictions
predictions = model.predict(df)
df["predicted_gender"] = predictions
df["gender_label"] = df["predicted_gender"].map({"f": "Female", "m": "Male"})
# Try to get probabilities
try:
probabilities = model.predict_proba(df)
df["confidence"] = np.max(probabilities, axis=1)
except:
df["confidence"] = None
st.success("Predictions completed!")
# Show results
self._display_batch_results(df)
def _display_batch_results(self, df: pd.DataFrame):
"""Display batch prediction results"""
result_columns = ["name", "gender_label", "predicted_gender"]
if "confidence" in df.columns:
result_columns.append("confidence")
st.dataframe(df[result_columns], use_container_width=True)
# Download results
csv = df.to_csv(index=False)
st.download_button(
label="Download Predictions",
data=csv,
file_name=f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
mime="text/csv",
)
# Summary statistics
self._display_batch_summary(df)
def _display_batch_summary(self, df: pd.DataFrame):
"""Display batch prediction summary"""
st.subheader("Prediction Summary")
gender_counts = df["gender_label"].value_counts()
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Predictions", len(df))
with col2:
st.metric("Female", gender_counts.get("Female", 0))
with col3:
st.metric("Male", gender_counts.get("Male", 0))
# Gender distribution chart
fig = px.pie(
values=gender_counts.values,
names=gender_counts.index,
title="Predicted Gender Distribution",
)
st.plotly_chart(fig, use_container_width=True)
def show_dataset_prediction(self, experiment):
"""Show dataset prediction interface"""
st.subheader("Dataset Prediction")
st.write("Apply the model to existing datasets")
# Dataset selection
dataset_options = {
"Featured Dataset": self.config.data.output_files["featured"],
"Evaluation Dataset": self.config.data.output_files["evaluation"],
}
selected_dataset = st.selectbox("Select Dataset", list(dataset_options.keys()))
file_path = get_data_file_path(dataset_options[selected_dataset], self.config)
if not file_path.exists():
st.warning(f"Dataset not found: {file_path}")
return
# Load and show dataset info
df = self._load_dataset(str(file_path))
if df.empty:
return
st.write(f"Dataset contains {len(df):,} records")
# Prediction options
col1, col2 = st.columns(2)
with col1:
sample_size = st.number_input(
"Sample size (0 = all data)", 0, len(df), min(1000, len(df))
)
with col2:
compare_with_actual = False
if "sex" in df.columns:
compare_with_actual = st.checkbox("Compare with actual labels", value=True)
if st.button("Run Dataset Prediction"):
self._run_dataset_prediction(df, experiment, sample_size, compare_with_actual)
def _load_dataset(self, file_path: str) -> pd.DataFrame:
"""Load dataset with error handling"""
try:
return pd.read_csv(file_path)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
def _run_dataset_prediction(self, df: pd.DataFrame, experiment, sample_size: int,
compare_with_actual: bool):
"""Run dataset prediction and display results"""
with st.spinner("Running predictions..."):
# Sample data if requested
if sample_size > 0:
df_sample = df.sample(n=sample_size, random_state=42)
else:
df_sample = df
# Load model and make predictions
model = self.experiment_runner.load_experiment_model(experiment.experiment_id)
if model is None:
st.error("Failed to load model")
return
predictions = model.predict(df_sample)
df_sample["predicted_gender"] = predictions
# Show results
if compare_with_actual and "sex" in df_sample.columns:
self._display_dataset_comparison(df_sample)
else:
self._display_dataset_predictions(df_sample)
def _display_dataset_comparison(self, df_sample: pd.DataFrame):
"""Display dataset predictions with actual comparison"""
# Calculate accuracy
accuracy = (df_sample["sex"] == df_sample["predicted_gender"]).mean()
st.metric("Accuracy on Selected Data", f"{accuracy:.4f}")
# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(df_sample["sex"], df_sample["predicted_gender"])
fig = px.imshow(cm, text_auto=True, aspect="auto", title="Confusion Matrix")
st.plotly_chart(fig, use_container_width=True)
# Sample of correct and incorrect predictions
correct_mask = df_sample["sex"] == df_sample["predicted_gender"]
col1, col2 = st.columns(2)
with col1:
st.write("**Sample Correct Predictions**")
correct_sample = df_sample[correct_mask][["name", "sex", "predicted_gender"]].head(10)
st.dataframe(correct_sample, use_container_width=True)
with col2:
st.write("**Sample Incorrect Predictions**")
incorrect_sample = df_sample[~correct_mask][["name", "sex", "predicted_gender"]].head(10)
st.dataframe(incorrect_sample, use_container_width=True)
def _display_dataset_predictions(self, df_sample: pd.DataFrame):
"""Display dataset predictions without comparison"""
# Just show predictions
st.write("**Sample Predictions**")
sample_results = df_sample[["name", "predicted_gender"]].head(20)
st.dataframe(sample_results, use_container_width=True)
# Gender distribution
gender_counts = df_sample["predicted_gender"].value_counts()
fig = px.pie(
values=gender_counts.values,
names=gender_counts.index,
title="Predicted Gender Distribution",
)
st.plotly_chart(fig, use_container_width=True)
+332
View File
@@ -0,0 +1,332 @@
from typing import List
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
class ResultsAnalysis:
"""Handles experiment results and analysis interface"""
def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner):
self.config = config
self.experiment_tracker = experiment_tracker
self.experiment_runner = experiment_runner
def index(self):
"""Main results analysis page"""
st.header("Results & Analysis")
tab1, tab2, tab3 = st.tabs(["Experiment Comparison", "Performance Analysis", "Model Analysis"])
with tab1:
self.show_experiment_comparison()
with tab2:
self.show_performance_analysis()
with tab3:
self.show_model_analysis()
def show_experiment_comparison(self):
"""Show experiment comparison interface"""
st.subheader("Compare Experiments")
experiments = self.experiment_tracker.list_experiments()
completed_experiments = [e for e in experiments if e.status.value == "completed"]
if not completed_experiments:
st.warning("No completed experiments found.")
return
# Experiment selection
exp_options = {
f"{exp.config.name} ({exp.experiment_id[:8]})": exp.experiment_id
for exp in completed_experiments
}
selected_exp_names = st.multiselect(
"Select Experiments to Compare",
list(exp_options.keys()),
default=list(exp_options.keys())[: min(5, len(exp_options))],
)
if not selected_exp_names:
st.info("Please select experiments to compare.")
return
selected_exp_ids = [exp_options[name] for name in selected_exp_names]
# Generate comparison
comparison_df = self.experiment_runner.compare_experiments(selected_exp_ids)
if comparison_df.empty:
st.error("No data available for comparison.")
return
self._display_comparison_table(comparison_df)
self._display_comparison_charts(comparison_df)
def _display_comparison_table(self, comparison_df: pd.DataFrame):
"""Display comparison table"""
st.write("**Experiment Comparison Table**")
# Select columns to display
metric_columns = [
col for col in comparison_df.columns if col.startswith("test_") or col.startswith("cv_")
]
display_columns = ["name", "model_type", "features"] + metric_columns
available_columns = [col for col in display_columns if col in comparison_df.columns]
st.dataframe(comparison_df[available_columns], use_container_width=True)
def _display_comparison_charts(self, comparison_df: pd.DataFrame):
"""Display comparison charts"""
st.write("**Performance Comparison**")
if "test_accuracy" in comparison_df.columns:
fig = px.bar(
comparison_df,
x="name",
y="test_accuracy",
color="model_type",
title="Test Accuracy Comparison",
)
fig.update_layout(xaxis_tickangle=-45)
st.plotly_chart(fig, use_container_width=True)
# Metric comparison across multiple metrics
metric_columns = [
col for col in comparison_df.columns if col.startswith("test_") or col.startswith("cv_")
]
if len(metric_columns) > 1:
metric_to_plot = st.selectbox("Select Metric for Detailed Comparison", metric_columns)
if metric_to_plot in comparison_df.columns:
fig = px.bar(
comparison_df,
x="name",
y=metric_to_plot,
color="model_type",
title=f"{metric_to_plot.replace('_', ' ').title()} Comparison",
)
fig.update_layout(xaxis_tickangle=-45)
st.plotly_chart(fig, use_container_width=True)
def show_performance_analysis(self):
"""Show performance analysis across experiments"""
st.subheader("Performance Analysis")
experiments = self.experiment_tracker.list_experiments()
completed_experiments = [
e for e in experiments if e.status.value == "completed" and e.test_metrics
]
if not completed_experiments:
st.warning("No completed experiments with metrics found.")
return
# Prepare data for analysis
analysis_data = self._prepare_analysis_data(completed_experiments)
analysis_df = pd.DataFrame(analysis_data)
self._display_performance_trends(analysis_df)
self._display_model_comparison(analysis_df)
self._display_top_experiments(analysis_df)
def _prepare_analysis_data(self, completed_experiments: List) -> List[dict]:
"""Prepare data for performance analysis"""
analysis_data = []
for exp in completed_experiments:
row = {
"experiment_id": exp.experiment_id,
"name": exp.config.name,
"model_type": exp.config.model_type,
"feature_count": len(exp.config.features),
"features": ", ".join([f.value for f in exp.config.features]),
"train_size": exp.train_size,
"test_size": exp.test_size,
**exp.test_metrics,
}
analysis_data.append(row)
return analysis_data
def _display_performance_trends(self, analysis_df: pd.DataFrame):
"""Display performance trend charts"""
col1, col2 = st.columns(2)
with col1:
# Accuracy vs Training Size
if "accuracy" in analysis_df.columns and "train_size" in analysis_df.columns:
fig = px.scatter(
analysis_df,
x="train_size",
y="accuracy",
color="model_type",
hover_data=["name"],
title="Accuracy vs Training Size",
)
st.plotly_chart(fig, use_container_width=True)
with col2:
# Feature Count vs Performance
if "accuracy" in analysis_df.columns and "feature_count" in analysis_df.columns:
fig = px.scatter(
analysis_df,
x="feature_count",
y="accuracy",
color="model_type",
hover_data=["name"],
title="Accuracy vs Number of Features",
)
st.plotly_chart(fig, use_container_width=True)
def _display_model_comparison(self, analysis_df: pd.DataFrame):
"""Display model type comparison"""
if "accuracy" in analysis_df.columns:
model_performance = (
analysis_df.groupby("model_type")["accuracy"]
.agg(["mean", "std", "count"])
.reset_index()
)
fig = go.Figure()
fig.add_trace(
go.Bar(
x=model_performance["model_type"],
y=model_performance["mean"],
error_y=dict(type="data", array=model_performance["std"]),
name="Average Accuracy",
)
)
fig.update_layout(title="Average Accuracy by Model Type", yaxis_title="Accuracy")
st.plotly_chart(fig, use_container_width=True)
def _display_top_experiments(self, analysis_df: pd.DataFrame):
"""Display top performing experiments"""
st.subheader("Top Performing Experiments")
if "accuracy" in analysis_df.columns:
display_columns = ["name", "model_type", "features", "accuracy"]
# Add other metrics if available
for metric in ["precision", "recall", "f1"]:
if metric in analysis_df.columns:
display_columns.append(metric)
top_experiments = analysis_df.nlargest(5, "accuracy")[display_columns]
st.dataframe(top_experiments, use_container_width=True)
def show_model_analysis(self):
"""Show detailed model analysis"""
st.subheader("Model Analysis")
experiments = self.experiment_tracker.list_experiments()
completed_experiments = [e for e in experiments if e.status.value == "completed"]
if not completed_experiments:
st.warning("No completed experiments found.")
return
# Select experiment for detailed analysis
exp_options = {
f"{exp.config.name} ({exp.experiment_id[:8]})": exp for exp in completed_experiments
}
selected_exp_name = st.selectbox(
"Select Experiment for Detailed Analysis", list(exp_options.keys())
)
if not selected_exp_name:
return
selected_exp = exp_options[selected_exp_name]
self._display_experiment_details(selected_exp)
self._display_confusion_matrix(selected_exp)
self._display_feature_importance(selected_exp)
self._display_prediction_examples(selected_exp)
def _display_experiment_details(self, experiment):
"""Display experiment configuration and metrics"""
col1, col2 = st.columns(2)
with col1:
st.write("**Experiment Configuration**")
st.json(
{
"name": experiment.config.name,
"model_type": experiment.config.model_type,
"features": [f.value for f in experiment.config.features],
"model_params": experiment.config.model_params,
}
)
with col2:
st.write("**Performance Metrics**")
if experiment.test_metrics:
for metric, value in experiment.test_metrics.items():
st.metric(metric.title(), f"{value:.4f}")
def _display_confusion_matrix(self, experiment):
"""Display confusion matrix if available"""
if experiment.confusion_matrix:
st.write("**Confusion Matrix**")
cm = np.array(experiment.confusion_matrix)
fig = px.imshow(cm, text_auto=True, aspect="auto", title="Confusion Matrix")
st.plotly_chart(fig, use_container_width=True)
def _display_feature_importance(self, experiment):
"""Display feature importance if available"""
if experiment.feature_importance:
st.write("**Feature Importance**")
importance_data = sorted(
experiment.feature_importance.items(), key=lambda x: x[1], reverse=True
)[:20]
features, importances = zip(*importance_data)
fig = px.bar(
x=list(importances),
y=list(features),
orientation="h",
title="Top 20 Feature Importances",
)
fig.update_layout(height=600)
st.plotly_chart(fig, use_container_width=True)
def _display_prediction_examples(self, experiment):
"""Display prediction examples if available"""
if experiment.prediction_examples:
st.write("**Prediction Examples**")
examples_df = pd.DataFrame(experiment.prediction_examples)
# Separate correct and incorrect predictions
correct_examples = examples_df[examples_df["correct"] == True]
incorrect_examples = examples_df[examples_df["correct"] == False]
col1, col2 = st.columns(2)
with col1:
st.write("**Correct Predictions**")
if not correct_examples.empty:
st.dataframe(
correct_examples[["name", "true_label", "predicted_label"]],
use_container_width=True,
)
with col2:
st.write("**Incorrect Predictions**")
if not incorrect_examples.empty:
st.dataframe(
incorrect_examples[["name", "true_label", "predicted_label"]],
use_container_width=True,
)
+2 -16
View File
@@ -3,8 +3,8 @@ import argparse
import sys
from core.config.config_manager import ConfigManager
from processing.monitoring.pipeline_monitor import PipelineMonitor
from processing.monitoring.data_analyzer import DatasetAnalyzer
from processing.monitoring.pipeline_monitor import PipelineMonitor
def main():
@@ -112,29 +112,15 @@ def main():
return 1
completion_stats = analyzer.analyze_completion()
quality_stats = analyzer.analyze_quality()
print(f"\n=== Dataset Analysis: {args.file} ===")
print(f"Total rows: {completion_stats['total_rows']:,}")
print(
f"Annotated: {completion_stats['annotated_rows']:,} ({completion_stats['annotation_percentage']:.1f}%)"
)
print(f"Annotated: {completion_stats['annotated_rows']:,} ({completion_stats['annotation_percentage']:.1f}%)")
print(f"Unannotated: {completion_stats['unannotated_rows']:,}")
print(
f"Complete names: {completion_stats['complete_names']:,} ({completion_stats['completeness_percentage']:.1f}%)"
)
if "name_length" in quality_stats:
length_stats = quality_stats["name_length"]
print(f"\nName length statistics:")
print(f" Average: {length_stats['mean']:.1f} characters")
print(f" Range: {length_stats['min']}-{length_stats['max']} characters")
if "word_distribution" in quality_stats:
print(f"\nWord count distribution:")
for words, count in quality_stats["word_distribution"].items():
print(f" {words} words: {count:,} names")
elif args.command == "info":
checkpoint_info = monitor.count_checkpoint_files()
-28
View File
@@ -50,31 +50,3 @@ class DatasetAnalyzer:
"complete_names": complete_names,
"completeness_percentage": (complete_names / total_rows * 100) if total_rows > 0 else 0,
}
def analyze_quality(self) -> Dict:
"""Analyze data quality metrics"""
if self.df is None:
return {}
quality_metrics = {}
# Missing values
missing_data = self.df.isnull().sum()
quality_metrics["missing_values"] = missing_data.to_dict()
# Name length distribution
if "name" in self.df.columns:
name_lengths = self.df["name"].str.len()
quality_metrics["name_length"] = {
"mean": name_lengths.mean(),
"median": name_lengths.median(),
"min": name_lengths.min(),
"max": name_lengths.max(),
}
# Word count distribution
if "words" in self.df.columns:
word_counts = self.df["words"].value_counts().sort_index()
quality_metrics["word_distribution"] = word_counts.to_dict()
return quality_metrics
+1 -1
View File
@@ -39,7 +39,7 @@ class FeatureExtractionStep(PipelineStep):
@classmethod
def get_name_category(cls, word_count: int) -> NameCategory:
"""Determine name category based on word count"""
if word_count <= 3:
if word_count == 3:
return NameCategory.SIMPLE
else:
return NameCategory.COMPOSE
+33 -32
View File
@@ -11,6 +11,7 @@ from core.utils.data_loader import DataLoader
from research.experiment import FeatureType, ExperimentConfig
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
from research.model_registry import MODEL_REGISTRY
class ModelTrainer:
@@ -21,25 +22,24 @@ class ModelTrainer:
self.data_loader = DataLoader(self.config)
self.experiment_runner = ExperimentRunner(self.config)
self.experiment_tracker = ExperimentTracker(self.config)
self.logger = logging.getLogger(__name__)
# Setup model artifacts directory
self.models_dir = self.config.paths.models_dir
self.models_dir.mkdir(parents=True, exist_ok=True)
def train_single_model(
self,
model_name: str,
model_type: str = "logistic_regression",
features: List[str] = None,
model_params: Dict[str, Any] = None,
save_artifacts: bool = True,
self,
model_name: str,
model_type: str = "logistic_regression",
features: List[str] = None,
model_params: Dict[str, Any] = None,
save_artifacts: bool = True,
) -> str:
"""
Train a single model and save its artifacts.
Returns the experiment ID.
"""
self.logger.info(f"Training {model_type} model: {model_name}")
logging.info(f"Training {model_type} model: {model_name}")
if features is None:
features = ["full_name"]
@@ -60,10 +60,10 @@ class ModelTrainer:
experiment = self.experiment_tracker.get_experiment(experiment_id)
if experiment and experiment.test_metrics:
self.logger.info("Training completed successfully!")
self.logger.info(f" Experiment ID: {experiment_id}")
self.logger.info(f" Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}")
self.logger.info(f" Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
logging.info("Training completed successfully!")
logging.info(f"Experiment ID: {experiment_id}")
logging.info(f"Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}")
logging.info(f"Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
if save_artifacts:
self.save_model_artifacts(experiment_id)
@@ -71,12 +71,15 @@ class ModelTrainer:
return experiment_id
def train_multiple_models(
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
self,
base_name: str,
model_configs: List[Dict[str, Any]],
save_all: bool = True
) -> List[str]:
"""
Train multiple models with different configurations.
"""
self.logger.info(f"Training {len(model_configs)} models...")
logging.info(f"Training {len(model_configs)} models...")
experiment_ids = []
@@ -94,10 +97,10 @@ class ModelTrainer:
experiment_ids.append(exp_id)
except Exception as e:
self.logger.error(f"Failed to train {model_name}: {e}")
logging.error(f"Failed to train {model_name}: {e}")
continue
self.logger.info(f"Completed training {len(experiment_ids)} models successfully")
logging.info(f"Completed training {len(experiment_ids)} models successfully")
return experiment_ids
def save_model_artifacts(self, experiment_id: str) -> Dict[str, str]:
@@ -145,7 +148,7 @@ class ModelTrainer:
df = self.data_loader.load_csv_complete(data_path)
# Generate learning curve
self.logger.info("Generating learning curve...")
logging.info("Generating learning curve...")
trained_model.generate_learning_curve(df, df[experiment.config.target_column])
# Plot and save learning curve
@@ -169,7 +172,7 @@ class ModelTrainer:
json.dump(trained_model.training_history, f, indent=2)
except Exception as e:
self.logger.warning(f"Could not generate learning curves: {e}")
logging.warning(f"Could not generate learning curves: {e}")
# Save artifacts metadata
metadata = {
@@ -193,17 +196,17 @@ class ModelTrainer:
with open(metadata_path, "w") as f:
json.dump(metadata, f, indent=2)
self.logger.info(f"Model artifacts saved to: {model_dir}")
self.logger.info(f" - Complete model: {model_path.name}")
self.logger.info(f" - Configuration: {config_path.name}")
self.logger.info(f" - Results: {results_path.name}")
self.logger.info(f" - Metadata: {metadata_path.name}")
logging.info(f"Model artifacts saved to: {model_dir}")
logging.info(f" - Complete model: {model_path.name}")
logging.info(f" - Configuration: {config_path.name}")
logging.info(f" - Results: {results_path.name}")
logging.info(f" - Metadata: {metadata_path.name}")
if learning_curve_path and learning_curve_path.exists():
self.logger.info(f" - Learning curve: {learning_curve_path.name}")
logging.info(f" - Learning curve: {learning_curve_path.name}")
if training_history_path and training_history_path.exists():
self.logger.info(f" - Training history: {training_history_path.name}")
logging.info(f" - Training history: {training_history_path.name}")
return {
"model_dir": str(model_dir),
@@ -231,16 +234,14 @@ class ModelTrainer:
metadata = json.load(f)
model_type = metadata["model_type"]
from research.model_registry import MODEL_REGISTRY
model_class = MODEL_REGISTRY[model_type]
# Load the complete model
loaded_model = model_class.load(str(model_path))
self.logger.info(f"Loaded model: {metadata['model_name']}")
self.logger.info(f" Type: {model_type}")
self.logger.info(f" Accuracy: {metadata['test_accuracy']:.4f}")
logging.info(f"Loaded model: {metadata['model_name']}")
logging.info(f" Type: {model_type}")
logging.info(f" Accuracy: {metadata['test_accuracy']:.4f}")
return loaded_model
@@ -259,10 +260,10 @@ class ModelTrainer:
metadata = json.load(f)
models_data.append(metadata)
except Exception as e:
self.logger.warning(f"Could not read metadata for {model_dir.name}: {e}")
logging.warning(f"Could not read metadata for {model_dir.name}: {e}")
if not models_data:
self.logger.info("No saved models found.")
logging.info("No saved models found.")
return pd.DataFrame()
df = pd.DataFrame(models_data)
+1 -1
View File
@@ -22,7 +22,7 @@ class LightGBMModel(TraditionalModel):
subsample=params.get("subsample", 0.8),
colsample_bytree=params.get("colsample_bytree", 0.8),
random_state=self.config.random_seed,
verbose=-1,
verbose=2,
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+3 -1
View File
@@ -20,7 +20,9 @@ class LogisticRegressionModel(TraditionalModel):
)
classifier = LogisticRegression(
max_iter=params.get("max_iter", 1000), random_state=self.config.random_seed
max_iter=params.get("max_iter", 1000),
random_state=self.config.random_seed,
verbose=2
)
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
+1
View File
@@ -18,6 +18,7 @@ class RandomForestModel(TraditionalModel):
n_estimators=params.get("n_estimators", 100),
max_depth=params.get("max_depth", None),
random_state=self.config.random_seed,
verbose=2
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+1
View File
@@ -25,6 +25,7 @@ class SVMModel(TraditionalModel):
gamma=params.get("gamma", "scale"),
probability=True, # Enable probability prediction
random_state=self.config.random_seed,
verbose=2
)
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
+1
View File
@@ -22,6 +22,7 @@ class XGBoostModel(TraditionalModel):
colsample_bytree=params.get("colsample_bytree", 0.8),
random_state=self.config.random_seed,
eval_metric="logloss",
verbosity=2
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+4 -1
View File
@@ -49,6 +49,7 @@ class NeuralNetworkModel(BaseModel):
# Now we can build the model with known vocab size
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
logging.info(f"Vocabulary size: {vocab_size}")
# Get additional model parameters
max_len = self.config.model_params.get("max_len", 6)
@@ -58,16 +59,18 @@ class NeuralNetworkModel(BaseModel):
)
# Train the neural network
logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features")
history = self.model.fit(
X_prepared,
y_encoded,
epochs=self.config.model_params.get("epochs", 10),
batch_size=self.config.model_params.get("batch_size", 64),
validation_split=0.1,
verbose=1,
verbose=2,
)
# Store training history
self.training_history = {
"accuracy": history.history["accuracy"],
"loss": history.history["loss"],
+2 -1
View File
@@ -50,7 +50,8 @@ class TraditionalModel(BaseModel):
y_encoded = self.label_encoder.transform(y)
# Train model
self.model.fit(X_prepared, y_encoded)
logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features")
self.model.fit(X_prepared, y_encoded, verbose=2)
self.is_fitted = True
return self
Regular → Executable
+9 -135
View File
@@ -1,151 +1,25 @@
#!.venv/bin/python3
import logging
import argparse
from core.config import setup_logging, get_config
from research.model_trainer import ModelTrainer
def train_baseline_models():
"""
Quick function to train all baseline models and save artifacts.
"""
logger = logging.getLogger(__name__)
logger.info("Training Baseline Models with Artifact Saving")
trainer = ModelTrainer()
# Define baseline model configurations
baseline_configs = [
{
"model_type": "logistic_regression",
"features": ["full_name"],
"model_params": {"ngram_range": [2, 5], "max_features": 10000},
},
{
"model_type": "logistic_regression",
"features": ["native_name"],
"model_params": {"ngram_range": [2, 4], "max_features": 5000},
},
{
"model_type": "logistic_regression",
"features": ["surname"],
"model_params": {"ngram_range": [2, 4], "max_features": 5000},
},
{
"model_type": "random_forest",
"features": ["name_length", "word_count", "province"],
"model_params": {"n_estimators": 100, "max_depth": 10},
},
{
"model_type": "svm",
"features": ["full_name"],
"model_params": {"kernel": "rbf", "C": 1.0},
},
{"model_type": "naive_bayes", "features": ["full_name"], "model_params": {"alpha": 1.0}},
]
# Train all baseline models
experiment_ids = trainer.train_multiple_models("baseline", baseline_configs)
# Show summary
logger.info(f"\n Training Summary:")
for exp_id in experiment_ids:
experiment = trainer.experiment_tracker.get_experiment(exp_id)
if experiment:
acc = experiment.test_metrics.get("accuracy", 0)
logger.info(f" {experiment.config.name}: {acc:.4f} accuracy")
return experiment_ids
def train_neural_networks():
"""
Train neural network models with proper parameters.
"""
logging.info("Training Neural Network Models")
trainer = ModelTrainer()
neural_configs = [
{
"model_type": "lstm",
"features": ["full_name"],
"model_params": {
"embedding_dim": 64,
"lstm_units": 32,
"epochs": 10,
"batch_size": 64,
"max_len": 6,
},
},
{
"model_type": "cnn",
"features": ["full_name"],
"model_params": {
"embedding_dim": 64,
"filters": 64,
"kernel_size": 3,
"epochs": 10,
"batch_size": 64,
"max_len": 20, # Character level
},
},
{
"model_type": "transformer",
"features": ["full_name"],
"model_params": {
"embedding_dim": 64,
"transformer_num_heads": 2,
"epochs": 10,
"batch_size": 64,
"max_len": 6,
},
},
]
experiment_ids = trainer.train_multiple_models("neural_networks", neural_configs)
return experiment_ids
def main():
"""
Main training script with different options.
"""
setup_logging(get_config())
parser = argparse.ArgumentParser(description="Train DRC Names Models")
parser.add_argument(
"--mode",
choices=["baseline", "neural", "list"],
default="list",
help="Training mode",
)
parser.add_argument("--model-type", type=str, help="Specific model type to train")
parser.add_argument("--type", type=str, help="Specific model type to train")
parser.add_argument("--name", type=str, help="Model name")
args = parser.parse_args()
trainer = ModelTrainer()
if args.mode == "baseline":
train_baseline_models()
elif args.mode == "neural":
train_neural_networks()
elif args.mode == "list":
logging.info("📋 Saved Models:")
saved_models = trainer.list_saved_models()
if not saved_models.empty:
logging.info(saved_models.to_string(index=False))
else:
logging.info("No saved models found.")
elif args.model_type and args.name:
# Train specific model
trainer.train_single_model(
model_name=args.name, model_type=args.model_type, features=["full_name"]
)
# Train specific model
trainer.train_single_model(
model_name=args.name,
model_type=args.type,
features=["full_name"]
)
if __name__ == "__main__":