This commit is contained in:
2025-08-15 08:08:11 +02:00
parent 9601c5e44d
commit 7b652d6999
17 changed files with 28 additions and 60 deletions
+1 -1
View File
@@ -3,7 +3,7 @@ import argparse
import streamlit as st
from core.config import get_config, setup_config, PipelineConfig
from core.config import setup_config, PipelineConfig
from core.utils.data_loader import DataLoader
from interface.configuration import Configuration
from interface.dashboard import Dashboard
+3
View File
@@ -21,3 +21,6 @@ class ProjectPaths(BaseModel):
@field_validator("*", mode="before")
def convert_to_path(cls, v):
return Path(v) if not isinstance(v, Path) else v
def get_data_path(self, filename: str) -> Path:
return self.data_dir / filename
-15
View File
@@ -44,18 +44,3 @@ def ensure_directories(config: "PipelineConfig") -> None:
Path(directory).mkdir(parents=True, exist_ok=True)
logging.info("Ensured all required directories exist")
def get_data_file_path(filename: str, config: "PipelineConfig") -> Path:
"""Get full path for a data file"""
return config.paths.data_dir / filename
def get_model_file_path(filename: str, config: "PipelineConfig") -> Path:
"""Get full path for a model file"""
return config.paths.models_dir / filename
def get_output_file_path(filename: str, config: "PipelineConfig") -> Path:
"""Get full path for an output file"""
return config.paths.outputs_dir / filename
+2 -2
View File
@@ -1,10 +1,10 @@
import pandas as pd
import streamlit as st
from core.utils import get_data_file_path
from core.utils.data_loader import OPTIMIZED_DTYPES
@st.cache_data
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
@@ -25,7 +25,7 @@ class Dashboard:
# Load basic statistics
try:
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
if data_path.exists():
df = load_dataset(str(data_path))
+2 -2
View File
@@ -4,10 +4,10 @@ import pandas as pd
import plotly.express as px
import streamlit as st
from core.utils import get_data_file_path
from core.utils.data_loader import OPTIMIZED_DTYPES
@st.cache_data
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
@@ -31,7 +31,7 @@ class DataOverview:
}
selected_file = st.selectbox("Select Dataset", list(data_files.keys()))
file_path = get_data_file_path(data_files[selected_file], self.config)
file_path = self.config.paths.get_data_path(data_files[selected_file])
if not file_path.exists():
st.warning(f"Dataset not found: {file_path}")
+1
View File
@@ -6,6 +6,7 @@ from core.utils.data_loader import OPTIMIZED_DTYPES
from interface.log_reader import LogReader
@st.cache_data
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
+1 -4
View File
@@ -1,5 +1,3 @@
"""Predictions interface for the Streamlit app"""
from datetime import datetime
from typing import Optional
@@ -8,7 +6,6 @@ import pandas as pd
import plotly.express as px
import streamlit as st
from core.utils import get_data_file_path
from core.utils.data_loader import OPTIMIZED_DTYPES
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
@@ -268,7 +265,7 @@ class Predictions:
}
selected_dataset = st.selectbox("Select Dataset", list(dataset_options.keys()))
file_path = get_data_file_path(dataset_options[selected_dataset], self.config)
file_path = self.config.paths.get_data_path(dataset_options[selected_dataset])
if not file_path.exists():
st.warning(f"Dataset not found: {file_path}")
+2 -6
View File
@@ -5,7 +5,6 @@ import sys
import traceback
from core.config import setup_config
from core.utils import get_data_file_path
from core.utils.data_loader import DataLoader
from processing.batch.batch_config import BatchConfig
from processing.pipeline import Pipeline
@@ -47,7 +46,7 @@ def run_pipeline(config) -> int:
logging.info(f"Starting pipeline: {config.name} v{config.version}")
# Load input data
input_file_path = get_data_file_path(config.data.input_file, config)
input_file_path = config.paths.get_data_path(config.data.input_file)
if not input_file_path.exists():
logging.error(f"Input file not found: {input_file_path}")
return 1
@@ -60,8 +59,6 @@ def run_pipeline(config) -> int:
# Create and run pipeline
pipeline = create_pipeline(config)
logging.info("Starting pipeline execution")
data_splitter.split(pipeline.run(df))
# Show completion statistics
@@ -104,5 +101,4 @@ def main():
if __name__ == "__main__":
exit_code = main()
sys.exit(exit_code)
sys.exit(main())
+1 -2
View File
@@ -86,5 +86,4 @@ def main():
if __name__ == "__main__":
exit_code = main()
sys.exit(exit_code)
sys.exit(main())
+3 -6
View File
@@ -1,8 +1,8 @@
#!/usr/bin/env python3
import argparse
import logging
import sys
import os
import sys
import traceback
from pathlib import Path
@@ -48,21 +48,18 @@ def train(config: PipelineConfig):
def run_pipeline(config: PipelineConfig, reset: bool = False):
# Step 1: Feature engineering
if not reset and os.path.exists(config.paths.data_dir / config.data.output_files["engineered"]):
if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["engineered"])):
logging.info("Step 1: Feature engineering already done.")
else:
logging.info("Step 1: Running feature engineering")
feature(config)
# Step 2: Build dataset
if not reset and os.path.exists(config.paths.data_dir / config.data.output_files["ner_data"]):
if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["ner_data"])):
logging.info("Step 2: NER dataset already built.")
else:
logging.info("Step 2: Building NER dataset")
build(config)
# Step 3: Train model
logging.info("Step 3: Training NER Model")
train(config)
+4 -7
View File
@@ -9,7 +9,6 @@ from spacy.tokens import DocBin
from spacy.util import filter_spans
from core.config import PipelineConfig
from core.utils import get_data_file_path
from core.utils.data_loader import DataLoader
@@ -98,10 +97,8 @@ class NERDataBuilder:
return docs
def build(self) -> int:
input_filepath = get_data_file_path(
self.config.data.output_files["engineered"], self.config
)
df = self.data_loader.load_csv_complete(input_filepath)
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
df = self.data_loader.load_csv_complete(filepath)
df = df[["name", "ner_tagged", "ner_entities"]]
# Filter early
@@ -139,8 +136,8 @@ class NERDataBuilder:
doc_bin = DocBin(docs=docs)
# Save
json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
with open(json_path, "w", encoding="utf-8") as f:
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
+3 -6
View File
@@ -7,7 +7,6 @@ import pandas as pd
from tqdm import tqdm
from core.config import PipelineConfig
from core.utils import get_data_file_path
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
from processing.ner.formats.connectors_format import ConnectorFormatter
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
@@ -55,7 +54,7 @@ class NEREngineering:
def load_data(self) -> pd.DataFrame:
"""Load and filter NER-tagged data from CSV file"""
filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
df = self.data_loader.load_csv_complete(filepath)
# Filter only NER-tagged rows
@@ -66,10 +65,8 @@ class NEREngineering:
def compute(self) -> None:
logging.info("Applying feature engineering transformations...")
input_filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
output_filepath = get_data_file_path(
self.config.data.output_files["engineered"], self.config
)
input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
output_filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
df = self.data_loader.load_csv_complete(input_filepath)
ner_df = df[df["ner_tagged"] == 1].copy()
+1 -1
View File
@@ -127,7 +127,7 @@ class FeatureExtractionStep(PipelineStep):
def _process_simple_names(self, df: pd.DataFrame) -> None:
"""Process 3-word names efficiently with vectorized operations"""
mask = df["words"] == 3
mask = pd.Series(df["words"] == 3)
if not mask.any():
return
+2 -3
View File
@@ -10,7 +10,6 @@ from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from core.config import PipelineConfig
from core.utils import get_data_file_path
from core.utils.data_loader import DataLoader
from research.base_model import BaseModel
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
@@ -36,8 +35,8 @@ class ExperimentRunner:
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
# Load data
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
df = self.data_loader.load_csv_complete(data_path)
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
df = self.data_loader.load_csv_complete(filepath)
# Apply data filters if specified
df = self._apply_data_filters(df, experiment_config)
+1 -2
View File
@@ -6,7 +6,6 @@ from typing import List, Dict, Any
import pandas as pd
from core.config import get_config
from core.utils import get_data_file_path
from core.utils.data_loader import DataLoader
from research.experiment import FeatureType, ExperimentConfig
from research.experiment.experiment_runner import ExperimentRunner
@@ -145,7 +144,7 @@ class ModelTrainer:
try:
# Load data for learning curve generation
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
if data_path.exists():
df = self.data_loader.load_csv_complete(data_path)
-1
View File
@@ -72,7 +72,6 @@ class NeuralNetworkModel(BaseModel):
)
# Store training history
self.training_history = {
"accuracy": history.history["accuracy"],
"loss": history.history["loss"],
+1 -2
View File
@@ -112,5 +112,4 @@ def main():
if __name__ == "__main__":
exit_code = main()
sys.exit(exit_code)
sys.exit(main())