hotfixes
This commit is contained in:
@@ -3,7 +3,7 @@ import argparse
|
||||
|
||||
import streamlit as st
|
||||
|
||||
from core.config import get_config, setup_config, PipelineConfig
|
||||
from core.config import setup_config, PipelineConfig
|
||||
from core.utils.data_loader import DataLoader
|
||||
from interface.configuration import Configuration
|
||||
from interface.dashboard import Dashboard
|
||||
|
||||
@@ -21,3 +21,6 @@ class ProjectPaths(BaseModel):
|
||||
@field_validator("*", mode="before")
|
||||
def convert_to_path(cls, v):
|
||||
return Path(v) if not isinstance(v, Path) else v
|
||||
|
||||
def get_data_path(self, filename: str) -> Path:
|
||||
return self.data_dir / filename
|
||||
|
||||
@@ -44,18 +44,3 @@ def ensure_directories(config: "PipelineConfig") -> None:
|
||||
Path(directory).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logging.info("Ensured all required directories exist")
|
||||
|
||||
|
||||
def get_data_file_path(filename: str, config: "PipelineConfig") -> Path:
|
||||
"""Get full path for a data file"""
|
||||
return config.paths.data_dir / filename
|
||||
|
||||
|
||||
def get_model_file_path(filename: str, config: "PipelineConfig") -> Path:
|
||||
"""Get full path for a model file"""
|
||||
return config.paths.models_dir / filename
|
||||
|
||||
|
||||
def get_output_file_path(filename: str, config: "PipelineConfig") -> Path:
|
||||
"""Get full path for an output file"""
|
||||
return config.paths.outputs_dir / filename
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
|
||||
|
||||
@st.cache_data
|
||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||
try:
|
||||
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||
@@ -25,7 +25,7 @@ class Dashboard:
|
||||
|
||||
# Load basic statistics
|
||||
try:
|
||||
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||
data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||
if data_path.exists():
|
||||
df = load_dataset(str(data_path))
|
||||
|
||||
|
||||
@@ -4,10 +4,10 @@ import pandas as pd
|
||||
import plotly.express as px
|
||||
import streamlit as st
|
||||
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
|
||||
|
||||
@st.cache_data
|
||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||
try:
|
||||
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||
@@ -31,7 +31,7 @@ class DataOverview:
|
||||
}
|
||||
|
||||
selected_file = st.selectbox("Select Dataset", list(data_files.keys()))
|
||||
file_path = get_data_file_path(data_files[selected_file], self.config)
|
||||
file_path = self.config.paths.get_data_path(data_files[selected_file])
|
||||
|
||||
if not file_path.exists():
|
||||
st.warning(f"Dataset not found: {file_path}")
|
||||
|
||||
@@ -6,6 +6,7 @@ from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
from interface.log_reader import LogReader
|
||||
|
||||
|
||||
@st.cache_data
|
||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||
try:
|
||||
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
"""Predictions interface for the Streamlit app"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
@@ -8,7 +6,6 @@ import pandas as pd
|
||||
import plotly.express as px
|
||||
import streamlit as st
|
||||
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
from research.experiment.experiment_runner import ExperimentRunner
|
||||
from research.experiment.experiment_tracker import ExperimentTracker
|
||||
@@ -268,7 +265,7 @@ class Predictions:
|
||||
}
|
||||
|
||||
selected_dataset = st.selectbox("Select Dataset", list(dataset_options.keys()))
|
||||
file_path = get_data_file_path(dataset_options[selected_dataset], self.config)
|
||||
file_path = self.config.paths.get_data_path(dataset_options[selected_dataset])
|
||||
|
||||
if not file_path.exists():
|
||||
st.warning(f"Dataset not found: {file_path}")
|
||||
|
||||
@@ -5,7 +5,6 @@ import sys
|
||||
import traceback
|
||||
|
||||
from core.config import setup_config
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import DataLoader
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
from processing.pipeline import Pipeline
|
||||
@@ -47,7 +46,7 @@ def run_pipeline(config) -> int:
|
||||
logging.info(f"Starting pipeline: {config.name} v{config.version}")
|
||||
|
||||
# Load input data
|
||||
input_file_path = get_data_file_path(config.data.input_file, config)
|
||||
input_file_path = config.paths.get_data_path(config.data.input_file)
|
||||
if not input_file_path.exists():
|
||||
logging.error(f"Input file not found: {input_file_path}")
|
||||
return 1
|
||||
@@ -60,8 +59,6 @@ def run_pipeline(config) -> int:
|
||||
|
||||
# Create and run pipeline
|
||||
pipeline = create_pipeline(config)
|
||||
|
||||
logging.info("Starting pipeline execution")
|
||||
data_splitter.split(pipeline.run(df))
|
||||
|
||||
# Show completion statistics
|
||||
@@ -104,5 +101,4 @@ def main():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit_code = main()
|
||||
sys.exit(exit_code)
|
||||
sys.exit(main())
|
||||
|
||||
+1
-2
@@ -86,5 +86,4 @@ def main():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit_code = main()
|
||||
sys.exit(exit_code)
|
||||
sys.exit(main())
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
|
||||
@@ -48,21 +48,18 @@ def train(config: PipelineConfig):
|
||||
|
||||
|
||||
def run_pipeline(config: PipelineConfig, reset: bool = False):
|
||||
# Step 1: Feature engineering
|
||||
if not reset and os.path.exists(config.paths.data_dir / config.data.output_files["engineered"]):
|
||||
if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["engineered"])):
|
||||
logging.info("Step 1: Feature engineering already done.")
|
||||
else:
|
||||
logging.info("Step 1: Running feature engineering")
|
||||
feature(config)
|
||||
|
||||
# Step 2: Build dataset
|
||||
if not reset and os.path.exists(config.paths.data_dir / config.data.output_files["ner_data"]):
|
||||
if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["ner_data"])):
|
||||
logging.info("Step 2: NER dataset already built.")
|
||||
else:
|
||||
logging.info("Step 2: Building NER dataset")
|
||||
build(config)
|
||||
|
||||
# Step 3: Train model
|
||||
logging.info("Step 3: Training NER Model")
|
||||
train(config)
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@ from spacy.tokens import DocBin
|
||||
from spacy.util import filter_spans
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import DataLoader
|
||||
|
||||
|
||||
@@ -98,10 +97,8 @@ class NERDataBuilder:
|
||||
return docs
|
||||
|
||||
def build(self) -> int:
|
||||
input_filepath = get_data_file_path(
|
||||
self.config.data.output_files["engineered"], self.config
|
||||
)
|
||||
df = self.data_loader.load_csv_complete(input_filepath)
|
||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
|
||||
df = self.data_loader.load_csv_complete(filepath)
|
||||
df = df[["name", "ner_tagged", "ner_entities"]]
|
||||
|
||||
# Filter early
|
||||
@@ -139,8 +136,8 @@ class NERDataBuilder:
|
||||
doc_bin = DocBin(docs=docs)
|
||||
|
||||
# Save
|
||||
json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
|
||||
spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
|
||||
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
|
||||
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
|
||||
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
||||
|
||||
@@ -7,7 +7,6 @@ import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
|
||||
from processing.ner.formats.connectors_format import ConnectorFormatter
|
||||
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
||||
@@ -55,7 +54,7 @@ class NEREngineering:
|
||||
def load_data(self) -> pd.DataFrame:
|
||||
"""Load and filter NER-tagged data from CSV file"""
|
||||
|
||||
filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||
df = self.data_loader.load_csv_complete(filepath)
|
||||
|
||||
# Filter only NER-tagged rows
|
||||
@@ -66,10 +65,8 @@ class NEREngineering:
|
||||
|
||||
def compute(self) -> None:
|
||||
logging.info("Applying feature engineering transformations...")
|
||||
input_filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||
output_filepath = get_data_file_path(
|
||||
self.config.data.output_files["engineered"], self.config
|
||||
)
|
||||
input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||
output_filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
|
||||
|
||||
df = self.data_loader.load_csv_complete(input_filepath)
|
||||
ner_df = df[df["ner_tagged"] == 1].copy()
|
||||
|
||||
@@ -127,7 +127,7 @@ class FeatureExtractionStep(PipelineStep):
|
||||
|
||||
def _process_simple_names(self, df: pd.DataFrame) -> None:
|
||||
"""Process 3-word names efficiently with vectorized operations"""
|
||||
mask = df["words"] == 3
|
||||
mask = pd.Series(df["words"] == 3)
|
||||
|
||||
if not mask.any():
|
||||
return
|
||||
|
||||
@@ -10,7 +10,6 @@ from sklearn.metrics import confusion_matrix
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import DataLoader
|
||||
from research.base_model import BaseModel
|
||||
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
|
||||
@@ -36,8 +35,8 @@ class ExperimentRunner:
|
||||
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
|
||||
|
||||
# Load data
|
||||
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||
df = self.data_loader.load_csv_complete(data_path)
|
||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||
df = self.data_loader.load_csv_complete(filepath)
|
||||
|
||||
# Apply data filters if specified
|
||||
df = self._apply_data_filters(df, experiment_config)
|
||||
|
||||
@@ -6,7 +6,6 @@ from typing import List, Dict, Any
|
||||
import pandas as pd
|
||||
|
||||
from core.config import get_config
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import DataLoader
|
||||
from research.experiment import FeatureType, ExperimentConfig
|
||||
from research.experiment.experiment_runner import ExperimentRunner
|
||||
@@ -145,7 +144,7 @@ class ModelTrainer:
|
||||
|
||||
try:
|
||||
# Load data for learning curve generation
|
||||
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||
data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||
if data_path.exists():
|
||||
df = self.data_loader.load_csv_complete(data_path)
|
||||
|
||||
|
||||
@@ -72,7 +72,6 @@ class NeuralNetworkModel(BaseModel):
|
||||
)
|
||||
|
||||
# Store training history
|
||||
|
||||
self.training_history = {
|
||||
"accuracy": history.history["accuracy"],
|
||||
"loss": history.history["loss"],
|
||||
|
||||
Reference in New Issue
Block a user