hotfixes
This commit is contained in:
@@ -3,7 +3,7 @@ import argparse
|
|||||||
|
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
from core.config import get_config, setup_config, PipelineConfig
|
from core.config import setup_config, PipelineConfig
|
||||||
from core.utils.data_loader import DataLoader
|
from core.utils.data_loader import DataLoader
|
||||||
from interface.configuration import Configuration
|
from interface.configuration import Configuration
|
||||||
from interface.dashboard import Dashboard
|
from interface.dashboard import Dashboard
|
||||||
|
|||||||
@@ -21,3 +21,6 @@ class ProjectPaths(BaseModel):
|
|||||||
@field_validator("*", mode="before")
|
@field_validator("*", mode="before")
|
||||||
def convert_to_path(cls, v):
|
def convert_to_path(cls, v):
|
||||||
return Path(v) if not isinstance(v, Path) else v
|
return Path(v) if not isinstance(v, Path) else v
|
||||||
|
|
||||||
|
def get_data_path(self, filename: str) -> Path:
|
||||||
|
return self.data_dir / filename
|
||||||
|
|||||||
@@ -44,18 +44,3 @@ def ensure_directories(config: "PipelineConfig") -> None:
|
|||||||
Path(directory).mkdir(parents=True, exist_ok=True)
|
Path(directory).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
logging.info("Ensured all required directories exist")
|
logging.info("Ensured all required directories exist")
|
||||||
|
|
||||||
|
|
||||||
def get_data_file_path(filename: str, config: "PipelineConfig") -> Path:
|
|
||||||
"""Get full path for a data file"""
|
|
||||||
return config.paths.data_dir / filename
|
|
||||||
|
|
||||||
|
|
||||||
def get_model_file_path(filename: str, config: "PipelineConfig") -> Path:
|
|
||||||
"""Get full path for a model file"""
|
|
||||||
return config.paths.models_dir / filename
|
|
||||||
|
|
||||||
|
|
||||||
def get_output_file_path(filename: str, config: "PipelineConfig") -> Path:
|
|
||||||
"""Get full path for an output file"""
|
|
||||||
return config.paths.outputs_dir / filename
|
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
from core.utils import get_data_file_path
|
|
||||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data
|
||||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||||
try:
|
try:
|
||||||
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||||
@@ -25,7 +25,7 @@ class Dashboard:
|
|||||||
|
|
||||||
# Load basic statistics
|
# Load basic statistics
|
||||||
try:
|
try:
|
||||||
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||||
if data_path.exists():
|
if data_path.exists():
|
||||||
df = load_dataset(str(data_path))
|
df = load_dataset(str(data_path))
|
||||||
|
|
||||||
|
|||||||
@@ -4,10 +4,10 @@ import pandas as pd
|
|||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
from core.utils import get_data_file_path
|
|
||||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data
|
||||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||||
try:
|
try:
|
||||||
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||||
@@ -31,7 +31,7 @@ class DataOverview:
|
|||||||
}
|
}
|
||||||
|
|
||||||
selected_file = st.selectbox("Select Dataset", list(data_files.keys()))
|
selected_file = st.selectbox("Select Dataset", list(data_files.keys()))
|
||||||
file_path = get_data_file_path(data_files[selected_file], self.config)
|
file_path = self.config.paths.get_data_path(data_files[selected_file])
|
||||||
|
|
||||||
if not file_path.exists():
|
if not file_path.exists():
|
||||||
st.warning(f"Dataset not found: {file_path}")
|
st.warning(f"Dataset not found: {file_path}")
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from core.utils.data_loader import OPTIMIZED_DTYPES
|
|||||||
from interface.log_reader import LogReader
|
from interface.log_reader import LogReader
|
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data
|
||||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||||
try:
|
try:
|
||||||
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
"""Predictions interface for the Streamlit app"""
|
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@@ -8,7 +6,6 @@ import pandas as pd
|
|||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
from core.utils import get_data_file_path
|
|
||||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||||
from research.experiment.experiment_runner import ExperimentRunner
|
from research.experiment.experiment_runner import ExperimentRunner
|
||||||
from research.experiment.experiment_tracker import ExperimentTracker
|
from research.experiment.experiment_tracker import ExperimentTracker
|
||||||
@@ -268,7 +265,7 @@ class Predictions:
|
|||||||
}
|
}
|
||||||
|
|
||||||
selected_dataset = st.selectbox("Select Dataset", list(dataset_options.keys()))
|
selected_dataset = st.selectbox("Select Dataset", list(dataset_options.keys()))
|
||||||
file_path = get_data_file_path(dataset_options[selected_dataset], self.config)
|
file_path = self.config.paths.get_data_path(dataset_options[selected_dataset])
|
||||||
|
|
||||||
if not file_path.exists():
|
if not file_path.exists():
|
||||||
st.warning(f"Dataset not found: {file_path}")
|
st.warning(f"Dataset not found: {file_path}")
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ import sys
|
|||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
from core.config import setup_config
|
from core.config import setup_config
|
||||||
from core.utils import get_data_file_path
|
|
||||||
from core.utils.data_loader import DataLoader
|
from core.utils.data_loader import DataLoader
|
||||||
from processing.batch.batch_config import BatchConfig
|
from processing.batch.batch_config import BatchConfig
|
||||||
from processing.pipeline import Pipeline
|
from processing.pipeline import Pipeline
|
||||||
@@ -47,7 +46,7 @@ def run_pipeline(config) -> int:
|
|||||||
logging.info(f"Starting pipeline: {config.name} v{config.version}")
|
logging.info(f"Starting pipeline: {config.name} v{config.version}")
|
||||||
|
|
||||||
# Load input data
|
# Load input data
|
||||||
input_file_path = get_data_file_path(config.data.input_file, config)
|
input_file_path = config.paths.get_data_path(config.data.input_file)
|
||||||
if not input_file_path.exists():
|
if not input_file_path.exists():
|
||||||
logging.error(f"Input file not found: {input_file_path}")
|
logging.error(f"Input file not found: {input_file_path}")
|
||||||
return 1
|
return 1
|
||||||
@@ -60,8 +59,6 @@ def run_pipeline(config) -> int:
|
|||||||
|
|
||||||
# Create and run pipeline
|
# Create and run pipeline
|
||||||
pipeline = create_pipeline(config)
|
pipeline = create_pipeline(config)
|
||||||
|
|
||||||
logging.info("Starting pipeline execution")
|
|
||||||
data_splitter.split(pipeline.run(df))
|
data_splitter.split(pipeline.run(df))
|
||||||
|
|
||||||
# Show completion statistics
|
# Show completion statistics
|
||||||
@@ -104,5 +101,4 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
exit_code = main()
|
sys.exit(main())
|
||||||
sys.exit(exit_code)
|
|
||||||
|
|||||||
+1
-2
@@ -86,5 +86,4 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
exit_code = main()
|
sys.exit(main())
|
||||||
sys.exit(exit_code)
|
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
import sys
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@@ -48,21 +48,18 @@ def train(config: PipelineConfig):
|
|||||||
|
|
||||||
|
|
||||||
def run_pipeline(config: PipelineConfig, reset: bool = False):
|
def run_pipeline(config: PipelineConfig, reset: bool = False):
|
||||||
# Step 1: Feature engineering
|
if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["engineered"])):
|
||||||
if not reset and os.path.exists(config.paths.data_dir / config.data.output_files["engineered"]):
|
|
||||||
logging.info("Step 1: Feature engineering already done.")
|
logging.info("Step 1: Feature engineering already done.")
|
||||||
else:
|
else:
|
||||||
logging.info("Step 1: Running feature engineering")
|
logging.info("Step 1: Running feature engineering")
|
||||||
feature(config)
|
feature(config)
|
||||||
|
|
||||||
# Step 2: Build dataset
|
if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["ner_data"])):
|
||||||
if not reset and os.path.exists(config.paths.data_dir / config.data.output_files["ner_data"]):
|
|
||||||
logging.info("Step 2: NER dataset already built.")
|
logging.info("Step 2: NER dataset already built.")
|
||||||
else:
|
else:
|
||||||
logging.info("Step 2: Building NER dataset")
|
logging.info("Step 2: Building NER dataset")
|
||||||
build(config)
|
build(config)
|
||||||
|
|
||||||
# Step 3: Train model
|
|
||||||
logging.info("Step 3: Training NER Model")
|
logging.info("Step 3: Training NER Model")
|
||||||
train(config)
|
train(config)
|
||||||
|
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ from spacy.tokens import DocBin
|
|||||||
from spacy.util import filter_spans
|
from spacy.util import filter_spans
|
||||||
|
|
||||||
from core.config import PipelineConfig
|
from core.config import PipelineConfig
|
||||||
from core.utils import get_data_file_path
|
|
||||||
from core.utils.data_loader import DataLoader
|
from core.utils.data_loader import DataLoader
|
||||||
|
|
||||||
|
|
||||||
@@ -98,10 +97,8 @@ class NERDataBuilder:
|
|||||||
return docs
|
return docs
|
||||||
|
|
||||||
def build(self) -> int:
|
def build(self) -> int:
|
||||||
input_filepath = get_data_file_path(
|
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
|
||||||
self.config.data.output_files["engineered"], self.config
|
df = self.data_loader.load_csv_complete(filepath)
|
||||||
)
|
|
||||||
df = self.data_loader.load_csv_complete(input_filepath)
|
|
||||||
df = df[["name", "ner_tagged", "ner_entities"]]
|
df = df[["name", "ner_tagged", "ner_entities"]]
|
||||||
|
|
||||||
# Filter early
|
# Filter early
|
||||||
@@ -139,8 +136,8 @@ class NERDataBuilder:
|
|||||||
doc_bin = DocBin(docs=docs)
|
doc_bin = DocBin(docs=docs)
|
||||||
|
|
||||||
# Save
|
# Save
|
||||||
json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
|
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
|
||||||
spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
|
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
|
||||||
|
|
||||||
with open(json_path, "w", encoding="utf-8") as f:
|
with open(json_path, "w", encoding="utf-8") as f:
|
||||||
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ import pandas as pd
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from core.config import PipelineConfig
|
from core.config import PipelineConfig
|
||||||
from core.utils import get_data_file_path
|
|
||||||
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
|
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
|
||||||
from processing.ner.formats.connectors_format import ConnectorFormatter
|
from processing.ner.formats.connectors_format import ConnectorFormatter
|
||||||
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
||||||
@@ -55,7 +54,7 @@ class NEREngineering:
|
|||||||
def load_data(self) -> pd.DataFrame:
|
def load_data(self) -> pd.DataFrame:
|
||||||
"""Load and filter NER-tagged data from CSV file"""
|
"""Load and filter NER-tagged data from CSV file"""
|
||||||
|
|
||||||
filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||||
df = self.data_loader.load_csv_complete(filepath)
|
df = self.data_loader.load_csv_complete(filepath)
|
||||||
|
|
||||||
# Filter only NER-tagged rows
|
# Filter only NER-tagged rows
|
||||||
@@ -66,10 +65,8 @@ class NEREngineering:
|
|||||||
|
|
||||||
def compute(self) -> None:
|
def compute(self) -> None:
|
||||||
logging.info("Applying feature engineering transformations...")
|
logging.info("Applying feature engineering transformations...")
|
||||||
input_filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||||
output_filepath = get_data_file_path(
|
output_filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
|
||||||
self.config.data.output_files["engineered"], self.config
|
|
||||||
)
|
|
||||||
|
|
||||||
df = self.data_loader.load_csv_complete(input_filepath)
|
df = self.data_loader.load_csv_complete(input_filepath)
|
||||||
ner_df = df[df["ner_tagged"] == 1].copy()
|
ner_df = df[df["ner_tagged"] == 1].copy()
|
||||||
|
|||||||
@@ -127,7 +127,7 @@ class FeatureExtractionStep(PipelineStep):
|
|||||||
|
|
||||||
def _process_simple_names(self, df: pd.DataFrame) -> None:
|
def _process_simple_names(self, df: pd.DataFrame) -> None:
|
||||||
"""Process 3-word names efficiently with vectorized operations"""
|
"""Process 3-word names efficiently with vectorized operations"""
|
||||||
mask = df["words"] == 3
|
mask = pd.Series(df["words"] == 3)
|
||||||
|
|
||||||
if not mask.any():
|
if not mask.any():
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ from sklearn.metrics import confusion_matrix
|
|||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
from core.config import PipelineConfig
|
from core.config import PipelineConfig
|
||||||
from core.utils import get_data_file_path
|
|
||||||
from core.utils.data_loader import DataLoader
|
from core.utils.data_loader import DataLoader
|
||||||
from research.base_model import BaseModel
|
from research.base_model import BaseModel
|
||||||
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
|
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
|
||||||
@@ -36,8 +35,8 @@ class ExperimentRunner:
|
|||||||
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
|
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
|
||||||
|
|
||||||
# Load data
|
# Load data
|
||||||
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||||
df = self.data_loader.load_csv_complete(data_path)
|
df = self.data_loader.load_csv_complete(filepath)
|
||||||
|
|
||||||
# Apply data filters if specified
|
# Apply data filters if specified
|
||||||
df = self._apply_data_filters(df, experiment_config)
|
df = self._apply_data_filters(df, experiment_config)
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ from typing import List, Dict, Any
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config import get_config
|
from core.config import get_config
|
||||||
from core.utils import get_data_file_path
|
|
||||||
from core.utils.data_loader import DataLoader
|
from core.utils.data_loader import DataLoader
|
||||||
from research.experiment import FeatureType, ExperimentConfig
|
from research.experiment import FeatureType, ExperimentConfig
|
||||||
from research.experiment.experiment_runner import ExperimentRunner
|
from research.experiment.experiment_runner import ExperimentRunner
|
||||||
@@ -145,7 +144,7 @@ class ModelTrainer:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Load data for learning curve generation
|
# Load data for learning curve generation
|
||||||
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||||
if data_path.exists():
|
if data_path.exists():
|
||||||
df = self.data_loader.load_csv_complete(data_path)
|
df = self.data_loader.load_csv_complete(data_path)
|
||||||
|
|
||||||
|
|||||||
@@ -72,7 +72,6 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Store training history
|
# Store training history
|
||||||
|
|
||||||
self.training_history = {
|
self.training_history = {
|
||||||
"accuracy": history.history["accuracy"],
|
"accuracy": history.history["accuracy"],
|
||||||
"loss": history.history["loss"],
|
"loss": history.history["loss"],
|
||||||
|
|||||||
Reference in New Issue
Block a user