feat: enhance logging and memory management across modules
This commit is contained in:
@@ -1,7 +1,9 @@
|
|||||||
#!.venv/bin/python3
|
#!.venv/bin/python3
|
||||||
|
import argparse
|
||||||
|
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
from core.config import get_config
|
from core.config import get_config, setup_config, PipelineConfig
|
||||||
from core.utils.data_loader import DataLoader
|
from core.utils.data_loader import DataLoader
|
||||||
from interface.configuration import Configuration
|
from interface.configuration import Configuration
|
||||||
from interface.dashboard import Dashboard
|
from interface.dashboard import Dashboard
|
||||||
@@ -23,17 +25,11 @@ st.set_page_config(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@st.cache_data
|
|
||||||
def load_config():
|
|
||||||
"""Load application configuration with unified setup"""
|
|
||||||
return get_config()
|
|
||||||
|
|
||||||
|
|
||||||
class StreamlitApp:
|
class StreamlitApp:
|
||||||
"""Main Streamlit application class"""
|
"""Main Streamlit application class"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, config: PipelineConfig):
|
||||||
self.config = load_config()
|
self.config = config
|
||||||
self.data_loader = DataLoader(self.config)
|
self.data_loader = DataLoader(self.config)
|
||||||
self.experiment_tracker = ExperimentTracker(self.config)
|
self.experiment_tracker = ExperimentTracker(self.config)
|
||||||
self.experiment_runner = ExperimentRunner(self.config)
|
self.experiment_runner = ExperimentRunner(self.config)
|
||||||
@@ -44,7 +40,9 @@ class StreamlitApp:
|
|||||||
self.data_overview = DataOverview(self.config)
|
self.data_overview = DataOverview(self.config)
|
||||||
self.data_processing = DataProcessing(self.config, self.pipeline_monitor)
|
self.data_processing = DataProcessing(self.config, self.pipeline_monitor)
|
||||||
self.experiments = Experiments(self.config, self.experiment_tracker, self.experiment_runner)
|
self.experiments = Experiments(self.config, self.experiment_tracker, self.experiment_runner)
|
||||||
self.results_analysis = ResultsAnalysis(self.config, self.experiment_tracker, self.experiment_runner)
|
self.results_analysis = ResultsAnalysis(
|
||||||
|
self.config, self.experiment_tracker, self.experiment_runner
|
||||||
|
)
|
||||||
self.predictions = Predictions(self.config, self.experiment_tracker, self.experiment_runner)
|
self.predictions = Predictions(self.config, self.experiment_tracker, self.experiment_runner)
|
||||||
self.configuration = Configuration(self.config)
|
self.configuration = Configuration(self.config)
|
||||||
|
|
||||||
@@ -86,8 +84,16 @@ class StreamlitApp:
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Main application entry point"""
|
parser = argparse.ArgumentParser(
|
||||||
app = StreamlitApp()
|
description="DRC NERS Platform",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
)
|
||||||
|
parser.add_argument("--config", type=str, help="Path to configuration file")
|
||||||
|
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
config = setup_config(args.config, env=args.env)
|
||||||
|
app = StreamlitApp(config)
|
||||||
app.run()
|
app.run()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -64,6 +64,7 @@ data:
|
|||||||
ner_spacy: "names_ner.spacy" # Output file for NER annotated data using spaCy format
|
ner_spacy: "names_ner.spacy" # Output file for NER annotated data using spaCy format
|
||||||
split_evaluation: false # Should the dataset be split into training and evaluation sets ?
|
split_evaluation: false # Should the dataset be split into training and evaluation sets ?
|
||||||
split_by_gender: true # Should the dataset be split by gender ?
|
split_by_gender: true # Should the dataset be split by gender ?
|
||||||
|
split_by_province: true # Should the dataset be split by province ?
|
||||||
split_ner_data: true # Should the NER data be extracted and saved?
|
split_ner_data: true # Should the NER data be extracted and saved?
|
||||||
evaluation_fraction: 0.2 # Fraction of data to use for evaluation
|
evaluation_fraction: 0.2 # Fraction of data to use for evaluation
|
||||||
random_seed: 42 # Random seed for reproducibility
|
random_seed: 42 # Random seed for reproducibility
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
class NERConfig(BaseModel):
|
class NERConfig(BaseModel):
|
||||||
"""NER annotation configuration"""
|
"""NER annotation configuration"""
|
||||||
|
|
||||||
|
|||||||
@@ -12,13 +12,15 @@ class DataConfig(BaseModel):
|
|||||||
default_factory=lambda: {
|
default_factory=lambda: {
|
||||||
"featured": "names_featured.csv",
|
"featured": "names_featured.csv",
|
||||||
"evaluation": "names_evaluation.csv",
|
"evaluation": "names_evaluation.csv",
|
||||||
|
"engineered": "names_engineered.csv",
|
||||||
"males": "names_males.csv",
|
"males": "names_males.csv",
|
||||||
"females": "names_females.csv",
|
"females": "names_females.csv",
|
||||||
"ner_data": "names_ner.json",
|
"ner_data": "names_ner.json",
|
||||||
"ner_spacy": "names_ner.spacy"
|
"ner_spacy": "names_ner.spacy",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
split_evaluation: bool = False
|
split_evaluation: bool = False
|
||||||
|
split_by_province: bool = True
|
||||||
split_by_gender: bool = True
|
split_by_gender: bool = True
|
||||||
split_ner_data: bool = True
|
split_ner_data: bool = True
|
||||||
evaluation_fraction: float = 0.2
|
evaluation_fraction: float = 0.2
|
||||||
|
|||||||
+60
-16
@@ -1,17 +1,41 @@
|
|||||||
|
import gc
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Union, Iterator
|
from typing import Optional, Union, Iterator, Dict
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
|
OPTIMIZED_DTYPES = {
|
||||||
|
# Numeric columns with appropriate bit-width
|
||||||
|
"year": "Int16", # Years fit in 16-bit integer
|
||||||
|
"words": "Int8", # Word counts typically < 128
|
||||||
|
"length": "Int16", # Name lengths fit in 16-bit
|
||||||
|
"annotated": "Int8", # Binary flag (0/1)
|
||||||
|
"ner_tagged": "Int8", # Binary flag (0/1)
|
||||||
|
# Categorical columns (memory efficient for repeated values)
|
||||||
|
"sex": "category",
|
||||||
|
"province": "category",
|
||||||
|
"region": "category",
|
||||||
|
"identified_category": "category",
|
||||||
|
"transformation_type": "category",
|
||||||
|
# String columns with proper string dtype
|
||||||
|
"name": "string",
|
||||||
|
"probable_native": "string",
|
||||||
|
"probable_surname": "string",
|
||||||
|
"identified_name": "string",
|
||||||
|
"identified_surname": "string",
|
||||||
|
"ner_entities": "string",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class DataLoader:
|
class DataLoader:
|
||||||
"""Reusable data loading utilities"""
|
"""Reusable data loading utilities"""
|
||||||
|
|
||||||
def __init__(self, config: PipelineConfig):
|
def __init__(self, config: PipelineConfig, custom_dtypes: Optional[Dict] = None):
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self.dtypes = {**OPTIMIZED_DTYPES, **(custom_dtypes or {})}
|
||||||
|
|
||||||
def load_csv_chunked(
|
def load_csv_chunked(
|
||||||
self, filepath: Union[str, Path], chunk_size: Optional[int] = None
|
self, filepath: Union[str, Path], chunk_size: Optional[int] = None
|
||||||
@@ -19,19 +43,23 @@ class DataLoader:
|
|||||||
"""Load CSV file in chunks for memory efficiency"""
|
"""Load CSV file in chunks for memory efficiency"""
|
||||||
chunk_size = chunk_size or self.config.processing.chunk_size
|
chunk_size = chunk_size or self.config.processing.chunk_size
|
||||||
encodings = self.config.processing.encoding_options
|
encodings = self.config.processing.encoding_options
|
||||||
|
|
||||||
filepath = Path(filepath)
|
filepath = Path(filepath)
|
||||||
|
|
||||||
for encoding in encodings:
|
for encoding in encodings:
|
||||||
try:
|
try:
|
||||||
logging.info(f"Attempting to read {filepath} with encoding: {encoding}")
|
logging.info(f"Reading {filepath} with encoding: {encoding}")
|
||||||
|
|
||||||
|
# Read with optimal dtypes
|
||||||
chunk_iter = pd.read_csv(
|
chunk_iter = pd.read_csv(
|
||||||
filepath, encoding=encoding, chunksize=chunk_size, on_bad_lines="skip"
|
filepath,
|
||||||
|
encoding=encoding,
|
||||||
|
chunksize=chunk_size,
|
||||||
|
on_bad_lines="skip",
|
||||||
|
dtype=self.dtypes,
|
||||||
)
|
)
|
||||||
|
|
||||||
for i, chunk in enumerate(chunk_iter):
|
for i, chunk in enumerate(chunk_iter):
|
||||||
logging.debug(f"Processing chunk {i+1}")
|
logging.debug(f"Processing optimized chunk {i + 1}")
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
logging.info(f"Successfully read {filepath} with encoding: {encoding}")
|
logging.info(f"Successfully read {filepath} with encoding: {encoding}")
|
||||||
@@ -44,12 +72,20 @@ class DataLoader:
|
|||||||
raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")
|
raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")
|
||||||
|
|
||||||
def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
|
def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
|
||||||
"""Load complete CSV file into memory with size limiting and balancing"""
|
"""Load complete CSV with memory optimization"""
|
||||||
chunks = list(self.load_csv_chunked(filepath))
|
chunks = []
|
||||||
|
for chunk in self.load_csv_chunked(filepath):
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
if not chunks:
|
if not chunks:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
||||||
df = pd.concat(chunks, ignore_index=True)
|
logging.info(f"Concatenating {len(chunks)} optimized chunks")
|
||||||
|
df = pd.concat(chunks, ignore_index=True, copy=False)
|
||||||
|
|
||||||
|
# Cleanup chunks from memory
|
||||||
|
del chunks
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
# Apply dataset size limiting if configured
|
# Apply dataset size limiting if configured
|
||||||
if self.config.data.max_dataset_size is not None:
|
if self.config.data.max_dataset_size is not None:
|
||||||
@@ -87,27 +123,35 @@ class DataLoader:
|
|||||||
balanced_samples = []
|
balanced_samples = []
|
||||||
|
|
||||||
for i, sex in enumerate(sex_values):
|
for i, sex in enumerate(sex_values):
|
||||||
sex_df = df[df["sex"] == sex]
|
# Use boolean indexing instead of creating temporary DataFrames
|
||||||
|
sex_mask = df["sex"] == sex
|
||||||
|
sex_indices = df[sex_mask].index
|
||||||
|
|
||||||
# Distribute remaining samples to first categories
|
# Distribute remaining samples to first categories
|
||||||
current_samples = samples_per_sex + (1 if i < remaining_samples else 0)
|
current_samples = samples_per_sex + (1 if i < remaining_samples else 0)
|
||||||
current_samples = min(current_samples, len(sex_df))
|
current_samples = min(current_samples, len(sex_indices))
|
||||||
|
|
||||||
if current_samples > 0:
|
if current_samples > 0:
|
||||||
sample = sex_df.sample(n=current_samples, random_state=self.config.data.random_seed + i)
|
# Sample indices instead of DataFrame
|
||||||
balanced_samples.append(sample)
|
sampled_indices = pd.Series(sex_indices).sample(
|
||||||
|
n=current_samples, random_state=self.config.data.random_seed + i
|
||||||
|
)
|
||||||
|
balanced_samples.extend(sampled_indices.tolist())
|
||||||
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
|
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
|
||||||
|
|
||||||
if not balanced_samples:
|
if not balanced_samples:
|
||||||
logging.warning("No balanced samples could be created, using random sampling")
|
logging.warning("No balanced samples could be created, using random sampling")
|
||||||
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
||||||
|
|
||||||
result = pd.concat(balanced_samples, ignore_index=True)
|
# Create result using iloc with indices (no copying until final step)
|
||||||
|
result = df.iloc[balanced_samples].copy()
|
||||||
|
|
||||||
# Shuffle the final result
|
# Shuffle the final result
|
||||||
result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index(drop=True)
|
result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
|
||||||
|
drop=True
|
||||||
|
)
|
||||||
|
|
||||||
logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total records")
|
logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total")
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@@ -1,3 +1 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -9,14 +9,9 @@ class RegionMapper:
|
|||||||
def __init__(self, mapping: Optional[Dict] = None):
|
def __init__(self, mapping: Optional[Dict] = None):
|
||||||
self.mapping = mapping or REGION_MAPPING
|
self.mapping = mapping or REGION_MAPPING
|
||||||
|
|
||||||
def map_region_to_province(self, region: str) -> str:
|
def map(self, series: pd.Series) -> pd.Series:
|
||||||
"""Map a region to its province"""
|
|
||||||
region_lower = str(region).lower().strip()
|
|
||||||
return self.mapping.get(region_lower, ("AUTRES", "AUTRES"))[1].lower()
|
|
||||||
|
|
||||||
def map_regions_vectorized(self, regions: pd.Series) -> pd.Series:
|
|
||||||
"""Vectorized region to province mapping"""
|
"""Vectorized region to province mapping"""
|
||||||
return regions.str.lower().map(
|
return series.str.lower().map(
|
||||||
lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
|
lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -34,6 +29,7 @@ class RegionMapper:
|
|||||||
"sud-kivu",
|
"sud-kivu",
|
||||||
"kasai-occidental",
|
"kasai-occidental",
|
||||||
"kasai-oriental",
|
"kasai-oriental",
|
||||||
|
"autres",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,72 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
NER Dataset Feature Engineering Script
|
|
||||||
Processes the names_featured.csv dataset to create position-independent variations
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
|
|
||||||
from processing.ner.ner_engineering import NEREngineering
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(description='Engineer NER dataset for position-independent learning')
|
|
||||||
parser.add_argument('--input', default='data/dataset/names_featured.csv', help='Input CSV file path')
|
|
||||||
parser.add_argument('--output', default='data/dataset/names_featured_engineered.csv', help='Output CSV file path')
|
|
||||||
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
print("=== NER Dataset Feature Engineering ===")
|
|
||||||
print(f"Input file: {args.input}")
|
|
||||||
print(f"Output file: {args.output}")
|
|
||||||
print(f"Random seed: {args.seed}")
|
|
||||||
|
|
||||||
# Check if input file exists
|
|
||||||
if not os.path.exists(args.input):
|
|
||||||
print(f"Error: Input file {args.input} not found!")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Initialize engineering class
|
|
||||||
engineering = NEREngineering()
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Load data with progress indication
|
|
||||||
print("\n1. Loading NER-tagged data...")
|
|
||||||
data = engineering.load_ner_data(args.input)
|
|
||||||
print(f" Dataset size: {len(data):,} rows")
|
|
||||||
|
|
||||||
# Show sample of original data
|
|
||||||
print("\n2. Sample original data:")
|
|
||||||
for i, row in data.head(3).iterrows():
|
|
||||||
print(f" {row['name']} -> Native: '{row['probable_native']}', Surname: '{row['probable_surname']}'")
|
|
||||||
|
|
||||||
# Apply transformations
|
|
||||||
print("\n3. Applying feature engineering transformations...")
|
|
||||||
engineered_data = engineering.engineer_dataset(data, random_seed=args.seed)
|
|
||||||
|
|
||||||
# Save results
|
|
||||||
print(f"\n4. Saving engineered dataset to {args.output}...")
|
|
||||||
engineering.save_engineered_dataset(engineered_data, args.output)
|
|
||||||
|
|
||||||
# Show statistics
|
|
||||||
print(f"\n=== RESULTS SUMMARY ===")
|
|
||||||
print(f"Original dataset: {len(data):,} rows")
|
|
||||||
print(f"Engineered dataset: {len(engineered_data):,} rows")
|
|
||||||
print(f"Transformation distribution:")
|
|
||||||
counts = engineered_data['transformation_type'].value_counts().sort_index()
|
|
||||||
for trans_type, count in counts.items():
|
|
||||||
percentage = (count / len(engineered_data)) * 100
|
|
||||||
print(f" {trans_type}: {count:,} rows ({percentage:.1f}%)")
|
|
||||||
|
|
||||||
print(f"\nDataset successfully engineered and saved!")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error during processing: {str(e)}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -2,11 +2,12 @@ import pandas as pd
|
|||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
from core.utils import get_data_file_path
|
from core.utils import get_data_file_path
|
||||||
|
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||||
|
|
||||||
|
|
||||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||||
try:
|
try:
|
||||||
return pd.read_csv(file_path)
|
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Error loading dataset: {e}")
|
st.error(f"Error loading dataset: {e}")
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|||||||
@@ -5,11 +5,12 @@ import plotly.express as px
|
|||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
from core.utils import get_data_file_path
|
from core.utils import get_data_file_path
|
||||||
|
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||||
|
|
||||||
|
|
||||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||||
try:
|
try:
|
||||||
return pd.read_csv(file_path)
|
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Error loading dataset: {e}")
|
st.error(f"Error loading dataset: {e}")
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|||||||
@@ -2,12 +2,13 @@ import pandas as pd
|
|||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
|
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||||
from interface.log_reader import LogReader
|
from interface.log_reader import LogReader
|
||||||
|
|
||||||
|
|
||||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||||
try:
|
try:
|
||||||
return pd.read_csv(file_path)
|
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Error loading dataset: {e}")
|
st.error(f"Error loading dataset: {e}")
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
@@ -56,16 +57,12 @@ class DataProcessing:
|
|||||||
log_level_filter = st.selectbox(
|
log_level_filter = st.selectbox(
|
||||||
"Filter by Level",
|
"Filter by Level",
|
||||||
["All", "INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"],
|
["All", "INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"],
|
||||||
key="log_level_filter"
|
key="log_level_filter",
|
||||||
)
|
)
|
||||||
|
|
||||||
with col2:
|
with col2:
|
||||||
num_entries = st.number_input(
|
num_entries = st.number_input(
|
||||||
"Number of entries",
|
"Number of entries", min_value=5, max_value=50, value=10, key="num_log_entries"
|
||||||
min_value=5,
|
|
||||||
max_value=50,
|
|
||||||
value=10,
|
|
||||||
key="num_log_entries"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get log entries based on filter
|
# Get log entries based on filter
|
||||||
@@ -77,13 +74,21 @@ class DataProcessing:
|
|||||||
if log_entries:
|
if log_entries:
|
||||||
for entry in log_entries:
|
for entry in log_entries:
|
||||||
if entry.level == "ERROR":
|
if entry.level == "ERROR":
|
||||||
st.error(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
|
st.error(
|
||||||
|
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
|
||||||
|
)
|
||||||
elif entry.level == "WARNING":
|
elif entry.level == "WARNING":
|
||||||
st.warning(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
|
st.warning(
|
||||||
|
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
|
||||||
|
)
|
||||||
elif entry.level == "INFO":
|
elif entry.level == "INFO":
|
||||||
st.info(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
|
st.info(
|
||||||
|
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
st.text(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
|
st.text(
|
||||||
|
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
|
||||||
|
)
|
||||||
|
|
||||||
# Show log statistics
|
# Show log statistics
|
||||||
st.subheader("Log Statistics")
|
st.subheader("Log Statistics")
|
||||||
@@ -93,16 +98,16 @@ class DataProcessing:
|
|||||||
col1, col2, col3, col4 = st.columns(4)
|
col1, col2, col3, col4 = st.columns(4)
|
||||||
|
|
||||||
with col1:
|
with col1:
|
||||||
st.metric("Total Lines", log_stats.get('total_lines', 0))
|
st.metric("Total Lines", log_stats.get("total_lines", 0))
|
||||||
with col2:
|
with col2:
|
||||||
st.metric("INFO", log_stats.get('INFO', 0))
|
st.metric("INFO", log_stats.get("INFO", 0))
|
||||||
with col3:
|
with col3:
|
||||||
st.metric("WARNING", log_stats.get('WARNING', 0))
|
st.metric("WARNING", log_stats.get("WARNING", 0))
|
||||||
with col4:
|
with col4:
|
||||||
st.metric("ERROR", log_stats.get('ERROR', 0))
|
st.metric("ERROR", log_stats.get("ERROR", 0))
|
||||||
|
|
||||||
# Log level distribution chart
|
# Log level distribution chart
|
||||||
levels = ['INFO', 'WARNING', 'ERROR', 'DEBUG', 'CRITICAL']
|
levels = ["INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"]
|
||||||
counts = [log_stats.get(level, 0) for level in levels]
|
counts = [log_stats.get(level, 0) for level in levels]
|
||||||
|
|
||||||
if sum(counts) > 0:
|
if sum(counts) > 0:
|
||||||
@@ -112,12 +117,12 @@ class DataProcessing:
|
|||||||
title="Log Entries by Level",
|
title="Log Entries by Level",
|
||||||
color=levels,
|
color=levels,
|
||||||
color_discrete_map={
|
color_discrete_map={
|
||||||
'INFO': 'blue',
|
"INFO": "blue",
|
||||||
'WARNING': 'orange',
|
"WARNING": "orange",
|
||||||
'ERROR': 'red',
|
"ERROR": "red",
|
||||||
'DEBUG': 'gray',
|
"DEBUG": "gray",
|
||||||
'CRITICAL': 'darkred'
|
"CRITICAL": "darkred",
|
||||||
}
|
},
|
||||||
)
|
)
|
||||||
st.plotly_chart(fig, use_container_width=True)
|
st.plotly_chart(fig, use_container_width=True)
|
||||||
else:
|
else:
|
||||||
|
|||||||
+50
-14
@@ -14,7 +14,9 @@ from research.model_registry import list_available_models
|
|||||||
class Experiments:
|
class Experiments:
|
||||||
"""Handles experiment management interface"""
|
"""Handles experiment management interface"""
|
||||||
|
|
||||||
def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner):
|
def __init__(
|
||||||
|
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||||
|
):
|
||||||
self.config = config
|
self.config = config
|
||||||
self.experiment_tracker = experiment_tracker
|
self.experiment_tracker = experiment_tracker
|
||||||
self.experiment_runner = experiment_runner
|
self.experiment_runner = experiment_runner
|
||||||
@@ -41,13 +43,19 @@ class Experiments:
|
|||||||
col1, col2 = st.columns(2)
|
col1, col2 = st.columns(2)
|
||||||
|
|
||||||
with col1:
|
with col1:
|
||||||
exp_name = st.text_input("Experiment Name", placeholder="e.g., native_name_gender_prediction")
|
exp_name = st.text_input(
|
||||||
description = st.text_area("Description", placeholder="Brief description of the experiment")
|
"Experiment Name", placeholder="e.g., native_name_gender_prediction"
|
||||||
|
)
|
||||||
|
description = st.text_area(
|
||||||
|
"Description", placeholder="Brief description of the experiment"
|
||||||
|
)
|
||||||
model_type = st.selectbox("Model Type", list_available_models())
|
model_type = st.selectbox("Model Type", list_available_models())
|
||||||
|
|
||||||
# Feature selection
|
# Feature selection
|
||||||
feature_options = [f.value for f in FeatureType]
|
feature_options = [f.value for f in FeatureType]
|
||||||
selected_features = st.multiselect("Features to Use", feature_options, default=["full_name"])
|
selected_features = st.multiselect(
|
||||||
|
"Features to Use", feature_options, default=["full_name"]
|
||||||
|
)
|
||||||
|
|
||||||
with col2:
|
with col2:
|
||||||
# Model parameters
|
# Model parameters
|
||||||
@@ -74,7 +82,9 @@ class Experiments:
|
|||||||
test_size = st.slider("Test Set Size", 0.1, 0.5, 0.2)
|
test_size = st.slider("Test Set Size", 0.1, 0.5, 0.2)
|
||||||
cv_folds = st.number_input("Cross-Validation Folds", 3, 10, 5)
|
cv_folds = st.number_input("Cross-Validation Folds", 3, 10, 5)
|
||||||
|
|
||||||
tags = st.text_input("Tags (comma-separated)", placeholder="e.g., baseline, feature_study")
|
tags = st.text_input(
|
||||||
|
"Tags (comma-separated)", placeholder="e.g., baseline, feature_study"
|
||||||
|
)
|
||||||
|
|
||||||
# Advanced options
|
# Advanced options
|
||||||
with st.expander("Advanced Options"):
|
with st.expander("Advanced Options"):
|
||||||
@@ -92,14 +102,33 @@ class Experiments:
|
|||||||
|
|
||||||
if submitted:
|
if submitted:
|
||||||
self._handle_experiment_submission(
|
self._handle_experiment_submission(
|
||||||
exp_name, description, model_type, selected_features, model_params,
|
exp_name,
|
||||||
test_size, cv_folds, tags, filter_province, min_words, max_words
|
description,
|
||||||
|
model_type,
|
||||||
|
selected_features,
|
||||||
|
model_params,
|
||||||
|
test_size,
|
||||||
|
cv_folds,
|
||||||
|
tags,
|
||||||
|
filter_province,
|
||||||
|
min_words,
|
||||||
|
max_words,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _handle_experiment_submission(self, exp_name: str, description: str, model_type: str,
|
def _handle_experiment_submission(
|
||||||
selected_features: List[str], model_params: Dict[str, Any],
|
self,
|
||||||
test_size: float, cv_folds: int, tags: str,
|
exp_name: str,
|
||||||
filter_province: str, min_words: int, max_words: int):
|
description: str,
|
||||||
|
model_type: str,
|
||||||
|
selected_features: List[str],
|
||||||
|
model_params: Dict[str, Any],
|
||||||
|
test_size: float,
|
||||||
|
cv_folds: int,
|
||||||
|
tags: str,
|
||||||
|
filter_province: str,
|
||||||
|
min_words: int,
|
||||||
|
max_words: int,
|
||||||
|
):
|
||||||
"""Handle experiment form submission"""
|
"""Handle experiment form submission"""
|
||||||
if not exp_name:
|
if not exp_name:
|
||||||
st.error("Please provide an experiment name")
|
st.error("Please provide an experiment name")
|
||||||
@@ -183,7 +212,7 @@ class Experiments:
|
|||||||
# Display experiments
|
# Display experiments
|
||||||
for i, exp in enumerate(experiments):
|
for i, exp in enumerate(experiments):
|
||||||
with st.expander(
|
with st.expander(
|
||||||
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
|
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
|
||||||
):
|
):
|
||||||
self._display_experiment_details(exp, i)
|
self._display_experiment_details(exp, i)
|
||||||
|
|
||||||
@@ -268,8 +297,15 @@ class Experiments:
|
|||||||
base_name, model_types, ngram_ranges, feature_combinations, test_sizes, tags
|
base_name, model_types, ngram_ranges, feature_combinations, test_sizes, tags
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_batch_experiments(self, base_name: str, model_types: List[str], ngram_ranges: str,
|
def run_batch_experiments(
|
||||||
feature_combinations: List[str], test_sizes: str, tags: str):
|
self,
|
||||||
|
base_name: str,
|
||||||
|
model_types: List[str],
|
||||||
|
ngram_ranges: str,
|
||||||
|
feature_combinations: List[str],
|
||||||
|
test_sizes: str,
|
||||||
|
tags: str,
|
||||||
|
):
|
||||||
"""Run batch experiments with parameter combinations"""
|
"""Run batch experiments with parameter combinations"""
|
||||||
with st.spinner("Running batch experiments..."):
|
with st.spinner("Running batch experiments..."):
|
||||||
try:
|
try:
|
||||||
|
|||||||
+16
-19
@@ -8,6 +8,7 @@ from typing import List, Dict, Optional
|
|||||||
@dataclass
|
@dataclass
|
||||||
class LogEntry:
|
class LogEntry:
|
||||||
"""Represents a single log entry."""
|
"""Represents a single log entry."""
|
||||||
|
|
||||||
timestamp: datetime
|
timestamp: datetime
|
||||||
logger: str
|
logger: str
|
||||||
level: str
|
level: str
|
||||||
@@ -23,7 +24,7 @@ class LogReader:
|
|||||||
self.log_file_path = Path(log_file_path)
|
self.log_file_path = Path(log_file_path)
|
||||||
# Pattern to match Python logging format: timestamp - logger - level - message
|
# Pattern to match Python logging format: timestamp - logger - level - message
|
||||||
self.log_pattern = re.compile(
|
self.log_pattern = re.compile(
|
||||||
r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (.+?) - (\w+) - (.+)'
|
r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (.+?) - (\w+) - (.+)"
|
||||||
)
|
)
|
||||||
|
|
||||||
def read_last_entries(self, count: int = 10) -> List[LogEntry]:
|
def read_last_entries(self, count: int = 10) -> List[LogEntry]:
|
||||||
@@ -32,12 +33,12 @@ class LogReader:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(self.log_file_path, 'r', encoding='utf-8') as file:
|
with open(self.log_file_path, "r", encoding="utf-8") as file:
|
||||||
lines = file.readlines()
|
lines = file.readlines()
|
||||||
|
|
||||||
# Parse log entries from the end
|
# Parse log entries from the end
|
||||||
entries = []
|
entries = []
|
||||||
for line in reversed(lines[-count * 2:]): # Read more lines in case some don't match
|
for line in reversed(lines[-count * 2 :]): # Read more lines in case some don't match
|
||||||
entry = self._parse_log_line(line.strip())
|
entry = self._parse_log_line(line.strip())
|
||||||
if entry:
|
if entry:
|
||||||
entries.append(entry)
|
entries.append(entry)
|
||||||
@@ -57,7 +58,7 @@ class LogReader:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(self.log_file_path, 'r', encoding='utf-8') as file:
|
with open(self.log_file_path, "r", encoding="utf-8") as file:
|
||||||
lines = file.readlines()
|
lines = file.readlines()
|
||||||
|
|
||||||
entries = []
|
entries = []
|
||||||
@@ -80,7 +81,7 @@ class LogReader:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(self.log_file_path, 'r', encoding='utf-8') as file:
|
with open(self.log_file_path, "r", encoding="utf-8") as file:
|
||||||
lines = file.readlines()
|
lines = file.readlines()
|
||||||
|
|
||||||
entries = []
|
entries = []
|
||||||
@@ -107,16 +108,16 @@ class LogReader:
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(self.log_file_path, 'r', encoding='utf-8') as file:
|
with open(self.log_file_path, "r", encoding="utf-8") as file:
|
||||||
lines = file.readlines()
|
lines = file.readlines()
|
||||||
|
|
||||||
stats = {
|
stats = {
|
||||||
'total_lines': len(lines),
|
"total_lines": len(lines),
|
||||||
'INFO': 0,
|
"INFO": 0,
|
||||||
'WARNING': 0,
|
"WARNING": 0,
|
||||||
'ERROR': 0,
|
"ERROR": 0,
|
||||||
'DEBUG': 0,
|
"DEBUG": 0,
|
||||||
'CRITICAL': 0
|
"CRITICAL": 0,
|
||||||
}
|
}
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
@@ -143,14 +144,10 @@ class LogReader:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
timestamp_str, logger, level, message = match.groups()
|
timestamp_str, logger, level, message = match.groups()
|
||||||
timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S,%f')
|
timestamp = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S,%f")
|
||||||
|
|
||||||
return LogEntry(
|
return LogEntry(
|
||||||
timestamp=timestamp,
|
timestamp=timestamp, logger=logger, level=level, message=message, raw_line=line
|
||||||
logger=logger,
|
|
||||||
level=level,
|
|
||||||
message=message,
|
|
||||||
raw_line=line
|
|
||||||
)
|
)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return None
|
return None
|
||||||
@@ -168,7 +165,7 @@ class MultiLogReader:
|
|||||||
if not self.log_directory.exists():
|
if not self.log_directory.exists():
|
||||||
return []
|
return []
|
||||||
|
|
||||||
return list(self.log_directory.glob('*.log'))
|
return list(self.log_directory.glob("*.log"))
|
||||||
|
|
||||||
def read_from_all_files(self, count: int = 10) -> List[LogEntry]:
|
def read_from_all_files(self, count: int = 10) -> List[LogEntry]:
|
||||||
"""Read entries from all log files and merge them chronologically."""
|
"""Read entries from all log files and merge them chronologically."""
|
||||||
|
|||||||
+19
-12
@@ -9,6 +9,7 @@ import plotly.express as px
|
|||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
from core.utils import get_data_file_path
|
from core.utils import get_data_file_path
|
||||||
|
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||||
from research.experiment.experiment_runner import ExperimentRunner
|
from research.experiment.experiment_runner import ExperimentRunner
|
||||||
from research.experiment.experiment_tracker import ExperimentTracker
|
from research.experiment.experiment_tracker import ExperimentTracker
|
||||||
|
|
||||||
@@ -16,7 +17,9 @@ from research.experiment.experiment_tracker import ExperimentTracker
|
|||||||
class Predictions:
|
class Predictions:
|
||||||
"""Handles prediction interface"""
|
"""Handles prediction interface"""
|
||||||
|
|
||||||
def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner):
|
def __init__(
|
||||||
|
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||||
|
):
|
||||||
self.config = config
|
self.config = config
|
||||||
self.experiment_tracker = experiment_tracker
|
self.experiment_tracker = experiment_tracker
|
||||||
self.experiment_runner = experiment_runner
|
self.experiment_runner = experiment_runner
|
||||||
@@ -86,7 +89,9 @@ class Predictions:
|
|||||||
confidence = self._get_prediction_confidence(model, input_df)
|
confidence = self._get_prediction_confidence(model, input_df)
|
||||||
|
|
||||||
# Display results
|
# Display results
|
||||||
self._display_single_prediction_results(prediction, confidence, experiment, name_input)
|
self._display_single_prediction_results(
|
||||||
|
prediction, confidence, experiment, name_input
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Error making prediction: {e}")
|
st.error(f"Error making prediction: {e}")
|
||||||
@@ -114,8 +119,9 @@ class Predictions:
|
|||||||
except:
|
except:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _display_single_prediction_results(self, prediction: str, confidence: Optional[float],
|
def _display_single_prediction_results(
|
||||||
experiment, name_input: str):
|
self, prediction: str, confidence: Optional[float], experiment, name_input: str
|
||||||
|
):
|
||||||
"""Display single prediction results"""
|
"""Display single prediction results"""
|
||||||
col1, col2 = st.columns(2)
|
col1, col2 = st.columns(2)
|
||||||
|
|
||||||
@@ -129,9 +135,7 @@ class Predictions:
|
|||||||
|
|
||||||
# Additional info
|
# Additional info
|
||||||
st.info(f"Model used: {experiment.config.name}")
|
st.info(f"Model used: {experiment.config.name}")
|
||||||
st.info(
|
st.info(f"Features used: {', '.join([f.value for f in experiment.config.features])}")
|
||||||
f"Features used: {', '.join([f.value for f in experiment.config.features])}"
|
|
||||||
)
|
|
||||||
|
|
||||||
def show_batch_prediction(self, experiment):
|
def show_batch_prediction(self, experiment):
|
||||||
"""Show batch prediction interface"""
|
"""Show batch prediction interface"""
|
||||||
@@ -141,7 +145,7 @@ class Predictions:
|
|||||||
|
|
||||||
if uploaded_file is not None:
|
if uploaded_file is not None:
|
||||||
try:
|
try:
|
||||||
df = pd.read_csv(uploaded_file)
|
df = pd.read_csv(uploaded_file, dtype=OPTIMIZED_DTYPES)
|
||||||
|
|
||||||
st.write("**Uploaded Data Preview:**")
|
st.write("**Uploaded Data Preview:**")
|
||||||
st.dataframe(df.head(), use_container_width=True)
|
st.dataframe(df.head(), use_container_width=True)
|
||||||
@@ -296,13 +300,14 @@ class Predictions:
|
|||||||
def _load_dataset(self, file_path: str) -> pd.DataFrame:
|
def _load_dataset(self, file_path: str) -> pd.DataFrame:
|
||||||
"""Load dataset with error handling"""
|
"""Load dataset with error handling"""
|
||||||
try:
|
try:
|
||||||
return pd.read_csv(file_path)
|
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Error loading dataset: {e}")
|
st.error(f"Error loading dataset: {e}")
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
||||||
def _run_dataset_prediction(self, df: pd.DataFrame, experiment, sample_size: int,
|
def _run_dataset_prediction(
|
||||||
compare_with_actual: bool):
|
self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
|
||||||
|
):
|
||||||
"""Run dataset prediction and display results"""
|
"""Run dataset prediction and display results"""
|
||||||
with st.spinner("Running predictions..."):
|
with st.spinner("Running predictions..."):
|
||||||
# Sample data if requested
|
# Sample data if requested
|
||||||
@@ -353,7 +358,9 @@ class Predictions:
|
|||||||
|
|
||||||
with col2:
|
with col2:
|
||||||
st.write("**Sample Incorrect Predictions**")
|
st.write("**Sample Incorrect Predictions**")
|
||||||
incorrect_sample = df_sample[~correct_mask][["name", "sex", "predicted_gender"]].head(10)
|
incorrect_sample = df_sample[~correct_mask][["name", "sex", "predicted_gender"]].head(
|
||||||
|
10
|
||||||
|
)
|
||||||
st.dataframe(incorrect_sample, use_container_width=True)
|
st.dataframe(incorrect_sample, use_container_width=True)
|
||||||
|
|
||||||
def _display_dataset_predictions(self, df_sample: pd.DataFrame):
|
def _display_dataset_predictions(self, df_sample: pd.DataFrame):
|
||||||
|
|||||||
@@ -13,7 +13,9 @@ from research.experiment.experiment_tracker import ExperimentTracker
|
|||||||
class ResultsAnalysis:
|
class ResultsAnalysis:
|
||||||
"""Handles experiment results and analysis interface"""
|
"""Handles experiment results and analysis interface"""
|
||||||
|
|
||||||
def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner):
|
def __init__(
|
||||||
|
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||||
|
):
|
||||||
self.config = config
|
self.config = config
|
||||||
self.experiment_tracker = experiment_tracker
|
self.experiment_tracker = experiment_tracker
|
||||||
self.experiment_runner = experiment_runner
|
self.experiment_runner = experiment_runner
|
||||||
@@ -21,7 +23,9 @@ class ResultsAnalysis:
|
|||||||
def index(self):
|
def index(self):
|
||||||
"""Main results analysis page"""
|
"""Main results analysis page"""
|
||||||
st.header("Results & Analysis")
|
st.header("Results & Analysis")
|
||||||
tab1, tab2, tab3 = st.tabs(["Experiment Comparison", "Performance Analysis", "Model Analysis"])
|
tab1, tab2, tab3 = st.tabs(
|
||||||
|
["Experiment Comparison", "Performance Analysis", "Model Analysis"]
|
||||||
|
)
|
||||||
|
|
||||||
with tab1:
|
with tab1:
|
||||||
self.show_experiment_comparison()
|
self.show_experiment_comparison()
|
||||||
|
|||||||
@@ -8,13 +8,11 @@ from core.config import setup_config
|
|||||||
from core.utils import get_data_file_path
|
from core.utils import get_data_file_path
|
||||||
from core.utils.data_loader import DataLoader
|
from core.utils.data_loader import DataLoader
|
||||||
from processing.batch.batch_config import BatchConfig
|
from processing.batch.batch_config import BatchConfig
|
||||||
from processing.ner.ner_data_builder import NERDataBuilder
|
|
||||||
from processing.pipeline import Pipeline
|
from processing.pipeline import Pipeline
|
||||||
from processing.steps.data_cleaning_step import DataCleaningStep
|
from processing.steps.data_cleaning_step import DataCleaningStep
|
||||||
from processing.steps.data_splitting_step import DataSplittingStep
|
from processing.steps.data_splitting_step import DataSplittingStep
|
||||||
from processing.steps.feature_extraction_step import FeatureExtractionStep
|
from processing.steps.feature_extraction_step import FeatureExtractionStep
|
||||||
from processing.steps.llm_annotation_step import LLMAnnotationStep
|
from processing.steps.llm_annotation_step import LLMAnnotationStep
|
||||||
from processing.steps.ner_annotation_step import NERAnnotationStep
|
|
||||||
|
|
||||||
|
|
||||||
def create_pipeline(config) -> Pipeline:
|
def create_pipeline(config) -> Pipeline:
|
||||||
@@ -31,9 +29,8 @@ def create_pipeline(config) -> Pipeline:
|
|||||||
steps = [
|
steps = [
|
||||||
DataCleaningStep(config),
|
DataCleaningStep(config),
|
||||||
FeatureExtractionStep(config),
|
FeatureExtractionStep(config),
|
||||||
NERAnnotationStep(config),
|
# NERAnnotationStep(config),
|
||||||
LLMAnnotationStep(config),
|
LLMAnnotationStep(config),
|
||||||
DataSplittingStep(config),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
for stage in config.stages:
|
for stage in config.stages:
|
||||||
@@ -56,6 +53,7 @@ def run_pipeline(config) -> int:
|
|||||||
return 1
|
return 1
|
||||||
|
|
||||||
data_loader = DataLoader(config)
|
data_loader = DataLoader(config)
|
||||||
|
data_splitter = DataSplittingStep(config)
|
||||||
logging.info(f"Loading data from {input_file_path}")
|
logging.info(f"Loading data from {input_file_path}")
|
||||||
df = data_loader.load_csv_complete(input_file_path)
|
df = data_loader.load_csv_complete(input_file_path)
|
||||||
logging.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
|
logging.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
|
||||||
@@ -64,13 +62,7 @@ def run_pipeline(config) -> int:
|
|||||||
pipeline = create_pipeline(config)
|
pipeline = create_pipeline(config)
|
||||||
|
|
||||||
logging.info("Starting pipeline execution")
|
logging.info("Starting pipeline execution")
|
||||||
result_df = pipeline.run(df)
|
data_splitter.split(pipeline.run(df))
|
||||||
|
|
||||||
# Save results using the splitting step
|
|
||||||
splitting_step = pipeline.steps[-1]
|
|
||||||
if isinstance(splitting_step, DataSplittingStep):
|
|
||||||
splitting_step.save_splits(result_df)
|
|
||||||
NERDataBuilder(config).build(result_df)
|
|
||||||
|
|
||||||
# Show completion statistics
|
# Show completion statistics
|
||||||
progress = pipeline.get_progress()
|
progress = pipeline.get_progress()
|
||||||
@@ -94,7 +86,7 @@ def run_pipeline(config) -> int:
|
|||||||
def main():
|
def main():
|
||||||
"""Main entry point with unified configuration loading"""
|
"""Main entry point with unified configuration loading"""
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="DRC Names Processing Pipeline",
|
description="DRC NERS Processing Pipeline",
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
)
|
)
|
||||||
parser.add_argument("--config", type=str, help="Path to configuration file")
|
parser.add_argument("--config", type=str, help="Path to configuration file")
|
||||||
|
|||||||
+14
-13
@@ -9,39 +9,40 @@ from processing.monitoring.pipeline_monitor import PipelineMonitor
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
choices = ["data_cleaning", "feature_extraction", "ner_annotation", "llm_annotation", "data_splitting"]
|
choices = [
|
||||||
|
"data_cleaning",
|
||||||
|
"feature_extraction",
|
||||||
|
"ner_annotation",
|
||||||
|
"llm_annotation",
|
||||||
|
"data_splitting",
|
||||||
|
]
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Monitor and manage the DRC names processing pipeline")
|
parser = argparse.ArgumentParser(description="DRC NERS Processing Monitoring")
|
||||||
parser.add_argument("--config", type=Path, help="Path to configuration file")
|
parser.add_argument("--config", type=Path, help="Path to configuration file")
|
||||||
parser.add_argument("--env", type=str, default="development", help="Environment")
|
parser.add_argument("--env", type=str, default="development", help="Environment")
|
||||||
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
||||||
|
|
||||||
# Status command
|
|
||||||
subparsers.add_parser("status", help="Show pipeline status")
|
|
||||||
|
|
||||||
# Clean command
|
# Clean command
|
||||||
clean_parser = subparsers.add_parser("clean", help="Clean checkpoint files")
|
clean_parser = subparsers.add_parser("clean", help="Clean checkpoint files")
|
||||||
clean_parser.add_argument("--step", type=str, choices=choices, help="Specific step (default: all)")
|
clean_parser.add_argument("--step", type=str, choices=choices, help="default: all")
|
||||||
clean_parser.add_argument("--keep-last", type=int, default=1, help="Checkpoints to keep (default: 1)")
|
clean_parser.add_argument("--keep-last", type=int, default=1, help="(default: 1)")
|
||||||
clean_parser.add_argument("--force", action="store_true", help="Clean without confirmation")
|
clean_parser.add_argument("--force", action="store_true", help="Clean without confirmation")
|
||||||
|
|
||||||
# Reset command
|
# Reset command
|
||||||
reset_parser = subparsers.add_parser("reset", help="Reset pipeline step")
|
reset_parser = subparsers.add_parser("reset", help="Reset pipeline step")
|
||||||
reset_parser.add_argument("--step", type=str, choices=choices, help="Specific step (default: all)")
|
reset_parser.add_argument("--step", type=str, choices=choices, help="(default: all)")
|
||||||
reset_parser.add_argument("--all", action="store_true", help="Reset all steps")
|
reset_parser.add_argument("--all", action="store_true", help="Reset all steps")
|
||||||
reset_parser.add_argument("--force", action="store_true", help="Reset without confirmation")
|
reset_parser.add_argument("--force", action="store_true", help="Reset without confirmation")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if not args.command:
|
|
||||||
parser.print_help()
|
|
||||||
return 1
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
setup_config(config_path=args.config, env=args.env)
|
setup_config(config_path=args.config, env=args.env)
|
||||||
monitor = PipelineMonitor()
|
monitor = PipelineMonitor()
|
||||||
|
|
||||||
if args.command == "status":
|
if not args.command:
|
||||||
|
parser.print_help()
|
||||||
monitor.print_status(detailed=True)
|
monitor.print_status(detailed=True)
|
||||||
|
return 1
|
||||||
|
|
||||||
elif args.command == "clean":
|
elif args.command == "clean":
|
||||||
checkpoint_info = monitor.count_checkpoint_files()
|
checkpoint_info = monitor.count_checkpoint_files()
|
||||||
|
|||||||
@@ -2,51 +2,89 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
import os
|
||||||
|
import traceback
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from core.config import setup_config
|
from core.config import setup_config, PipelineConfig
|
||||||
from processing.ner.ner_data_builder import NERDataBuilder
|
from processing.ner.ner_data_builder import NERDataBuilder
|
||||||
|
from processing.ner.ner_engineering import NEREngineering
|
||||||
from processing.ner.ner_name_model import NERNameModel
|
from processing.ner.ner_name_model import NERNameModel
|
||||||
|
|
||||||
|
|
||||||
def train(config_path=None, env="development"):
|
def feature(config: PipelineConfig):
|
||||||
|
"""Apply feature engineering to create position-independent NER dataset."""
|
||||||
|
NEREngineering(config).compute()
|
||||||
|
|
||||||
|
|
||||||
|
def build(config: PipelineConfig):
|
||||||
|
"""Build NER dataset using NERDataBuilder."""
|
||||||
|
NERDataBuilder(config).build()
|
||||||
|
|
||||||
|
|
||||||
|
def train(config: PipelineConfig):
|
||||||
"""Train the NER model."""
|
"""Train the NER model."""
|
||||||
try:
|
trainer = NERNameModel(config)
|
||||||
config = setup_config(config_path=config_path, env=env)
|
|
||||||
trainer = NERNameModel(config)
|
|
||||||
builder = NERDataBuilder(config)
|
|
||||||
|
|
||||||
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
|
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
|
||||||
if not data_path.exists():
|
if not data_path.exists():
|
||||||
builder.build()
|
logging.info("NER data not found. Building dataset first...")
|
||||||
|
build(config)
|
||||||
|
|
||||||
trainer.create_blank_model("fr")
|
trainer.create_blank_model("fr")
|
||||||
data = trainer.load_data(str(data_path))
|
data = trainer.load_data(str(data_path))
|
||||||
|
|
||||||
split_idx = int(len(data) * 0.8)
|
split_idx = int(len(data) * 0.9)
|
||||||
train_data, eval_data = data[:split_idx], data[split_idx:]
|
train_data, eval_data = data[:split_idx], data[split_idx:]
|
||||||
|
|
||||||
logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
|
logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
|
||||||
trainer.train(train_data, epochs=1, batch_size=config.processing.batch_size, dropout_rate=0.3)
|
trainer.train(
|
||||||
trainer.evaluate(eval_data)
|
data=train_data, epochs=1, batch_size=config.processing.batch_size, dropout_rate=0.3
|
||||||
|
)
|
||||||
|
trainer.evaluate(eval_data)
|
||||||
|
|
||||||
model_path = trainer.save()
|
model_path = trainer.save()
|
||||||
logging.info(f"Model saved to: {model_path}")
|
logging.info(f"Model saved to: {model_path}")
|
||||||
return 0
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"NER Training failed: {e}", exc_info=True)
|
def run_pipeline(config: PipelineConfig, reset: bool = False):
|
||||||
return 1
|
# Step 1: Feature engineering
|
||||||
|
if not reset and os.path.exists(config.paths.data_dir / config.data.output_files["engineered"]):
|
||||||
|
logging.info("Step 1: Feature engineering already done.")
|
||||||
|
else:
|
||||||
|
logging.info("Step 1: Running feature engineering")
|
||||||
|
feature(config)
|
||||||
|
|
||||||
|
# Step 2: Build dataset
|
||||||
|
if not reset and os.path.exists(config.paths.data_dir / config.data.output_files["ner_data"]):
|
||||||
|
logging.info("Step 2: NER dataset already built.")
|
||||||
|
else:
|
||||||
|
logging.info("Step 2: Building NER dataset")
|
||||||
|
build(config)
|
||||||
|
|
||||||
|
# Step 3: Train model
|
||||||
|
logging.info("Step 3: Training NER Model")
|
||||||
|
train(config)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Train NER model for DRC names")
|
parser = argparse.ArgumentParser(description="NER model management for DRC names")
|
||||||
parser.add_argument("--config", type=str, help="Path to configuration file")
|
parser.add_argument("--config", type=str, help="Path to configuration file")
|
||||||
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
||||||
|
parser.add_argument("--reset", action="store_true", help="Reset all steps")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
sys.exit(train(config_path=args.config, env=args.env))
|
try:
|
||||||
|
config = setup_config(config_path=args.config, env=args.env)
|
||||||
|
return run_pipeline(config, args.reset)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Pipeline failed: {e}")
|
||||||
|
traceback.print_exc()
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
sys.exit(main())
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from typing import Iterator
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.batch.batch_config import BatchConfig
|
from processing.batch.batch_config import BatchConfig
|
||||||
|
from processing.batch.memory_monitor import MemoryMonitor
|
||||||
from processing.steps import PipelineStep
|
from processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
@@ -13,28 +14,36 @@ class BatchProcessor:
|
|||||||
|
|
||||||
def __init__(self, config: BatchConfig):
|
def __init__(self, config: BatchConfig):
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self.memory_monitor = MemoryMonitor()
|
||||||
|
|
||||||
def create_batches(self, df: pd.DataFrame) -> Iterator[tuple[pd.DataFrame, int]]:
|
def create_batches(self, df: pd.DataFrame) -> Iterator[tuple[pd.DataFrame, int]]:
|
||||||
"""Create batches from DataFrame"""
|
"""Create batches from DataFrame without unnecessary copies"""
|
||||||
total_rows = len(df)
|
total_rows = len(df)
|
||||||
batch_size = self.config.batch_size
|
batch_size = self.config.batch_size
|
||||||
|
|
||||||
for i in range(0, total_rows, batch_size):
|
for i in range(0, total_rows, batch_size):
|
||||||
batch = df.iloc[i : i + batch_size].copy()
|
batch = df.iloc[i : i + batch_size]
|
||||||
batch_id = i // batch_size
|
batch_id = i // batch_size
|
||||||
yield batch, batch_id
|
yield batch, batch_id
|
||||||
|
|
||||||
def process_sequential(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
def process_sequential(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""Process batches sequentially"""
|
"""Memory-optimized sequential processing"""
|
||||||
results = []
|
results = []
|
||||||
|
memory_threshold_mb = 1000 # Clean memory when usage exceeds 1 GB
|
||||||
|
|
||||||
for batch, batch_id in self.create_batches(df):
|
for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)):
|
||||||
if step.batch_exists(batch_id):
|
if step.batch_exists(batch_id):
|
||||||
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
|
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
|
||||||
processed_batch = step.load_batch(batch_id)
|
processed_batch = step.load_batch(batch_id)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
processed_batch = step.process_batch(batch, batch_id)
|
# Only copy if the processing step requires mutation
|
||||||
|
if step.requires_batch_mutation:
|
||||||
|
batch_copy = batch.copy()
|
||||||
|
processed_batch = step.process_batch(batch_copy, batch_id)
|
||||||
|
else:
|
||||||
|
processed_batch = step.process_batch(batch, batch_id)
|
||||||
|
|
||||||
step.save_batch(processed_batch, batch_id)
|
step.save_batch(processed_batch, batch_id)
|
||||||
step.state.processed_batches += 1
|
step.state.processed_batches += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -44,14 +53,32 @@ class BatchProcessor:
|
|||||||
|
|
||||||
results.append(processed_batch)
|
results.append(processed_batch)
|
||||||
|
|
||||||
|
# Memory management
|
||||||
|
if batch_num % self.config.checkpoint_interval == 0:
|
||||||
|
current_memory = self.memory_monitor.get_memory_usage_mb()
|
||||||
|
if current_memory > memory_threshold_mb:
|
||||||
|
logging.info(f"Memory cleanup triggered at {current_memory:.1f} MB")
|
||||||
|
self.memory_monitor.cleanup_memory()
|
||||||
|
|
||||||
# Save state periodically
|
# Save state periodically
|
||||||
if batch_id % self.config.checkpoint_interval == 0:
|
if batch_id % self.config.checkpoint_interval == 0:
|
||||||
step.save_state()
|
step.save_state()
|
||||||
|
|
||||||
return pd.concat(results, ignore_index=True) if results else pd.DataFrame()
|
# Final memory cleanup before concatenation
|
||||||
|
self.memory_monitor.cleanup_memory()
|
||||||
|
self.memory_monitor.log_memory_usage("before_concat")
|
||||||
|
|
||||||
|
result = self._safe_concat(results) if results else pd.DataFrame()
|
||||||
|
|
||||||
|
# Final cleanup
|
||||||
|
del results
|
||||||
|
self.memory_monitor.cleanup_memory()
|
||||||
|
self.memory_monitor.log_memory_usage("sequential_complete")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""Process batches concurrently"""
|
"""Memory-optimized concurrent processing"""
|
||||||
executor_class = (
|
executor_class = (
|
||||||
ProcessPoolExecutor if self.config.use_multiprocessing else ThreadPoolExecutor
|
ProcessPoolExecutor if self.config.use_multiprocessing else ThreadPoolExecutor
|
||||||
)
|
)
|
||||||
@@ -65,7 +92,9 @@ class BatchProcessor:
|
|||||||
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
|
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
|
||||||
results[batch_id] = step.load_batch(batch_id)
|
results[batch_id] = step.load_batch(batch_id)
|
||||||
else:
|
else:
|
||||||
future = executor.submit(step.process_batch, batch, batch_id)
|
# Only copy if necessary for concurrent processing
|
||||||
|
batch_copy = batch.copy() if step.requires_batch_mutation else batch
|
||||||
|
future = executor.submit(step.process_batch, batch_copy, batch_id)
|
||||||
future_to_batch[future] = (batch_id, batch)
|
future_to_batch[future] = (batch_id, batch)
|
||||||
|
|
||||||
# Collect results as they complete
|
# Collect results as they complete
|
||||||
@@ -81,13 +110,24 @@ class BatchProcessor:
|
|||||||
logging.error(f"Failed to process batch {batch_id}: {e}")
|
logging.error(f"Failed to process batch {batch_id}: {e}")
|
||||||
step.state.failed_batches.append(batch_id)
|
step.state.failed_batches.append(batch_id)
|
||||||
|
|
||||||
# Reassemble results in order
|
# Memory-efficient reassembly
|
||||||
ordered_results = []
|
ordered_results = []
|
||||||
for batch_id in sorted(results.keys()):
|
for batch_id in sorted(results.keys()):
|
||||||
ordered_results.append(results[batch_id])
|
ordered_results.append(results[batch_id])
|
||||||
|
|
||||||
step.save_state()
|
step.save_state()
|
||||||
return pd.concat(ordered_results, ignore_index=True) if ordered_results else pd.DataFrame()
|
|
||||||
|
# Cleanup before concat
|
||||||
|
del results
|
||||||
|
self.memory_monitor.cleanup_memory()
|
||||||
|
|
||||||
|
result = self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
|
||||||
|
|
||||||
|
# Final cleanup
|
||||||
|
del ordered_results
|
||||||
|
self.memory_monitor.cleanup_memory()
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""Process data using the configured strategy"""
|
"""Process data using the configured strategy"""
|
||||||
@@ -95,8 +135,29 @@ class BatchProcessor:
|
|||||||
step.load_state()
|
step.load_state()
|
||||||
|
|
||||||
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
|
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
|
||||||
|
self.memory_monitor.log_memory_usage("process_start")
|
||||||
|
|
||||||
if self.config.max_workers == 1:
|
if self.config.max_workers == 1:
|
||||||
return self.process_sequential(step, df)
|
result = self.process_sequential(step, df)
|
||||||
else:
|
else:
|
||||||
return self.process_concurrent(step, df)
|
result = self.process_concurrent(step, df)
|
||||||
|
|
||||||
|
self.memory_monitor.log_memory_usage("process_complete")
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _safe_concat(self, dfs: list) -> pd.DataFrame:
|
||||||
|
"""Memory-safe concatenation with monitoring"""
|
||||||
|
if not dfs:
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
memory = self.memory_monitor.get_memory_usage_mb()
|
||||||
|
logging.info(f"Starting concat of {len(dfs)} DataFrames at {memory:.1f} MB")
|
||||||
|
|
||||||
|
# Use copy=False to avoid unnecessary copying during concat
|
||||||
|
result = pd.concat(dfs, ignore_index=True, copy=False)
|
||||||
|
|
||||||
|
# Monitor memory after concat
|
||||||
|
memory = self.memory_monitor.get_memory_usage_mb()
|
||||||
|
logging.info(f"Concat complete. Memory: {memory:.1f} MB")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|||||||
@@ -0,0 +1,25 @@
|
|||||||
|
import gc
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
|
||||||
|
|
||||||
|
class MemoryMonitor:
|
||||||
|
"""Monitor and manage memory usage during batch processing"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_memory_usage_mb() -> float:
|
||||||
|
"""Get current memory usage in MB"""
|
||||||
|
process = psutil.Process()
|
||||||
|
return process.memory_info().rss / 1024 / 1024
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def cleanup_memory():
|
||||||
|
"""Force garbage collection"""
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def log_memory_usage(step_name: str):
|
||||||
|
"""Log current memory usage"""
|
||||||
|
memory_mb = MemoryMonitor.get_memory_usage_mb()
|
||||||
|
logging.info(f"Memory usage after {step_name}: {memory_mb:.1f} MB")
|
||||||
@@ -1,52 +0,0 @@
|
|||||||
import logging
|
|
||||||
from typing import Dict
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
class DatasetAnalyzer:
|
|
||||||
"""Analyze dataset statistics and quality"""
|
|
||||||
|
|
||||||
def __init__(self, filepath: str):
|
|
||||||
self.filepath = filepath
|
|
||||||
self.df = None
|
|
||||||
|
|
||||||
def load_data(self) -> bool:
|
|
||||||
"""Load dataset for analysis"""
|
|
||||||
try:
|
|
||||||
self.df = pd.read_csv(self.filepath)
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Failed to load {self.filepath}: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def analyze_completion(self) -> Dict:
|
|
||||||
"""Analyze annotation completion status"""
|
|
||||||
if self.df is None:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
total_rows = len(self.df)
|
|
||||||
|
|
||||||
# Check annotation status
|
|
||||||
if "annotated" in self.df.columns:
|
|
||||||
annotated_count = (self.df["annotated"] == 1).sum()
|
|
||||||
unannotated_count = (self.df["annotated"] == 0).sum()
|
|
||||||
else:
|
|
||||||
annotated_count = 0
|
|
||||||
unannotated_count = total_rows
|
|
||||||
|
|
||||||
# Analyze name completeness
|
|
||||||
complete_names = 0
|
|
||||||
if "identified_name" in self.df.columns and "identified_surname" in self.df.columns:
|
|
||||||
complete_names = (
|
|
||||||
(self.df["identified_name"].notna()) & (self.df["identified_surname"].notna())
|
|
||||||
).sum()
|
|
||||||
|
|
||||||
return {
|
|
||||||
"total_rows": total_rows,
|
|
||||||
"annotated_rows": annotated_count,
|
|
||||||
"unannotated_rows": unannotated_count,
|
|
||||||
"annotation_percentage": (annotated_count / total_rows * 100) if total_rows > 0 else 0,
|
|
||||||
"complete_names": complete_names,
|
|
||||||
"completeness_percentage": (complete_names / total_rows * 100) if total_rows > 0 else 0,
|
|
||||||
}
|
|
||||||
@@ -19,7 +19,13 @@ class PipelineMonitor:
|
|||||||
|
|
||||||
self.paths = paths
|
self.paths = paths
|
||||||
self.checkpoint_dir = paths.checkpoints_dir
|
self.checkpoint_dir = paths.checkpoints_dir
|
||||||
self.steps = ["data_cleaning", "feature_extraction", "ner_annotation", "llm_annotation", "data_splitting"]
|
self.steps = [
|
||||||
|
"data_cleaning",
|
||||||
|
"feature_extraction",
|
||||||
|
"ner_annotation",
|
||||||
|
"llm_annotation",
|
||||||
|
"data_splitting",
|
||||||
|
]
|
||||||
|
|
||||||
def get_step_status(self, step_name: str) -> Dict:
|
def get_step_status(self, step_name: str) -> Dict:
|
||||||
"""Get status of a specific pipeline step"""
|
"""Get status of a specific pipeline step"""
|
||||||
|
|||||||
@@ -13,10 +13,17 @@ class BaseNameFormatter(ABC):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None):
|
def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None):
|
||||||
self.connectors = connectors or ['wa', 'ya', 'ka', 'ba']
|
self.connectors = connectors or ["wa", "ya", "ka", "ba"]
|
||||||
self.additional_surnames = additional_surnames or [
|
self.additional_surnames = additional_surnames or [
|
||||||
'jean', 'paul', 'marie', 'joseph', 'pierre', 'claude',
|
"jean",
|
||||||
'andre', 'michel', 'robert'
|
"paul",
|
||||||
|
"marie",
|
||||||
|
"joseph",
|
||||||
|
"pierre",
|
||||||
|
"claude",
|
||||||
|
"andre",
|
||||||
|
"michel",
|
||||||
|
"robert",
|
||||||
]
|
]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -26,7 +33,9 @@ class BaseNameFormatter(ABC):
|
|||||||
return []
|
return []
|
||||||
return native_str.strip().split()
|
return native_str.strip().split()
|
||||||
|
|
||||||
def create_ner_tags(self, text: str, native_parts: List[str], surname: str) -> List[Tuple[int, int, str]]:
|
def create_ner_tags(
|
||||||
|
self, text: str, native_parts: List[str], surname: str
|
||||||
|
) -> List[Tuple[int, int, str]]:
|
||||||
"""Create NER entity tags for transformed text"""
|
"""Create NER entity tags for transformed text"""
|
||||||
entities = []
|
entities = []
|
||||||
current_pos = 0
|
current_pos = 0
|
||||||
@@ -38,15 +47,15 @@ class BaseNameFormatter(ABC):
|
|||||||
|
|
||||||
# Determine tag based on word content
|
# Determine tag based on word content
|
||||||
if word in native_parts or any(connector in word for connector in self.connectors):
|
if word in native_parts or any(connector in word for connector in self.connectors):
|
||||||
tag = 'NATIVE'
|
tag = "NATIVE"
|
||||||
elif word == surname or word in self.additional_surnames:
|
elif word == surname or word in self.additional_surnames:
|
||||||
tag = 'SURNAME'
|
tag = "SURNAME"
|
||||||
else:
|
else:
|
||||||
# Check if it's a compound native word or new surname
|
# Check if it's a compound native word or new surname
|
||||||
if any(part in word for part in native_parts):
|
if any(part in word for part in native_parts):
|
||||||
tag = 'NATIVE'
|
tag = "NATIVE"
|
||||||
else:
|
else:
|
||||||
tag = 'SURNAME'
|
tag = "SURNAME"
|
||||||
|
|
||||||
entities.append((start_pos, end_pos, tag))
|
entities.append((start_pos, end_pos, tag))
|
||||||
current_pos = end_pos + 1 # +1 for space
|
current_pos = end_pos + 1 # +1 for space
|
||||||
@@ -54,15 +63,17 @@ class BaseNameFormatter(ABC):
|
|||||||
return entities
|
return entities
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def compute_derived_attributes(cls, name: str) -> Dict:
|
def compute_numeric_features(cls, name: str) -> Dict:
|
||||||
"""Compute all derived attributes for the transformed name"""
|
"""Compute all derived attributes for the transformed name"""
|
||||||
words_count = len(name.split()) if name else 0
|
words_count = len(name.split()) if name else 0
|
||||||
length = len(name) if name else 0
|
length = len(name) if name else 0
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'words': words_count,
|
"words": words_count,
|
||||||
'length': length,
|
"length": length,
|
||||||
'identified_category': NameCategory.SIMPLE if words_count == 3 else NameCategory.COMPOSE,
|
"identified_category": (
|
||||||
|
NameCategory.SIMPLE.value if words_count == 3 else NameCategory.COMPOSE.value
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
|||||||
@@ -8,8 +8,8 @@ from processing.ner.formats import BaseNameFormatter
|
|||||||
|
|
||||||
class ConnectorFormatter(BaseNameFormatter):
|
class ConnectorFormatter(BaseNameFormatter):
|
||||||
def transform(self, row: pd.Series) -> Dict:
|
def transform(self, row: pd.Series) -> Dict:
|
||||||
native_parts = self.parse_native_components(row['probable_native'])
|
native_parts = self.parse_native_components(row["probable_native"])
|
||||||
surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
|
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||||
connector = random.choice(self.connectors)
|
connector = random.choice(self.connectors)
|
||||||
|
|
||||||
# Connect native parts with a random connector
|
# Connect native parts with a random connector
|
||||||
@@ -17,20 +17,22 @@ class ConnectorFormatter(BaseNameFormatter):
|
|||||||
connected_native = f" {connector} ".join(native_parts)
|
connected_native = f" {connector} ".join(native_parts)
|
||||||
full_name = f"{connected_native} {surname}".strip()
|
full_name = f"{connected_native} {surname}".strip()
|
||||||
else:
|
else:
|
||||||
connected_native = f"{row['probable_native']} {connector} {row['probable_native']}".strip()
|
connected_native = (
|
||||||
|
f"{row['probable_native']} {connector} {row['probable_native']}".strip()
|
||||||
|
)
|
||||||
full_name = f"{connected_native} {surname}".strip()
|
full_name = f"{connected_native} {surname}".strip()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'name': full_name,
|
"name": full_name,
|
||||||
'probable_native': connected_native,
|
"probable_native": connected_native,
|
||||||
'identify_name': connected_native,
|
"identified_name": connected_native,
|
||||||
'probable_surname': surname,
|
"probable_surname": surname,
|
||||||
'identify_surname': surname,
|
"identified_surname": surname,
|
||||||
'ner_entities': str(self.create_ner_tags(full_name, native_parts, surname)),
|
"ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
|
||||||
'transformation_type': self.transformation_type,
|
"transformation_type": self.transformation_type,
|
||||||
**self.compute_derived_attributes(full_name)
|
**self.compute_numeric_features(full_name),
|
||||||
}
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def transformation_type(self) -> str:
|
def transformation_type(self) -> str:
|
||||||
return 'connector_added'
|
return "connector_added"
|
||||||
|
|||||||
@@ -8,8 +8,8 @@ from processing.ner.formats import BaseNameFormatter
|
|||||||
|
|
||||||
class ExtendedSurnameFormatter(BaseNameFormatter):
|
class ExtendedSurnameFormatter(BaseNameFormatter):
|
||||||
def transform(self, row: pd.Series) -> Dict:
|
def transform(self, row: pd.Series) -> Dict:
|
||||||
native_parts = self.parse_native_components(row['probable_native'])
|
native_parts = self.parse_native_components(row["probable_native"])
|
||||||
original_surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
|
original_surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||||
|
|
||||||
# Add random additional surname
|
# Add random additional surname
|
||||||
additional_surname = random.choice(self.additional_surnames)
|
additional_surname = random.choice(self.additional_surnames)
|
||||||
@@ -17,16 +17,16 @@ class ExtendedSurnameFormatter(BaseNameFormatter):
|
|||||||
full_name = f"{row['probable_native']} {combined_surname}".strip()
|
full_name = f"{row['probable_native']} {combined_surname}".strip()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'name': full_name,
|
"name": full_name,
|
||||||
'probable_native': row['probable_native'],
|
"probable_native": row["probable_native"],
|
||||||
'identify_name': row['probable_native'],
|
"identified_name": row["probable_native"],
|
||||||
'probable_surname': combined_surname,
|
"probable_surname": combined_surname,
|
||||||
'identity_surname': combined_surname,
|
"identified_surname": combined_surname,
|
||||||
'ner_entities': str(self.create_ner_tags(full_name, native_parts, combined_surname)),
|
"ner_entities": str(self.create_ner_tags(full_name, native_parts, combined_surname)),
|
||||||
'transformation_type': self.transformation_type,
|
"transformation_type": self.transformation_type,
|
||||||
**self.compute_derived_attributes(full_name)
|
**self.compute_numeric_features(full_name),
|
||||||
}
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def transformation_type(self) -> str:
|
def transformation_type(self) -> str:
|
||||||
return 'extended_surname'
|
return "extended_surname"
|
||||||
|
|||||||
@@ -7,22 +7,22 @@ from processing.ner.formats import BaseNameFormatter
|
|||||||
|
|
||||||
class NativeOnlyFormatter(BaseNameFormatter):
|
class NativeOnlyFormatter(BaseNameFormatter):
|
||||||
def transform(self, row: pd.Series) -> Dict:
|
def transform(self, row: pd.Series) -> Dict:
|
||||||
native_parts = self.parse_native_components(row['probable_native'])
|
native_parts = self.parse_native_components(row["probable_native"])
|
||||||
|
|
||||||
# Only native components
|
# Only native components
|
||||||
full_name = row['probable_native']
|
full_name = row["probable_native"]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'name': full_name,
|
"name": full_name,
|
||||||
'probable_native': row['probable_native'],
|
"probable_native": row["probable_native"],
|
||||||
'identify_name': row['probable_native'],
|
"identified_name": row["probable_native"],
|
||||||
'probable_surname': '',
|
"probable_surname": "",
|
||||||
'identify_surname': '',
|
"identified_surname": "",
|
||||||
'ner_entities': str(self.create_ner_tags(full_name, native_parts, '')),
|
"ner_entities": str(self.create_ner_tags(full_name, native_parts, "")),
|
||||||
'transformation_type': self.transformation_type,
|
"transformation_type": self.transformation_type,
|
||||||
**self.compute_derived_attributes(full_name)
|
**self.compute_numeric_features(full_name),
|
||||||
}
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def transformation_type(self) -> str:
|
def transformation_type(self) -> str:
|
||||||
return 'native_only'
|
return "native_only"
|
||||||
|
|||||||
@@ -7,23 +7,23 @@ from processing.ner.formats import BaseNameFormatter
|
|||||||
|
|
||||||
class OriginalFormatter(BaseNameFormatter):
|
class OriginalFormatter(BaseNameFormatter):
|
||||||
def transform(self, row: pd.Series) -> Dict:
|
def transform(self, row: pd.Series) -> Dict:
|
||||||
native_parts = self.parse_native_components(row['probable_native'])
|
native_parts = self.parse_native_components(row["probable_native"])
|
||||||
surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
|
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||||
|
|
||||||
# Keep original order: native components + surname
|
# Keep original order: native components + surname
|
||||||
full_name = f"{row['probable_native']} {surname}".strip()
|
full_name = f"{row['probable_native']} {surname}".strip()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'name': full_name,
|
"name": full_name,
|
||||||
'probable_native': row['probable_native'],
|
"probable_native": row["probable_native"],
|
||||||
'identify_name': row['probable_native'],
|
"identified_name": row["probable_native"],
|
||||||
'probable_surname': surname,
|
"probable_surname": surname,
|
||||||
'identify_surname': surname,
|
"identified_surname": surname,
|
||||||
'ner_entities': str(self.create_ner_tags(full_name, native_parts, surname)),
|
"ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
|
||||||
'transformation_type': self.transformation_type,
|
"transformation_type": self.transformation_type,
|
||||||
**self.compute_derived_attributes(full_name)
|
**self.compute_numeric_features(full_name),
|
||||||
}
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def transformation_type(self) -> str:
|
def transformation_type(self) -> str:
|
||||||
return 'original'
|
return "original"
|
||||||
|
|||||||
@@ -7,23 +7,23 @@ from processing.ner.formats import BaseNameFormatter
|
|||||||
|
|
||||||
class PositionFlippedFormatter(BaseNameFormatter):
|
class PositionFlippedFormatter(BaseNameFormatter):
|
||||||
def transform(self, row: pd.Series) -> Dict:
|
def transform(self, row: pd.Series) -> Dict:
|
||||||
native_parts = self.parse_native_components(row['probable_native'])
|
native_parts = self.parse_native_components(row["probable_native"])
|
||||||
surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
|
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||||
|
|
||||||
# Flip order: surname + native components
|
# Flip order: surname + native components
|
||||||
full_name = f"{surname} {row['probable_native']}".strip()
|
full_name = f"{surname} {row['probable_native']}".strip()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'name': full_name,
|
"name": full_name,
|
||||||
'probable_native': row['probable_native'],
|
"probable_native": row["probable_native"],
|
||||||
'identify_name': row['probable_native'],
|
"identified_name": row["probable_native"],
|
||||||
'probable_surname': surname,
|
"probable_surname": surname,
|
||||||
'identify_surname': surname,
|
"identified_surname": surname,
|
||||||
'ner_entities': str(self.create_ner_tags(full_name, native_parts, surname)),
|
"ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
|
||||||
'transformation_type': self.transformation_type,
|
"transformation_type": self.transformation_type,
|
||||||
**self.compute_derived_attributes(full_name)
|
**self.compute_numeric_features(full_name),
|
||||||
}
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def transformation_type(self) -> str:
|
def transformation_type(self) -> str:
|
||||||
return 'position_flipped'
|
return "position_flipped"
|
||||||
|
|||||||
@@ -7,24 +7,24 @@ from processing.ner.formats import BaseNameFormatter
|
|||||||
|
|
||||||
class ReducedNativeFormatter(BaseNameFormatter):
|
class ReducedNativeFormatter(BaseNameFormatter):
|
||||||
def transform(self, row: pd.Series) -> Dict:
|
def transform(self, row: pd.Series) -> Dict:
|
||||||
native_parts = self.parse_native_components(row['probable_native'])
|
native_parts = self.parse_native_components(row["probable_native"])
|
||||||
surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
|
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||||
|
|
||||||
# Keep only first native component + surname
|
# Keep only first native component + surname
|
||||||
reduced_native = native_parts[0] if len(native_parts) > 1 else row['probable_native']
|
reduced_native = native_parts[0] if len(native_parts) > 1 else row["probable_native"]
|
||||||
full_name = f"{reduced_native} {surname}".strip()
|
full_name = f"{reduced_native} {surname}".strip()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'name': full_name,
|
"name": full_name,
|
||||||
'probable_native': reduced_native,
|
"probable_native": reduced_native,
|
||||||
'identify_name': reduced_native,
|
"identified_name": reduced_native,
|
||||||
'probable_surname': surname,
|
"probable_surname": surname,
|
||||||
'identify_surname': surname,
|
"identified_surname": surname,
|
||||||
'ner_entities': str(self.create_ner_tags(full_name, [reduced_native], surname)),
|
"ner_entities": str(self.create_ner_tags(full_name, [reduced_native], surname)),
|
||||||
'transformation_type': self.transformation_type,
|
"transformation_type": self.transformation_type,
|
||||||
**self.compute_derived_attributes(full_name)
|
**self.compute_numeric_features(full_name),
|
||||||
}
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def transformation_type(self) -> str:
|
def transformation_type(self) -> str:
|
||||||
return 'reduced_native'
|
return "reduced_native"
|
||||||
|
|||||||
+122
-168
@@ -10,189 +10,143 @@ from spacy.util import filter_spans
|
|||||||
|
|
||||||
from core.config import PipelineConfig
|
from core.config import PipelineConfig
|
||||||
from core.utils import get_data_file_path
|
from core.utils import get_data_file_path
|
||||||
|
from core.utils.data_loader import DataLoader
|
||||||
|
|
||||||
|
|
||||||
class NERDataBuilder:
|
class NERDataBuilder:
|
||||||
def __init__(self, config: PipelineConfig):
|
def __init__(self, config: PipelineConfig):
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self.data_loader = DataLoader(config)
|
||||||
|
|
||||||
@classmethod
|
@staticmethod
|
||||||
def parse_entities(cls, entities_str):
|
def _parse_entities(series: pd.Series) -> pd.Series:
|
||||||
"""Parse entity string (tuple format or JSON) into spaCy-style tuples."""
|
"""Vectorized parse of entity strings."""
|
||||||
if not entities_str or entities_str in ["[]", "", "nan"]:
|
|
||||||
return []
|
|
||||||
|
|
||||||
entities_str = str(entities_str).strip()
|
def _parse(entities_str):
|
||||||
|
if not entities_str or entities_str in ["[]", "", "nan"]:
|
||||||
# Handle different formats
|
return []
|
||||||
try:
|
entities_str = str(entities_str).strip()
|
||||||
# Try to parse as Python literal (tuples or lists)
|
|
||||||
if entities_str.startswith("[(") and entities_str.endswith(")]"):
|
|
||||||
# Standard tuple format: [(0, 6, 'NATIVE'), ...]
|
|
||||||
return ast.literal_eval(entities_str)
|
|
||||||
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
|
|
||||||
# Nested list format: [[0, 6, 'NATIVE'], ...]
|
|
||||||
nested_list = ast.literal_eval(entities_str)
|
|
||||||
return [(start, end, label) for start, end, label in nested_list]
|
|
||||||
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
|
|
||||||
# JSON format: [{"start": 0, "end": 6, "label": "NATIVE"}, ...]
|
|
||||||
json_entities = json.loads(entities_str)
|
|
||||||
return [(e["start"], e["end"], e["label"]) for e in json_entities]
|
|
||||||
else:
|
|
||||||
# Try general ast.literal_eval for other formats
|
|
||||||
parsed = ast.literal_eval(entities_str)
|
|
||||||
if isinstance(parsed, list):
|
|
||||||
# Convert any list format to tuples
|
|
||||||
result = []
|
|
||||||
for item in parsed:
|
|
||||||
if isinstance(item, (list, tuple)) and len(item) == 3:
|
|
||||||
result.append((item[0], item[1], item[2]))
|
|
||||||
return result
|
|
||||||
|
|
||||||
except (ValueError, SyntaxError, json.JSONDecodeError) as e:
|
|
||||||
logging.warning(f"Failed to parse entities: {entities_str} ({e})")
|
|
||||||
return []
|
|
||||||
|
|
||||||
logging.warning(f"Unknown entity format: {entities_str}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def validate_entities(cls, entities, text):
|
|
||||||
"""Validate and sort entity tuples, removing overlaps and invalid spans."""
|
|
||||||
if not entities or not text:
|
|
||||||
return []
|
|
||||||
|
|
||||||
text = str(text).strip()
|
|
||||||
if not text:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Filter out invalid entities
|
|
||||||
valid_entities = []
|
|
||||||
for entity in entities:
|
|
||||||
if not isinstance(entity, (list, tuple)) or len(entity) != 3:
|
|
||||||
logging.warning(f"Invalid entity format: {entity}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
start, end, label = entity
|
|
||||||
|
|
||||||
# Ensure start/end are integers
|
|
||||||
try:
|
try:
|
||||||
start = int(start)
|
if entities_str.startswith("[(") and entities_str.endswith(")]"):
|
||||||
end = int(end)
|
return ast.literal_eval(entities_str)
|
||||||
except (ValueError, TypeError):
|
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
|
||||||
logging.warning(f"Invalid start/end positions: {entity}")
|
return [tuple(e) for e in ast.literal_eval(entities_str)]
|
||||||
continue
|
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
|
||||||
|
return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
|
||||||
|
else:
|
||||||
|
parsed = ast.literal_eval(entities_str)
|
||||||
|
return [
|
||||||
|
tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3
|
||||||
|
]
|
||||||
|
except (ValueError, SyntaxError, json.JSONDecodeError):
|
||||||
|
return []
|
||||||
|
|
||||||
# Ensure label is string
|
return series.map(_parse)
|
||||||
if not isinstance(label, str):
|
|
||||||
logging.warning(f"Invalid label type: {entity}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check bounds
|
@staticmethod
|
||||||
if not (0 <= start < end <= len(text)):
|
def _validate_entities(texts: pd.Series, entities_series: pd.Series) -> pd.Series:
|
||||||
logging.warning(f"Entity span out of bounds: {entity} for text '{text}' (length {len(text)})")
|
"""Vectorized entity validation."""
|
||||||
continue
|
|
||||||
|
|
||||||
# Check that span contains actual text
|
def _validate(text, entities):
|
||||||
span_text = text[start:end].strip()
|
if not entities or not text:
|
||||||
if not span_text:
|
return []
|
||||||
logging.warning(f"Empty span: {entity} in text '{text}'")
|
text = str(text).strip()
|
||||||
continue
|
valid = []
|
||||||
|
for ent in entities:
|
||||||
valid_entities.append((start, end, label))
|
if not isinstance(ent, (list, tuple)) or len(ent) != 3:
|
||||||
|
|
||||||
if not valid_entities:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Sort by start position
|
|
||||||
valid_entities.sort(key=lambda x: (x[0], x[1]))
|
|
||||||
|
|
||||||
# Remove overlapping entities (keep the first one)
|
|
||||||
filtered = []
|
|
||||||
for start, end, label in valid_entities:
|
|
||||||
# Check for overlap with already added entities
|
|
||||||
has_overlap = False
|
|
||||||
for e_start, e_end, _ in filtered:
|
|
||||||
if not (end <= e_start or start >= e_end):
|
|
||||||
has_overlap = True
|
|
||||||
logging.warning(
|
|
||||||
f"Removing overlapping entity ({start}, {end}, '{label}') "
|
|
||||||
f"conflicts with ({e_start}, {e_end}) in '{text}'"
|
|
||||||
)
|
|
||||||
break
|
|
||||||
|
|
||||||
if not has_overlap:
|
|
||||||
filtered.append((start, end, label))
|
|
||||||
|
|
||||||
return filtered
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_doc(cls, text, entities, nlp):
|
|
||||||
"""Create a spaCy Doc object with entities added."""
|
|
||||||
doc = nlp(text)
|
|
||||||
ents = []
|
|
||||||
|
|
||||||
for start, end, label in entities:
|
|
||||||
span = doc.char_span(start, end, label=label, alignment_mode="contract") \
|
|
||||||
or doc.char_span(start, end, label=label, alignment_mode="strict")
|
|
||||||
if span:
|
|
||||||
ents.append(span)
|
|
||||||
else:
|
|
||||||
logging.warning(f"Could not create span ({start}, {end}, '{label}') in '{text}'")
|
|
||||||
|
|
||||||
doc.ents = filter_spans(ents) if ents else []
|
|
||||||
return doc
|
|
||||||
|
|
||||||
def build(self, data: pd.DataFrame = None) -> int:
|
|
||||||
"""Build the dataset for NER training."""
|
|
||||||
logging.info("Building dataset for NER training")
|
|
||||||
try:
|
|
||||||
df = pd.read_csv(get_data_file_path("names_featured.csv", self.config)) \
|
|
||||||
if data is None \
|
|
||||||
else data
|
|
||||||
|
|
||||||
ner_df = df[df["ner_tagged"] == 1].copy()
|
|
||||||
if ner_df.empty:
|
|
||||||
logging.error("No NER tagged data found in the CSV")
|
|
||||||
return 1
|
|
||||||
|
|
||||||
logging.info(f"Found {len(ner_df)} NER tagged entries")
|
|
||||||
nlp = spacy.blank("fr")
|
|
||||||
doc_bin, training_data = DocBin(), []
|
|
||||||
processed_count, skipped_count = 0, 0
|
|
||||||
|
|
||||||
for _, row in ner_df.iterrows():
|
|
||||||
text = str(row.get("name", "")).strip()
|
|
||||||
if not text:
|
|
||||||
continue
|
continue
|
||||||
|
start, end, label = ent
|
||||||
entities = self.parse_entities(row.get("ner_entities", "[]"))
|
|
||||||
entities = self.validate_entities(entities, text)
|
|
||||||
|
|
||||||
training_data.append((text, {"entities": entities}))
|
|
||||||
try:
|
try:
|
||||||
doc_bin.add(self.create_doc(text, entities, nlp))
|
start, end = int(start), int(end)
|
||||||
processed_count += 1
|
except (ValueError, TypeError):
|
||||||
except Exception as e:
|
continue
|
||||||
logging.error(f"Error processing '{text}': {e}")
|
if not isinstance(label, str):
|
||||||
skipped_count += 1
|
continue
|
||||||
|
if not (0 <= start < end <= len(text)):
|
||||||
|
continue
|
||||||
|
if not text[start:end].strip():
|
||||||
|
continue
|
||||||
|
valid.append((start, end, label))
|
||||||
|
if not valid:
|
||||||
|
return []
|
||||||
|
valid.sort(key=lambda x: (x[0], x[1]))
|
||||||
|
# remove overlaps
|
||||||
|
filtered, last_end = [], -1
|
||||||
|
for s, e, l in valid:
|
||||||
|
if s >= last_end:
|
||||||
|
filtered.append((s, e, l))
|
||||||
|
last_end = e
|
||||||
|
return filtered
|
||||||
|
|
||||||
if not training_data:
|
return pd.Series(map(_validate, texts, entities_series), index=texts.index)
|
||||||
logging.error("No valid training examples generated")
|
|
||||||
return 1
|
|
||||||
|
|
||||||
json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
|
@staticmethod
|
||||||
spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
|
def _create_docs(nlp, texts, entities):
|
||||||
|
"""Batch create spaCy Docs."""
|
||||||
|
docs = []
|
||||||
|
for text, ents in zip(texts, entities):
|
||||||
|
doc = nlp(text)
|
||||||
|
spans = []
|
||||||
|
for start, end, label in ents:
|
||||||
|
span = doc.char_span(
|
||||||
|
start, end, label=label, alignment_mode="contract"
|
||||||
|
) or doc.char_span(start, end, label=label, alignment_mode="strict")
|
||||||
|
if span:
|
||||||
|
spans.append(span)
|
||||||
|
doc.ents = filter_spans(spans)
|
||||||
|
docs.append(doc)
|
||||||
|
return docs
|
||||||
|
|
||||||
with open(json_path, "w", encoding="utf-8") as f:
|
def build(self) -> int:
|
||||||
json.dump(training_data, f, ensure_ascii=False, indent=None)
|
input_filepath = get_data_file_path(
|
||||||
doc_bin.to_disk(spacy_path)
|
self.config.data.output_files["engineered"], self.config
|
||||||
|
)
|
||||||
|
df = self.data_loader.load_csv_complete(input_filepath)
|
||||||
|
df = df[["name", "ner_tagged", "ner_entities"]]
|
||||||
|
|
||||||
logging.info(f"Processed: {processed_count}, Skipped: {skipped_count}")
|
# Filter early
|
||||||
logging.info(f"Saved NER data in json format to {json_path}")
|
ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]]
|
||||||
logging.info(f"Saved NER data in spaCy format to {spacy_path}")
|
if ner_df.empty:
|
||||||
return 0
|
logging.error("No NER tagged data found")
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Failed to build NER dataset: {e}", exc_info=True)
|
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
total_rows = len(df)
|
||||||
|
del df # No need to keep in memory
|
||||||
|
|
||||||
|
logging.info(f"Found {len(ner_df)} NER tagged entries")
|
||||||
|
nlp = spacy.blank("fr")
|
||||||
|
|
||||||
|
# Vectorized parsing + validation
|
||||||
|
parsed_entities = self._parse_entities(ner_df["ner_entities"])
|
||||||
|
validated_entities = self._validate_entities(ner_df["name"], parsed_entities)
|
||||||
|
|
||||||
|
# Drop rows with no valid entities
|
||||||
|
mask = validated_entities.map(bool)
|
||||||
|
ner_df = ner_df.loc[mask]
|
||||||
|
validated_entities = validated_entities.loc[mask]
|
||||||
|
|
||||||
|
if ner_df.empty:
|
||||||
|
logging.error("No valid training examples after validation")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Prepare training data
|
||||||
|
training_data = list(
|
||||||
|
zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create spaCy DocBin in batch
|
||||||
|
docs = self._create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
|
||||||
|
doc_bin = DocBin(docs=docs)
|
||||||
|
|
||||||
|
# Save
|
||||||
|
json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
|
||||||
|
spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
|
||||||
|
|
||||||
|
with open(json_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
||||||
|
doc_bin.to_disk(spacy_path)
|
||||||
|
|
||||||
|
logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
|
||||||
|
logging.info(f"Saved NER JSON to {json_path}")
|
||||||
|
logging.info(f"Saved NER spacy to {spacy_path}")
|
||||||
|
return 0
|
||||||
|
|||||||
@@ -1,9 +1,14 @@
|
|||||||
import random
|
import random
|
||||||
from typing import List
|
from typing import List
|
||||||
|
import logging
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from core.config import PipelineConfig
|
||||||
|
from core.utils import get_data_file_path
|
||||||
|
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
|
||||||
from processing.ner.formats.connectors_format import ConnectorFormatter
|
from processing.ner.formats.connectors_format import ConnectorFormatter
|
||||||
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
||||||
from processing.ner.formats.native_only_format import NativeOnlyFormatter
|
from processing.ner.formats.native_only_format import NativeOnlyFormatter
|
||||||
@@ -18,50 +23,64 @@ class NEREngineering:
|
|||||||
and encourage sequence characteristic learning.
|
and encourage sequence characteristic learning.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None):
|
def __init__(self, config: PipelineConfig):
|
||||||
self.connectors = connectors or ['wa', 'ya', 'ka', 'ba', 'la']
|
self.config = config
|
||||||
self.additional_surnames = additional_surnames or [
|
self.data_loader = DataLoader(config)
|
||||||
'jean', 'paul', 'marie', 'joseph', 'pierre', 'claude',
|
self.connectors = ["wa", "ya", "ka", "ba", "la"]
|
||||||
'andre', 'michel', 'robert'
|
self.additional_surnames = [
|
||||||
|
"jean",
|
||||||
|
"paul",
|
||||||
|
"marie",
|
||||||
|
"joseph",
|
||||||
|
"pierre",
|
||||||
|
"claude",
|
||||||
|
"andre",
|
||||||
|
"michel",
|
||||||
|
"robert",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
random.seed(self.config.data.random_seed)
|
||||||
|
np.random.seed(self.config.data.random_seed)
|
||||||
|
|
||||||
# Initialize format classes
|
# Initialize format classes
|
||||||
self.formatters = {
|
self.formatters = {
|
||||||
'original': OriginalFormatter(self.connectors, self.additional_surnames),
|
"original": OriginalFormatter(self.connectors, self.additional_surnames),
|
||||||
'native_only': NativeOnlyFormatter(self.connectors, self.additional_surnames),
|
"native_only": NativeOnlyFormatter(self.connectors, self.additional_surnames),
|
||||||
'position_flipped': PositionFlippedFormatter(self.connectors, self.additional_surnames),
|
"position_flipped": PositionFlippedFormatter(self.connectors, self.additional_surnames),
|
||||||
'reduced_native': ReducedNativeFormatter(self.connectors, self.additional_surnames),
|
"reduced_native": ReducedNativeFormatter(self.connectors, self.additional_surnames),
|
||||||
'connector_added': ConnectorFormatter(self.connectors, self.additional_surnames),
|
"connector_added": ConnectorFormatter(self.connectors, self.additional_surnames),
|
||||||
'extended_surname': ExtendedSurnameFormatter(self.connectors, self.additional_surnames)
|
"extended_surname": ExtendedSurnameFormatter(self.connectors, self.additional_surnames),
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
def load_data(self) -> pd.DataFrame:
|
||||||
def load_ner_data(cls, filepath: str) -> pd.DataFrame:
|
|
||||||
"""Load and filter NER-tagged data from CSV file"""
|
"""Load and filter NER-tagged data from CSV file"""
|
||||||
df = pd.read_csv(filepath)
|
|
||||||
|
filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||||
|
df = self.data_loader.load_csv_complete(filepath)
|
||||||
|
|
||||||
# Filter only NER-tagged rows
|
# Filter only NER-tagged rows
|
||||||
ner_data = df[df['ner_tagged'] == 1].copy()
|
ner_data = df[df["ner_tagged"] == 1].copy()
|
||||||
print(f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records")
|
logging.info(f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records")
|
||||||
|
|
||||||
return ner_data
|
return ner_data
|
||||||
|
|
||||||
def engineer_dataset(self, df: pd.DataFrame, random_seed: int = 42) -> pd.DataFrame:
|
def compute(self) -> None:
|
||||||
"""
|
logging.info("Applying feature engineering transformations...")
|
||||||
Apply feature engineering transformations according to the specified rules:
|
input_filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||||
- First 25%: original format
|
output_filepath = get_data_file_path(
|
||||||
- Second 25%: remove surname
|
self.config.data.output_files["engineered"], self.config
|
||||||
- Third 25%: flip positions
|
)
|
||||||
- Fourth 10%: reduce native components
|
|
||||||
- Fifth 10%: add connectors
|
|
||||||
- Last 5%: extend surnames
|
|
||||||
"""
|
|
||||||
random.seed(random_seed)
|
|
||||||
np.random.seed(random_seed)
|
|
||||||
|
|
||||||
# Shuffle the dataset
|
df = self.data_loader.load_csv_complete(input_filepath)
|
||||||
df_shuffled = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
|
ner_df = df[df["ner_tagged"] == 1].copy()
|
||||||
total_rows = len(df_shuffled)
|
logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records")
|
||||||
|
|
||||||
|
del df # No need to keep in memory
|
||||||
|
|
||||||
|
ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
|
||||||
|
drop=True
|
||||||
|
)
|
||||||
|
total_rows = len(ner_df)
|
||||||
|
|
||||||
# Calculate split points
|
# Calculate split points
|
||||||
split_25_1 = int(total_rows * 0.25)
|
split_25_1 = int(total_rows * 0.25)
|
||||||
@@ -71,37 +90,31 @@ class NEREngineering:
|
|||||||
split_10_2 = int(total_rows * 0.95)
|
split_10_2 = int(total_rows * 0.95)
|
||||||
|
|
||||||
# Define transformation groups
|
# Define transformation groups
|
||||||
transformation_groups = [
|
groups = [
|
||||||
(0, split_25_1, 'original'),
|
(0, split_25_1, "original"), # First 25%: original format
|
||||||
(split_25_1, split_25_2, 'native_only'),
|
(split_25_1, split_25_2, "native_only"), # Second 25%: remove surname
|
||||||
(split_25_2, split_25_3, 'position_flipped'),
|
(split_25_2, split_25_3, "position_flipped"), # Third 25%: flip positions
|
||||||
(split_25_3, split_10_1, 'reduced_native'),
|
(split_25_3, split_10_1, "reduced_native"), # Fourth 10%: reduce native components
|
||||||
(split_10_1, split_10_2, 'connector_added'),
|
(split_10_1, split_10_2, "connector_added"), # Fifth 10%: add connectors
|
||||||
(split_10_2, total_rows, 'extended_surname')
|
(split_10_2, total_rows, "extended_surname"), # Last 5%: extend surnames
|
||||||
]
|
]
|
||||||
|
|
||||||
print("Dataset splits:")
|
for start, end, trans_type in groups:
|
||||||
for start, end, trans_type in transformation_groups:
|
logging.info(f"Group {trans_type}: {start} to {end} ({end - start} rows)")
|
||||||
print(f"Group {trans_type}: {start} to {end} ({end - start} rows)")
|
|
||||||
|
|
||||||
# Process each group
|
# Process each group
|
||||||
engineered_rows = []
|
rows = []
|
||||||
for start, end, formatter_key in transformation_groups:
|
for start, end, formatter_key in groups:
|
||||||
formatter = self.formatters[formatter_key]
|
formatter = self.formatters[formatter_key]
|
||||||
|
|
||||||
for idx in range(start, end):
|
for idx in tqdm(range(start, end), desc=f"Processing {formatter_key}"):
|
||||||
row = df_shuffled.iloc[idx]
|
row = ner_df.iloc[idx]
|
||||||
transformed = formatter.transform(row)
|
transformed = formatter.transform(row)
|
||||||
|
|
||||||
# Keep original columns and add transformed ones
|
# Keep original columns and add transformed ones
|
||||||
new_row = row.to_dict()
|
new_row = row.to_dict()
|
||||||
new_row.update(transformed)
|
new_row.update(transformed)
|
||||||
engineered_rows.append(new_row)
|
rows.append(new_row)
|
||||||
|
|
||||||
return pd.DataFrame(engineered_rows)
|
self.data_loader.save_csv(pd.DataFrame(rows), output_filepath)
|
||||||
|
logging.info(f"Engineered dataset saved to {output_filepath}")
|
||||||
@classmethod
|
|
||||||
def save_engineered_dataset(cls, df: pd.DataFrame, output_path: str):
|
|
||||||
"""Save the engineered dataset to CSV file"""
|
|
||||||
df.to_csv(output_path, index=False)
|
|
||||||
print(f"Engineered dataset saved to {output_path}")
|
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ class NERNameModel:
|
|||||||
|
|
||||||
logging.info(f"Loading training data from {data_path}")
|
logging.info(f"Loading training data from {data_path}")
|
||||||
|
|
||||||
with open(data_path, 'r', encoding='utf-8') as f:
|
with open(data_path, "r", encoding="utf-8") as f:
|
||||||
raw_data = json.load(f)
|
raw_data = json.load(f)
|
||||||
|
|
||||||
# Validate and clean training data
|
# Validate and clean training data
|
||||||
@@ -58,7 +58,9 @@ class NERNameModel:
|
|||||||
for i, item in enumerate(raw_data):
|
for i, item in enumerate(raw_data):
|
||||||
try:
|
try:
|
||||||
if not isinstance(item, (list, tuple)) or len(item) != 2:
|
if not isinstance(item, (list, tuple)) or len(item) != 2:
|
||||||
logging.warning(f"Skipping invalid training example format at index {i}: {item}")
|
logging.warning(
|
||||||
|
f"Skipping invalid training example format at index {i}: {item}"
|
||||||
|
)
|
||||||
skipped_count += 1
|
skipped_count += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -83,20 +85,27 @@ class NERNameModel:
|
|||||||
# String format from tagger: "[(0, 6, 'NATIVE'), ...]"
|
# String format from tagger: "[(0, 6, 'NATIVE'), ...]"
|
||||||
try:
|
try:
|
||||||
import ast
|
import ast
|
||||||
|
|
||||||
entities = ast.literal_eval(entities_raw)
|
entities = ast.literal_eval(entities_raw)
|
||||||
if not isinstance(entities, list):
|
if not isinstance(entities, list):
|
||||||
logging.warning(f"Parsed entities is not a list at index {i}: {entities}")
|
logging.warning(
|
||||||
|
f"Parsed entities is not a list at index {i}: {entities}"
|
||||||
|
)
|
||||||
skipped_count += 1
|
skipped_count += 1
|
||||||
continue
|
continue
|
||||||
except (ValueError, SyntaxError) as e:
|
except (ValueError, SyntaxError) as e:
|
||||||
logging.warning(f"Failed to parse entity string at index {i}: {entities_raw} ({e})")
|
logging.warning(
|
||||||
|
f"Failed to parse entity string at index {i}: {entities_raw} ({e})"
|
||||||
|
)
|
||||||
skipped_count += 1
|
skipped_count += 1
|
||||||
continue
|
continue
|
||||||
elif isinstance(entities_raw, list):
|
elif isinstance(entities_raw, list):
|
||||||
# Already in list format
|
# Already in list format
|
||||||
entities = entities_raw
|
entities = entities_raw
|
||||||
else:
|
else:
|
||||||
logging.warning(f"Skipping invalid entities format at index {i}: {entities_raw}")
|
logging.warning(
|
||||||
|
f"Skipping invalid entities format at index {i}: {entities_raw}"
|
||||||
|
)
|
||||||
skipped_count += 1
|
skipped_count += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -110,16 +119,20 @@ class NERNameModel:
|
|||||||
start, end, label = entity
|
start, end, label = entity
|
||||||
|
|
||||||
# Validate entity components
|
# Validate entity components
|
||||||
if (not isinstance(start, int) or not isinstance(end, int) or
|
if (
|
||||||
not isinstance(label, str) or start >= end or
|
not isinstance(start, int)
|
||||||
start < 0 or end > len(text)):
|
or not isinstance(end, int)
|
||||||
|
or not isinstance(label, str)
|
||||||
|
or start >= end
|
||||||
|
or start < 0
|
||||||
|
or end > len(text)
|
||||||
|
):
|
||||||
logging.warning(f"Skipping invalid entity bounds in '{text}': {entity}")
|
logging.warning(f"Skipping invalid entity bounds in '{text}': {entity}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check for overlaps with already validated entities
|
# Check for overlaps with already validated entities
|
||||||
has_overlap = any(
|
has_overlap = any(
|
||||||
start < v_end and end > v_start
|
start < v_end and end > v_start for v_start, v_end, _ in valid_entities
|
||||||
for v_start, v_end, _ in valid_entities
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if has_overlap:
|
if has_overlap:
|
||||||
@@ -128,8 +141,10 @@ class NERNameModel:
|
|||||||
|
|
||||||
# Validate that the span doesn't contain spaces (matching tagger validation)
|
# Validate that the span doesn't contain spaces (matching tagger validation)
|
||||||
span_text = text[start:end]
|
span_text = text[start:end]
|
||||||
if not span_text or span_text != span_text.strip() or ' ' in span_text:
|
if not span_text or span_text != span_text.strip() or " " in span_text:
|
||||||
logging.warning(f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'")
|
logging.warning(
|
||||||
|
f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
valid_entities.append((start, end, label))
|
valid_entities.append((start, end, label))
|
||||||
@@ -148,7 +163,9 @@ class NERNameModel:
|
|||||||
skipped_count += 1
|
skipped_count += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
logging.info(f"Loaded {len(valid_data)} valid training examples, skipped {skipped_count} invalid ones")
|
logging.info(
|
||||||
|
f"Loaded {len(valid_data)} valid training examples, skipped {skipped_count} invalid ones"
|
||||||
|
)
|
||||||
|
|
||||||
if not valid_data:
|
if not valid_data:
|
||||||
raise ValueError("No valid training examples found in the data")
|
raise ValueError("No valid training examples found in the data")
|
||||||
@@ -156,15 +173,17 @@ class NERNameModel:
|
|||||||
return valid_data
|
return valid_data
|
||||||
|
|
||||||
def train(
|
def train(
|
||||||
self,
|
self,
|
||||||
data: List[Tuple[str, Dict]],
|
data: List[Tuple[str, Dict]],
|
||||||
epochs: int = 5,
|
epochs: int = 5,
|
||||||
batch_size: int = 16,
|
batch_size: int = 16,
|
||||||
dropout_rate: float = 0.2,
|
dropout_rate: float = 0.2,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Train the NER model"""
|
"""Train the NER model"""
|
||||||
logging.info(f"Starting NER training with {len(data)} examples")
|
logging.info(f"Starting NER training with {len(data)} examples")
|
||||||
logging.info(f"Training parameters: epochs={epochs}, batch_size={batch_size}, dropout={dropout_rate}")
|
logging.info(
|
||||||
|
f"Training parameters: epochs={epochs}, batch_size={batch_size}, dropout={dropout_rate}"
|
||||||
|
)
|
||||||
|
|
||||||
if self.nlp is None:
|
if self.nlp is None:
|
||||||
raise ValueError("Model not initialized. Call create_blank_model() first.")
|
raise ValueError("Model not initialized. Call create_blank_model() first.")
|
||||||
@@ -184,16 +203,15 @@ class NERNameModel:
|
|||||||
doc = self.nlp.make_doc(text)
|
doc = self.nlp.make_doc(text)
|
||||||
example = Example.from_dict(doc, annotations)
|
example = Example.from_dict(doc, annotations)
|
||||||
examples.append(example)
|
examples.append(example)
|
||||||
logging.info(f"Training example: {text[:30]}... with entities {annotations.get('entities', [])}")
|
logging.info(
|
||||||
|
f"Training example: {text[:30]}... with entities {annotations.get('entities', [])}"
|
||||||
|
)
|
||||||
|
|
||||||
# Train in batches
|
# Train in batches
|
||||||
batches = minibatch(examples, size=batch_size)
|
batches = minibatch(examples, size=batch_size)
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
self.nlp.update(
|
self.nlp.update(
|
||||||
batch,
|
batch, losses=losses, drop=dropout_rate, sgd=self.nlp.create_optimizer()
|
||||||
losses=losses,
|
|
||||||
drop=dropout_rate,
|
|
||||||
sgd=self.nlp.create_optimizer()
|
|
||||||
)
|
)
|
||||||
logging.info(f"Training batch with {len(batch)} examples, current losses: {losses}")
|
logging.info(f"Training batch with {len(batch)} examples, current losses: {losses}")
|
||||||
|
|
||||||
@@ -208,7 +226,7 @@ class NERNameModel:
|
|||||||
"training_examples": len(data),
|
"training_examples": len(data),
|
||||||
"loss_history": losses_history,
|
"loss_history": losses_history,
|
||||||
"batch_size": batch_size,
|
"batch_size": batch_size,
|
||||||
"dropout_rate": dropout_rate
|
"dropout_rate": dropout_rate,
|
||||||
}
|
}
|
||||||
|
|
||||||
logging.info(f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}")
|
logging.info(f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}")
|
||||||
@@ -225,7 +243,10 @@ class NERNameModel:
|
|||||||
predicted_entities = 0
|
predicted_entities = 0
|
||||||
actual_entities = 0
|
actual_entities = 0
|
||||||
|
|
||||||
entity_stats = {"NATIVE": {"tp": 0, "fp": 0, "fn": 0}, "SURNAME": {"tp": 0, "fp": 0, "fn": 0}}
|
entity_stats = {
|
||||||
|
"NATIVE": {"tp": 0, "fp": 0, "fn": 0},
|
||||||
|
"SURNAME": {"tp": 0, "fp": 0, "fn": 0},
|
||||||
|
}
|
||||||
|
|
||||||
for text, annotations in test_data:
|
for text, annotations in test_data:
|
||||||
# Get actual entities
|
# Get actual entities
|
||||||
@@ -259,7 +280,9 @@ class NERNameModel:
|
|||||||
# Calculate overall metrics
|
# Calculate overall metrics
|
||||||
precision = correct_entities / predicted_entities if predicted_entities > 0 else 0
|
precision = correct_entities / predicted_entities if predicted_entities > 0 else 0
|
||||||
recall = correct_entities / actual_entities if actual_entities > 0 else 0
|
recall = correct_entities / actual_entities if actual_entities > 0 else 0
|
||||||
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
|
f1_score = (
|
||||||
|
2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
|
||||||
|
)
|
||||||
|
|
||||||
# Calculate per-label metrics
|
# Calculate per-label metrics
|
||||||
label_metrics = {}
|
label_metrics = {}
|
||||||
@@ -268,14 +291,16 @@ class NERNameModel:
|
|||||||
label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
|
label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
|
||||||
label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
|
label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
|
||||||
label_f1 = (
|
label_f1 = (
|
||||||
2 * (label_precision * label_recall) / (label_precision + label_recall)) \
|
(2 * (label_precision * label_recall) / (label_precision + label_recall))
|
||||||
if (label_precision + label_recall) > 0 else 0
|
if (label_precision + label_recall) > 0
|
||||||
|
else 0
|
||||||
|
)
|
||||||
|
|
||||||
label_metrics[label] = {
|
label_metrics[label] = {
|
||||||
"precision": label_precision,
|
"precision": label_precision,
|
||||||
"recall": label_recall,
|
"recall": label_recall,
|
||||||
"f1_score": label_f1,
|
"f1_score": label_f1,
|
||||||
"support": tp + fn
|
"support": tp + fn,
|
||||||
}
|
}
|
||||||
|
|
||||||
evaluation_results = {
|
evaluation_results = {
|
||||||
@@ -286,9 +311,9 @@ class NERNameModel:
|
|||||||
"total_examples": total_examples,
|
"total_examples": total_examples,
|
||||||
"correct_entities": correct_entities,
|
"correct_entities": correct_entities,
|
||||||
"predicted_entities": predicted_entities,
|
"predicted_entities": predicted_entities,
|
||||||
"actual_entities": actual_entities
|
"actual_entities": actual_entities,
|
||||||
},
|
},
|
||||||
"by_label": label_metrics
|
"by_label": label_metrics,
|
||||||
}
|
}
|
||||||
|
|
||||||
logging.info(f"NER Evaluation completed. Overall F1: {f1_score:.4f}")
|
logging.info(f"NER Evaluation completed. Overall F1: {f1_score:.4f}")
|
||||||
@@ -309,7 +334,7 @@ class NERNameModel:
|
|||||||
|
|
||||||
# Save training statistics
|
# Save training statistics
|
||||||
stats_path = model_dir / "training_stats.json"
|
stats_path = model_dir / "training_stats.json"
|
||||||
with open(stats_path, 'w', encoding='utf-8') as f:
|
with open(stats_path, "w", encoding="utf-8") as f:
|
||||||
json.dump(self.training_stats, f, indent=2)
|
json.dump(self.training_stats, f, indent=2)
|
||||||
|
|
||||||
logging.info(f"NER Model saved to {model_dir}")
|
logging.info(f"NER Model saved to {model_dir}")
|
||||||
@@ -328,7 +353,7 @@ class NERNameModel:
|
|||||||
# Load training statistics if available
|
# Load training statistics if available
|
||||||
stats_path = Path(model_path) / "training_stats.json"
|
stats_path = Path(model_path) / "training_stats.json"
|
||||||
if stats_path.exists():
|
if stats_path.exists():
|
||||||
with open(stats_path, 'r', encoding='utf-8') as f:
|
with open(stats_path, "r", encoding="utf-8") as f:
|
||||||
self.training_stats = json.load(f)
|
self.training_stats = json.load(f)
|
||||||
|
|
||||||
logging.info("NER Model loaded successfully")
|
logging.info("NER Model loaded successfully")
|
||||||
@@ -342,15 +367,14 @@ class NERNameModel:
|
|||||||
entities = []
|
entities = []
|
||||||
|
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
entities.append({
|
entities.append(
|
||||||
"text": ent.text,
|
{
|
||||||
"label": ent.label_,
|
"text": ent.text,
|
||||||
"start": ent.start_char,
|
"label": ent.label_,
|
||||||
"end": ent.end_char,
|
"start": ent.start_char,
|
||||||
"confidence": getattr(ent, 'score', None) # If confidence scores are available
|
"end": ent.end_char,
|
||||||
})
|
"confidence": getattr(ent, "score", None), # If confidence scores are available
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return {
|
return {"text": text, "entities": entities}
|
||||||
"text": text,
|
|
||||||
"entities": entities
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -3,7 +3,9 @@ import logging
|
|||||||
|
|
||||||
|
|
||||||
class NERNameTagger:
|
class NERNameTagger:
|
||||||
def tag_name(self, name: str, probable_native: str, probable_surname: str) -> Union[Dict[str, Any], None]:
|
def tag_name(
|
||||||
|
self, name: str, probable_native: str, probable_surname: str
|
||||||
|
) -> Union[Dict[str, Any], None]:
|
||||||
"""Create a single NER training example using probable_native and probable_surname"""
|
"""Create a single NER training example using probable_native and probable_surname"""
|
||||||
if not name or not probable_native or not probable_surname:
|
if not name or not probable_native or not probable_surname:
|
||||||
return None
|
return None
|
||||||
@@ -56,9 +58,10 @@ class NERNameTagger:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Check if this is a word boundary match and doesn't overlap
|
# Check if this is a word boundary match and doesn't overlap
|
||||||
if (self._is_word_boundary_match(name, pos, end_pos) and
|
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
|
||||||
not has_overlap(pos, end_pos)):
|
pos, end_pos
|
||||||
entities.append((pos, end_pos, 'NATIVE'))
|
):
|
||||||
|
entities.append((pos, end_pos, "NATIVE"))
|
||||||
used_spans.append((pos, end_pos))
|
used_spans.append((pos, end_pos))
|
||||||
break # Only take the first non-overlapping occurrence
|
break # Only take the first non-overlapping occurrence
|
||||||
|
|
||||||
@@ -84,16 +87,19 @@ class NERNameTagger:
|
|||||||
start_pos = pos + 1
|
start_pos = pos + 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if (self._is_word_boundary_match(name, pos, end_pos) and
|
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
|
||||||
not has_overlap(pos, end_pos)):
|
pos, end_pos
|
||||||
entities.append((pos, end_pos, 'SURNAME'))
|
):
|
||||||
|
entities.append((pos, end_pos, "SURNAME"))
|
||||||
used_spans.append((pos, end_pos))
|
used_spans.append((pos, end_pos))
|
||||||
break
|
break
|
||||||
|
|
||||||
start_pos = pos + 1
|
start_pos = pos + 1
|
||||||
|
|
||||||
if not entities:
|
if not entities:
|
||||||
logging.warning(f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'")
|
logging.warning(
|
||||||
|
f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'"
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Sort entities by position and validate
|
# Sort entities by position and validate
|
||||||
@@ -104,7 +110,9 @@ class NERNameTagger:
|
|||||||
for start, end, label in entities:
|
for start, end, label in entities:
|
||||||
# Check bounds
|
# Check bounds
|
||||||
if not (0 <= start < end <= len(name)):
|
if not (0 <= start < end <= len(name)):
|
||||||
logging.warning(f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'")
|
logging.warning(
|
||||||
|
f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check for overlaps with already validated entities
|
# Check for overlaps with already validated entities
|
||||||
@@ -114,8 +122,10 @@ class NERNameTagger:
|
|||||||
|
|
||||||
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
|
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
|
||||||
span_text = name[start:end]
|
span_text = name[start:end]
|
||||||
if not span_text or span_text != span_text.strip() or ' ' in span_text:
|
if not span_text or span_text != span_text.strip() or " " in span_text:
|
||||||
logging.warning(f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'")
|
logging.warning(
|
||||||
|
f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
validated_entities.append((start, end, label))
|
validated_entities.append((start, end, label))
|
||||||
@@ -129,7 +139,7 @@ class NERNameTagger:
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
"entities": entities_str,
|
"entities": entities_str,
|
||||||
"spans": validated_entities # Keep the original tuples for internal use
|
"spans": validated_entities, # Keep the original tuples for internal use
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -154,6 +164,7 @@ class NERNameTagger:
|
|||||||
"""Validate that entity annotations are correct for a given name"""
|
"""Validate that entity annotations are correct for a given name"""
|
||||||
try:
|
try:
|
||||||
import ast
|
import ast
|
||||||
|
|
||||||
entities = ast.literal_eval(entities_str)
|
entities = ast.literal_eval(entities_str)
|
||||||
|
|
||||||
# Check for overlaps and valid bounds
|
# Check for overlaps and valid bounds
|
||||||
@@ -182,10 +193,11 @@ class NERNameTagger:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
|
def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
|
||||||
"""Extract the actual text for each entity type"""
|
"""Extract the actual text for each entity type"""
|
||||||
result = {'NATIVE': [], 'SURNAME': []}
|
result = {"NATIVE": [], "SURNAME": []}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ast
|
import ast
|
||||||
|
|
||||||
entities = ast.literal_eval(entities_str)
|
entities = ast.literal_eval(entities_str)
|
||||||
|
|
||||||
for start, end, label in entities:
|
for start, end, label in entities:
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import pandas as pd
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from core.config.pipeline_config import PipelineConfig
|
||||||
|
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
|
||||||
from processing.batch.batch_config import BatchConfig
|
from processing.batch.batch_config import BatchConfig
|
||||||
|
|
||||||
|
|
||||||
@@ -37,10 +38,11 @@ class PipelineStep(ABC):
|
|||||||
"""Abstract base class for pipeline steps"""
|
"""Abstract base class for pipeline steps"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
|
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
|
||||||
):
|
):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.pipeline_config = pipeline_config
|
self.pipeline_config = pipeline_config
|
||||||
|
self.data_loader = DataLoader(pipeline_config)
|
||||||
|
|
||||||
# Use provided batch_config or create default from pipeline config
|
# Use provided batch_config or create default from pipeline config
|
||||||
if batch_config is None:
|
if batch_config is None:
|
||||||
@@ -53,6 +55,11 @@ class PipelineStep(ABC):
|
|||||||
self.batch_config = batch_config
|
self.batch_config = batch_config
|
||||||
self.state = PipelineState()
|
self.state = PipelineState()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_batch_mutation(self) -> bool:
|
||||||
|
"""Indicates if this step modifies the batch data"""
|
||||||
|
return False
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||||
"""Process a single batch of data"""
|
"""Process a single batch of data"""
|
||||||
@@ -108,12 +115,12 @@ class PipelineStep(ABC):
|
|||||||
def save_batch(self, batch: pd.DataFrame, batch_id: int):
|
def save_batch(self, batch: pd.DataFrame, batch_id: int):
|
||||||
"""Save processed batch to checkpoint"""
|
"""Save processed batch to checkpoint"""
|
||||||
checkpoint_path = self.get_checkpoint_path(batch_id)
|
checkpoint_path = self.get_checkpoint_path(batch_id)
|
||||||
batch.to_csv(checkpoint_path, index=False)
|
self.data_loader.save_csv(batch, checkpoint_path)
|
||||||
logging.info(f"Saved batch {batch_id} to {checkpoint_path}")
|
logging.info(f"Saved batch {batch_id} to {checkpoint_path}")
|
||||||
|
|
||||||
def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
|
def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
|
||||||
"""Load processed batch from checkpoint"""
|
"""Load processed batch from checkpoint"""
|
||||||
checkpoint_path = self.get_checkpoint_path(batch_id)
|
checkpoint_path = self.get_checkpoint_path(batch_id)
|
||||||
if os.path.exists(checkpoint_path):
|
if os.path.exists(checkpoint_path):
|
||||||
return pd.read_csv(checkpoint_path)
|
return self.data_loader.load_csv_complete(checkpoint_path)
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -2,11 +2,10 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from core.config.pipeline_config import PipelineConfig
|
||||||
from processing.steps.feature_extraction_step import Gender
|
from core.utils.region_mapper import RegionMapper
|
||||||
from core.utils.data_loader import DataLoader
|
|
||||||
|
|
||||||
from processing.batch.batch_config import BatchConfig
|
from processing.batch.batch_config import BatchConfig
|
||||||
from processing.steps import PipelineStep
|
from processing.steps import PipelineStep
|
||||||
|
from processing.steps.feature_extraction_step import Gender
|
||||||
|
|
||||||
|
|
||||||
class DataSplittingStep(PipelineStep):
|
class DataSplittingStep(PipelineStep):
|
||||||
@@ -20,7 +19,6 @@ class DataSplittingStep(PipelineStep):
|
|||||||
use_multiprocessing=False,
|
use_multiprocessing=False,
|
||||||
)
|
)
|
||||||
super().__init__("data_splitting", pipeline_config, batch_config)
|
super().__init__("data_splitting", pipeline_config, batch_config)
|
||||||
self.data_loader = DataLoader(pipeline_config)
|
|
||||||
self.eval_indices = None
|
self.eval_indices = None
|
||||||
|
|
||||||
def determine_eval_indices(self, total_size: int) -> set:
|
def determine_eval_indices(self, total_size: int) -> set:
|
||||||
@@ -33,9 +31,9 @@ class DataSplittingStep(PipelineStep):
|
|||||||
|
|
||||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||||
"""Process batch for data splitting - no modification needed"""
|
"""Process batch for data splitting - no modification needed"""
|
||||||
return batch.copy()
|
return batch
|
||||||
|
|
||||||
def save_splits(self, df: pd.DataFrame) -> None:
|
def split(self, df: pd.DataFrame) -> None:
|
||||||
"""Save the split datasets based on configuration"""
|
"""Save the split datasets based on configuration"""
|
||||||
output_files = self.pipeline_config.data.output_files
|
output_files = self.pipeline_config.data.output_files
|
||||||
data_dir = self.pipeline_config.paths.data_dir
|
data_dir = self.pipeline_config.paths.data_dir
|
||||||
@@ -52,9 +50,14 @@ class DataSplittingStep(PipelineStep):
|
|||||||
else:
|
else:
|
||||||
self.data_loader.save_csv(df, data_dir / output_files["featured"])
|
self.data_loader.save_csv(df, data_dir / output_files["featured"])
|
||||||
|
|
||||||
|
if self.pipeline_config.data.split_by_province:
|
||||||
|
for province in RegionMapper.get_provinces():
|
||||||
|
df_region = df[df.province == province]
|
||||||
|
self.data_loader.save_csv(df_region, data_dir / "provinces" / f"{province}.csv")
|
||||||
|
|
||||||
if self.pipeline_config.data.split_by_gender:
|
if self.pipeline_config.data.split_by_gender:
|
||||||
df_males = df[df["sex"] == Gender.MALE.value]
|
df_males = df[df.sex == Gender.MALE.value]
|
||||||
df_females = df[df["sex"] == Gender.FEMALE.value]
|
df_females = df[df.sex == Gender.FEMALE.value]
|
||||||
|
|
||||||
self.data_loader.save_csv(df_males, data_dir / output_files["males"])
|
self.data_loader.save_csv(df_males, data_dir / output_files["males"])
|
||||||
self.data_loader.save_csv(df_females, data_dir / output_files["females"])
|
self.data_loader.save_csv(df_females, data_dir / output_files["females"])
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
|
import gc
|
||||||
import logging
|
import logging
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
@@ -27,10 +29,15 @@ class FeatureExtractionStep(PipelineStep):
|
|||||||
self.region_mapper = RegionMapper()
|
self.region_mapper = RegionMapper()
|
||||||
self.name_tagger = NERNameTagger()
|
self.name_tagger = NERNameTagger()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def requires_batch_mutation(cls) -> bool:
|
||||||
|
"""This step creates new columns, so mutation is required"""
|
||||||
|
return True
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def validate_gender(cls, gender: str) -> Gender:
|
def validate_gender(cls, gender: str) -> Gender:
|
||||||
"""Validate and normalize gender value"""
|
"""Validate and normalize gender value"""
|
||||||
gender_lower = gender.lower().strip()
|
gender_lower = str(gender).lower().strip()
|
||||||
if gender_lower in ["m", "male", "homme", "masculin"]:
|
if gender_lower in ["m", "male", "homme", "masculin"]:
|
||||||
return Gender.MALE
|
return Gender.MALE
|
||||||
elif gender_lower in ["f", "female", "femme", "féminin"]:
|
elif gender_lower in ["f", "female", "femme", "féminin"]:
|
||||||
@@ -41,68 +48,144 @@ class FeatureExtractionStep(PipelineStep):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def get_name_category(cls, word_count: int) -> NameCategory:
|
def get_name_category(cls, word_count: int) -> NameCategory:
|
||||||
"""Determine name category based on word count"""
|
"""Determine name category based on word count"""
|
||||||
if word_count == 3:
|
return NameCategory.SIMPLE if word_count == 3 else NameCategory.COMPOSE
|
||||||
return NameCategory.SIMPLE
|
|
||||||
else:
|
|
||||||
return NameCategory.COMPOSE
|
|
||||||
|
|
||||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||||
"""Extract features from names in batch"""
|
"""Extract features from names in batch"""
|
||||||
logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")
|
logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")
|
||||||
|
|
||||||
batch = batch.copy()
|
result = batch.copy()
|
||||||
|
numeric_features = self._compute_numeric_features(result["name"])
|
||||||
|
result = result.assign(**numeric_features)
|
||||||
|
|
||||||
# Basic features
|
# Initialize features columns with optimal dtypes
|
||||||
batch["words"] = batch["name"].str.count(" ") + 1
|
features_columns = self._initialize_features_columns(len(result))
|
||||||
batch["length"] = batch["name"].str.len()
|
result = result.assign(**features_columns)
|
||||||
|
|
||||||
# Handle year column
|
self._assign_probable_names(result)
|
||||||
if "year" in batch.columns:
|
self._process_simple_names(result)
|
||||||
batch["year"] = pd.to_numeric(batch["year"], errors="coerce").astype("Int64")
|
result["identified_category"] = self._assign_identified_category(result["words"])
|
||||||
|
|
||||||
# Initialize new columns
|
if "year" in result.columns:
|
||||||
batch["probable_native"] = None
|
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype("Int16")
|
||||||
batch["probable_surname"] = None
|
|
||||||
batch["identified_name"] = None
|
|
||||||
batch["identified_surname"] = None
|
|
||||||
batch["ner_entities"] = None
|
|
||||||
batch["ner_tagged"] = 0
|
|
||||||
batch["annotated"] = 0
|
|
||||||
|
|
||||||
# Vectorized category assignment
|
if "region" in result.columns:
|
||||||
batch["identified_category"] = batch["words"].apply(
|
result["province"] = self.region_mapper.map(result["region"])
|
||||||
lambda x: self.get_name_category(x).value
|
result["province"] = result["province"].astype("category")
|
||||||
|
|
||||||
|
if "sex" in result.columns:
|
||||||
|
result["sex"] = self._normalize_gender(result["sex"])
|
||||||
|
|
||||||
|
# Apply final dtype optimizations
|
||||||
|
result = self._optimize_dtypes(result)
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
del numeric_features, features_columns
|
||||||
|
if batch_id % 10 == 0: # Periodic cleanup
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _compute_numeric_features(cls, series: pd.Series) -> Dict[str, pd.Series]:
|
||||||
|
"""Calculate basic features in vectorized manner"""
|
||||||
|
return {
|
||||||
|
"words": (series.str.count(" ") + 1).astype("Int8"),
|
||||||
|
"length": series.str.len().astype("Int16"),
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _initialize_features_columns(cls, size: int) -> Dict[str, Any]:
|
||||||
|
"""Initialize new columns with optimal dtypes"""
|
||||||
|
return {
|
||||||
|
"probable_native": pd.Series([None] * size, dtype="string"),
|
||||||
|
"probable_surname": pd.Series([None] * size, dtype="string"),
|
||||||
|
"identified_name": pd.Series([None] * size, dtype="string"),
|
||||||
|
"identified_surname": pd.Series([None] * size, dtype="string"),
|
||||||
|
"ner_entities": pd.Series([None] * size, dtype="string"),
|
||||||
|
"ner_tagged": pd.Series([0] * size, dtype="Int8"),
|
||||||
|
"annotated": pd.Series([0] * size, dtype="Int8"),
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _assign_probable_names(cls, df: pd.DataFrame) -> None:
|
||||||
|
"""Assign probable native and surname names efficiently"""
|
||||||
|
|
||||||
|
name_splits = df["name"].str.split()
|
||||||
|
mask = name_splits.str.len() >= 2
|
||||||
|
|
||||||
|
df.loc[mask, "probable_native"] = name_splits[mask].apply(
|
||||||
|
lambda x: " ".join(x[:-1]) if isinstance(x, list) else None
|
||||||
|
)
|
||||||
|
df.loc[mask, "probable_surname"] = name_splits[mask].apply(
|
||||||
|
lambda x: x[-1] if isinstance(x, list) else None
|
||||||
)
|
)
|
||||||
|
|
||||||
# Assign probable_native and probable_surname for all names
|
def _assign_identified_category(self, series: pd.Series) -> pd.Series:
|
||||||
name_splits = batch["name"].str.split()
|
"""Assign identified category based on word count"""
|
||||||
batch["probable_native"] = name_splits.apply(
|
return series.map(lambda x: self.get_name_category(x).value).astype("category")
|
||||||
lambda x: " ".join(x[:-1]) if isinstance(x, list) and len(x) >= 2 else None
|
|
||||||
)
|
|
||||||
batch["probable_surname"] = name_splits.apply(
|
|
||||||
lambda x: x[-1] if isinstance(x, list) and len(x) >= 2 else None
|
|
||||||
)
|
|
||||||
|
|
||||||
# Auto-assign for 3-word names
|
def _process_simple_names(self, df: pd.DataFrame) -> None:
|
||||||
three_word_mask = batch["words"] == 3
|
"""Process 3-word names efficiently with vectorized operations"""
|
||||||
batch.loc[three_word_mask, "identified_name"] = batch.loc[three_word_mask, "probable_native"]
|
mask = df["words"] == 3
|
||||||
batch.loc[three_word_mask, "identified_surname"] = batch.loc[three_word_mask, "probable_surname"]
|
|
||||||
batch.loc[three_word_mask, "annotated"] = 1
|
|
||||||
|
|
||||||
# Tag names with NER entities
|
if not mask.any():
|
||||||
three_word_rows = batch[three_word_mask]
|
return
|
||||||
|
|
||||||
|
df.loc[mask, "identified_name"] = df.loc[mask, "probable_native"]
|
||||||
|
df.loc[mask, "identified_surname"] = df.loc[mask, "probable_surname"]
|
||||||
|
df.loc[mask, "annotated"] = 1
|
||||||
|
|
||||||
|
# NER tagging for 3-word names
|
||||||
|
three_word_rows = df[mask]
|
||||||
for idx, row in three_word_rows.iterrows():
|
for idx, row in three_word_rows.iterrows():
|
||||||
entity = self.name_tagger.tag_name(row['name'], row['identified_name'], row['identified_surname'])
|
try:
|
||||||
|
entity = self.name_tagger.tag_name(
|
||||||
|
row["name"], row["identified_name"], row["identified_surname"]
|
||||||
|
)
|
||||||
|
|
||||||
if entity:
|
if entity:
|
||||||
batch.at[idx, "ner_entities"] = entity["entities"]
|
df.at[idx, "ner_entities"] = str(entity["entities"])
|
||||||
batch.at[idx, "ner_tagged"] = 1
|
df.at[idx, "ner_tagged"] = 1
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"NER tagging failed for row {idx}: {e}")
|
||||||
|
|
||||||
# Map regions to provinces
|
def _normalize_gender(self, series: pd.Series) -> pd.Series:
|
||||||
batch["province"] = self.region_mapper.map_regions_vectorized(batch["region"])
|
gender_mapping = {
|
||||||
|
"m": "m",
|
||||||
|
"male": "m",
|
||||||
|
"homme": "m",
|
||||||
|
"masculin": "m",
|
||||||
|
"f": "f",
|
||||||
|
"female": "f",
|
||||||
|
"femme": "f",
|
||||||
|
"féminin": "f",
|
||||||
|
}
|
||||||
|
|
||||||
# Normalize gender
|
# Apply mapping with error handling
|
||||||
if "sex" in batch.columns:
|
normalized = series.astype(str).str.lower().str.strip().map(gender_mapping)
|
||||||
batch["sex"] = batch["sex"].apply(lambda x: self.validate_gender(str(x)).value)
|
return normalized.astype("category")
|
||||||
|
|
||||||
return batch
|
@classmethod
|
||||||
|
def _optimize_dtypes(cls, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
categories = ["province", "identified_category", "sex"]
|
||||||
|
|
||||||
|
for col in categories:
|
||||||
|
if col in df.columns and df[col].dtype != "category":
|
||||||
|
df[col] = df[col].astype("category")
|
||||||
|
|
||||||
|
# Ensure string columns are proper string dtype
|
||||||
|
string_cols = [
|
||||||
|
"name",
|
||||||
|
"probable_native",
|
||||||
|
"probable_surname",
|
||||||
|
"identified_name",
|
||||||
|
"identified_surname",
|
||||||
|
"ner_entities",
|
||||||
|
]
|
||||||
|
|
||||||
|
for col in string_cols:
|
||||||
|
if col in df.columns and df[col].dtype == "object":
|
||||||
|
df[col] = df[col].astype("string")
|
||||||
|
|
||||||
|
return df
|
||||||
|
|||||||
@@ -24,8 +24,7 @@ class LLMAnnotationStep(PipelineStep):
|
|||||||
batch_config = BatchConfig(
|
batch_config = BatchConfig(
|
||||||
batch_size=pipeline_config.processing.batch_size,
|
batch_size=pipeline_config.processing.batch_size,
|
||||||
max_workers=min(
|
max_workers=min(
|
||||||
self.llm_config.max_concurrent_requests,
|
self.llm_config.max_concurrent_requests, pipeline_config.processing.max_workers
|
||||||
pipeline_config.processing.max_workers
|
|
||||||
),
|
),
|
||||||
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
|
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
|
||||||
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
|
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
|
||||||
@@ -98,7 +97,7 @@ class LLMAnnotationStep(PipelineStep):
|
|||||||
|
|
||||||
# Exponential backoff with jitter
|
# Exponential backoff with jitter
|
||||||
if attempt < self.llm_config.retry_attempts - 1:
|
if attempt < self.llm_config.retry_attempts - 1:
|
||||||
wait_time = (2 ** attempt) + (time.time() % 1)
|
wait_time = (2**attempt) + (time.time() % 1)
|
||||||
time.sleep(min(wait_time, 10))
|
time.sleep(min(wait_time, 10))
|
||||||
|
|
||||||
self.failed_requests += 1
|
self.failed_requests += 1
|
||||||
@@ -156,6 +155,8 @@ class LLMAnnotationStep(PipelineStep):
|
|||||||
batch.loc[idx, "annotated"] = 0
|
batch.loc[idx, "annotated"] = 0
|
||||||
|
|
||||||
# Ensure proper data types
|
# Ensure proper data types
|
||||||
batch["annotated"] = pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
|
batch["annotated"] = (
|
||||||
|
pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
|
||||||
|
)
|
||||||
|
|
||||||
return batch
|
return batch
|
||||||
|
|||||||
@@ -6,8 +6,8 @@ from typing import Dict
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from core.config.pipeline_config import PipelineConfig
|
||||||
from processing.steps import PipelineStep, NameAnnotation
|
|
||||||
from processing.ner.ner_name_model import NERNameModel
|
from processing.ner.ner_name_model import NERNameModel
|
||||||
|
from processing.steps import PipelineStep, NameAnnotation
|
||||||
|
|
||||||
|
|
||||||
class NERAnnotationStep(PipelineStep):
|
class NERAnnotationStep(PipelineStep):
|
||||||
@@ -63,7 +63,7 @@ class NERAnnotationStep(PipelineStep):
|
|||||||
|
|
||||||
# Get NER predictions
|
# Get NER predictions
|
||||||
prediction = self.ner_trainer.predict(name.lower())
|
prediction = self.ner_trainer.predict(name.lower())
|
||||||
entities = prediction.get('entities', [])
|
entities = prediction.get("entities", [])
|
||||||
|
|
||||||
elapsed_time = time.time() - start_time
|
elapsed_time = time.time() - start_time
|
||||||
|
|
||||||
@@ -72,15 +72,15 @@ class NERAnnotationStep(PipelineStep):
|
|||||||
surname_parts = []
|
surname_parts = []
|
||||||
|
|
||||||
for entity in entities:
|
for entity in entities:
|
||||||
if entity['label'] == 'NATIVE':
|
if entity["label"] == "NATIVE":
|
||||||
native_parts.append(entity['text'])
|
native_parts.append(entity["text"])
|
||||||
elif entity['label'] == 'SURNAME':
|
elif entity["label"] == "SURNAME":
|
||||||
surname_parts.append(entity['text'])
|
surname_parts.append(entity["text"])
|
||||||
|
|
||||||
# Create annotation result in same format as LLM step
|
# Create annotation result in same format as LLM step
|
||||||
annotation = NameAnnotation(
|
annotation = NameAnnotation(
|
||||||
identified_name=" ".join(native_parts) if native_parts else None,
|
identified_name=" ".join(native_parts) if native_parts else None,
|
||||||
identified_surname=" ".join(surname_parts) if surname_parts else None
|
identified_surname=" ".join(surname_parts) if surname_parts else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
@@ -159,6 +159,8 @@ class NERAnnotationStep(PipelineStep):
|
|||||||
batch.loc[idx, "annotated"] = 0
|
batch.loc[idx, "annotated"] = 0
|
||||||
|
|
||||||
# Ensure proper data types
|
# Ensure proper data types
|
||||||
batch["annotated"] = pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
|
batch["annotated"] = (
|
||||||
|
pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
|
||||||
|
)
|
||||||
|
|
||||||
return batch
|
return batch
|
||||||
|
|||||||
@@ -224,9 +224,9 @@ class ExperimentRunner:
|
|||||||
model.learning_curve_data = model_data.get("learning_curve_data", {})
|
model.learning_curve_data = model_data.get("learning_curve_data", {})
|
||||||
|
|
||||||
# Restore vectorizers and encoders for models that use them (like XGBoost)
|
# Restore vectorizers and encoders for models that use them (like XGBoost)
|
||||||
if "vectorizers" in model_data and hasattr(model, 'vectorizers'):
|
if "vectorizers" in model_data and hasattr(model, "vectorizers"):
|
||||||
model.vectorizers = model_data["vectorizers"]
|
model.vectorizers = model_data["vectorizers"]
|
||||||
if "label_encoders" in model_data and hasattr(model, 'label_encoders'):
|
if "label_encoders" in model_data and hasattr(model, "label_encoders"):
|
||||||
model.label_encoders = model_data["label_encoders"]
|
model.label_encoders = model_data["label_encoders"]
|
||||||
|
|
||||||
return model
|
return model
|
||||||
@@ -237,7 +237,9 @@ class ExperimentRunner:
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def compare_experiments(self, experiment_ids: List[str], metric: str = "accuracy") -> pd.DataFrame:
|
def compare_experiments(
|
||||||
|
self, experiment_ids: List[str], metric: str = "accuracy"
|
||||||
|
) -> pd.DataFrame:
|
||||||
"""Compare experiments and return analysis"""
|
"""Compare experiments and return analysis"""
|
||||||
comparison_df = self.tracker.compare_experiments(experiment_ids)
|
comparison_df = self.tracker.compare_experiments(experiment_ids)
|
||||||
|
|
||||||
|
|||||||
@@ -28,13 +28,13 @@ class ModelTrainer:
|
|||||||
self.models_dir.mkdir(parents=True, exist_ok=True)
|
self.models_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
def train_single_model(
|
def train_single_model(
|
||||||
self,
|
self,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
model_type: str = "logistic_regression",
|
model_type: str = "logistic_regression",
|
||||||
features: List[str] = None,
|
features: List[str] = None,
|
||||||
model_params: Dict[str, Any] = None,
|
model_params: Dict[str, Any] = None,
|
||||||
tags: List[str] = None,
|
tags: List[str] = None,
|
||||||
save_artifacts: bool = True,
|
save_artifacts: bool = True,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Train a single model and save its artifacts.
|
Train a single model and save its artifacts.
|
||||||
@@ -76,10 +76,7 @@ class ModelTrainer:
|
|||||||
return experiment_id
|
return experiment_id
|
||||||
|
|
||||||
def train_multiple_models(
|
def train_multiple_models(
|
||||||
self,
|
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
|
||||||
base_name: str,
|
|
||||||
model_configs: List[Dict[str, Any]],
|
|
||||||
save_all: bool = True
|
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Train multiple models with different configurations.
|
Train multiple models with different configurations.
|
||||||
|
|||||||
@@ -50,14 +50,18 @@ class LightGBMModel(TraditionalModel):
|
|||||||
self.vectorizers[feature_key] = CountVectorizer(
|
self.vectorizers[feature_key] = CountVectorizer(
|
||||||
analyzer="char", ngram_range=(2, 3), max_features=50
|
analyzer="char", ngram_range=(2, 3), max_features=50
|
||||||
)
|
)
|
||||||
char_features = self.vectorizers[feature_key].fit_transform(
|
char_features = (
|
||||||
column.fillna("").astype(str)
|
self.vectorizers[feature_key]
|
||||||
).toarray()
|
.fit_transform(column.fillna("").astype(str))
|
||||||
|
.toarray()
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# Subsequent times - use existing vectorizer
|
# Subsequent times - use existing vectorizer
|
||||||
char_features = self.vectorizers[feature_key].transform(
|
char_features = (
|
||||||
column.fillna("").astype(str)
|
self.vectorizers[feature_key]
|
||||||
).toarray()
|
.transform(column.fillna("").astype(str))
|
||||||
|
.toarray()
|
||||||
|
)
|
||||||
|
|
||||||
features.append(char_features)
|
features.append(char_features)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -20,9 +20,7 @@ class LogisticRegressionModel(TraditionalModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
classifier = LogisticRegression(
|
classifier = LogisticRegression(
|
||||||
max_iter=params.get("max_iter", 1000),
|
max_iter=params.get("max_iter", 1000), random_state=self.config.random_seed, verbose=2
|
||||||
random_state=self.config.random_seed,
|
|
||||||
verbose=2
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ class RandomForestModel(TraditionalModel):
|
|||||||
n_estimators=params.get("n_estimators", 100),
|
n_estimators=params.get("n_estimators", 100),
|
||||||
max_depth=params.get("max_depth", None),
|
max_depth=params.get("max_depth", None),
|
||||||
random_state=self.config.random_seed,
|
random_state=self.config.random_seed,
|
||||||
verbose=2
|
verbose=2,
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ class SVMModel(TraditionalModel):
|
|||||||
gamma=params.get("gamma", "scale"),
|
gamma=params.get("gamma", "scale"),
|
||||||
probability=True, # Enable probability prediction
|
probability=True, # Enable probability prediction
|
||||||
random_state=self.config.random_seed,
|
random_state=self.config.random_seed,
|
||||||
verbose=2
|
verbose=2,
|
||||||
)
|
)
|
||||||
|
|
||||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ class XGBoostModel(TraditionalModel):
|
|||||||
colsample_bytree=params.get("colsample_bytree", 0.8),
|
colsample_bytree=params.get("colsample_bytree", 0.8),
|
||||||
random_state=self.config.random_seed,
|
random_state=self.config.random_seed,
|
||||||
eval_metric="logloss",
|
eval_metric="logloss",
|
||||||
verbosity=2
|
verbosity=2,
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
@@ -50,14 +50,18 @@ class XGBoostModel(TraditionalModel):
|
|||||||
self.vectorizers[feature_key] = CountVectorizer(
|
self.vectorizers[feature_key] = CountVectorizer(
|
||||||
analyzer="char", ngram_range=(2, 3), max_features=100
|
analyzer="char", ngram_range=(2, 3), max_features=100
|
||||||
)
|
)
|
||||||
char_features = self.vectorizers[feature_key].fit_transform(
|
char_features = (
|
||||||
column.fillna("").astype(str)
|
self.vectorizers[feature_key]
|
||||||
).toarray()
|
.fit_transform(column.fillna("").astype(str))
|
||||||
|
.toarray()
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# Subsequent times - use existing vectorizer
|
# Subsequent times - use existing vectorizer
|
||||||
char_features = self.vectorizers[feature_key].transform(
|
char_features = (
|
||||||
column.fillna("").astype(str)
|
self.vectorizers[feature_key]
|
||||||
).toarray()
|
.transform(column.fillna("").astype(str))
|
||||||
|
.toarray()
|
||||||
|
)
|
||||||
|
|
||||||
features.append(char_features)
|
features.append(char_features)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -59,7 +59,9 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Train the neural network
|
# Train the neural network
|
||||||
logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features")
|
logging.info(
|
||||||
|
f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features"
|
||||||
|
)
|
||||||
history = self.model.fit(
|
history = self.model.fit(
|
||||||
X_prepared,
|
X_prepared,
|
||||||
y_encoded,
|
y_encoded,
|
||||||
@@ -162,7 +164,11 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
|
|
||||||
# Split data once for validation
|
# Split data once for validation
|
||||||
X_train_full, X_val, y_train_full, y_val = train_test_split(
|
X_train_full, X_val, y_train_full, y_val = train_test_split(
|
||||||
X_prepared, y_encoded, test_size=0.2, random_state=self.config.random_seed, stratify=y_encoded
|
X_prepared,
|
||||||
|
y_encoded,
|
||||||
|
test_size=0.2,
|
||||||
|
random_state=self.config.random_seed,
|
||||||
|
stratify=y_encoded,
|
||||||
)
|
)
|
||||||
|
|
||||||
for size in train_sizes:
|
for size in train_sizes:
|
||||||
|
|||||||
@@ -55,7 +55,9 @@ class TraditionalModel(BaseModel):
|
|||||||
logging.info(f"Fitting model with {X_prepared.shape[0]} samples (text features)")
|
logging.info(f"Fitting model with {X_prepared.shape[0]} samples (text features)")
|
||||||
else:
|
else:
|
||||||
# For numerical features
|
# For numerical features
|
||||||
logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features")
|
logging.info(
|
||||||
|
f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features"
|
||||||
|
)
|
||||||
|
|
||||||
self.model.fit(X_prepared, y_encoded)
|
self.model.fit(X_prepared, y_encoded)
|
||||||
self.is_fitted = True
|
self.is_fitted = True
|
||||||
|
|||||||
@@ -3,8 +3,8 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from core.config import setup_config
|
from core.config import setup_config
|
||||||
from research.model_trainer import ModelTrainer
|
from research.model_trainer import ModelTrainer
|
||||||
@@ -13,7 +13,7 @@ from research.model_trainer import ModelTrainer
|
|||||||
def load_research_templates(templates_path: str = "config/research_templates.yaml") -> dict:
|
def load_research_templates(templates_path: str = "config/research_templates.yaml") -> dict:
|
||||||
"""Load research templates from YAML file"""
|
"""Load research templates from YAML file"""
|
||||||
try:
|
try:
|
||||||
with open(templates_path, 'r') as file:
|
with open(templates_path, "r") as file:
|
||||||
return yaml.safe_load(file)
|
return yaml.safe_load(file)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
logging.error(f"Templates file not found: {templates_path}")
|
logging.error(f"Templates file not found: {templates_path}")
|
||||||
@@ -30,13 +30,15 @@ def find_experiment_config(templates: dict, name: str, experiment_type: str) ->
|
|||||||
"baseline": "baseline_experiments",
|
"baseline": "baseline_experiments",
|
||||||
"advanced": "advanced_experiments",
|
"advanced": "advanced_experiments",
|
||||||
"feature_study": "feature_studies",
|
"feature_study": "feature_studies",
|
||||||
"tuning": "hyperparameter_tuning"
|
"tuning": "hyperparameter_tuning",
|
||||||
}
|
}
|
||||||
|
|
||||||
section_name = type_mapping.get(experiment_type)
|
section_name = type_mapping.get(experiment_type)
|
||||||
if not section_name:
|
if not section_name:
|
||||||
available_types = list(type_mapping.keys())
|
available_types = list(type_mapping.keys())
|
||||||
raise ValueError(f"Unknown experiment type '{experiment_type}'. Available types: {available_types}")
|
raise ValueError(
|
||||||
|
f"Unknown experiment type '{experiment_type}'. Available types: {available_types}"
|
||||||
|
)
|
||||||
|
|
||||||
if section_name not in templates:
|
if section_name not in templates:
|
||||||
raise ValueError(f"Section '{section_name}' not found in templates")
|
raise ValueError(f"Section '{section_name}' not found in templates")
|
||||||
@@ -47,16 +49,22 @@ def find_experiment_config(templates: dict, name: str, experiment_type: str) ->
|
|||||||
for experiment in experiments:
|
for experiment in experiments:
|
||||||
# Check if this is the experiment we're looking for
|
# Check if this is the experiment we're looking for
|
||||||
# Look for experiments that match the model type or contain the name
|
# Look for experiments that match the model type or contain the name
|
||||||
if (experiment.get("model_type") == name or
|
if (
|
||||||
name.lower() in experiment.get("name", "").lower() or
|
experiment.get("model_type") == name
|
||||||
f"baseline_{name}" == experiment.get("name") or
|
or name.lower() in experiment.get("name", "").lower()
|
||||||
f"advanced_{name}" == experiment.get("name")):
|
or f"baseline_{name}" == experiment.get("name")
|
||||||
|
or f"advanced_{name}" == experiment.get("name")
|
||||||
|
):
|
||||||
return experiment
|
return experiment
|
||||||
|
|
||||||
# If not found, list available experiments
|
# If not found, list available experiments
|
||||||
available_experiments = [exp.get("name", exp.get("model_type", "unknown")) for exp in experiments]
|
available_experiments = [
|
||||||
raise ValueError(f"Experiment '{name}' not found in '{experiment_type}' section. "
|
exp.get("name", exp.get("model_type", "unknown")) for exp in experiments
|
||||||
f"Available experiments: {available_experiments}")
|
]
|
||||||
|
raise ValueError(
|
||||||
|
f"Experiment '{name}' not found in '{experiment_type}' section. "
|
||||||
|
f"Available experiments: {available_experiments}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@@ -91,7 +99,7 @@ def main():
|
|||||||
model_type=experiment_config.get("model_type"),
|
model_type=experiment_config.get("model_type"),
|
||||||
features=experiment_config.get("features"),
|
features=experiment_config.get("features"),
|
||||||
model_params=experiment_config.get("model_params", {}),
|
model_params=experiment_config.get("model_params", {}),
|
||||||
tags=experiment_config.get("tags", [])
|
tags=experiment_config.get("tags", []),
|
||||||
)
|
)
|
||||||
|
|
||||||
logging.info("Training completed successfully!")
|
logging.info("Training completed successfully!")
|
||||||
|
|||||||
Reference in New Issue
Block a user