feat: add osm data

This commit is contained in:
2025-09-21 16:23:44 +02:00
parent 63e23d6600
commit c1b502c878
39 changed files with 955 additions and 338 deletions
+35 -32
View File
@@ -62,7 +62,7 @@ stages:
**Running the Pipeline** **Running the Pipeline**
```bash ```bash
python main.py --env development python main.py --env production
``` ```
## NER Processing (Optional) ## NER Processing (Optional)
@@ -72,7 +72,7 @@ Its main objective is to accurately identify and tag the different components of
specifically distinguishing between the native part and the surname. specifically distinguishing between the native part and the surname.
```bash ```bash
python ner.py --env development python ner.py --env production
``` ```
Once you've built and train the NER model you can use it to annotate **COMPOSE** name in the original dataset Once you've built and train the NER model you can use it to annotate **COMPOSE** name in the original dataset
@@ -105,54 +105,54 @@ you can define model features, training parameters, and evaluation metrics in th
```bash ```bash
# bigru # bigru
python train.py --name="bigru" --type="baseline" --env="development" python train.py --name="bigru" --type="baseline" --env="production"
python train.py --name="bigru_native" --type="baseline" --env="development" python train.py --name="bigru_native" --type="baseline" --env="production"
python train.py --name="bigru_surname" --type="baseline" --env="development" python train.py --name="bigru_surname" --type="baseline" --env="production"
# cnn # cnn
python train.py --name="cnn" --type="baseline" --env="development" python train.py --name="cnn" --type="baseline" --env="production"
python train.py --name="cnn_native" --type="baseline" --env="development" python train.py --name="cnn_native" --type="baseline" --env="production"
python train.py --name="cnn_surname" --type="baseline" --env="development" python train.py --name="cnn_surname" --type="baseline" --env="production"
# lightgbm # lightgbm
python train.py --name="lightgbm" --type="baseline" --env="development" python train.py --name="lightgbm" --type="baseline" --env="production"
python train.py --name="lightgbm_native" --type="baseline" --env="development" python train.py --name="lightgbm_native" --type="baseline" --env="production"
python train.py --name="lightgbm_surname" --type="baseline" --env="development" python train.py --name="lightgbm_surname" --type="baseline" --env="production"
# logistic regression # logistic regression
python train.py --name="logistic_regression" --type="baseline" --env="development" python train.py --name="logistic_regression" --type="baseline" --env="production"
python train.py --name="logistic_regression_native" --type="baseline" --env="development" python train.py --name="logistic_regression_native" --type="baseline" --env="production"
python train.py --name="logistic_regression_surname" --type="baseline" --env="development" python train.py --name="logistic_regression_surname" --type="baseline" --env="production"
# lstm # lstm
python train.py --name="lstm" --type="baseline" --env="development" python train.py --name="lstm" --type="baseline" --env="production"
python train.py --name="lstm_native" --type="baseline" --env="development" python train.py --name="lstm_native" --type="baseline" --env="production"
python train.py --name="lstm_surname" --type="baseline" --env="development" python train.py --name="lstm_surname" --type="baseline" --env="production"
# random forest # random forest
python train.py --name="random_forest" --type="baseline" --env="development" python train.py --name="random_forest" --type="baseline" --env="production"
python train.py --name="random_forest_native" --type="baseline" --env="development" python train.py --name="random_forest_native" --type="baseline" --env="production"
python train.py --name="random_forest_surname" --type="baseline" --env="development" python train.py --name="random_forest_surname" --type="baseline" --env="production"
# svm # svm
python train.py --name="svm" --type="baseline" --env="development" python train.py --name="svm" --type="baseline" --env="production"
python train.py --name="svm_native" --type="baseline" --env="development" python train.py --name="svm_native" --type="baseline" --env="production"
python train.py --name="svm_surname" --type="baseline" --env="development" python train.py --name="svm_surname" --type="baseline" --env="production"
# naive bayes # naive bayes
python train.py --name="naive_bayes" --type="baseline" --env="development" python train.py --name="naive_bayes" --type="baseline" --env="production"
python train.py --name="naive_bayes_native" --type="baseline" --env="development" python train.py --name="naive_bayes_native" --type="baseline" --env="production"
python train.py --name="naive_bayes_surname" --type="baseline" --env="development" python train.py --name="naive_bayes_surname" --type="baseline" --env="production"
# transformer # transformer
python train.py --name="transformer" --type="baseline" --env="development" python train.py --name="transformer" --type="baseline" --env="production"
python train.py --name="transformer_native" --type="baseline" --env="development" python train.py --name="transformer_native" --type="baseline" --env="production"
python train.py --name="transformer_surname" --type="baseline" --env="development" python train.py --name="transformer_surname" --type="baseline" --env="production"
# xgboost # xgboost
python train.py --name="xgboost" --type="baseline" --env="development" python train.py --name="xgboost" --type="baseline" --env="production"
python train.py --name="xgboost_native" --type="baseline" --env="development" python train.py --name="xgboost_native" --type="baseline" --env="production"
python train.py --name="xgboost_surname" --type="baseline" --env="development" python train.py --name="xgboost_surname" --type="baseline" --env="production"
``` ```
## Web Interface ## Web Interface
@@ -171,3 +171,6 @@ streamlit run web/app.py
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors"> <a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
<img src="https://contrib.rocks/image?repo=bernard-ng/drc-ners-nlp" alt="contributors"/> <img src="https://contrib.rocks/image?repo=bernard-ng/drc-ners-nlp" alt="contributors"/>
</a> </a>
## Acknowledgements
- Map Visualization: [https://data.humdata.org/dataset/anciennes-provinces-rdc-old-provinces-drc](https://data.humdata.org/dataset/anciennes-provinces-rdc-old-provinces-drc)
+1
View File
@@ -11,6 +11,7 @@ processing:
# Pipeline stages # Pipeline stages
stages: stages:
- "data_cleaning" - "data_cleaning"
- "data_selection"
- "feature_extraction" - "feature_extraction"
#- "ner_annotation" #- "ner_annotation"
#- "llm_annotation" #- "llm_annotation"
+6 -5
View File
@@ -3,17 +3,18 @@ debug: false
# Processing settings # Processing settings
processing: processing:
batch_size: 10_000 batch_size: 100_000
max_workers: 8 max_workers: 4
checkpoint_interval: 10 checkpoint_interval: 10
use_multiprocessing: true use_multiprocessing: true
# Pipeline stages # Pipeline stages
stages: stages:
- "data_cleaning" - "data_cleaning"
- "data_selection"
- "feature_extraction" - "feature_extraction"
- "ner_annotation" # - "ner_annotation"
- "llm_annotation" # - "llm_annotation"
- "data_splitting" - "data_splitting"
# Production LLM settings # Production LLM settings
@@ -34,7 +35,7 @@ data:
# Production logging (less verbose) # Production logging (less verbose)
logging: logging:
level: "INFO" level: "INFO"
console_logging: false console_logging: true
file_logging: true file_logging: true
log_file: "pipeline.production.log" log_file: "pipeline.production.log"
max_log_size: 52428800 # 50MB max_log_size: 52428800 # 50MB
+6
View File
@@ -21,6 +21,7 @@ paths:
# List of stages in the processing pipeline # List of stages in the processing pipeline
stages: stages:
- "data_cleaning" # Data cleaning stage - "data_cleaning" # Data cleaning stage
- "data_selection" # Data selection stage - keep only required columns
- "feature_extraction" # Feature extraction stage - "feature_extraction" # Feature extraction stage
- "ner_annotation" # NER-based annotation stage - "ner_annotation" # NER-based annotation stage
- "llm_annotation" # LLM annotation stage (computational intensive) - "llm_annotation" # LLM annotation stage (computational intensive)
@@ -64,6 +65,11 @@ data:
females: "names_females.csv" # Output files for female names females: "names_females.csv" # Output files for female names
ner_data: "names_ner.json" # Output file for NER annotated data ner_data: "names_ner.json" # Output file for NER annotated data
ner_spacy: "names_ner.spacy" # Output file for NER annotated data using spaCy format ner_spacy: "names_ner.spacy" # Output file for NER annotated data using spaCy format
selected_columns: # Required columns for processing
- name
- sex
- region
- year
split_evaluation: false # Should the dataset be split into training and evaluation sets ? split_evaluation: false # Should the dataset be split into training and evaluation sets ?
split_by_gender: true # Should the dataset be split by gender ? split_by_gender: true # Should the dataset be split by gender ?
split_by_province: true # Should the dataset be split by province ? split_by_province: true # Should the dataset be split by province ?
+1
View File
@@ -19,6 +19,7 @@ class DataConfig(BaseModel):
"ner_spacy": "names_ner.spacy", "ner_spacy": "names_ner.spacy",
} }
) )
selected_columns: list[str] = field(default=["name", "sex", "region"])
split_evaluation: bool = False split_evaluation: bool = False
split_by_province: bool = True split_by_province: bool = True
split_by_gender: bool = True split_by_gender: bool = True
+2 -4
View File
@@ -8,12 +8,10 @@ class RegionMapper:
def __init__(self, mapping: Optional[Dict] = None): def __init__(self, mapping: Optional[Dict] = None):
self.mapping = mapping or REGION_MAPPING self.mapping = mapping or REGION_MAPPING
self.mapping = {k.lower(): v[1].upper() for k, v in self.mapping.items()}
def map(self, series: pd.Series) -> pd.Series: def map(self, series: pd.Series) -> pd.Series:
"""Vectorized region to province mapping""" return series.str.lower().map(self.mapping).fillna("AUTRES")
return series.str.lower().map(
lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
)
@staticmethod @staticmethod
def get_provinces(): def get_provinces():
+2 -3
View File
@@ -30,9 +30,8 @@ class TextCleaner:
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame: def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
"""Clean all text columns in a DataFrame""" """Clean all text columns in a DataFrame"""
df = df.copy() df = df.copy()
text_columns = df.select_dtypes(include="object").columns columns = df.select_dtypes(include=["object", "string"]).columns
for col in columns:
for col in text_columns:
df[col] = self.clean_text_series(df[col]) df[col] = self.clean_text_series(df[col])
return df return df
+3 -2
View File
@@ -9,9 +9,9 @@ from core.utils.data_loader import DataLoader
from processing.batch.batch_config import BatchConfig from processing.batch.batch_config import BatchConfig
from processing.pipeline import Pipeline from processing.pipeline import Pipeline
from processing.steps.data_cleaning_step import DataCleaningStep from processing.steps.data_cleaning_step import DataCleaningStep
from processing.steps.data_selection_step import DataSelectionStep
from processing.steps.data_splitting_step import DataSplittingStep from processing.steps.data_splitting_step import DataSplittingStep
from processing.steps.feature_extraction_step import FeatureExtractionStep from processing.steps.feature_extraction_step import FeatureExtractionStep
from processing.steps.llm_annotation_step import LLMAnnotationStep
def create_pipeline(config) -> Pipeline: def create_pipeline(config) -> Pipeline:
@@ -28,8 +28,9 @@ def create_pipeline(config) -> Pipeline:
steps = [ steps = [
DataCleaningStep(config), DataCleaningStep(config),
FeatureExtractionStep(config), FeatureExtractionStep(config),
DataSelectionStep(config),
# NERAnnotationStep(config), # NERAnnotationStep(config),
LLMAnnotationStep(config), # LLMAnnotationStep(config),
] ]
for stage in config.stages: for stage in config.stages:
+1
View File
@@ -11,6 +11,7 @@ from processing.monitoring.pipeline_monitor import PipelineMonitor
def main(): def main():
choices = [ choices = [
"data_cleaning", "data_cleaning",
"data_selection",
"feature_extraction", "feature_extraction",
"ner_annotation", "ner_annotation",
"llm_annotation", "llm_annotation",
+99 -93
View File
File diff suppressed because one or more lines are too long
+430 -98
View File
File diff suppressed because one or more lines are too long
+107
View File
@@ -0,0 +1,107 @@
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "# Qualitative Analysis",
"id": "d20715dd63f57364"
},
{
"cell_type": "code",
"id": "c93a55c8",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T13:34:50.973298Z",
"start_time": "2025-09-21T13:34:50.969142Z"
}
},
"source": [
"import pandas as pd\n",
"import geopandas as gpd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import sys\n",
"import os\n",
"\n",
"sys.path.append(os.path.abspath(\"..\"))\n",
"from core.utils.data_loader import DataLoader\n",
"from core.config.pipeline_config import PipelineConfig"
],
"outputs": [],
"execution_count": 3
},
{
"cell_type": "code",
"id": "c0b00261",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T13:34:51.002610Z",
"start_time": "2025-09-21T13:34:50.998586Z"
}
},
"source": [
"config = PipelineConfig(\n",
" paths={\n",
" \"root_dir\": \"../data\",\n",
" \"data_dir\": \"../data/dataset\",\n",
" \"models_dir\": \"../models\",\n",
" \"outputs_dir\": \"../data/processed\",\n",
" \"logs_dir\": \"../logs\",\n",
" \"configs_dir\": \"../configs\",\n",
" \"checkpoints_dir\": \"../checkpoints\"\n",
" }\n",
")\n",
"\n",
"loader = DataLoader(config)"
],
"outputs": [],
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T13:35:27.430639Z",
"start_time": "2025-09-21T13:34:51.013412Z"
}
},
"cell_type": "code",
"outputs": [],
"execution_count": 5,
"source": [
"gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
"gdf_proj = gdf.to_crs(epsg=32732)\n",
"gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
"\n",
"df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
],
"id": "b38394ce38864379"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Exploration",
"id": "a1af5626d2a948d6"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+107
View File
@@ -0,0 +1,107 @@
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "# Quantitative Analysis",
"id": "a605c0f92056a825"
},
{
"cell_type": "code",
"id": "c93a55c8",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T14:14:47.287549Z",
"start_time": "2025-09-21T14:14:47.279199Z"
}
},
"source": [
"import pandas as pd\n",
"import geopandas as gpd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import sys\n",
"import os\n",
"\n",
"sys.path.append(os.path.abspath(\"..\"))\n",
"from core.utils.data_loader import DataLoader\n",
"from core.config.pipeline_config import PipelineConfig"
],
"outputs": [],
"execution_count": 30
},
{
"cell_type": "code",
"id": "c0b00261",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T14:14:47.315980Z",
"start_time": "2025-09-21T14:14:47.308376Z"
}
},
"source": [
"config = PipelineConfig(\n",
" paths={\n",
" \"root_dir\": \"../data\",\n",
" \"data_dir\": \"../data/dataset\",\n",
" \"models_dir\": \"../models\",\n",
" \"outputs_dir\": \"../data/processed\",\n",
" \"logs_dir\": \"../logs\",\n",
" \"configs_dir\": \"../configs\",\n",
" \"checkpoints_dir\": \"../checkpoints\"\n",
" }\n",
")\n",
"\n",
"loader = DataLoader(config)"
],
"outputs": [],
"execution_count": 31
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T14:15:47.899044Z",
"start_time": "2025-09-21T14:14:47.339266Z"
}
},
"cell_type": "code",
"source": [
"gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
"gdf_proj = gdf.to_crs(epsg=32732)\n",
"gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
"\n",
"df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
],
"id": "b38394ce38864379",
"outputs": [],
"execution_count": 32
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Exploration",
"id": "a1af5626d2a948d6"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+1
View File
@@ -0,0 +1 @@
UTF-8
BIN
View File
Binary file not shown.
+1
View File
@@ -0,0 +1 @@
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
+3 -1
View File
@@ -211,7 +211,9 @@ class NameModel:
for batch in batches: for batch in batches:
batch_losses = {} batch_losses = {}
self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer) self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer)
logging.info(f"Training batch with {len(batch)} examples, current losses: {batch_losses}") logging.info(
f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
)
# Accumulate into total losses dict # Accumulate into total losses dict
for k, v in batch_losses.items(): for k, v in batch_losses.items():
+4 -1
View File
@@ -49,6 +49,9 @@ class Pipeline:
"processed_batches": step.state.processed_batches, "processed_batches": step.state.processed_batches,
"total_batches": step.state.total_batches, "total_batches": step.state.total_batches,
"failed_batches": len(step.state.failed_batches), "failed_batches": len(step.state.failed_batches),
"completion_percentage": (step.state.processed_batches / max(1, step.state.total_batches)) * 100, "completion_percentage": (
step.state.processed_batches / max(1, step.state.total_batches)
)
* 100,
} }
return progress return progress
+43
View File
@@ -0,0 +1,43 @@
import logging
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from processing.steps import PipelineStep
class DataSelectionStep(PipelineStep):
"""Configuration-driven data selection step to keep only specified columns"""
def __init__(self, pipeline_config: PipelineConfig):
super().__init__("data_selection", pipeline_config)
self.selected_columns = pipeline_config.data.selected_columns
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process a single batch for data selection"""
logging.info(f"Selecting columns for batch {batch_id} with {len(batch)} rows")
# Check which columns exist in the batch
available_columns = [col for col in self.selected_columns if col in batch.columns]
missing_columns = [col for col in self.selected_columns if col not in batch.columns]
if missing_columns:
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
if not available_columns:
logging.error(f"No required columns found in batch {batch_id}")
return pd.DataFrame() # Return empty DataFrame if no required columns exist
# Select only the available required columns
selected_batch = batch[available_columns].copy()
logging.info(
f"Selected {len(available_columns)} columns for batch {batch_id}: {available_columns}"
)
return selected_batch
@property
def requires_batch_mutation(self) -> bool:
"""This step modifies the batch data by selecting columns"""
return True
+5 -3
View File
@@ -72,7 +72,9 @@ class ExperimentBuilder:
f"Available experiments: {available_experiments}" f"Available experiments: {available_experiments}"
) )
def get_templates(self, templates_path: str = "research_templates.yaml") -> Dict[str, List[Dict]]: def get_templates(
self, templates_path: str = "research_templates.yaml"
) -> Dict[str, List[Dict]]:
"""Get all available experiments from templates organized by type""" """Get all available experiments from templates organized by type"""
templates = self.load_templates(templates_path) templates = self.load_templates(templates_path)
@@ -80,7 +82,7 @@ class ExperimentBuilder:
"baseline": templates.get("baseline_experiments", []), "baseline": templates.get("baseline_experiments", []),
"advanced": templates.get("advanced_experiments", []), "advanced": templates.get("advanced_experiments", []),
"feature_study": templates.get("feature_studies", []), "feature_study": templates.get("feature_studies", []),
"tuning": templates.get("hyperparameter_tuning", []) "tuning": templates.get("hyperparameter_tuning", []),
} }
@classmethod @classmethod
@@ -104,5 +106,5 @@ class ExperimentBuilder:
tags=template_config.get("tags", []), tags=template_config.get("tags", []),
test_size=template_config.get("test_size", 0.2), test_size=template_config.get("test_size", 0.2),
cross_validation_folds=template_config.get("cross_validation_folds", 5), cross_validation_folds=template_config.get("cross_validation_folds", 5),
train_data_filter=template_config.get("train_data_filter") train_data_filter=template_config.get("train_data_filter"),
) )
+3 -1
View File
@@ -82,7 +82,9 @@ class EnsembleModel(TraditionalModel):
# Soft voting averages probabilities (preferred when members are calibrated); # Soft voting averages probabilities (preferred when members are calibrated);
# hard voting uses majority class. Parallelize member predictions. # hard voting uses majority class. Parallelize member predictions.
voting_type = params.get("voting", "soft") # 'hard' or 'soft' voting_type = params.get("voting", "soft") # 'hard' or 'soft'
return VotingClassifier(estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1)) return VotingClassifier(
estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1)
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray: def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = [] text_features = []
+3 -1
View File
@@ -55,7 +55,9 @@ class RandomForestModel(TraditionalModel):
encoder = self.label_encoders[feature_key] encoder = self.label_encoders[feature_key]
column_clean = column.fillna("unknown").astype(str) column_clean = column.fillna("unknown").astype(str)
known_classes = set(encoder.classes_) known_classes = set(encoder.classes_)
default_class = "unknown" if "unknown" in known_classes else encoder.classes_[0] default_class = (
"unknown" if "unknown" in known_classes else encoder.classes_[0]
)
column_mapped = column_clean.apply( column_mapped = column_clean.apply(
lambda value: value if value in known_classes else default_class lambda value: value if value in known_classes else default_class
) )
+3 -3
View File
@@ -36,9 +36,9 @@ class TransformerModel(NeuralNetworkModel):
# Add positional encoding # Add positional encoding
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1) positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
pos_embedding = Embedding(input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64))( pos_embedding = Embedding(
positions input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64)
) )(positions)
x = x + pos_embedding x = x + pos_embedding
x = self._transformer_encoder(x, params) x = self._transformer_encoder(x, params)
+3 -1
View File
@@ -84,7 +84,9 @@ class NeuralNetworkModel(BaseModel):
def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]: def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
"""Combine configured textual features into one string per record.""" """Combine configured textual features into one string per record."""
column_names = [feature.value for feature in self.config.features if feature.value in X.columns] column_names = [
feature.value for feature in self.config.features if feature.value in X.columns
]
if not column_names: if not column_names:
raise ValueError("No configured text features found in the provided DataFrame.") raise ValueError("No configured text features found in the provided DataFrame.")
+3 -1
View File
@@ -50,7 +50,9 @@ class StreamlitApp:
@classmethod @classmethod
def run(cls): def run(cls):
st.title("🇨🇩 DRC NERS Platform") st.title("🇨🇩 DRC NERS Platform")
st.markdown("A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference") st.markdown(
"A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
)
st.markdown( st.markdown(
""" """
## Overview ## Overview
-1
View File
@@ -1,2 +1 @@
from .ner_testing import NERTesting from .ner_testing import NERTesting
+13 -10
View File
@@ -16,7 +16,7 @@ class Experiments:
self, self,
config: PipelineConfig, config: PipelineConfig,
experiment_tracker: ExperimentTracker, experiment_tracker: ExperimentTracker,
experiment_runner: ExperimentRunner experiment_runner: ExperimentRunner,
): ):
self.config = config self.config = config
self.experiment_tracker = experiment_tracker self.experiment_tracker = experiment_tracker
@@ -26,8 +26,7 @@ class Experiments:
def index(self): def index(self):
st.title("Experiments") st.title("Experiments")
tab1, tab2, tab3 = st.tabs( tab1, tab2, tab3 = st.tabs(["Templates", "Experiments", "Batch Experiments"])
["Templates", "Experiments", "Batch Experiments"])
with tab1: with tab1:
self.show_template_experiments() self.show_template_experiments()
@@ -56,14 +55,18 @@ class Experiments:
self._show_experiments_by_type(available_experiments["advanced"], "advanced") self._show_experiments_by_type(available_experiments["advanced"], "advanced")
with exp_tabs[2]: with exp_tabs[2]:
self._show_experiments_by_type(available_experiments["feature_study"], "feature_study") self._show_experiments_by_type(
available_experiments["feature_study"], "feature_study"
)
with exp_tabs[3]: with exp_tabs[3]:
self._show_experiments_by_type(available_experiments["tuning"], "tuning") self._show_experiments_by_type(available_experiments["tuning"], "tuning")
except Exception as e: except Exception as e:
st.error(f"Error loading experiment templates: {e}") st.error(f"Error loading experiment templates: {e}")
st.info("Make sure the research templates file exists at `config/research_templates.yaml`") st.info(
"Make sure the research templates file exists at `config/research_templates.yaml`"
)
def _show_experiments_by_type(self, experiments: List[Dict], experiment_type: str): def _show_experiments_by_type(self, experiments: List[Dict], experiment_type: str):
"""Show experiments for a specific type""" """Show experiments for a specific type"""
@@ -213,7 +216,7 @@ class Experiments:
experiment_types = st.multiselect( experiment_types = st.multiselect(
"Select Experiment Types", "Select Experiment Types",
["baseline", "advanced", "feature_study", "tuning"], ["baseline", "advanced", "feature_study", "tuning"],
default=["baseline"] default=["baseline"],
) )
if experiment_types: if experiment_types:
@@ -223,11 +226,11 @@ class Experiments:
experiments = available_experiments.get(exp_type, []) experiments = available_experiments.get(exp_type, [])
if experiments: if experiments:
st.write(f"**{exp_type.title()} Experiments:**") st.write(f"**{exp_type.title()} Experiments:**")
exp_names = [exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)] exp_names = [
exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)
]
selected_names = st.multiselect( selected_names = st.multiselect(
f"Select {exp_type} experiments", f"Select {exp_type} experiments", exp_names, key=f"select_{exp_type}"
exp_names,
key=f"select_{exp_type}"
) )
for name in selected_names: for name in selected_names:
+1 -1
View File
@@ -38,7 +38,7 @@ class LogReader:
# Parse log entries from the end # Parse log entries from the end
entries = [] entries = []
for line in reversed(lines[-count * 2:]): # Read more lines in case some don't match for line in reversed(lines[-count * 2 :]): # Read more lines in case some don't match
entry = self._parse_log_line(line.strip()) entry = self._parse_log_line(line.strip())
if entry: if entry:
entries.append(entry) entries.append(entry)
+20 -28
View File
@@ -33,7 +33,9 @@ class NERTesting:
# Load model # Load model
if not self.load_ner_model(): if not self.load_ner_model():
st.warning("NER model could not be loaded. Please ensure the model is trained and available.") st.warning(
"NER model could not be loaded. Please ensure the model is trained and available."
)
return return
# Display model information # Display model information
@@ -53,9 +55,11 @@ class NERTesting:
col1, col2, col3, col4 = st.columns(4) col1, col2, col3, col4 = st.columns(4)
with col1: with col1:
st.metric("Training Examples", f"{self.training_stats.get('training_examples', 0):,}") st.metric(
"Training Examples", f"{self.training_stats.get('training_examples', 0):,}"
)
with col2: with col2:
st.metric("Epochs", self.training_stats.get('epochs', 0)) st.metric("Epochs", self.training_stats.get("epochs", 0))
with col3: with col3:
st.metric("Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}") st.metric("Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}")
with col4: with col4:
@@ -64,7 +68,7 @@ class NERTesting:
def show_model_evaluation_info(self): def show_model_evaluation_info(self):
if self.evaluation_stats: if self.evaluation_stats:
col1, col2, col3 = st.columns(4) col1, col2, col3 = st.columns(4)
overall = self.evaluation_stats.get('overall', {}) overall = self.evaluation_stats.get("overall", {})
with col1: with col1:
st.metric("Overall Precision", f"{overall['precision']:.2f}") st.metric("Overall Precision", f"{overall['precision']:.2f}")
@@ -79,7 +83,7 @@ class NERTesting:
name_input = st.text_input( name_input = st.text_input(
"Name:", "Name:",
placeholder="e.g., Jean Baptiste Mukendi, Marie Kabamba Tshiala, Joseph Kasongo", placeholder="e.g., Jean Baptiste Mukendi, Marie Kabamba Tshiala, Joseph Kasongo",
help="Enter a full name or multiple names separated by spaces" help="Enter a full name or multiple names separated by spaces",
) )
if name_input.strip(): if name_input.strip():
if st.button("Analyze Name", type="primary"): if st.button("Analyze Name", type="primary"):
@@ -90,12 +94,12 @@ class NERTesting:
"Names:", "Names:",
placeholder="Jean Baptiste Mukendi\nMarie Kabamba Tshiala\nJoseph Kasongo\nGrace Mbuyi Kalala", placeholder="Jean Baptiste Mukendi\nMarie Kabamba Tshiala\nJoseph Kasongo\nGrace Mbuyi Kalala",
height=150, height=150,
help="Enter each name on a new line" help="Enter each name on a new line",
) )
if names_input.strip(): if names_input.strip():
if st.button("Analyze All Names", type="primary"): if st.button("Analyze All Names", type="primary"):
names = [name.strip() for name in names_input.split('\n') if name.strip()] names = [name.strip() for name in names_input.split("\n") if name.strip()]
for i, name in enumerate(names): for i, name in enumerate(names):
st.markdown(f"**Name {i+1}: {name}**") st.markdown(f"**Name {i+1}: {name}**")
self.analyze_and_display(name) self.analyze_and_display(name)
@@ -106,12 +110,12 @@ class NERTesting:
try: try:
result = self.ner_model.predict(text) result = self.ner_model.predict(text)
st.subheader("Analysis Results") st.subheader("Analysis Results")
entities = result.get('entities', []) entities = result.get("entities", [])
if entities: if entities:
self.show_visual_entities(text, entities) self.show_visual_entities(text, entities)
native_count = sum(1 for e in entities if e['label'] == 'NATIVE') native_count = sum(1 for e in entities if e["label"] == "NATIVE")
surname_count = sum(1 for e in entities if e['label'] == 'SURNAME') surname_count = sum(1 for e in entities if e["label"] == "SURNAME")
col1, col2, col3 = st.columns(3) col1, col2, col3 = st.columns(3)
with col1: with col1:
@@ -134,29 +138,17 @@ class NERTesting:
# Convert our entities format to spaCy format for displacy # Convert our entities format to spaCy format for displacy
ents = [] ents = []
for entity in entities: for entity in entities:
ents.append({ ents.append(
"start": entity['start'], {"start": entity["start"], "end": entity["end"], "label": entity["label"]}
"end": entity['end'], )
"label": entity['label']
})
# Create doc-like structure for displacy # Create doc-like structure for displacy
doc_data = { doc_data = {"text": text, "ents": ents, "title": None}
"text": text,
"ents": ents,
"title": None
}
# Custom colors for our labels # Custom colors for our labels
colors = { colors = {"NATIVE": "#74C0FC", "SURNAME": "#69DB7C"} # Light blue # Light green
"NATIVE": "#74C0FC", # Light blue
"SURNAME": "#69DB7C" # Light green
}
options = { options = {"colors": colors, "distance": 90}
"colors": colors,
"distance": 90
}
# Generate HTML visualization # Generate HTML visualization
html = displacy.render(doc_data, style="ent", manual=True, options=options) html = displacy.render(doc_data, style="ent", manual=True, options=options)