drc-ners-nlp/engineer_ner_dataset.py

#!/usr/bin/env python3
"""
NER Dataset Feature Engineering Script
Processes the names_featured.csv dataset to create position-independent variations
"""

import argparse
import os

from processing.ner.ner_engineering import NEREngineering


def main():
    parser = argparse.ArgumentParser(description='Engineer NER dataset for position-independent learning')
    parser.add_argument('--input', default='data/dataset/names_featured.csv', help='Input CSV file path')
    parser.add_argument('--output', default='data/dataset/names_featured_engineered.csv', help='Output CSV file path')
    parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility')

    args = parser.parse_args()

    print("=== NER Dataset Feature Engineering ===")
    print(f"Input file: {args.input}")
    print(f"Output file: {args.output}")
    print(f"Random seed: {args.seed}")

    # Check if input file exists
    if not os.path.exists(args.input):
        print(f"Error: Input file {args.input} not found!")
        return

    # Initialize engineering class
    engineering = NEREngineering()

    try:
        # Load data with progress indication
        print("\n1. Loading NER-tagged data...")
        data = engineering.load_ner_data(args.input)
        print(f"   Dataset size: {len(data):,} rows")

        # Show sample of original data
        print("\n2. Sample original data:")
        for i, row in data.head(3).iterrows():
            print(f"   {row['name']} -> Native: '{row['probable_native']}', Surname: '{row['probable_surname']}'")

        # Apply transformations
        print("\n3. Applying feature engineering transformations...")
        engineered_data = engineering.engineer_dataset(data, random_seed=args.seed)

        # Save results
        print(f"\n4. Saving engineered dataset to {args.output}...")
        engineering.save_engineered_dataset(engineered_data, args.output)

        # Show statistics
        print(f"\n=== RESULTS SUMMARY ===")
        print(f"Original dataset: {len(data):,} rows")
        print(f"Engineered dataset: {len(engineered_data):,} rows")
        print(f"Transformation distribution:")
        counts = engineered_data['transformation_type'].value_counts().sort_index()
        for trans_type, count in counts.items():
            percentage = (count / len(engineered_data)) * 100
            print(f"  {trans_type}: {count:,} rows ({percentage:.1f}%)")

        print(f"\nDataset successfully engineered and saved!")

    except Exception as e:
        print(f"Error during processing: {str(e)}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()