Files
drc-ners-nlp/engineer_ner_dataset.py
T

73 lines
2.6 KiB
Python

#!/usr/bin/env python3
"""
NER Dataset Feature Engineering Script
Processes the names_featured.csv dataset to create position-independent variations
"""
import argparse
import os
from processing.ner.ner_engineering import NEREngineering
def main():
parser = argparse.ArgumentParser(description='Engineer NER dataset for position-independent learning')
parser.add_argument('--input', default='data/dataset/names_featured.csv', help='Input CSV file path')
parser.add_argument('--output', default='data/dataset/names_featured_engineered.csv', help='Output CSV file path')
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility')
args = parser.parse_args()
print("=== NER Dataset Feature Engineering ===")
print(f"Input file: {args.input}")
print(f"Output file: {args.output}")
print(f"Random seed: {args.seed}")
# Check if input file exists
if not os.path.exists(args.input):
print(f"Error: Input file {args.input} not found!")
return
# Initialize engineering class
engineering = NEREngineering()
try:
# Load data with progress indication
print("\n1. Loading NER-tagged data...")
data = engineering.load_ner_data(args.input)
print(f" Dataset size: {len(data):,} rows")
# Show sample of original data
print("\n2. Sample original data:")
for i, row in data.head(3).iterrows():
print(f" {row['name']} -> Native: '{row['probable_native']}', Surname: '{row['probable_surname']}'")
# Apply transformations
print("\n3. Applying feature engineering transformations...")
engineered_data = engineering.engineer_dataset(data, random_seed=args.seed)
# Save results
print(f"\n4. Saving engineered dataset to {args.output}...")
engineering.save_engineered_dataset(engineered_data, args.output)
# Show statistics
print(f"\n=== RESULTS SUMMARY ===")
print(f"Original dataset: {len(data):,} rows")
print(f"Engineered dataset: {len(engineered_data):,} rows")
print(f"Transformation distribution:")
counts = engineered_data['transformation_type'].value_counts().sort_index()
for trans_type, count in counts.items():
percentage = (count / len(engineered_data)) * 100
print(f" {trans_type}: {count:,} rows ({percentage:.1f}%)")
print(f"\nDataset successfully engineered and saved!")
except Exception as e:
print(f"Error during processing: {str(e)}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()