feat: implement NER dataset feature engineering with multiple transformation formats
This commit is contained in:
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
NER Dataset Feature Engineering Script
|
||||
Processes the names_featured.csv dataset to create position-independent variations
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
from processing.ner.ner_engineering import NEREngineering
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Engineer NER dataset for position-independent learning')
|
||||
parser.add_argument('--input', default='data/dataset/names_featured.csv', help='Input CSV file path')
|
||||
parser.add_argument('--output', default='data/dataset/names_featured_engineered.csv', help='Output CSV file path')
|
||||
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=== NER Dataset Feature Engineering ===")
|
||||
print(f"Input file: {args.input}")
|
||||
print(f"Output file: {args.output}")
|
||||
print(f"Random seed: {args.seed}")
|
||||
|
||||
# Check if input file exists
|
||||
if not os.path.exists(args.input):
|
||||
print(f"Error: Input file {args.input} not found!")
|
||||
return
|
||||
|
||||
# Initialize engineering class
|
||||
engineering = NEREngineering()
|
||||
|
||||
try:
|
||||
# Load data with progress indication
|
||||
print("\n1. Loading NER-tagged data...")
|
||||
data = engineering.load_ner_data(args.input)
|
||||
print(f" Dataset size: {len(data):,} rows")
|
||||
|
||||
# Show sample of original data
|
||||
print("\n2. Sample original data:")
|
||||
for i, row in data.head(3).iterrows():
|
||||
print(f" {row['name']} -> Native: '{row['probable_native']}', Surname: '{row['probable_surname']}'")
|
||||
|
||||
# Apply transformations
|
||||
print("\n3. Applying feature engineering transformations...")
|
||||
engineered_data = engineering.engineer_dataset(data, random_seed=args.seed)
|
||||
|
||||
# Save results
|
||||
print(f"\n4. Saving engineered dataset to {args.output}...")
|
||||
engineering.save_engineered_dataset(engineered_data, args.output)
|
||||
|
||||
# Show statistics
|
||||
print(f"\n=== RESULTS SUMMARY ===")
|
||||
print(f"Original dataset: {len(data):,} rows")
|
||||
print(f"Engineered dataset: {len(engineered_data):,} rows")
|
||||
print(f"Transformation distribution:")
|
||||
counts = engineered_data['transformation_type'].value_counts().sort_index()
|
||||
for trans_type, count in counts.items():
|
||||
percentage = (count / len(engineered_data)) * 100
|
||||
print(f" {trans_type}: {count:,} rows ({percentage:.1f}%)")
|
||||
|
||||
print(f"\nDataset successfully engineered and saved!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during processing: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user