# Batch Processing and Performance

This notebook demonstrates efficient batch processing techniques for large datasets and provides performance optimization tips.

## Setup

Load libraries and create a larger sample dataset for demonstration.

In [None]:
import pandas as pd
import ethnicolr
import time
from pathlib import Path

# Load sample data
data_path = Path('data/input-with-header.csv')

try:
 small_df = pd.read_csv(data_path)
 print(f"Loaded data from: {data_path}")
except FileNotFoundError:
 # Create sample data if file not found
 small_df = pd.DataFrame({
 'first_name': ['John', 'Maria', 'David', 'Sarah', 'Michael'],
 'last_name': ['Smith', 'Garcia', 'Johnson', 'Davis', 'Brown']
 })
 print("Using generated sample data")

print(f"Sample data shape: {small_df.shape}")
print("\nFirst few rows:")
small_df.head()

# Create a larger dataset for batch processing demonstration
# Replicate the small dataset multiple times
large_df = pd.concat([small_df] * 20, ignore_index=True)
print(f"\nLarge dataset shape: {large_df.shape}")
print("Ready for batch processing demonstrations.")

## Performance Comparison

Let's compare the performance of different models on our dataset.

In [None]:
def time_prediction(func, df, *args, **kwargs):
 """Helper function to time predictions"""
 start_time = time.time()
 result = func(df, *args, **kwargs)
 end_time = time.time()
 return result, end_time - start_time

# Test different models
models = {
 'census_lookup': (ethnicolr.census_ln, ['last_name'], {'year': 2010}),
 'census_lstm': (ethnicolr.pred_census_ln, ['last_name'], {}),
 'wiki_lastname': (ethnicolr.pred_wiki_ln, ['last_name'], {}),
 'florida_lstm': (ethnicolr.pred_fl_reg_ln, ['last_name'], {})
}

performance_results = []

for model_name, (func, args, kwargs) in models.items():
 print(f"\nTesting {model_name}...")
 result, duration = time_prediction(func, large_df, *args, **kwargs)
 
 perf_data = {
 'model': model_name,
 'duration': round(duration, 2),
 'rows_per_second': round(len(large_df) / duration, 0),
 'result_rows': result.shape[0],
 'result_cols': result.shape[1]
 }
 performance_results.append(perf_data)
 
 print(f"Duration: {duration:.2f} seconds")
 print(f"Speed: {len(large_df) / duration:.0f} rows/second")

# Performance summary
perf_df = pd.DataFrame(performance_results).set_index('model')
print("\nPerformance Summary:")
perf_df[['duration', 'rows_per_second', 'result_rows', 'result_cols']]

## Chunked Processing

For very large datasets, processing in chunks can be more memory efficient.

First, let's define our chunked processing function:

In [None]:
def process_in_chunks(df, func, chunk_size=1000, *args, **kwargs):
 """Process dataframe in chunks to manage memory usage"""
 results = []
 total_chunks = (len(df) - 1) // chunk_size + 1
 
 for i in range(0, len(df), chunk_size):
 chunk = df.iloc[i:i + chunk_size]
 chunk_result = func(chunk, *args, **kwargs)
 results.append(chunk_result)
 
 if (i // chunk_size + 1) % 5 == 0: # Progress every 5 chunks
 print(f"Processed {i // chunk_size + 1}/{total_chunks} chunks")
 
 return pd.concat(results, ignore_index=True)

In [None]:
# Example: Process in chunks of 250 rows
print("Processing Florida model in chunks of 250...")
start_time = time.time()
chunked_result = process_in_chunks(
 large_df, 
 ethnicolr.pred_fl_reg_ln, 
 250, # chunk_size as positional argument
 'last_name' # positional argument for the prediction function
)
chunked_duration = time.time() - start_time

print(f"\nChunked processing completed in {chunked_duration:.2f} seconds")
print(f"Result shape: {chunked_result.shape}")
chunked_result[['last_name', 'race', 'asian', 'hispanic', 'nh_black', 'nh_white']].head()

## Handling Missing or Problematic Names

Real-world datasets often have missing values, special characters, or other data quality issues.

In [None]:
# Create a dataset with some problematic entries
problematic_df = large_df.copy().head(50)

# Add some missing values and problematic names
problematic_df.loc[5, 'last_name'] = None
problematic_df.loc[10, 'last_name'] = ''
problematic_df.loc[15, 'last_name'] = 'O\'Connor' # Apostrophe
problematic_df.loc[20, 'last_name'] = 'GarcĂ­a' # Accented character
problematic_df.loc[25, 'last_name'] = '123' # Numeric
problematic_df.loc[30, 'first_name'] = None

print("Sample problematic entries:")
print(problematic_df.iloc[[5, 10, 15, 20, 25, 30]][['first_name', 'last_name']])

# Process with Wikipedia model (handles problematic names better)
wiki_result = ethnicolr.pred_wiki_name(problematic_df, 'last_name', 'first_name')

print("\nProcessing results for problematic names:")
problem_indices = [5, 10, 15, 20, 25, 30]
display_cols = ['first_name', 'last_name', 'race', '__name', 'processing_status']
# Some columns might not exist, so filter to available ones
available_cols = [col for col in display_cols if col in wiki_result.columns]
print(wiki_result.iloc[problem_indices][available_cols])

## Data Quality Analysis

Analyze the quality and coverage of predictions across your dataset.

In [None]:
# Get predictions for quality analysis
census_pred = ethnicolr.pred_census_ln(large_df, 'last_name')
wiki_pred = ethnicolr.pred_wiki_ln(large_df, 'last_name')

# Calculate prediction confidence (use correct column names for each model)
# Census model columns: api, black, hispanic, white
census_pred['max_confidence'] = census_pred[['api', 'black', 'hispanic', 'white']].max(axis=1)

# Wikipedia model: find numeric probability columns only
numeric_cols = []
for col in wiki_pred.columns:
 if col not in ['race', '__name', 'last_name', 'processing_status']:
 try:
 # Check if column is numeric
 pd.to_numeric(wiki_pred[col], errors='raise')
 numeric_cols.append(col)
 except (ValueError, TypeError):
 continue

if len(numeric_cols) > 0:
 wiki_pred['max_confidence'] = wiki_pred[numeric_cols].max(axis=1)
else:
 wiki_pred['max_confidence'] = 0.5 # Default value if no numeric columns found

# Confidence distribution
print("Census Model Confidence Distribution:")
print(f"High confidence (>0.8): {(census_pred['max_confidence'] > 0.8).sum()} ({(census_pred['max_confidence'] > 0.8).mean()*100:.1f}%)")
print(f"Medium confidence (0.5-0.8): {((census_pred['max_confidence'] > 0.5) & (census_pred['max_confidence'] <= 0.8)).sum()} ({((census_pred['max_confidence'] > 0.5) & (census_pred['max_confidence'] <= 0.8)).mean()*100:.1f}%)")
print(f"Low confidence (<0.5): {(census_pred['max_confidence'] <= 0.5).sum()} ({(census_pred['max_confidence'] <= 0.5).mean()*100:.1f}%)")

print("\nWikipedia Model Confidence Distribution:")
print(f"Found {len(numeric_cols)} numeric probability columns")
print(f"High confidence (>0.8): {(wiki_pred['max_confidence'] > 0.8).sum()} ({(wiki_pred['max_confidence'] > 0.8).mean()*100:.1f}%)")
print(f"Medium confidence (0.5-0.8): {((wiki_pred['max_confidence'] > 0.5) & (wiki_pred['max_confidence'] <= 0.8)).sum()} ({((wiki_pred['max_confidence'] > 0.5) & (wiki_pred['max_confidence'] <= 0.8)).mean()*100:.1f}%)")
print(f"Low confidence (<0.5): {(wiki_pred['max_confidence'] <= 0.5).sum()} ({(wiki_pred['max_confidence'] <= 0.5).mean()*100:.1f}%)")

## Batch Processing Best Practices

### Performance Tips:
1. **Choose the right model**: Census lookup is fastest, ML models are slower but more accurate
2. **Use chunking**: For datasets >10,000 rows, process in chunks to manage memory
3. **Clean data first**: Remove/handle missing values before processing
4. **Monitor confidence**: Low confidence predictions may need manual review

### Memory Management:
- Process in chunks of 500-2000 rows for large datasets
- Use only the columns you need in your input DataFrame
- Clear intermediate results when not needed

### Error Handling:
- Check for missing values in name columns
- Handle special characters and accents
- Validate results and flag low-confidence predictions

In [None]:
# Example production-ready batch processing function
def robust_batch_predict(df, name_col, model='census', chunk_size=1000, min_confidence=0.5):
 """
 Robust batch prediction with error handling and quality filtering
 """
 # Data validation
 if name_col not in df.columns:
 raise ValueError(f"Column '{name_col}' not found in DataFrame")
 
 # Clean data
 clean_df = df.copy()
 clean_df[name_col] = clean_df[name_col].fillna('').astype(str)
 
 # Choose prediction function and define probability columns
 if model == 'census':
 pred_func = ethnicolr.pred_census_ln
 prob_cols = ['api', 'black', 'hispanic', 'white']
 elif model == 'wiki':
 pred_func = ethnicolr.pred_wiki_ln
 prob_cols = None # Will be determined dynamically after prediction
 elif model == 'florida':
 pred_func = ethnicolr.pred_fl_reg_ln
 prob_cols = ['asian', 'hispanic', 'nh_black', 'nh_white']
 else:
 raise ValueError(f"Unknown model: {model}")
 
 # Process in chunks
 result = process_in_chunks(clean_df, pred_func, chunk_size, name_col)
 
 # Calculate confidence based on model type
 if model == 'wiki':
 # For Wikipedia model, find numeric probability columns dynamically
 prob_cols = []
 for col in result.columns:
 if col not in ['race', '__name', name_col, 'processing_status']:
 try:
 pd.to_numeric(result[col], errors='raise')
 prob_cols.append(col)
 except (ValueError, TypeError):
 continue
 
 if prob_cols and len(prob_cols) > 0:
 result['max_confidence'] = result[prob_cols].max(axis=1)
 result['high_confidence'] = result['max_confidence'] >= min_confidence
 else:
 result['max_confidence'] = 0.5 # Default confidence
 result['high_confidence'] = False
 
 return result

# Example usage
print("Running robust batch prediction...")
robust_result = robust_batch_predict(
 large_df.head(100), 
 'last_name', 
 model='census', 
 chunk_size=50,
 min_confidence=0.6
)

print(f"\nProcessed {len(robust_result)} names")
print(f"High confidence predictions: {robust_result['high_confidence'].sum()} ({robust_result['high_confidence'].mean()*100:.1f}%)")
print("\nSample results:")
robust_result[['last_name', 'race', 'max_confidence', 'high_confidence']].head(10)