{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Batch Processing and Performance\n", "\n", "This notebook demonstrates efficient batch processing techniques for large datasets and provides performance optimization tips." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup\n", "\n", "Load libraries and create a larger sample dataset for demonstration." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": "import pandas as pd\nimport ethnicolr\nimport time\nfrom pathlib import Path\n\n# Load sample data\ndata_path = Path('data/input-with-header.csv')\n\ntry:\n small_df = pd.read_csv(data_path)\n print(f\"Loaded data from: {data_path}\")\nexcept FileNotFoundError:\n # Create sample data if file not found\n small_df = pd.DataFrame({\n 'first_name': ['John', 'Maria', 'David', 'Sarah', 'Michael'],\n 'last_name': ['Smith', 'Garcia', 'Johnson', 'Davis', 'Brown']\n })\n print(\"Using generated sample data\")\n\nprint(f\"Sample data shape: {small_df.shape}\")\nprint(\"\\nFirst few rows:\")\nsmall_df.head()\n\n# Create a larger dataset for batch processing demonstration\n# Replicate the small dataset multiple times\nlarge_df = pd.concat([small_df] * 20, ignore_index=True)\nprint(f\"\\nLarge dataset shape: {large_df.shape}\")\nprint(\"Ready for batch processing demonstrations.\")" }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Performance Comparison\n", "\n", "Let's compare the performance of different models on our dataset." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": "def time_prediction(func, df, *args, **kwargs):\n \"\"\"Helper function to time predictions\"\"\"\n start_time = time.time()\n result = func(df, *args, **kwargs)\n end_time = time.time()\n return result, end_time - start_time\n\n# Test different models\nmodels = {\n 'census_lookup': (ethnicolr.census_ln, ['last_name'], {'year': 2010}),\n 'census_lstm': (ethnicolr.pred_census_ln, ['last_name'], {}),\n 'wiki_lastname': (ethnicolr.pred_wiki_ln, ['last_name'], {}),\n 'florida_lstm': (ethnicolr.pred_fl_reg_ln, ['last_name'], {})\n}\n\nperformance_results = []\n\nfor model_name, (func, args, kwargs) in models.items():\n print(f\"\\nTesting {model_name}...\")\n result, duration = time_prediction(func, large_df, *args, **kwargs)\n \n perf_data = {\n 'model': model_name,\n 'duration': round(duration, 2),\n 'rows_per_second': round(len(large_df) / duration, 0),\n 'result_rows': result.shape[0],\n 'result_cols': result.shape[1]\n }\n performance_results.append(perf_data)\n \n print(f\"Duration: {duration:.2f} seconds\")\n print(f\"Speed: {len(large_df) / duration:.0f} rows/second\")\n\n# Performance summary\nperf_df = pd.DataFrame(performance_results).set_index('model')\nprint(\"\\nPerformance Summary:\")\nperf_df[['duration', 'rows_per_second', 'result_rows', 'result_cols']]" }, { "cell_type": "markdown", "metadata": {}, "source": "## Chunked Processing\n\nFor very large datasets, processing in chunks can be more memory efficient.\n\nFirst, let's define our chunked processing function:" }, { "cell_type": "code", "source": "def process_in_chunks(df, func, chunk_size=1000, *args, **kwargs):\n \"\"\"Process dataframe in chunks to manage memory usage\"\"\"\n results = []\n total_chunks = (len(df) - 1) // chunk_size + 1\n \n for i in range(0, len(df), chunk_size):\n chunk = df.iloc[i:i + chunk_size]\n chunk_result = func(chunk, *args, **kwargs)\n results.append(chunk_result)\n \n if (i // chunk_size + 1) % 5 == 0: # Progress every 5 chunks\n print(f\"Processed {i // chunk_size + 1}/{total_chunks} chunks\")\n \n return pd.concat(results, ignore_index=True)", "metadata": {}, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": "# Example: Process in chunks of 250 rows\nprint(\"Processing Florida model in chunks of 250...\")\nstart_time = time.time()\nchunked_result = process_in_chunks(\n large_df, \n ethnicolr.pred_fl_reg_ln, \n 250, # chunk_size as positional argument\n 'last_name' # positional argument for the prediction function\n)\nchunked_duration = time.time() - start_time\n\nprint(f\"\\nChunked processing completed in {chunked_duration:.2f} seconds\")\nprint(f\"Result shape: {chunked_result.shape}\")\nchunked_result[['last_name', 'race', 'asian', 'hispanic', 'nh_black', 'nh_white']].head()" }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Handling Missing or Problematic Names\n", "\n", "Real-world datasets often have missing values, special characters, or other data quality issues." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a dataset with some problematic entries\n", "problematic_df = large_df.copy().head(50)\n", "\n", "# Add some missing values and problematic names\n", "problematic_df.loc[5, 'last_name'] = None\n", "problematic_df.loc[10, 'last_name'] = ''\n", "problematic_df.loc[15, 'last_name'] = 'O\\'Connor' # Apostrophe\n", "problematic_df.loc[20, 'last_name'] = 'GarcĂ­a' # Accented character\n", "problematic_df.loc[25, 'last_name'] = '123' # Numeric\n", "problematic_df.loc[30, 'first_name'] = None\n", "\n", "print(\"Sample problematic entries:\")\n", "print(problematic_df.iloc[[5, 10, 15, 20, 25, 30]][['first_name', 'last_name']])\n", "\n", "# Process with Wikipedia model (handles problematic names better)\n", "wiki_result = ethnicolr.pred_wiki_name(problematic_df, 'last_name', 'first_name')\n", "\n", "print(\"\\nProcessing results for problematic names:\")\n", "problem_indices = [5, 10, 15, 20, 25, 30]\n", "display_cols = ['first_name', 'last_name', 'race', '__name', 'processing_status']\n", "# Some columns might not exist, so filter to available ones\n", "available_cols = [col for col in display_cols if col in wiki_result.columns]\n", "print(wiki_result.iloc[problem_indices][available_cols])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Quality Analysis\n", "\n", "Analyze the quality and coverage of predictions across your dataset." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": "# Get predictions for quality analysis\ncensus_pred = ethnicolr.pred_census_ln(large_df, 'last_name')\nwiki_pred = ethnicolr.pred_wiki_ln(large_df, 'last_name')\n\n# Calculate prediction confidence (use correct column names for each model)\n# Census model columns: api, black, hispanic, white\ncensus_pred['max_confidence'] = census_pred[['api', 'black', 'hispanic', 'white']].max(axis=1)\n\n# Wikipedia model: find numeric probability columns only\nnumeric_cols = []\nfor col in wiki_pred.columns:\n if col not in ['race', '__name', 'last_name', 'processing_status']:\n try:\n # Check if column is numeric\n pd.to_numeric(wiki_pred[col], errors='raise')\n numeric_cols.append(col)\n except (ValueError, TypeError):\n continue\n\nif len(numeric_cols) > 0:\n wiki_pred['max_confidence'] = wiki_pred[numeric_cols].max(axis=1)\nelse:\n wiki_pred['max_confidence'] = 0.5 # Default value if no numeric columns found\n\n# Confidence distribution\nprint(\"Census Model Confidence Distribution:\")\nprint(f\"High confidence (>0.8): {(census_pred['max_confidence'] > 0.8).sum()} ({(census_pred['max_confidence'] > 0.8).mean()*100:.1f}%)\")\nprint(f\"Medium confidence (0.5-0.8): {((census_pred['max_confidence'] > 0.5) & (census_pred['max_confidence'] <= 0.8)).sum()} ({((census_pred['max_confidence'] > 0.5) & (census_pred['max_confidence'] <= 0.8)).mean()*100:.1f}%)\")\nprint(f\"Low confidence (<0.5): {(census_pred['max_confidence'] <= 0.5).sum()} ({(census_pred['max_confidence'] <= 0.5).mean()*100:.1f}%)\")\n\nprint(\"\\nWikipedia Model Confidence Distribution:\")\nprint(f\"Found {len(numeric_cols)} numeric probability columns\")\nprint(f\"High confidence (>0.8): {(wiki_pred['max_confidence'] > 0.8).sum()} ({(wiki_pred['max_confidence'] > 0.8).mean()*100:.1f}%)\")\nprint(f\"Medium confidence (0.5-0.8): {((wiki_pred['max_confidence'] > 0.5) & (wiki_pred['max_confidence'] <= 0.8)).sum()} ({((wiki_pred['max_confidence'] > 0.5) & (wiki_pred['max_confidence'] <= 0.8)).mean()*100:.1f}%)\")\nprint(f\"Low confidence (<0.5): {(wiki_pred['max_confidence'] <= 0.5).sum()} ({(wiki_pred['max_confidence'] <= 0.5).mean()*100:.1f}%)\")" }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Batch Processing Best Practices\n", "\n", "### Performance Tips:\n", "1. **Choose the right model**: Census lookup is fastest, ML models are slower but more accurate\n", "2. **Use chunking**: For datasets >10,000 rows, process in chunks to manage memory\n", "3. **Clean data first**: Remove/handle missing values before processing\n", "4. **Monitor confidence**: Low confidence predictions may need manual review\n", "\n", "### Memory Management:\n", "- Process in chunks of 500-2000 rows for large datasets\n", "- Use only the columns you need in your input DataFrame\n", "- Clear intermediate results when not needed\n", "\n", "### Error Handling:\n", "- Check for missing values in name columns\n", "- Handle special characters and accents\n", "- Validate results and flag low-confidence predictions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": "# Example production-ready batch processing function\ndef robust_batch_predict(df, name_col, model='census', chunk_size=1000, min_confidence=0.5):\n \"\"\"\n Robust batch prediction with error handling and quality filtering\n \"\"\"\n # Data validation\n if name_col not in df.columns:\n raise ValueError(f\"Column '{name_col}' not found in DataFrame\")\n \n # Clean data\n clean_df = df.copy()\n clean_df[name_col] = clean_df[name_col].fillna('').astype(str)\n \n # Choose prediction function and define probability columns\n if model == 'census':\n pred_func = ethnicolr.pred_census_ln\n prob_cols = ['api', 'black', 'hispanic', 'white']\n elif model == 'wiki':\n pred_func = ethnicolr.pred_wiki_ln\n prob_cols = None # Will be determined dynamically after prediction\n elif model == 'florida':\n pred_func = ethnicolr.pred_fl_reg_ln\n prob_cols = ['asian', 'hispanic', 'nh_black', 'nh_white']\n else:\n raise ValueError(f\"Unknown model: {model}\")\n \n # Process in chunks\n result = process_in_chunks(clean_df, pred_func, chunk_size, name_col)\n \n # Calculate confidence based on model type\n if model == 'wiki':\n # For Wikipedia model, find numeric probability columns dynamically\n prob_cols = []\n for col in result.columns:\n if col not in ['race', '__name', name_col, 'processing_status']:\n try:\n pd.to_numeric(result[col], errors='raise')\n prob_cols.append(col)\n except (ValueError, TypeError):\n continue\n \n if prob_cols and len(prob_cols) > 0:\n result['max_confidence'] = result[prob_cols].max(axis=1)\n result['high_confidence'] = result['max_confidence'] >= min_confidence\n else:\n result['max_confidence'] = 0.5 # Default confidence\n result['high_confidence'] = False\n \n return result\n\n# Example usage\nprint(\"Running robust batch prediction...\")\nrobust_result = robust_batch_predict(\n large_df.head(100), \n 'last_name', \n model='census', \n chunk_size=50,\n min_confidence=0.6\n)\n\nprint(f\"\\nProcessed {len(robust_result)} names\")\nprint(f\"High confidence predictions: {robust_result['high_confidence'].sum()} ({robust_result['high_confidence'].mean()*100:.1f}%)\")\nprint(\"\\nSample results:\")\nrobust_result[['last_name', 'race', 'max_confidence', 'high_confidence']].head(10)" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 4 }