{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Batch Processing and Performance\n",
    "\n",
    "This notebook demonstrates efficient batch processing techniques for large datasets and provides performance optimization tips."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup\n",
    "\n",
    "Load libraries and create a larger sample dataset for demonstration."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "import pandas as pd\nimport ethnicolr\nimport time\nfrom pathlib import Path\n\n# Load sample data\ndata_path = Path('data/input-with-header.csv')\n\ntry:\n    small_df = pd.read_csv(data_path)\n    print(f\"Loaded data from: {data_path}\")\nexcept FileNotFoundError:\n    # Create sample data if file not found\n    small_df = pd.DataFrame({\n        'first_name': ['John', 'Maria', 'David', 'Sarah', 'Michael'],\n        'last_name': ['Smith', 'Garcia', 'Johnson', 'Davis', 'Brown']\n    })\n    print(\"Using generated sample data\")\n\nprint(f\"Sample data shape: {small_df.shape}\")\nprint(\"\\nFirst few rows:\")\nsmall_df.head()\n\n# Create a larger dataset for batch processing demonstration\n# Replicate the small dataset multiple times\nlarge_df = pd.concat([small_df] * 20, ignore_index=True)\nprint(f\"\\nLarge dataset shape: {large_df.shape}\")\nprint(\"Ready for batch processing demonstrations.\")"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Performance Comparison\n",
    "\n",
    "Let's compare the performance of different models on our dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "def time_prediction(func, df, *args, **kwargs):\n    \"\"\"Helper function to time predictions\"\"\"\n    start_time = time.time()\n    result = func(df, *args, **kwargs)\n    end_time = time.time()\n    return result, end_time - start_time\n\n# Test different models\nmodels = {\n    'census_lookup': (ethnicolr.census_ln, ['last_name'], {'year': 2010}),\n    'census_lstm': (ethnicolr.pred_census_ln, ['last_name'], {}),\n    'wiki_lastname': (ethnicolr.pred_wiki_ln, ['last_name'], {}),\n    'florida_lstm': (ethnicolr.pred_fl_reg_ln, ['last_name'], {})\n}\n\nperformance_results = []\n\nfor model_name, (func, args, kwargs) in models.items():\n    print(f\"\\nTesting {model_name}...\")\n    result, duration = time_prediction(func, large_df, *args, **kwargs)\n    \n    perf_data = {\n        'model': model_name,\n        'duration': round(duration, 2),\n        'rows_per_second': round(len(large_df) / duration, 0),\n        'result_rows': result.shape[0],\n        'result_cols': result.shape[1]\n    }\n    performance_results.append(perf_data)\n    \n    print(f\"Duration: {duration:.2f} seconds\")\n    print(f\"Speed: {len(large_df) / duration:.0f} rows/second\")\n\n# Performance summary\nperf_df = pd.DataFrame(performance_results).set_index('model')\nprint(\"\\nPerformance Summary:\")\nperf_df[['duration', 'rows_per_second', 'result_rows', 'result_cols']]"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "## Chunked Processing\n\nFor very large datasets, processing in chunks can be more memory efficient.\n\nFirst, let's define our chunked processing function:"
  },
  {
   "cell_type": "code",
   "source": "def process_in_chunks(df, func, chunk_size=1000, *args, **kwargs):\n    \"\"\"Process dataframe in chunks to manage memory usage\"\"\"\n    results = []\n    total_chunks = (len(df) - 1) // chunk_size + 1\n    \n    for i in range(0, len(df), chunk_size):\n        chunk = df.iloc[i:i + chunk_size]\n        chunk_result = func(chunk, *args, **kwargs)\n        results.append(chunk_result)\n        \n        if (i // chunk_size + 1) % 5 == 0:  # Progress every 5 chunks\n            print(f\"Processed {i // chunk_size + 1}/{total_chunks} chunks\")\n    \n    return pd.concat(results, ignore_index=True)",
   "metadata": {},
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "# Example: Process in chunks of 250 rows\nprint(\"Processing Florida model in chunks of 250...\")\nstart_time = time.time()\nchunked_result = process_in_chunks(\n    large_df, \n    ethnicolr.pred_fl_reg_ln, \n    250,  # chunk_size as positional argument\n    'last_name'  # positional argument for the prediction function\n)\nchunked_duration = time.time() - start_time\n\nprint(f\"\\nChunked processing completed in {chunked_duration:.2f} seconds\")\nprint(f\"Result shape: {chunked_result.shape}\")\nchunked_result[['last_name', 'race', 'asian', 'hispanic', 'nh_black', 'nh_white']].head()"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Handling Missing or Problematic Names\n",
    "\n",
    "Real-world datasets often have missing values, special characters, or other data quality issues."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a dataset with some problematic entries\n",
    "problematic_df = large_df.copy().head(50)\n",
    "\n",
    "# Add some missing values and problematic names\n",
    "problematic_df.loc[5, 'last_name'] = None\n",
    "problematic_df.loc[10, 'last_name'] = ''\n",
    "problematic_df.loc[15, 'last_name'] = 'O\\'Connor'  # Apostrophe\n",
    "problematic_df.loc[20, 'last_name'] = 'García'     # Accented character\n",
    "problematic_df.loc[25, 'last_name'] = '123'        # Numeric\n",
    "problematic_df.loc[30, 'first_name'] = None\n",
    "\n",
    "print(\"Sample problematic entries:\")\n",
    "print(problematic_df.iloc[[5, 10, 15, 20, 25, 30]][['first_name', 'last_name']])\n",
    "\n",
    "# Process with Wikipedia model (handles problematic names better)\n",
    "wiki_result = ethnicolr.pred_wiki_name(problematic_df, 'last_name', 'first_name')\n",
    "\n",
    "print(\"\\nProcessing results for problematic names:\")\n",
    "problem_indices = [5, 10, 15, 20, 25, 30]\n",
    "display_cols = ['first_name', 'last_name', 'race', '__name', 'processing_status']\n",
    "# Some columns might not exist, so filter to available ones\n",
    "available_cols = [col for col in display_cols if col in wiki_result.columns]\n",
    "print(wiki_result.iloc[problem_indices][available_cols])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Quality Analysis\n",
    "\n",
    "Analyze the quality and coverage of predictions across your dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "# Get predictions for quality analysis\ncensus_pred = ethnicolr.pred_census_ln(large_df, 'last_name')\nwiki_pred = ethnicolr.pred_wiki_ln(large_df, 'last_name')\n\n# Calculate prediction confidence (use correct column names for each model)\n# Census model columns: api, black, hispanic, white\ncensus_pred['max_confidence'] = census_pred[['api', 'black', 'hispanic', 'white']].max(axis=1)\n\n# Wikipedia model: find numeric probability columns only\nnumeric_cols = []\nfor col in wiki_pred.columns:\n    if col not in ['race', '__name', 'last_name', 'processing_status']:\n        try:\n            # Check if column is numeric\n            pd.to_numeric(wiki_pred[col], errors='raise')\n            numeric_cols.append(col)\n        except (ValueError, TypeError):\n            continue\n\nif len(numeric_cols) > 0:\n    wiki_pred['max_confidence'] = wiki_pred[numeric_cols].max(axis=1)\nelse:\n    wiki_pred['max_confidence'] = 0.5  # Default value if no numeric columns found\n\n# Confidence distribution\nprint(\"Census Model Confidence Distribution:\")\nprint(f\"High confidence (>0.8): {(census_pred['max_confidence'] > 0.8).sum()} ({(census_pred['max_confidence'] > 0.8).mean()*100:.1f}%)\")\nprint(f\"Medium confidence (0.5-0.8): {((census_pred['max_confidence'] > 0.5) & (census_pred['max_confidence'] <= 0.8)).sum()} ({((census_pred['max_confidence'] > 0.5) & (census_pred['max_confidence'] <= 0.8)).mean()*100:.1f}%)\")\nprint(f\"Low confidence (<0.5): {(census_pred['max_confidence'] <= 0.5).sum()} ({(census_pred['max_confidence'] <= 0.5).mean()*100:.1f}%)\")\n\nprint(\"\\nWikipedia Model Confidence Distribution:\")\nprint(f\"Found {len(numeric_cols)} numeric probability columns\")\nprint(f\"High confidence (>0.8): {(wiki_pred['max_confidence'] > 0.8).sum()} ({(wiki_pred['max_confidence'] > 0.8).mean()*100:.1f}%)\")\nprint(f\"Medium confidence (0.5-0.8): {((wiki_pred['max_confidence'] > 0.5) & (wiki_pred['max_confidence'] <= 0.8)).sum()} ({((wiki_pred['max_confidence'] > 0.5) & (wiki_pred['max_confidence'] <= 0.8)).mean()*100:.1f}%)\")\nprint(f\"Low confidence (<0.5): {(wiki_pred['max_confidence'] <= 0.5).sum()} ({(wiki_pred['max_confidence'] <= 0.5).mean()*100:.1f}%)\")"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Batch Processing Best Practices\n",
    "\n",
    "### Performance Tips:\n",
    "1. **Choose the right model**: Census lookup is fastest, ML models are slower but more accurate\n",
    "2. **Use chunking**: For datasets >10,000 rows, process in chunks to manage memory\n",
    "3. **Clean data first**: Remove/handle missing values before processing\n",
    "4. **Monitor confidence**: Low confidence predictions may need manual review\n",
    "\n",
    "### Memory Management:\n",
    "- Process in chunks of 500-2000 rows for large datasets\n",
    "- Use only the columns you need in your input DataFrame\n",
    "- Clear intermediate results when not needed\n",
    "\n",
    "### Error Handling:\n",
    "- Check for missing values in name columns\n",
    "- Handle special characters and accents\n",
    "- Validate results and flag low-confidence predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "# Example production-ready batch processing function\ndef robust_batch_predict(df, name_col, model='census', chunk_size=1000, min_confidence=0.5):\n    \"\"\"\n    Robust batch prediction with error handling and quality filtering\n    \"\"\"\n    # Data validation\n    if name_col not in df.columns:\n        raise ValueError(f\"Column '{name_col}' not found in DataFrame\")\n    \n    # Clean data\n    clean_df = df.copy()\n    clean_df[name_col] = clean_df[name_col].fillna('').astype(str)\n    \n    # Choose prediction function and define probability columns\n    if model == 'census':\n        pred_func = ethnicolr.pred_census_ln\n        prob_cols = ['api', 'black', 'hispanic', 'white']\n    elif model == 'wiki':\n        pred_func = ethnicolr.pred_wiki_ln\n        prob_cols = None  # Will be determined dynamically after prediction\n    elif model == 'florida':\n        pred_func = ethnicolr.pred_fl_reg_ln\n        prob_cols = ['asian', 'hispanic', 'nh_black', 'nh_white']\n    else:\n        raise ValueError(f\"Unknown model: {model}\")\n    \n    # Process in chunks\n    result = process_in_chunks(clean_df, pred_func, chunk_size, name_col)\n    \n    # Calculate confidence based on model type\n    if model == 'wiki':\n        # For Wikipedia model, find numeric probability columns dynamically\n        prob_cols = []\n        for col in result.columns:\n            if col not in ['race', '__name', name_col, 'processing_status']:\n                try:\n                    pd.to_numeric(result[col], errors='raise')\n                    prob_cols.append(col)\n                except (ValueError, TypeError):\n                    continue\n    \n    if prob_cols and len(prob_cols) > 0:\n        result['max_confidence'] = result[prob_cols].max(axis=1)\n        result['high_confidence'] = result['max_confidence'] >= min_confidence\n    else:\n        result['max_confidence'] = 0.5  # Default confidence\n        result['high_confidence'] = False\n    \n    return result\n\n# Example usage\nprint(\"Running robust batch prediction...\")\nrobust_result = robust_batch_predict(\n    large_df.head(100), \n    'last_name', \n    model='census', \n    chunk_size=50,\n    min_confidence=0.6\n)\n\nprint(f\"\\nProcessed {len(robust_result)} names\")\nprint(f\"High confidence predictions: {robust_result['high_confidence'].sum()} ({robust_result['high_confidence'].mean()*100:.1f}%)\")\nprint(\"\\nSample results:\")\nrobust_result[['last_name', 'race', 'max_confidence', 'high_confidence']].head(10)"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}